aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/staging/rdma
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/rdma')
-rw-r--r--drivers/staging/rdma/Kconfig33
-rw-r--r--drivers/staging/rdma/Makefile5
-rw-r--r--drivers/staging/rdma/amso1100/Kbuild6
-rw-r--r--drivers/staging/rdma/amso1100/Kconfig15
-rw-r--r--drivers/staging/rdma/amso1100/TODO4
-rw-r--r--drivers/staging/rdma/amso1100/c2.c1241
-rw-r--r--drivers/staging/rdma/amso1100/c2.h547
-rw-r--r--drivers/staging/rdma/amso1100/c2_ae.c327
-rw-r--r--drivers/staging/rdma/amso1100/c2_ae.h108
-rw-r--r--drivers/staging/rdma/amso1100/c2_alloc.c142
-rw-r--r--drivers/staging/rdma/amso1100/c2_cm.c461
-rw-r--r--drivers/staging/rdma/amso1100/c2_cq.c440
-rw-r--r--drivers/staging/rdma/amso1100/c2_intr.c219
-rw-r--r--drivers/staging/rdma/amso1100/c2_mm.c377
-rw-r--r--drivers/staging/rdma/amso1100/c2_mq.c174
-rw-r--r--drivers/staging/rdma/amso1100/c2_mq.h106
-rw-r--r--drivers/staging/rdma/amso1100/c2_pd.c90
-rw-r--r--drivers/staging/rdma/amso1100/c2_provider.c912
-rw-r--r--drivers/staging/rdma/amso1100/c2_provider.h182
-rw-r--r--drivers/staging/rdma/amso1100/c2_qp.c1024
-rw-r--r--drivers/staging/rdma/amso1100/c2_rnic.c655
-rw-r--r--drivers/staging/rdma/amso1100/c2_status.h158
-rw-r--r--drivers/staging/rdma/amso1100/c2_user.h82
-rw-r--r--drivers/staging/rdma/amso1100/c2_vq.c260
-rw-r--r--drivers/staging/rdma/amso1100/c2_vq.h63
-rw-r--r--drivers/staging/rdma/amso1100/c2_wr.h1520
-rw-r--r--drivers/staging/rdma/ehca/Kconfig10
-rw-r--r--drivers/staging/rdma/ehca/Makefile16
-rw-r--r--drivers/staging/rdma/ehca/TODO4
-rw-r--r--drivers/staging/rdma/ehca/ehca_av.c277
-rw-r--r--drivers/staging/rdma/ehca/ehca_classes.h482
-rw-r--r--drivers/staging/rdma/ehca/ehca_classes_pSeries.h208
-rw-r--r--drivers/staging/rdma/ehca/ehca_cq.c397
-rw-r--r--drivers/staging/rdma/ehca/ehca_eq.c189
-rw-r--r--drivers/staging/rdma/ehca/ehca_hca.c414
-rw-r--r--drivers/staging/rdma/ehca/ehca_irq.c870
-rw-r--r--drivers/staging/rdma/ehca/ehca_irq.h77
-rw-r--r--drivers/staging/rdma/ehca/ehca_iverbs.h218
-rw-r--r--drivers/staging/rdma/ehca/ehca_main.c1123
-rw-r--r--drivers/staging/rdma/ehca/ehca_mcast.c131
-rw-r--r--drivers/staging/rdma/ehca/ehca_mrmw.c2593
-rw-r--r--drivers/staging/rdma/ehca/ehca_mrmw.h132
-rw-r--r--drivers/staging/rdma/ehca/ehca_pd.c124
-rw-r--r--drivers/staging/rdma/ehca/ehca_qes.h260
-rw-r--r--drivers/staging/rdma/ehca/ehca_qp.c2257
-rw-r--r--drivers/staging/rdma/ehca/ehca_reqs.c953
-rw-r--r--drivers/staging/rdma/ehca/ehca_sqp.c245
-rw-r--r--drivers/staging/rdma/ehca/ehca_tools.h155
-rw-r--r--drivers/staging/rdma/ehca/ehca_uverbs.c309
-rw-r--r--drivers/staging/rdma/ehca/hcp_if.c949
-rw-r--r--drivers/staging/rdma/ehca/hcp_if.h265
-rw-r--r--drivers/staging/rdma/ehca/hcp_phyp.c82
-rw-r--r--drivers/staging/rdma/ehca/hcp_phyp.h90
-rw-r--r--drivers/staging/rdma/ehca/hipz_fns.h68
-rw-r--r--drivers/staging/rdma/ehca/hipz_fns_core.h100
-rw-r--r--drivers/staging/rdma/ehca/hipz_hw.h414
-rw-r--r--drivers/staging/rdma/ehca/ipz_pt_fn.c289
-rw-r--r--drivers/staging/rdma/ehca/ipz_pt_fn.h289
-rw-r--r--drivers/staging/rdma/hfi1/Kconfig37
-rw-r--r--drivers/staging/rdma/hfi1/Makefile19
-rw-r--r--drivers/staging/rdma/hfi1/TODO6
-rw-r--r--drivers/staging/rdma/hfi1/chip.c10798
-rw-r--r--drivers/staging/rdma/hfi1/chip.h1035
-rw-r--r--drivers/staging/rdma/hfi1/chip_registers.h1292
-rw-r--r--drivers/staging/rdma/hfi1/common.h415
-rw-r--r--drivers/staging/rdma/hfi1/cq.c558
-rw-r--r--drivers/staging/rdma/hfi1/debugfs.c899
-rw-r--r--drivers/staging/rdma/hfi1/debugfs.h78
-rw-r--r--drivers/staging/rdma/hfi1/device.c184
-rw-r--r--drivers/staging/rdma/hfi1/device.h62
-rw-r--r--drivers/staging/rdma/hfi1/diag.c1873
-rw-r--r--drivers/staging/rdma/hfi1/dma.c186
-rw-r--r--drivers/staging/rdma/hfi1/driver.c1241
-rw-r--r--drivers/staging/rdma/hfi1/eprom.c475
-rw-r--r--drivers/staging/rdma/hfi1/eprom.h55
-rw-r--r--drivers/staging/rdma/hfi1/file_ops.c2144
-rw-r--r--drivers/staging/rdma/hfi1/firmware.c1620
-rw-r--r--drivers/staging/rdma/hfi1/hfi.h1821
-rw-r--r--drivers/staging/rdma/hfi1/init.c1722
-rw-r--r--drivers/staging/rdma/hfi1/intr.c207
-rw-r--r--drivers/staging/rdma/hfi1/iowait.h186
-rw-r--r--drivers/staging/rdma/hfi1/keys.c411
-rw-r--r--drivers/staging/rdma/hfi1/mad.c4257
-rw-r--r--drivers/staging/rdma/hfi1/mad.h325
-rw-r--r--drivers/staging/rdma/hfi1/mmap.c192
-rw-r--r--drivers/staging/rdma/hfi1/mr.c551
-rw-r--r--drivers/staging/rdma/hfi1/opa_compat.h129
-rw-r--r--drivers/staging/rdma/hfi1/pcie.c1253
-rw-r--r--drivers/staging/rdma/hfi1/pio.c1771
-rw-r--r--drivers/staging/rdma/hfi1/pio.h224
-rw-r--r--drivers/staging/rdma/hfi1/pio_copy.c858
-rw-r--r--drivers/staging/rdma/hfi1/platform_config.h286
-rw-r--r--drivers/staging/rdma/hfi1/qp.c1687
-rw-r--r--drivers/staging/rdma/hfi1/qp.h235
-rw-r--r--drivers/staging/rdma/hfi1/qsfp.c546
-rw-r--r--drivers/staging/rdma/hfi1/qsfp.h222
-rw-r--r--drivers/staging/rdma/hfi1/rc.c2426
-rw-r--r--drivers/staging/rdma/hfi1/ruc.c948
-rw-r--r--drivers/staging/rdma/hfi1/sdma.c2962
-rw-r--r--drivers/staging/rdma/hfi1/sdma.h1123
-rw-r--r--drivers/staging/rdma/hfi1/srq.c397
-rw-r--r--drivers/staging/rdma/hfi1/sysfs.c739
-rw-r--r--drivers/staging/rdma/hfi1/trace.c221
-rw-r--r--drivers/staging/rdma/hfi1/trace.h1409
-rw-r--r--drivers/staging/rdma/hfi1/twsi.c518
-rw-r--r--drivers/staging/rdma/hfi1/twsi.h68
-rw-r--r--drivers/staging/rdma/hfi1/uc.c585
-rw-r--r--drivers/staging/rdma/hfi1/ud.c885
-rw-r--r--drivers/staging/rdma/hfi1/user_pages.c156
-rw-r--r--drivers/staging/rdma/hfi1/user_sdma.c1444
-rw-r--r--drivers/staging/rdma/hfi1/user_sdma.h89
-rw-r--r--drivers/staging/rdma/hfi1/verbs.c2145
-rw-r--r--drivers/staging/rdma/hfi1/verbs.h1151
-rw-r--r--drivers/staging/rdma/hfi1/verbs_mcast.c385
-rw-r--r--drivers/staging/rdma/ipath/Kconfig16
-rw-r--r--drivers/staging/rdma/ipath/Makefile37
-rw-r--r--drivers/staging/rdma/ipath/TODO5
-rw-r--r--drivers/staging/rdma/ipath/ipath_common.h851
-rw-r--r--drivers/staging/rdma/ipath/ipath_cq.c483
-rw-r--r--drivers/staging/rdma/ipath/ipath_debug.h99
-rw-r--r--drivers/staging/rdma/ipath/ipath_diag.c551
-rw-r--r--drivers/staging/rdma/ipath/ipath_dma.c179
-rw-r--r--drivers/staging/rdma/ipath/ipath_driver.c2789
-rw-r--r--drivers/staging/rdma/ipath/ipath_eeprom.c1183
-rw-r--r--drivers/staging/rdma/ipath/ipath_file_ops.c2620
-rw-r--r--drivers/staging/rdma/ipath/ipath_fs.c422
-rw-r--r--drivers/staging/rdma/ipath/ipath_iba6110.c1940
-rw-r--r--drivers/staging/rdma/ipath/ipath_init_chip.c1066
-rw-r--r--drivers/staging/rdma/ipath/ipath_intr.c1273
-rw-r--r--drivers/staging/rdma/ipath/ipath_kernel.h1373
-rw-r--r--drivers/staging/rdma/ipath/ipath_keys.c270
-rw-r--r--drivers/staging/rdma/ipath/ipath_mad.c1521
-rw-r--r--drivers/staging/rdma/ipath/ipath_mmap.c174
-rw-r--r--drivers/staging/rdma/ipath/ipath_mr.c425
-rw-r--r--drivers/staging/rdma/ipath/ipath_qp.c1080
-rw-r--r--drivers/staging/rdma/ipath/ipath_rc.c1969
-rw-r--r--drivers/staging/rdma/ipath/ipath_registers.h512
-rw-r--r--drivers/staging/rdma/ipath/ipath_ruc.c734
-rw-r--r--drivers/staging/rdma/ipath/ipath_sdma.c818
-rw-r--r--drivers/staging/rdma/ipath/ipath_srq.c380
-rw-r--r--drivers/staging/rdma/ipath/ipath_stats.c347
-rw-r--r--drivers/staging/rdma/ipath/ipath_sysfs.c1238
-rw-r--r--drivers/staging/rdma/ipath/ipath_uc.c547
-rw-r--r--drivers/staging/rdma/ipath/ipath_ud.c580
-rw-r--r--drivers/staging/rdma/ipath/ipath_user_pages.c229
-rw-r--r--drivers/staging/rdma/ipath/ipath_user_sdma.c875
-rw-r--r--drivers/staging/rdma/ipath/ipath_user_sdma.h52
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs.c2365
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs.h939
-rw-r--r--drivers/staging/rdma/ipath/ipath_verbs_mcast.c364
-rw-r--r--drivers/staging/rdma/ipath/ipath_wc_ppc64.c49
-rw-r--r--drivers/staging/rdma/ipath/ipath_wc_x86_64.c144
152 files changed, 111151 insertions, 0 deletions
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
new file mode 100644
index 000000000000..d7f62359d743
--- /dev/null
+++ b/drivers/staging/rdma/Kconfig
@@ -0,0 +1,33 @@
+menuconfig STAGING_RDMA
+ bool "RDMA staging drivers"
+ depends on INFINIBAND
+ depends on PCI || BROKEN
+ depends on HAS_IOMEM
+ depends on NET
+ depends on INET
+ default n
+ ---help---
+ This option allows you to select a number of RDMA drivers that
+ fall into one of two categories: deprecated drivers being held
+ here before finally being removed or new drivers that still need
+ some work before being moved to the normal RDMA driver area.
+
+ If you wish to work on these drivers, to help improve them, or
+ to report problems you have with them, please use the
+ linux-rdma@vger.kernel.org mailing list.
+
+ If in doubt, say N here.
+
+
+# Please keep entries in alphabetic order
+if STAGING_RDMA
+
+source "drivers/staging/rdma/amso1100/Kconfig"
+
+source "drivers/staging/rdma/ehca/Kconfig"
+
+source "drivers/staging/rdma/hfi1/Kconfig"
+
+source "drivers/staging/rdma/ipath/Kconfig"
+
+endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
new file mode 100644
index 000000000000..139d78ef2c24
--- /dev/null
+++ b/drivers/staging/rdma/Makefile
@@ -0,0 +1,5 @@
+# Entries for RDMA_STAGING tree
+obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/
+obj-$(CONFIG_INFINIBAND_EHCA) += ehca/
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/
+obj-$(CONFIG_INFINIBAND_IPATH) += ipath/
diff --git a/drivers/staging/rdma/amso1100/Kbuild b/drivers/staging/rdma/amso1100/Kbuild
new file mode 100644
index 000000000000..950dfabcd89d
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/Kbuild
@@ -0,0 +1,6 @@
+ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
+
+obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
+
+iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
+ c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/staging/rdma/amso1100/Kconfig b/drivers/staging/rdma/amso1100/Kconfig
new file mode 100644
index 000000000000..e6ce5f209e47
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/Kconfig
@@ -0,0 +1,15 @@
+config INFINIBAND_AMSO1100
+ tristate "Ammasso 1100 HCA support"
+ depends on PCI && INET
+ ---help---
+ This is a low-level driver for the Ammasso 1100 host
+ channel adapter (HCA).
+
+config INFINIBAND_AMSO1100_DEBUG
+ bool "Verbose debugging output"
+ depends on INFINIBAND_AMSO1100
+ default n
+ ---help---
+ This option causes the amso1100 driver to produce a bunch of
+ debug messages. Select this if you are developing the driver
+ or trying to diagnose a problem.
diff --git a/drivers/staging/rdma/amso1100/TODO b/drivers/staging/rdma/amso1100/TODO
new file mode 100644
index 000000000000..18b00a5cb549
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/TODO
@@ -0,0 +1,4 @@
+7/2015
+
+The amso1100 driver has been deprecated and moved to drivers/staging.
+It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/amso1100/c2.c b/drivers/staging/rdma/amso1100/c2.c
new file mode 100644
index 000000000000..766a71ccefed
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2.c
@@ -0,0 +1,1241 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_provider.h"
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+ | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+
+static int debug = -1; /* defaults above */
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+static int c2_up(struct net_device *netdev);
+static int c2_down(struct net_device *netdev);
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+static void c2_tx_interrupt(struct net_device *netdev);
+static void c2_rx_interrupt(struct net_device *netdev);
+static irqreturn_t c2_interrupt(int irq, void *dev_id);
+static void c2_tx_timeout(struct net_device *netdev);
+static int c2_change_mtu(struct net_device *netdev, int new_mtu);
+static void c2_reset(struct c2_port *c2_port);
+
+static struct pci_device_id c2_pci_table[] = {
+ { PCI_DEVICE(0x18b8, 0xb001) },
+ { 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, c2_pci_table);
+
+static void c2_print_macaddr(struct net_device *netdev)
+{
+ pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+static void c2_set_rxbufsize(struct c2_port *c2_port)
+{
+ struct net_device *netdev = c2_port->netdev;
+
+ if (netdev->mtu > RX_BUF_SIZE)
+ c2_port->rx_buf_size =
+ netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
+ NET_IP_ALIGN;
+ else
+ c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
+}
+
+/*
+ * Allocate TX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
+ dma_addr_t base, void __iomem * mmio_txp_ring)
+{
+ struct c2_tx_desc *tx_desc;
+ struct c2_txp_desc __iomem *txp_desc;
+ struct c2_element *elem;
+ int i;
+
+ tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL);
+ if (!tx_ring->start)
+ return -ENOMEM;
+
+ elem = tx_ring->start;
+ tx_desc = vaddr;
+ txp_desc = mmio_txp_ring;
+ for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
+ tx_desc->len = 0;
+ tx_desc->status = 0;
+
+ /* Set TXP_HTXD_UNINIT */
+ __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+ (void __iomem *) txp_desc + C2_TXP_ADDR);
+ __raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
+ __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+ (void __iomem *) txp_desc + C2_TXP_FLAGS);
+
+ elem->skb = NULL;
+ elem->ht_desc = tx_desc;
+ elem->hw_desc = txp_desc;
+
+ if (i == tx_ring->count - 1) {
+ elem->next = tx_ring->start;
+ tx_desc->next_offset = base;
+ } else {
+ elem->next = elem + 1;
+ tx_desc->next_offset =
+ base + (i + 1) * sizeof(*tx_desc);
+ }
+ }
+
+ tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
+
+ return 0;
+}
+
+/*
+ * Allocate RX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
+ dma_addr_t base, void __iomem * mmio_rxp_ring)
+{
+ struct c2_rx_desc *rx_desc;
+ struct c2_rxp_desc __iomem *rxp_desc;
+ struct c2_element *elem;
+ int i;
+
+ rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL);
+ if (!rx_ring->start)
+ return -ENOMEM;
+
+ elem = rx_ring->start;
+ rx_desc = vaddr;
+ rxp_desc = mmio_rxp_ring;
+ for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
+ rx_desc->len = 0;
+ rx_desc->status = 0;
+
+ /* Set RXP_HRXD_UNINIT */
+ __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
+ (void __iomem *) rxp_desc + C2_RXP_STATUS);
+ __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
+ __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
+ __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+ (void __iomem *) rxp_desc + C2_RXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+ (void __iomem *) rxp_desc + C2_RXP_FLAGS);
+
+ elem->skb = NULL;
+ elem->ht_desc = rx_desc;
+ elem->hw_desc = rxp_desc;
+
+ if (i == rx_ring->count - 1) {
+ elem->next = rx_ring->start;
+ rx_desc->next_offset = base;
+ } else {
+ elem->next = elem + 1;
+ rx_desc->next_offset =
+ base + (i + 1) * sizeof(*rx_desc);
+ }
+ }
+
+ rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
+
+ return 0;
+}
+
+/* Setup buffer for receiving */
+static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
+{
+ struct c2_dev *c2dev = c2_port->c2dev;
+ struct c2_rx_desc *rx_desc = elem->ht_desc;
+ struct sk_buff *skb;
+ dma_addr_t mapaddr;
+ u32 maplen;
+ struct c2_rxp_hdr *rxp_hdr;
+
+ skb = dev_alloc_skb(c2_port->rx_buf_size);
+ if (unlikely(!skb)) {
+ pr_debug("%s: out of memory for receive\n",
+ c2_port->netdev->name);
+ return -ENOMEM;
+ }
+
+ /* Zero out the rxp hdr in the sk_buff */
+ memset(skb->data, 0, sizeof(*rxp_hdr));
+
+ skb->dev = c2_port->netdev;
+
+ maplen = c2_port->rx_buf_size;
+ mapaddr =
+ pci_map_single(c2dev->pcidev, skb->data, maplen,
+ PCI_DMA_FROMDEVICE);
+
+ /* Set the sk_buff RXP_header to RXP_HRXD_READY */
+ rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+ rxp_hdr->flags = RXP_HRXD_READY;
+
+ __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+ __raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
+ elem->hw_desc + C2_RXP_LEN);
+ __raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+ elem->hw_desc + C2_RXP_FLAGS);
+
+ elem->skb = skb;
+ elem->mapaddr = mapaddr;
+ elem->maplen = maplen;
+ rx_desc->len = maplen;
+
+ return 0;
+}
+
+/*
+ * Allocate buffers for the Rx ring
+ * For receive: rx_ring.to_clean is next received frame
+ */
+static int c2_rx_fill(struct c2_port *c2_port)
+{
+ struct c2_ring *rx_ring = &c2_port->rx_ring;
+ struct c2_element *elem;
+ int ret = 0;
+
+ elem = rx_ring->start;
+ do {
+ if (c2_rx_alloc(c2_port, elem)) {
+ ret = 1;
+ break;
+ }
+ } while ((elem = elem->next) != rx_ring->start);
+
+ rx_ring->to_clean = rx_ring->start;
+ return ret;
+}
+
+/* Free all buffers in RX ring, assumes receiver stopped */
+static void c2_rx_clean(struct c2_port *c2_port)
+{
+ struct c2_dev *c2dev = c2_port->c2dev;
+ struct c2_ring *rx_ring = &c2_port->rx_ring;
+ struct c2_element *elem;
+ struct c2_rx_desc *rx_desc;
+
+ elem = rx_ring->start;
+ do {
+ rx_desc = elem->ht_desc;
+ rx_desc->len = 0;
+
+ __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+ __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+ __raw_writew(0, elem->hw_desc + C2_RXP_LEN);
+ __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+ elem->hw_desc + C2_RXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+ elem->hw_desc + C2_RXP_FLAGS);
+
+ if (elem->skb) {
+ pci_unmap_single(c2dev->pcidev, elem->mapaddr,
+ elem->maplen, PCI_DMA_FROMDEVICE);
+ dev_kfree_skb(elem->skb);
+ elem->skb = NULL;
+ }
+ } while ((elem = elem->next) != rx_ring->start);
+}
+
+static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
+{
+ struct c2_tx_desc *tx_desc = elem->ht_desc;
+
+ tx_desc->len = 0;
+
+ pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
+ PCI_DMA_TODEVICE);
+
+ if (elem->skb) {
+ dev_kfree_skb_any(elem->skb);
+ elem->skb = NULL;
+ }
+
+ return 0;
+}
+
+/* Free all buffers in TX ring, assumes transmitter stopped */
+static void c2_tx_clean(struct c2_port *c2_port)
+{
+ struct c2_ring *tx_ring = &c2_port->tx_ring;
+ struct c2_element *elem;
+ struct c2_txp_desc txp_htxd;
+ int retry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+ elem = tx_ring->start;
+
+ do {
+ retry = 0;
+ do {
+ txp_htxd.flags =
+ readw(elem->hw_desc + C2_TXP_FLAGS);
+
+ if (txp_htxd.flags == TXP_HTXD_READY) {
+ retry = 1;
+ __raw_writew(0,
+ elem->hw_desc + C2_TXP_LEN);
+ __raw_writeq(0,
+ elem->hw_desc + C2_TXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
+ elem->hw_desc + C2_TXP_FLAGS);
+ c2_port->netdev->stats.tx_dropped++;
+ break;
+ } else {
+ __raw_writew(0,
+ elem->hw_desc + C2_TXP_LEN);
+ __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+ elem->hw_desc + C2_TXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+ elem->hw_desc + C2_TXP_FLAGS);
+ }
+
+ c2_tx_free(c2_port->c2dev, elem);
+
+ } while ((elem = elem->next) != tx_ring->start);
+ } while (retry);
+
+ c2_port->tx_avail = c2_port->tx_ring.count - 1;
+ c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
+
+ if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+ netif_wake_queue(c2_port->netdev);
+
+ spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+}
+
+/*
+ * Process transmit descriptors marked 'DONE' by the firmware,
+ * freeing up their unneeded sk_buffs.
+ */
+static void c2_tx_interrupt(struct net_device *netdev)
+{
+ struct c2_port *c2_port = netdev_priv(netdev);
+ struct c2_dev *c2dev = c2_port->c2dev;
+ struct c2_ring *tx_ring = &c2_port->tx_ring;
+ struct c2_element *elem;
+ struct c2_txp_desc txp_htxd;
+
+ spin_lock(&c2_port->tx_lock);
+
+ for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
+ elem = elem->next) {
+ txp_htxd.flags =
+ be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
+
+ if (txp_htxd.flags != TXP_HTXD_DONE)
+ break;
+
+ if (netif_msg_tx_done(c2_port)) {
+ /* PCI reads are expensive in fast path */
+ txp_htxd.len =
+ be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
+ pr_debug("%s: tx done slot %3Zu status 0x%x len "
+ "%5u bytes\n",
+ netdev->name, elem - tx_ring->start,
+ txp_htxd.flags, txp_htxd.len);
+ }
+
+ c2_tx_free(c2dev, elem);
+ ++(c2_port->tx_avail);
+ }
+
+ tx_ring->to_clean = elem;
+
+ if (netif_queue_stopped(netdev)
+ && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+ netif_wake_queue(netdev);
+
+ spin_unlock(&c2_port->tx_lock);
+}
+
+static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
+{
+ struct c2_rx_desc *rx_desc = elem->ht_desc;
+ struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+
+ if (rxp_hdr->status != RXP_HRXD_OK ||
+ rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
+ pr_debug("BAD RXP_HRXD\n");
+ pr_debug(" rx_desc : %p\n", rx_desc);
+ pr_debug(" index : %Zu\n",
+ elem - c2_port->rx_ring.start);
+ pr_debug(" len : %u\n", rx_desc->len);
+ pr_debug(" rxp_hdr : %p [PA %p]\n", rxp_hdr,
+ (void *) __pa((unsigned long) rxp_hdr));
+ pr_debug(" flags : 0x%x\n", rxp_hdr->flags);
+ pr_debug(" status: 0x%x\n", rxp_hdr->status);
+ pr_debug(" len : %u\n", rxp_hdr->len);
+ pr_debug(" rsvd : 0x%x\n", rxp_hdr->rsvd);
+ }
+
+ /* Setup the skb for reuse since we're dropping this pkt */
+ elem->skb->data = elem->skb->head;
+ skb_reset_tail_pointer(elem->skb);
+
+ /* Zero out the rxp hdr in the sk_buff */
+ memset(elem->skb->data, 0, sizeof(*rxp_hdr));
+
+ /* Write the descriptor to the adapter's rx ring */
+ __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+ __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+ __raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
+ elem->hw_desc + C2_RXP_LEN);
+ __raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
+ elem->hw_desc + C2_RXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+ elem->hw_desc + C2_RXP_FLAGS);
+
+ pr_debug("packet dropped\n");
+ c2_port->netdev->stats.rx_dropped++;
+}
+
+static void c2_rx_interrupt(struct net_device *netdev)
+{
+ struct c2_port *c2_port = netdev_priv(netdev);
+ struct c2_dev *c2dev = c2_port->c2dev;
+ struct c2_ring *rx_ring = &c2_port->rx_ring;
+ struct c2_element *elem;
+ struct c2_rx_desc *rx_desc;
+ struct c2_rxp_hdr *rxp_hdr;
+ struct sk_buff *skb;
+ dma_addr_t mapaddr;
+ u32 maplen, buflen;
+ unsigned long flags;
+
+ spin_lock_irqsave(&c2dev->lock, flags);
+
+ /* Begin where we left off */
+ rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
+
+ for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
+ elem = elem->next) {
+ rx_desc = elem->ht_desc;
+ mapaddr = elem->mapaddr;
+ maplen = elem->maplen;
+ skb = elem->skb;
+ rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+
+ if (rxp_hdr->flags != RXP_HRXD_DONE)
+ break;
+ buflen = rxp_hdr->len;
+
+ /* Sanity check the RXP header */
+ if (rxp_hdr->status != RXP_HRXD_OK ||
+ buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
+ c2_rx_error(c2_port, elem);
+ continue;
+ }
+
+ /*
+ * Allocate and map a new skb for replenishing the host
+ * RX desc
+ */
+ if (c2_rx_alloc(c2_port, elem)) {
+ c2_rx_error(c2_port, elem);
+ continue;
+ }
+
+ /* Unmap the old skb */
+ pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
+ PCI_DMA_FROMDEVICE);
+
+ prefetch(skb->data);
+
+ /*
+ * Skip past the leading 8 bytes comprising of the
+ * "struct c2_rxp_hdr", prepended by the adapter
+ * to the usual Ethernet header ("struct ethhdr"),
+ * to the start of the raw Ethernet packet.
+ *
+ * Fix up the various fields in the sk_buff before
+ * passing it up to netif_rx(). The transfer size
+ * (in bytes) specified by the adapter len field of
+ * the "struct rxp_hdr_t" does NOT include the
+ * "sizeof(struct c2_rxp_hdr)".
+ */
+ skb->data += sizeof(*rxp_hdr);
+ skb_set_tail_pointer(skb, buflen);
+ skb->len = buflen;
+ skb->protocol = eth_type_trans(skb, netdev);
+
+ netif_rx(skb);
+
+ netdev->stats.rx_packets++;
+ netdev->stats.rx_bytes += buflen;
+ }
+
+ /* Save where we left off */
+ rx_ring->to_clean = elem;
+ c2dev->cur_rx = elem - rx_ring->start;
+ C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+ spin_unlock_irqrestore(&c2dev->lock, flags);
+}
+
+/*
+ * Handle netisr0 TX & RX interrupts.
+ */
+static irqreturn_t c2_interrupt(int irq, void *dev_id)
+{
+ unsigned int netisr0, dmaisr;
+ int handled = 0;
+ struct c2_dev *c2dev = (struct c2_dev *) dev_id;
+
+ /* Process CCILNET interrupts */
+ netisr0 = readl(c2dev->regs + C2_NISR0);
+ if (netisr0) {
+
+ /*
+ * There is an issue with the firmware that always
+ * provides the status of RX for both TX & RX
+ * interrupts. So process both queues here.
+ */
+ c2_rx_interrupt(c2dev->netdev);
+ c2_tx_interrupt(c2dev->netdev);
+
+ /* Clear the interrupt */
+ writel(netisr0, c2dev->regs + C2_NISR0);
+ handled++;
+ }
+
+ /* Process RNIC interrupts */
+ dmaisr = readl(c2dev->regs + C2_DISR);
+ if (dmaisr) {
+ writel(dmaisr, c2dev->regs + C2_DISR);
+ c2_rnic_interrupt(c2dev);
+ handled++;
+ }
+
+ if (handled) {
+ return IRQ_HANDLED;
+ } else {
+ return IRQ_NONE;
+ }
+}
+
+static int c2_up(struct net_device *netdev)
+{
+ struct c2_port *c2_port = netdev_priv(netdev);
+ struct c2_dev *c2dev = c2_port->c2dev;
+ struct c2_element *elem;
+ struct c2_rxp_hdr *rxp_hdr;
+ struct in_device *in_dev;
+ size_t rx_size, tx_size;
+ int ret, i;
+ unsigned int netimr0;
+
+ if (netif_msg_ifup(c2_port))
+ pr_debug("%s: enabling interface\n", netdev->name);
+
+ /* Set the Rx buffer size based on MTU */
+ c2_set_rxbufsize(c2_port);
+
+ /* Allocate DMA'able memory for Tx/Rx host descriptor rings */
+ rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
+ tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
+
+ c2_port->mem_size = tx_size + rx_size;
+ c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
+ &c2_port->dma);
+ if (c2_port->mem == NULL) {
+ pr_debug("Unable to allocate memory for "
+ "host descriptor rings\n");
+ return -ENOMEM;
+ }
+
+ /* Create the Rx host descriptor ring */
+ if ((ret =
+ c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
+ c2dev->mmio_rxp_ring))) {
+ pr_debug("Unable to create RX ring\n");
+ goto bail0;
+ }
+
+ /* Allocate Rx buffers for the host descriptor ring */
+ if (c2_rx_fill(c2_port)) {
+ pr_debug("Unable to fill RX ring\n");
+ goto bail1;
+ }
+
+ /* Create the Tx host descriptor ring */
+ if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
+ c2_port->dma + rx_size,
+ c2dev->mmio_txp_ring))) {
+ pr_debug("Unable to create TX ring\n");
+ goto bail1;
+ }
+
+ /* Set the TX pointer to where we left off */
+ c2_port->tx_avail = c2_port->tx_ring.count - 1;
+ c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
+ c2_port->tx_ring.start + c2dev->cur_tx;
+
+ /* missing: Initialize MAC */
+
+ BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
+
+ /* Reset the adapter, ensures the driver is in sync with the RXP */
+ c2_reset(c2_port);
+
+ /* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
+ for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
+ i++, elem++) {
+ rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+ rxp_hdr->flags = 0;
+ __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+ elem->hw_desc + C2_RXP_FLAGS);
+ }
+
+ /* Enable network packets */
+ netif_start_queue(netdev);
+
+ /* Enable IRQ */
+ writel(0, c2dev->regs + C2_IDIS);
+ netimr0 = readl(c2dev->regs + C2_NIMR0);
+ netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
+ writel(netimr0, c2dev->regs + C2_NIMR0);
+
+ /* Tell the stack to ignore arp requests for ipaddrs bound to
+ * other interfaces. This is needed to prevent the host stack
+ * from responding to arp requests to the ipaddr bound on the
+ * rdma interface.
+ */
+ in_dev = in_dev_get(netdev);
+ IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
+ in_dev_put(in_dev);
+
+ return 0;
+
+ bail1:
+ c2_rx_clean(c2_port);
+ kfree(c2_port->rx_ring.start);
+
+ bail0:
+ pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+ c2_port->dma);
+
+ return ret;
+}
+
+static int c2_down(struct net_device *netdev)
+{
+ struct c2_port *c2_port = netdev_priv(netdev);
+ struct c2_dev *c2dev = c2_port->c2dev;
+
+ if (netif_msg_ifdown(c2_port))
+ pr_debug("%s: disabling interface\n",
+ netdev->name);
+
+ /* Wait for all the queued packets to get sent */
+ c2_tx_interrupt(netdev);
+
+ /* Disable network packets */
+ netif_stop_queue(netdev);
+
+ /* Disable IRQs by clearing the interrupt mask */
+ writel(1, c2dev->regs + C2_IDIS);
+ writel(0, c2dev->regs + C2_NIMR0);
+
+ /* missing: Stop transmitter */
+
+ /* missing: Stop receiver */
+
+ /* Reset the adapter, ensures the driver is in sync with the RXP */
+ c2_reset(c2_port);
+
+ /* missing: Turn off LEDs here */
+
+ /* Free all buffers in the host descriptor rings */
+ c2_tx_clean(c2_port);
+ c2_rx_clean(c2_port);
+
+ /* Free the host descriptor rings */
+ kfree(c2_port->rx_ring.start);
+ kfree(c2_port->tx_ring.start);
+ pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+ c2_port->dma);
+
+ return 0;
+}
+
+static void c2_reset(struct c2_port *c2_port)
+{
+ struct c2_dev *c2dev = c2_port->c2dev;
+ unsigned int cur_rx = c2dev->cur_rx;
+
+ /* Tell the hardware to quiesce */
+ C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
+
+ /*
+ * The hardware will reset the C2_PCI_HRX_QUI bit once
+ * the RXP is quiesced. Wait 2 seconds for this.
+ */
+ ssleep(2);
+
+ cur_rx = C2_GET_CUR_RX(c2dev);
+
+ if (cur_rx & C2_PCI_HRX_QUI)
+ pr_debug("c2_reset: failed to quiesce the hardware!\n");
+
+ cur_rx &= ~C2_PCI_HRX_QUI;
+
+ c2dev->cur_rx = cur_rx;
+
+ pr_debug("Current RX: %u\n", c2dev->cur_rx);
+}
+
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct c2_port *c2_port = netdev_priv(netdev);
+ struct c2_dev *c2dev = c2_port->c2dev;
+ struct c2_ring *tx_ring = &c2_port->tx_ring;
+ struct c2_element *elem;
+ dma_addr_t mapaddr;
+ u32 maplen;
+ unsigned long flags;
+ unsigned int i;
+
+ spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+ if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
+ netif_stop_queue(netdev);
+ spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+ pr_debug("%s: Tx ring full when queue awake!\n",
+ netdev->name);
+ return NETDEV_TX_BUSY;
+ }
+
+ maplen = skb_headlen(skb);
+ mapaddr =
+ pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
+
+ elem = tx_ring->to_use;
+ elem->skb = skb;
+ elem->mapaddr = mapaddr;
+ elem->maplen = maplen;
+
+ /* Tell HW to xmit */
+ __raw_writeq((__force u64) cpu_to_be64(mapaddr),
+ elem->hw_desc + C2_TXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(maplen),
+ elem->hw_desc + C2_TXP_LEN);
+ __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+ elem->hw_desc + C2_TXP_FLAGS);
+
+ netdev->stats.tx_packets++;
+ netdev->stats.tx_bytes += maplen;
+
+ /* Loop thru additional data fragments and queue them */
+ if (skb_shinfo(skb)->nr_frags) {
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ maplen = skb_frag_size(frag);
+ mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
+ 0, maplen, DMA_TO_DEVICE);
+ elem = elem->next;
+ elem->skb = NULL;
+ elem->mapaddr = mapaddr;
+ elem->maplen = maplen;
+
+ /* Tell HW to xmit */
+ __raw_writeq((__force u64) cpu_to_be64(mapaddr),
+ elem->hw_desc + C2_TXP_ADDR);
+ __raw_writew((__force u16) cpu_to_be16(maplen),
+ elem->hw_desc + C2_TXP_LEN);
+ __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+ elem->hw_desc + C2_TXP_FLAGS);
+
+ netdev->stats.tx_packets++;
+ netdev->stats.tx_bytes += maplen;
+ }
+ }
+
+ tx_ring->to_use = elem->next;
+ c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
+
+ if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
+ netif_stop_queue(netdev);
+ if (netif_msg_tx_queued(c2_port))
+ pr_debug("%s: transmit queue full\n",
+ netdev->name);
+ }
+
+ spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+ netdev->trans_start = jiffies;
+
+ return NETDEV_TX_OK;
+}
+
+static void c2_tx_timeout(struct net_device *netdev)
+{
+ struct c2_port *c2_port = netdev_priv(netdev);
+
+ if (netif_msg_timer(c2_port))
+ pr_debug("%s: tx timeout\n", netdev->name);
+
+ c2_tx_clean(c2_port);
+}
+
+static int c2_change_mtu(struct net_device *netdev, int new_mtu)
+{
+ int ret = 0;
+
+ if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+ return -EINVAL;
+
+ netdev->mtu = new_mtu;
+
+ if (netif_running(netdev)) {
+ c2_down(netdev);
+
+ c2_up(netdev);
+ }
+
+ return ret;
+}
+
+static const struct net_device_ops c2_netdev = {
+ .ndo_open = c2_up,
+ .ndo_stop = c2_down,
+ .ndo_start_xmit = c2_xmit_frame,
+ .ndo_tx_timeout = c2_tx_timeout,
+ .ndo_change_mtu = c2_change_mtu,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
+/* Initialize network device */
+static struct net_device *c2_devinit(struct c2_dev *c2dev,
+ void __iomem * mmio_addr)
+{
+ struct c2_port *c2_port = NULL;
+ struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
+
+ if (!netdev) {
+ pr_debug("c2_port etherdev alloc failed");
+ return NULL;
+ }
+
+ SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+ netdev->netdev_ops = &c2_netdev;
+ netdev->watchdog_timeo = C2_TX_TIMEOUT;
+ netdev->irq = c2dev->pcidev->irq;
+
+ c2_port = netdev_priv(netdev);
+ c2_port->netdev = netdev;
+ c2_port->c2dev = c2dev;
+ c2_port->msg_enable = netif_msg_init(debug, default_msg);
+ c2_port->tx_ring.count = C2_NUM_TX_DESC;
+ c2_port->rx_ring.count = C2_NUM_RX_DESC;
+
+ spin_lock_init(&c2_port->tx_lock);
+
+ /* Copy our 48-bit ethernet hardware address */
+ memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
+
+ /* Validate the MAC address */
+ if (!is_valid_ether_addr(netdev->dev_addr)) {
+ pr_debug("Invalid MAC Address\n");
+ c2_print_macaddr(netdev);
+ free_netdev(netdev);
+ return NULL;
+ }
+
+ c2dev->netdev = netdev;
+
+ return netdev;
+}
+
+static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+ int ret = 0, i;
+ unsigned long reg0_start, reg0_flags, reg0_len;
+ unsigned long reg2_start, reg2_flags, reg2_len;
+ unsigned long reg4_start, reg4_flags, reg4_len;
+ unsigned kva_map_size;
+ struct net_device *netdev = NULL;
+ struct c2_dev *c2dev = NULL;
+ void __iomem *mmio_regs = NULL;
+
+ printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
+ DRV_VERSION);
+
+ /* Enable PCI device */
+ ret = pci_enable_device(pcidev);
+ if (ret) {
+ printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
+ pci_name(pcidev));
+ goto bail0;
+ }
+
+ reg0_start = pci_resource_start(pcidev, BAR_0);
+ reg0_len = pci_resource_len(pcidev, BAR_0);
+ reg0_flags = pci_resource_flags(pcidev, BAR_0);
+
+ reg2_start = pci_resource_start(pcidev, BAR_2);
+ reg2_len = pci_resource_len(pcidev, BAR_2);
+ reg2_flags = pci_resource_flags(pcidev, BAR_2);
+
+ reg4_start = pci_resource_start(pcidev, BAR_4);
+ reg4_len = pci_resource_len(pcidev, BAR_4);
+ reg4_flags = pci_resource_flags(pcidev, BAR_4);
+
+ pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
+ pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
+ pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
+
+ /* Make sure PCI base addr are MMIO */
+ if (!(reg0_flags & IORESOURCE_MEM) ||
+ !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
+ printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+ ret = -ENODEV;
+ goto bail1;
+ }
+
+ /* Check for weird/broken PCI region reporting */
+ if ((reg0_len < C2_REG0_SIZE) ||
+ (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
+ printk(KERN_ERR PFX "Invalid PCI region sizes\n");
+ ret = -ENODEV;
+ goto bail1;
+ }
+
+ /* Reserve PCI I/O and memory resources */
+ ret = pci_request_regions(pcidev, DRV_NAME);
+ if (ret) {
+ printk(KERN_ERR PFX "%s: Unable to request regions\n",
+ pci_name(pcidev));
+ goto bail1;
+ }
+
+ if ((sizeof(dma_addr_t) > 4)) {
+ ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+ if (ret < 0) {
+ printk(KERN_ERR PFX "64b DMA configuration failed\n");
+ goto bail2;
+ }
+ } else {
+ ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+ if (ret < 0) {
+ printk(KERN_ERR PFX "32b DMA configuration failed\n");
+ goto bail2;
+ }
+ }
+
+ /* Enables bus-mastering on the device */
+ pci_set_master(pcidev);
+
+ /* Remap the adapter PCI registers in BAR4 */
+ mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+ sizeof(struct c2_adapter_pci_regs));
+ if (!mmio_regs) {
+ printk(KERN_ERR PFX
+ "Unable to remap adapter PCI registers in BAR4\n");
+ ret = -EIO;
+ goto bail2;
+ }
+
+ /* Validate PCI regs magic */
+ for (i = 0; i < sizeof(c2_magic); i++) {
+ if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
+ printk(KERN_ERR PFX "Downlevel Firmware boot loader "
+ "[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
+ "utility to update your boot loader\n",
+ i + 1, sizeof(c2_magic),
+ readb(mmio_regs + C2_REGS_MAGIC + i),
+ c2_magic[i]);
+ printk(KERN_ERR PFX "Adapter not claimed\n");
+ iounmap(mmio_regs);
+ ret = -EIO;
+ goto bail2;
+ }
+ }
+
+ /* Validate the adapter version */
+ if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
+ printk(KERN_ERR PFX "Version mismatch "
+ "[fw=%u, c2=%u], Adapter not claimed\n",
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
+ C2_VERSION);
+ ret = -EINVAL;
+ iounmap(mmio_regs);
+ goto bail2;
+ }
+
+ /* Validate the adapter IVN */
+ if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
+ printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
+ "the OpenIB device support kit. "
+ "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
+ C2_IVN);
+ ret = -EINVAL;
+ iounmap(mmio_regs);
+ goto bail2;
+ }
+
+ /* Allocate hardware structure */
+ c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
+ if (!c2dev) {
+ printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
+ pci_name(pcidev));
+ ret = -ENOMEM;
+ iounmap(mmio_regs);
+ goto bail2;
+ }
+
+ memset(c2dev, 0, sizeof(*c2dev));
+ spin_lock_init(&c2dev->lock);
+ c2dev->pcidev = pcidev;
+ c2dev->cur_tx = 0;
+
+ /* Get the last RX index */
+ c2dev->cur_rx =
+ (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
+ 0xffffc000) / sizeof(struct c2_rxp_desc);
+
+ /* Request an interrupt line for the driver */
+ ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
+ if (ret) {
+ printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+ pci_name(pcidev), pcidev->irq);
+ iounmap(mmio_regs);
+ goto bail3;
+ }
+
+ /* Set driver specific data */
+ pci_set_drvdata(pcidev, c2dev);
+
+ /* Initialize network device */
+ if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
+ ret = -ENOMEM;
+ iounmap(mmio_regs);
+ goto bail4;
+ }
+
+ /* Save off the actual size prior to unmapping mmio_regs */
+ kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
+
+ /* Unmap the adapter PCI registers in BAR4 */
+ iounmap(mmio_regs);
+
+ /* Register network device */
+ ret = register_netdev(netdev);
+ if (ret) {
+ printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
+ ret);
+ goto bail5;
+ }
+
+ /* Disable network packets */
+ netif_stop_queue(netdev);
+
+ /* Remap the adapter HRXDQ PA space to kernel VA space */
+ c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
+ C2_RXP_HRXDQ_SIZE);
+ if (!c2dev->mmio_rxp_ring) {
+ printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
+ ret = -EIO;
+ goto bail6;
+ }
+
+ /* Remap the adapter HTXDQ PA space to kernel VA space */
+ c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
+ C2_TXP_HTXDQ_SIZE);
+ if (!c2dev->mmio_txp_ring) {
+ printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
+ ret = -EIO;
+ goto bail7;
+ }
+
+ /* Save off the current RX index in the last 4 bytes of the TXP Ring */
+ C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+ /* Remap the PCI registers in adapter BAR0 to kernel VA space */
+ c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
+ if (!c2dev->regs) {
+ printk(KERN_ERR PFX "Unable to remap BAR0\n");
+ ret = -EIO;
+ goto bail8;
+ }
+
+ /* Remap the PCI registers in adapter BAR4 to kernel VA space */
+ c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
+ c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+ kva_map_size);
+ if (!c2dev->kva) {
+ printk(KERN_ERR PFX "Unable to remap BAR4\n");
+ ret = -EIO;
+ goto bail9;
+ }
+
+ /* Print out the MAC address */
+ c2_print_macaddr(netdev);
+
+ ret = c2_rnic_init(c2dev);
+ if (ret) {
+ printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
+ goto bail10;
+ }
+
+ ret = c2_register_device(c2dev);
+ if (ret)
+ goto bail10;
+
+ return 0;
+
+ bail10:
+ iounmap(c2dev->kva);
+
+ bail9:
+ iounmap(c2dev->regs);
+
+ bail8:
+ iounmap(c2dev->mmio_txp_ring);
+
+ bail7:
+ iounmap(c2dev->mmio_rxp_ring);
+
+ bail6:
+ unregister_netdev(netdev);
+
+ bail5:
+ free_netdev(netdev);
+
+ bail4:
+ free_irq(pcidev->irq, c2dev);
+
+ bail3:
+ ib_dealloc_device(&c2dev->ibdev);
+
+ bail2:
+ pci_release_regions(pcidev);
+
+ bail1:
+ pci_disable_device(pcidev);
+
+ bail0:
+ return ret;
+}
+
+static void c2_remove(struct pci_dev *pcidev)
+{
+ struct c2_dev *c2dev = pci_get_drvdata(pcidev);
+ struct net_device *netdev = c2dev->netdev;
+
+ /* Unregister with OpenIB */
+ c2_unregister_device(c2dev);
+
+ /* Clean up the RNIC resources */
+ c2_rnic_term(c2dev);
+
+ /* Remove network device from the kernel */
+ unregister_netdev(netdev);
+
+ /* Free network device */
+ free_netdev(netdev);
+
+ /* Free the interrupt line */
+ free_irq(pcidev->irq, c2dev);
+
+ /* missing: Turn LEDs off here */
+
+ /* Unmap adapter PA space */
+ iounmap(c2dev->kva);
+ iounmap(c2dev->regs);
+ iounmap(c2dev->mmio_txp_ring);
+ iounmap(c2dev->mmio_rxp_ring);
+
+ /* Free the hardware structure */
+ ib_dealloc_device(&c2dev->ibdev);
+
+ /* Release reserved PCI I/O and memory resources */
+ pci_release_regions(pcidev);
+
+ /* Disable PCI device */
+ pci_disable_device(pcidev);
+
+ /* Clear driver specific data */
+ pci_set_drvdata(pcidev, NULL);
+}
+
+static struct pci_driver c2_pci_driver = {
+ .name = DRV_NAME,
+ .id_table = c2_pci_table,
+ .probe = c2_probe,
+ .remove = c2_remove,
+};
+
+module_pci_driver(c2_pci_driver);
diff --git a/drivers/staging/rdma/amso1100/c2.h b/drivers/staging/rdma/amso1100/c2.h
new file mode 100644
index 000000000000..d619d735838b
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2.h
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __C2_H
+#define __C2_H
+
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+
+#include "c2_provider.h"
+#include "c2_mq.h"
+#include "c2_status.h"
+
+#define DRV_NAME "c2"
+#define DRV_VERSION "1.1"
+#define PFX DRV_NAME ": "
+
+#define BAR_0 0
+#define BAR_2 2
+#define BAR_4 4
+
+#define RX_BUF_SIZE (1536 + 8)
+#define ETH_JUMBO_MTU 9000
+#define C2_MAGIC "CEPHEUS"
+#define C2_VERSION 4
+#define C2_IVN (18 & 0x7fffffff)
+
+#define C2_REG0_SIZE (16 * 1024)
+#define C2_REG2_SIZE (2 * 1024 * 1024)
+#define C2_REG4_SIZE (256 * 1024 * 1024)
+#define C2_NUM_TX_DESC 341
+#define C2_NUM_RX_DESC 256
+#define C2_PCI_REGS_OFFSET (0x10000)
+#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
+#define C2_RXP_HRXDQ_SIZE (4096)
+#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
+#define C2_TXP_HTXDQ_SIZE (4096)
+#define C2_TX_TIMEOUT (6*HZ)
+
+/* CEPHEUS */
+static const u8 c2_magic[] = {
+ 0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
+};
+
+enum adapter_pci_regs {
+ C2_REGS_MAGIC = 0x0000,
+ C2_REGS_VERS = 0x0008,
+ C2_REGS_IVN = 0x000C,
+ C2_REGS_PCI_WINSIZE = 0x0010,
+ C2_REGS_Q0_QSIZE = 0x0014,
+ C2_REGS_Q0_MSGSIZE = 0x0018,
+ C2_REGS_Q0_POOLSTART = 0x001C,
+ C2_REGS_Q0_SHARED = 0x0020,
+ C2_REGS_Q1_QSIZE = 0x0024,
+ C2_REGS_Q1_MSGSIZE = 0x0028,
+ C2_REGS_Q1_SHARED = 0x0030,
+ C2_REGS_Q2_QSIZE = 0x0034,
+ C2_REGS_Q2_MSGSIZE = 0x0038,
+ C2_REGS_Q2_SHARED = 0x0040,
+ C2_REGS_ENADDR = 0x004C,
+ C2_REGS_RDMA_ENADDR = 0x0054,
+ C2_REGS_HRX_CUR = 0x006C,
+};
+
+struct c2_adapter_pci_regs {
+ char reg_magic[8];
+ u32 version;
+ u32 ivn;
+ u32 pci_window_size;
+ u32 q0_q_size;
+ u32 q0_msg_size;
+ u32 q0_pool_start;
+ u32 q0_shared;
+ u32 q1_q_size;
+ u32 q1_msg_size;
+ u32 q1_pool_start;
+ u32 q1_shared;
+ u32 q2_q_size;
+ u32 q2_msg_size;
+ u32 q2_pool_start;
+ u32 q2_shared;
+ u32 log_start;
+ u32 log_size;
+ u8 host_enaddr[8];
+ u8 rdma_enaddr[8];
+ u32 crash_entry;
+ u32 crash_ready[2];
+ u32 fw_txd_cur;
+ u32 fw_hrxd_cur;
+ u32 fw_rxd_cur;
+};
+
+enum pci_regs {
+ C2_HISR = 0x0000,
+ C2_DISR = 0x0004,
+ C2_HIMR = 0x0008,
+ C2_DIMR = 0x000C,
+ C2_NISR0 = 0x0010,
+ C2_NISR1 = 0x0014,
+ C2_NIMR0 = 0x0018,
+ C2_NIMR1 = 0x001C,
+ C2_IDIS = 0x0020,
+};
+
+enum {
+ C2_PCI_HRX_INT = 1 << 8,
+ C2_PCI_HTX_INT = 1 << 17,
+ C2_PCI_HRX_QUI = 1 << 31,
+};
+
+/*
+ * Cepheus registers in BAR0.
+ */
+struct c2_pci_regs {
+ u32 hostisr;
+ u32 dmaisr;
+ u32 hostimr;
+ u32 dmaimr;
+ u32 netisr0;
+ u32 netisr1;
+ u32 netimr0;
+ u32 netimr1;
+ u32 int_disable;
+};
+
+/* TXP flags */
+enum c2_txp_flags {
+ TXP_HTXD_DONE = 0,
+ TXP_HTXD_READY = 1 << 0,
+ TXP_HTXD_UNINIT = 1 << 1,
+};
+
+/* RXP flags */
+enum c2_rxp_flags {
+ RXP_HRXD_UNINIT = 0,
+ RXP_HRXD_READY = 1 << 0,
+ RXP_HRXD_DONE = 1 << 1,
+};
+
+/* RXP status */
+enum c2_rxp_status {
+ RXP_HRXD_ZERO = 0,
+ RXP_HRXD_OK = 1 << 0,
+ RXP_HRXD_BUF_OV = 1 << 1,
+};
+
+/* TXP descriptor fields */
+enum txp_desc {
+ C2_TXP_FLAGS = 0x0000,
+ C2_TXP_LEN = 0x0002,
+ C2_TXP_ADDR = 0x0004,
+};
+
+/* RXP descriptor fields */
+enum rxp_desc {
+ C2_RXP_FLAGS = 0x0000,
+ C2_RXP_STATUS = 0x0002,
+ C2_RXP_COUNT = 0x0004,
+ C2_RXP_LEN = 0x0006,
+ C2_RXP_ADDR = 0x0008,
+};
+
+struct c2_txp_desc {
+ u16 flags;
+ u16 len;
+ u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_desc {
+ u16 flags;
+ u16 status;
+ u16 count;
+ u16 len;
+ u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_hdr {
+ u16 flags;
+ u16 status;
+ u16 len;
+ u16 rsvd;
+} __attribute__ ((packed));
+
+struct c2_tx_desc {
+ u32 len;
+ u32 status;
+ dma_addr_t next_offset;
+};
+
+struct c2_rx_desc {
+ u32 len;
+ u32 status;
+ dma_addr_t next_offset;
+};
+
+struct c2_alloc {
+ u32 last;
+ u32 max;
+ spinlock_t lock;
+ unsigned long *table;
+};
+
+struct c2_array {
+ struct {
+ void **page;
+ int used;
+ } *page_list;
+};
+
+/*
+ * The MQ shared pointer pool is organized as a linked list of
+ * chunks. Each chunk contains a linked list of free shared pointers
+ * that can be allocated to a given user mode client.
+ *
+ */
+struct sp_chunk {
+ struct sp_chunk *next;
+ dma_addr_t dma_addr;
+ DEFINE_DMA_UNMAP_ADDR(mapping);
+ u16 head;
+ u16 shared_ptr[0];
+};
+
+struct c2_pd_table {
+ u32 last;
+ u32 max;
+ spinlock_t lock;
+ unsigned long *table;
+};
+
+struct c2_qp_table {
+ struct idr idr;
+ spinlock_t lock;
+};
+
+struct c2_element {
+ struct c2_element *next;
+ void *ht_desc; /* host descriptor */
+ void __iomem *hw_desc; /* hardware descriptor */
+ struct sk_buff *skb;
+ dma_addr_t mapaddr;
+ u32 maplen;
+};
+
+struct c2_ring {
+ struct c2_element *to_clean;
+ struct c2_element *to_use;
+ struct c2_element *start;
+ unsigned long count;
+};
+
+struct c2_dev {
+ struct ib_device ibdev;
+ void __iomem *regs;
+ void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
+ void __iomem *mmio_rxp_ring;
+ spinlock_t lock;
+ struct pci_dev *pcidev;
+ struct net_device *netdev;
+ struct net_device *pseudo_netdev;
+ unsigned int cur_tx;
+ unsigned int cur_rx;
+ u32 adapter_handle;
+ int device_cap_flags;
+ void __iomem *kva; /* KVA device memory */
+ unsigned long pa; /* PA device memory */
+ void **qptr_array;
+
+ struct kmem_cache *host_msg_cache;
+
+ struct list_head cca_link; /* adapter list */
+ struct list_head eh_wakeup_list; /* event wakeup list */
+ wait_queue_head_t req_vq_wo;
+
+ /* Cached RNIC properties */
+ struct ib_device_attr props;
+
+ struct c2_pd_table pd_table;
+ struct c2_qp_table qp_table;
+ int ports; /* num of GigE ports */
+ int devnum;
+ spinlock_t vqlock; /* sync vbs req MQ */
+
+ /* Verbs Queues */
+ struct c2_mq req_vq; /* Verbs Request MQ */
+ struct c2_mq rep_vq; /* Verbs Reply MQ */
+ struct c2_mq aeq; /* Async Events MQ */
+
+ /* Kernel client MQs */
+ struct sp_chunk *kern_mqsp_pool;
+
+ /* Device updates these values when posting messages to a host
+ * target queue */
+ u16 req_vq_shared;
+ u16 rep_vq_shared;
+ u16 aeq_shared;
+ u16 irq_claimed;
+
+ /*
+ * Shared host target pages for user-accessible MQs.
+ */
+ int hthead; /* index of first free entry */
+ void *htpages; /* kernel vaddr */
+ int htlen; /* length of htpages memory */
+ void *htuva; /* user mapped vaddr */
+ spinlock_t htlock; /* serialize allocation */
+
+ u64 adapter_hint_uva; /* access to the activity FIFO */
+
+ // spinlock_t aeq_lock;
+ // spinlock_t rnic_lock;
+
+ __be16 *hint_count;
+ dma_addr_t hint_count_dma;
+ u16 hints_read;
+
+ int init; /* TRUE if it's ready */
+ char ae_cache_name[16];
+ char vq_cache_name[16];
+};
+
+struct c2_port {
+ u32 msg_enable;
+ struct c2_dev *c2dev;
+ struct net_device *netdev;
+
+ spinlock_t tx_lock;
+ u32 tx_avail;
+ struct c2_ring tx_ring;
+ struct c2_ring rx_ring;
+
+ void *mem; /* PCI memory for host rings */
+ dma_addr_t dma;
+ unsigned long mem_size;
+
+ u32 rx_buf_size;
+};
+
+/*
+ * Activity FIFO registers in BAR0.
+ */
+#define PCI_BAR0_HOST_HINT 0x100
+#define PCI_BAR0_ADAPTER_HINT 0x2000
+
+/*
+ * Ammasso PCI vendor id and Cepheus PCI device id.
+ */
+#define CQ_ARMED 0x01
+#define CQ_WAIT_FOR_DMA 0x80
+
+/*
+ * The format of a hint is as follows:
+ * Lower 16 bits are the count of hints for the queue.
+ * Next 15 bits are the qp_index
+ * Upper most bit depends on who reads it:
+ * If read by producer, then it means Full (1) or Not-Full (0)
+ * If read by consumer, then it means Empty (1) or Not-Empty (0)
+ */
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+
+/*
+ * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
+ * struct.
+ */
+#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
+
+#ifndef readq
+static inline u64 readq(const void __iomem * addr)
+{
+ u64 ret = readl(addr + 4);
+ ret <<= 32;
+ ret |= readl(addr);
+
+ return ret;
+}
+#endif
+
+#ifndef writeq
+static inline void __raw_writeq(u64 val, void __iomem * addr)
+{
+ __raw_writel((u32) (val), addr);
+ __raw_writel((u32) (val >> 32), (addr + 4));
+}
+#endif
+
+#define C2_SET_CUR_RX(c2dev, cur_rx) \
+ __raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
+
+#define C2_GET_CUR_RX(c2dev) \
+ be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
+
+static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct c2_dev, ibdev);
+}
+
+static inline int c2_errno(void *reply)
+{
+ switch (c2_wr_get_result(reply)) {
+ case C2_OK:
+ return 0;
+ case CCERR_NO_BUFS:
+ case CCERR_INSUFFICIENT_RESOURCES:
+ case CCERR_ZERO_RDMA_READ_RESOURCES:
+ return -ENOMEM;
+ case CCERR_MR_IN_USE:
+ case CCERR_QP_IN_USE:
+ return -EBUSY;
+ case CCERR_ADDR_IN_USE:
+ return -EADDRINUSE;
+ case CCERR_ADDR_NOT_AVAIL:
+ return -EADDRNOTAVAIL;
+ case CCERR_CONN_RESET:
+ return -ECONNRESET;
+ case CCERR_NOT_IMPLEMENTED:
+ case CCERR_INVALID_WQE:
+ return -ENOSYS;
+ case CCERR_QP_NOT_PRIVILEGED:
+ return -EPERM;
+ case CCERR_STACK_ERROR:
+ return -EPROTO;
+ case CCERR_ACCESS_VIOLATION:
+ case CCERR_BASE_AND_BOUNDS_VIOLATION:
+ return -EFAULT;
+ case CCERR_STAG_STATE_NOT_INVALID:
+ case CCERR_INVALID_ADDRESS:
+ case CCERR_INVALID_CQ:
+ case CCERR_INVALID_EP:
+ case CCERR_INVALID_MODIFIER:
+ case CCERR_INVALID_MTU:
+ case CCERR_INVALID_PD_ID:
+ case CCERR_INVALID_QP:
+ case CCERR_INVALID_RNIC:
+ case CCERR_INVALID_STAG:
+ return -EINVAL;
+ default:
+ return -EAGAIN;
+ }
+}
+
+/* Device */
+extern int c2_register_device(struct c2_dev *c2dev);
+extern void c2_unregister_device(struct c2_dev *c2dev);
+extern int c2_rnic_init(struct c2_dev *c2dev);
+extern void c2_rnic_term(struct c2_dev *c2dev);
+extern void c2_rnic_interrupt(struct c2_dev *c2dev);
+extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+
+/* QPs */
+extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
+ struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
+extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
+extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
+extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+ struct ib_qp_attr *attr, int attr_mask);
+extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+ int ord, int ird);
+extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+ struct ib_send_wr **bad_wr);
+extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+ struct ib_recv_wr **bad_wr);
+extern void c2_init_qp_table(struct c2_dev *c2dev);
+extern void c2_cleanup_qp_table(struct c2_dev *c2dev);
+extern void c2_set_qp_state(struct c2_qp *, int);
+extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
+
+/* PDs */
+extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
+extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
+extern int c2_init_pd_table(struct c2_dev *c2dev);
+extern void c2_cleanup_pd_table(struct c2_dev *c2dev);
+
+/* CQs */
+extern int c2_init_cq(struct c2_dev *c2dev, int entries,
+ struct c2_ucontext *ctx, struct c2_cq *cq);
+extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
+extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
+extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
+extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+
+/* CM */
+extern int c2_llp_connect(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *iw_param);
+extern int c2_llp_accept(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *iw_param);
+extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
+ u8 pdata_len);
+extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
+extern int c2_llp_service_destroy(struct iw_cm_id *cm_id);
+
+/* MM */
+extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ int page_size, int pbl_depth, u32 length,
+ u32 off, u64 *va, enum c2_acf acf,
+ struct c2_mr *mr);
+extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
+
+/* AE */
+extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
+
+/* MQSP Allocator */
+extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+ struct sp_chunk **root);
+extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
+extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+ dma_addr_t *dma_addr, gfp_t gfp_mask);
+extern void c2_free_mqsp(__be16* mqsp);
+#endif
diff --git a/drivers/staging/rdma/amso1100/c2_ae.c b/drivers/staging/rdma/amso1100/c2_ae.c
new file mode 100644
index 000000000000..cedda25232be
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_ae.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_status.h"
+#include "c2_ae.h"
+
+static int c2_convert_cm_status(u32 c2_status)
+{
+ switch (c2_status) {
+ case C2_CONN_STATUS_SUCCESS:
+ return 0;
+ case C2_CONN_STATUS_REJECTED:
+ return -ENETRESET;
+ case C2_CONN_STATUS_REFUSED:
+ return -ECONNREFUSED;
+ case C2_CONN_STATUS_TIMEDOUT:
+ return -ETIMEDOUT;
+ case C2_CONN_STATUS_NETUNREACH:
+ return -ENETUNREACH;
+ case C2_CONN_STATUS_HOSTUNREACH:
+ return -EHOSTUNREACH;
+ case C2_CONN_STATUS_INVALID_RNIC:
+ return -EINVAL;
+ case C2_CONN_STATUS_INVALID_QP:
+ return -EINVAL;
+ case C2_CONN_STATUS_INVALID_QP_STATE:
+ return -EINVAL;
+ case C2_CONN_STATUS_ADDR_NOT_AVAIL:
+ return -EADDRNOTAVAIL;
+ default:
+ printk(KERN_ERR PFX
+ "%s - Unable to convert CM status: %d\n",
+ __func__, c2_status);
+ return -EIO;
+ }
+}
+
+static const char* to_event_str(int event)
+{
+ static const char* event_str[] = {
+ "CCAE_REMOTE_SHUTDOWN",
+ "CCAE_ACTIVE_CONNECT_RESULTS",
+ "CCAE_CONNECTION_REQUEST",
+ "CCAE_LLP_CLOSE_COMPLETE",
+ "CCAE_TERMINATE_MESSAGE_RECEIVED",
+ "CCAE_LLP_CONNECTION_RESET",
+ "CCAE_LLP_CONNECTION_LOST",
+ "CCAE_LLP_SEGMENT_SIZE_INVALID",
+ "CCAE_LLP_INVALID_CRC",
+ "CCAE_LLP_BAD_FPDU",
+ "CCAE_INVALID_DDP_VERSION",
+ "CCAE_INVALID_RDMA_VERSION",
+ "CCAE_UNEXPECTED_OPCODE",
+ "CCAE_INVALID_DDP_QUEUE_NUMBER",
+ "CCAE_RDMA_READ_NOT_ENABLED",
+ "CCAE_RDMA_WRITE_NOT_ENABLED",
+ "CCAE_RDMA_READ_TOO_SMALL",
+ "CCAE_NO_L_BIT",
+ "CCAE_TAGGED_INVALID_STAG",
+ "CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
+ "CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
+ "CCAE_TAGGED_INVALID_PD",
+ "CCAE_WRAP_ERROR",
+ "CCAE_BAD_CLOSE",
+ "CCAE_BAD_LLP_CLOSE",
+ "CCAE_INVALID_MSN_RANGE",
+ "CCAE_INVALID_MSN_GAP",
+ "CCAE_IRRQ_OVERFLOW",
+ "CCAE_IRRQ_MSN_GAP",
+ "CCAE_IRRQ_MSN_RANGE",
+ "CCAE_IRRQ_INVALID_STAG",
+ "CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
+ "CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
+ "CCAE_IRRQ_INVALID_PD",
+ "CCAE_IRRQ_WRAP_ERROR",
+ "CCAE_CQ_SQ_COMPLETION_OVERFLOW",
+ "CCAE_CQ_RQ_COMPLETION_ERROR",
+ "CCAE_QP_SRQ_WQE_ERROR",
+ "CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
+ "CCAE_CQ_OVERFLOW",
+ "CCAE_CQ_OPERATION_ERROR",
+ "CCAE_SRQ_LIMIT_REACHED",
+ "CCAE_QP_RQ_LIMIT_REACHED",
+ "CCAE_SRQ_CATASTROPHIC_ERROR",
+ "CCAE_RNIC_CATASTROPHIC_ERROR"
+ };
+
+ if (event < CCAE_REMOTE_SHUTDOWN ||
+ event > CCAE_RNIC_CATASTROPHIC_ERROR)
+ return "<invalid event>";
+
+ event -= CCAE_REMOTE_SHUTDOWN;
+ return event_str[event];
+}
+
+static const char *to_qp_state_str(int state)
+{
+ switch (state) {
+ case C2_QP_STATE_IDLE:
+ return "C2_QP_STATE_IDLE";
+ case C2_QP_STATE_CONNECTING:
+ return "C2_QP_STATE_CONNECTING";
+ case C2_QP_STATE_RTS:
+ return "C2_QP_STATE_RTS";
+ case C2_QP_STATE_CLOSING:
+ return "C2_QP_STATE_CLOSING";
+ case C2_QP_STATE_TERMINATE:
+ return "C2_QP_STATE_TERMINATE";
+ case C2_QP_STATE_ERROR:
+ return "C2_QP_STATE_ERROR";
+ default:
+ return "<invalid QP state>";
+ }
+}
+
+void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
+{
+ struct c2_mq *mq = c2dev->qptr_array[mq_index];
+ union c2wr *wr;
+ void *resource_user_context;
+ struct iw_cm_event cm_event;
+ struct ib_event ib_event;
+ enum c2_resource_indicator resource_indicator;
+ enum c2_event_id event_id;
+ unsigned long flags;
+ int status;
+ struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
+ struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
+
+ /*
+ * retrieve the message
+ */
+ wr = c2_mq_consume(mq);
+ if (!wr)
+ return;
+
+ memset(&ib_event, 0, sizeof(ib_event));
+ memset(&cm_event, 0, sizeof(cm_event));
+
+ event_id = c2_wr_get_id(wr);
+ resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
+ resource_user_context =
+ (void *) (unsigned long) wr->ae.ae_generic.user_context;
+
+ status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
+
+ pr_debug("event received c2_dev=%p, event_id=%d, "
+ "resource_indicator=%d, user_context=%p, status = %d\n",
+ c2dev, event_id, resource_indicator, resource_user_context,
+ status);
+
+ switch (resource_indicator) {
+ case C2_RES_IND_QP:{
+
+ struct c2_qp *qp = (struct c2_qp *)resource_user_context;
+ struct iw_cm_id *cm_id = qp->cm_id;
+ struct c2wr_ae_active_connect_results *res;
+
+ if (!cm_id) {
+ pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
+ qp);
+ goto ignore_it;
+ }
+ pr_debug("%s: event = %s, user_context=%llx, "
+ "resource_type=%x, "
+ "resource=%x, qp_state=%s\n",
+ __func__,
+ to_event_str(event_id),
+ (unsigned long long) wr->ae.ae_generic.user_context,
+ be32_to_cpu(wr->ae.ae_generic.resource_type),
+ be32_to_cpu(wr->ae.ae_generic.resource),
+ to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
+
+ c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
+
+ switch (event_id) {
+ case CCAE_ACTIVE_CONNECT_RESULTS:
+ res = &wr->ae.ae_active_connect_results;
+ cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+ laddr->sin_addr.s_addr = res->laddr;
+ raddr->sin_addr.s_addr = res->raddr;
+ laddr->sin_port = res->lport;
+ raddr->sin_port = res->rport;
+ if (status == 0) {
+ cm_event.private_data_len =
+ be32_to_cpu(res->private_data_length);
+ cm_event.private_data = res->private_data;
+ } else {
+ spin_lock_irqsave(&qp->lock, flags);
+ if (qp->cm_id) {
+ qp->cm_id->rem_ref(qp->cm_id);
+ qp->cm_id = NULL;
+ }
+ spin_unlock_irqrestore(&qp->lock, flags);
+ cm_event.private_data_len = 0;
+ cm_event.private_data = NULL;
+ }
+ if (cm_id->event_handler)
+ cm_id->event_handler(cm_id, &cm_event);
+ break;
+ case CCAE_TERMINATE_MESSAGE_RECEIVED:
+ case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
+ ib_event.device = &c2dev->ibdev;
+ ib_event.element.qp = &qp->ibqp;
+ ib_event.event = IB_EVENT_QP_REQ_ERR;
+
+ if (qp->ibqp.event_handler)
+ qp->ibqp.event_handler(&ib_event,
+ qp->ibqp.
+ qp_context);
+ break;
+ case CCAE_BAD_CLOSE:
+ case CCAE_LLP_CLOSE_COMPLETE:
+ case CCAE_LLP_CONNECTION_RESET:
+ case CCAE_LLP_CONNECTION_LOST:
+ BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
+
+ spin_lock_irqsave(&qp->lock, flags);
+ if (qp->cm_id) {
+ qp->cm_id->rem_ref(qp->cm_id);
+ qp->cm_id = NULL;
+ }
+ spin_unlock_irqrestore(&qp->lock, flags);
+ cm_event.event = IW_CM_EVENT_CLOSE;
+ cm_event.status = 0;
+ if (cm_id->event_handler)
+ cm_id->event_handler(cm_id, &cm_event);
+ break;
+ default:
+ BUG_ON(1);
+ pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
+ "CM_ID=%p\n",
+ __func__, __LINE__,
+ event_id, qp, cm_id);
+ break;
+ }
+ break;
+ }
+
+ case C2_RES_IND_EP:{
+
+ struct c2wr_ae_connection_request *req =
+ &wr->ae.ae_connection_request;
+ struct iw_cm_id *cm_id =
+ (struct iw_cm_id *)resource_user_context;
+
+ pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
+ if (event_id != CCAE_CONNECTION_REQUEST) {
+ pr_debug("%s: Invalid event_id: %d\n",
+ __func__, event_id);
+ break;
+ }
+ cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+ cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
+ laddr->sin_addr.s_addr = req->laddr;
+ raddr->sin_addr.s_addr = req->raddr;
+ laddr->sin_port = req->lport;
+ raddr->sin_port = req->rport;
+ cm_event.private_data_len =
+ be32_to_cpu(req->private_data_length);
+ cm_event.private_data = req->private_data;
+ /*
+ * Until ird/ord negotiation via MPAv2 support is added, send
+ * max supported values
+ */
+ cm_event.ird = cm_event.ord = 128;
+
+ if (cm_id->event_handler)
+ cm_id->event_handler(cm_id, &cm_event);
+ break;
+ }
+
+ case C2_RES_IND_CQ:{
+ struct c2_cq *cq =
+ (struct c2_cq *) resource_user_context;
+
+ pr_debug("IB_EVENT_CQ_ERR\n");
+ ib_event.device = &c2dev->ibdev;
+ ib_event.element.cq = &cq->ibcq;
+ ib_event.event = IB_EVENT_CQ_ERR;
+
+ if (cq->ibcq.event_handler)
+ cq->ibcq.event_handler(&ib_event,
+ cq->ibcq.cq_context);
+ break;
+ }
+
+ default:
+ printk("Bad resource indicator = %d\n",
+ resource_indicator);
+ break;
+ }
+
+ ignore_it:
+ c2_mq_free(mq);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_ae.h b/drivers/staging/rdma/amso1100/c2_ae.h
new file mode 100644
index 000000000000..3a065c33b83b
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_ae.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_AE_H_
+#define _C2_AE_H_
+
+/*
+ * WARNING: If you change this file, also bump C2_IVN_BASE
+ * in common/include/clustercore/c2_ivn.h.
+ */
+
+/*
+ * Asynchronous Event Identifiers
+ *
+ * These start at 0x80 only so it's obvious from inspection that
+ * they are not work-request statuses. This isn't critical.
+ *
+ * NOTE: these event id's must fit in eight bits.
+ */
+enum c2_event_id {
+ CCAE_REMOTE_SHUTDOWN = 0x80,
+ CCAE_ACTIVE_CONNECT_RESULTS,
+ CCAE_CONNECTION_REQUEST,
+ CCAE_LLP_CLOSE_COMPLETE,
+ CCAE_TERMINATE_MESSAGE_RECEIVED,
+ CCAE_LLP_CONNECTION_RESET,
+ CCAE_LLP_CONNECTION_LOST,
+ CCAE_LLP_SEGMENT_SIZE_INVALID,
+ CCAE_LLP_INVALID_CRC,
+ CCAE_LLP_BAD_FPDU,
+ CCAE_INVALID_DDP_VERSION,
+ CCAE_INVALID_RDMA_VERSION,
+ CCAE_UNEXPECTED_OPCODE,
+ CCAE_INVALID_DDP_QUEUE_NUMBER,
+ CCAE_RDMA_READ_NOT_ENABLED,
+ CCAE_RDMA_WRITE_NOT_ENABLED,
+ CCAE_RDMA_READ_TOO_SMALL,
+ CCAE_NO_L_BIT,
+ CCAE_TAGGED_INVALID_STAG,
+ CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
+ CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
+ CCAE_TAGGED_INVALID_PD,
+ CCAE_WRAP_ERROR,
+ CCAE_BAD_CLOSE,
+ CCAE_BAD_LLP_CLOSE,
+ CCAE_INVALID_MSN_RANGE,
+ CCAE_INVALID_MSN_GAP,
+ CCAE_IRRQ_OVERFLOW,
+ CCAE_IRRQ_MSN_GAP,
+ CCAE_IRRQ_MSN_RANGE,
+ CCAE_IRRQ_INVALID_STAG,
+ CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
+ CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
+ CCAE_IRRQ_INVALID_PD,
+ CCAE_IRRQ_WRAP_ERROR,
+ CCAE_CQ_SQ_COMPLETION_OVERFLOW,
+ CCAE_CQ_RQ_COMPLETION_ERROR,
+ CCAE_QP_SRQ_WQE_ERROR,
+ CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
+ CCAE_CQ_OVERFLOW,
+ CCAE_CQ_OPERATION_ERROR,
+ CCAE_SRQ_LIMIT_REACHED,
+ CCAE_QP_RQ_LIMIT_REACHED,
+ CCAE_SRQ_CATASTROPHIC_ERROR,
+ CCAE_RNIC_CATASTROPHIC_ERROR
+/* WARNING If you add more id's, make sure their values fit in eight bits. */
+};
+
+/*
+ * Resource Indicators and Identifiers
+ */
+enum c2_resource_indicator {
+ C2_RES_IND_QP = 1,
+ C2_RES_IND_EP,
+ C2_RES_IND_CQ,
+ C2_RES_IND_SRQ,
+};
+
+#endif /* _C2_AE_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_alloc.c b/drivers/staging/rdma/amso1100/c2_alloc.c
new file mode 100644
index 000000000000..78d247ec6961
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_alloc.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/bitmap.h>
+
+#include "c2.h"
+
+static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
+ struct sp_chunk **head)
+{
+ int i;
+ struct sp_chunk *new_head;
+ dma_addr_t dma_addr;
+
+ new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
+ &dma_addr, gfp_mask);
+ if (new_head == NULL)
+ return -ENOMEM;
+
+ new_head->dma_addr = dma_addr;
+ dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
+
+ new_head->next = NULL;
+ new_head->head = 0;
+
+ /* build list where each index is the next free slot */
+ for (i = 0;
+ i < (PAGE_SIZE - sizeof(struct sp_chunk) -
+ sizeof(u16)) / sizeof(u16) - 1;
+ i++) {
+ new_head->shared_ptr[i] = i + 1;
+ }
+ /* terminate list */
+ new_head->shared_ptr[i] = 0xFFFF;
+
+ *head = new_head;
+ return 0;
+}
+
+int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+ struct sp_chunk **root)
+{
+ return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
+}
+
+void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
+{
+ struct sp_chunk *next;
+
+ while (root) {
+ next = root->next;
+ dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
+ dma_unmap_addr(root, mapping));
+ root = next;
+ }
+}
+
+__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+ dma_addr_t *dma_addr, gfp_t gfp_mask)
+{
+ u16 mqsp;
+
+ while (head) {
+ mqsp = head->head;
+ if (mqsp != 0xFFFF) {
+ head->head = head->shared_ptr[mqsp];
+ break;
+ } else if (head->next == NULL) {
+ if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
+ 0) {
+ head = head->next;
+ mqsp = head->head;
+ head->head = head->shared_ptr[mqsp];
+ break;
+ } else
+ return NULL;
+ } else
+ head = head->next;
+ }
+ if (head) {
+ *dma_addr = head->dma_addr +
+ ((unsigned long) &(head->shared_ptr[mqsp]) -
+ (unsigned long) head);
+ pr_debug("%s addr %p dma_addr %llx\n", __func__,
+ &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
+ return (__force __be16 *) &(head->shared_ptr[mqsp]);
+ }
+ return NULL;
+}
+
+void c2_free_mqsp(__be16 *mqsp)
+{
+ struct sp_chunk *head;
+ u16 idx;
+
+ /* The chunk containing this ptr begins at the page boundary */
+ head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
+
+ /* Link head to new mqsp */
+ *mqsp = (__force __be16) head->head;
+
+ /* Compute the shared_ptr index */
+ idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1;
+ idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
+
+ /* Point this index at the head */
+ head->shared_ptr[idx] = head->head;
+
+ /* Point head at this index */
+ head->head = idx;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_cm.c b/drivers/staging/rdma/amso1100/c2_cm.c
new file mode 100644
index 000000000000..23bfa94fbd4e
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_cm.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_vq.h"
+#include <rdma/iw_cm.h>
+
+int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ struct c2_dev *c2dev = to_c2dev(cm_id->device);
+ struct ib_qp *ibqp;
+ struct c2_qp *qp;
+ struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */
+ struct c2_vq_req *vq_req;
+ int err;
+ struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+ if (cm_id->remote_addr.ss_family != AF_INET)
+ return -ENOSYS;
+
+ ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+ if (!ibqp)
+ return -EINVAL;
+ qp = to_c2qp(ibqp);
+
+ /* Associate QP <--> CM_ID */
+ cm_id->provider_data = qp;
+ cm_id->add_ref(cm_id);
+ qp->cm_id = cm_id;
+
+ /*
+ * only support the max private_data length
+ */
+ if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+ err = -EINVAL;
+ goto bail0;
+ }
+ /*
+ * Set the rdma read limits
+ */
+ err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+ if (err)
+ goto bail0;
+
+ /*
+ * Create and send a WR_QP_CONNECT...
+ */
+ wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+ if (!wr) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ c2_wr_set_id(wr, CCWR_QP_CONNECT);
+ wr->hdr.context = 0;
+ wr->rnic_handle = c2dev->adapter_handle;
+ wr->qp_handle = qp->adapter_handle;
+
+ wr->remote_addr = raddr->sin_addr.s_addr;
+ wr->remote_port = raddr->sin_port;
+
+ /*
+ * Move any private data from the callers's buf into
+ * the WR.
+ */
+ if (iw_param->private_data) {
+ wr->private_data_length =
+ cpu_to_be32(iw_param->private_data_len);
+ memcpy(&wr->private_data[0], iw_param->private_data,
+ iw_param->private_data_len);
+ } else
+ wr->private_data_length = 0;
+
+ /*
+ * Send WR to adapter. NOTE: There is no synch reply from
+ * the adapter.
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) wr);
+ vq_req_free(c2dev, vq_req);
+
+ bail1:
+ kfree(wr);
+ bail0:
+ if (err) {
+ /*
+ * If we fail, release reference on QP and
+ * disassociate QP from CM_ID
+ */
+ cm_id->provider_data = NULL;
+ qp->cm_id = NULL;
+ cm_id->rem_ref(cm_id);
+ }
+ return err;
+}
+
+int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+ struct c2_dev *c2dev;
+ struct c2wr_ep_listen_create_req wr;
+ struct c2wr_ep_listen_create_rep *reply;
+ struct c2_vq_req *vq_req;
+ int err;
+ struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+
+ if (cm_id->local_addr.ss_family != AF_INET)
+ return -ENOSYS;
+
+ c2dev = to_c2dev(cm_id->device);
+ if (c2dev == NULL)
+ return -EINVAL;
+
+ /*
+ * Allocate verbs request.
+ */
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ /*
+ * Build the WR
+ */
+ c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
+ wr.hdr.context = (u64) (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.local_addr = laddr->sin_addr.s_addr;
+ wr.local_port = laddr->sin_port;
+ wr.backlog = cpu_to_be32(backlog);
+ wr.user_context = (u64) (unsigned long) cm_id;
+
+ /*
+ * Reference the request struct. Dereferenced in the int handler.
+ */
+ vq_req_get(c2dev, vq_req);
+
+ /*
+ * Send WR to adapter
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ /*
+ * Wait for reply from adapter
+ */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail0;
+
+ /*
+ * Process reply
+ */
+ reply =
+ (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ if ((err = c2_errno(reply)) != 0)
+ goto bail1;
+
+ /*
+ * Keep the adapter handle. Used in subsequent destroy
+ */
+ cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
+
+ /*
+ * free vq stuff
+ */
+ vq_repbuf_free(c2dev, reply);
+ vq_req_free(c2dev, vq_req);
+
+ return 0;
+
+ bail1:
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+
+int c2_llp_service_destroy(struct iw_cm_id *cm_id)
+{
+
+ struct c2_dev *c2dev;
+ struct c2wr_ep_listen_destroy_req wr;
+ struct c2wr_ep_listen_destroy_rep *reply;
+ struct c2_vq_req *vq_req;
+ int err;
+
+ c2dev = to_c2dev(cm_id->device);
+ if (c2dev == NULL)
+ return -EINVAL;
+
+ /*
+ * Allocate verbs request.
+ */
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ /*
+ * Build the WR
+ */
+ c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
+
+ /*
+ * reference the request struct. dereferenced in the int handler.
+ */
+ vq_req_get(c2dev, vq_req);
+
+ /*
+ * Send WR to adapter
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ /*
+ * Wait for reply from adapter
+ */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail0;
+
+ /*
+ * Process reply
+ */
+ reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+ if ((err = c2_errno(reply)) != 0)
+ goto bail1;
+
+ bail1:
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ struct c2_dev *c2dev = to_c2dev(cm_id->device);
+ struct c2_qp *qp;
+ struct ib_qp *ibqp;
+ struct c2wr_cr_accept_req *wr; /* variable length WR */
+ struct c2_vq_req *vq_req;
+ struct c2wr_cr_accept_rep *reply; /* VQ Reply msg ptr. */
+ int err;
+
+ ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+ if (!ibqp)
+ return -EINVAL;
+ qp = to_c2qp(ibqp);
+
+ /* Set the RDMA read limits */
+ err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+ if (err)
+ goto bail0;
+
+ /* Allocate verbs request. */
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+ vq_req->qp = qp;
+ vq_req->cm_id = cm_id;
+ vq_req->event = IW_CM_EVENT_ESTABLISHED;
+
+ wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+ if (!wr) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ /* Build the WR */
+ c2_wr_set_id(wr, CCWR_CR_ACCEPT);
+ wr->hdr.context = (unsigned long) vq_req;
+ wr->rnic_handle = c2dev->adapter_handle;
+ wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
+ wr->qp_handle = qp->adapter_handle;
+
+ /* Replace the cr_handle with the QP after accept */
+ cm_id->provider_data = qp;
+ cm_id->add_ref(cm_id);
+ qp->cm_id = cm_id;
+
+ cm_id->provider_data = qp;
+
+ /* Validate private_data length */
+ if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+ err = -EINVAL;
+ goto bail1;
+ }
+
+ if (iw_param->private_data) {
+ wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
+ memcpy(&wr->private_data[0],
+ iw_param->private_data, iw_param->private_data_len);
+ } else
+ wr->private_data_length = 0;
+
+ /* Reference the request struct. Dereferenced in the int handler. */
+ vq_req_get(c2dev, vq_req);
+
+ /* Send WR to adapter */
+ err = vq_send_wr(c2dev, (union c2wr *) wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail1;
+ }
+
+ /* Wait for reply from adapter */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail1;
+
+ /* Check that reply is present */
+ reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ err = c2_errno(reply);
+ vq_repbuf_free(c2dev, reply);
+
+ if (!err)
+ c2_set_qp_state(qp, C2_QP_STATE_RTS);
+ bail1:
+ kfree(wr);
+ vq_req_free(c2dev, vq_req);
+ bail0:
+ if (err) {
+ /*
+ * If we fail, release reference on QP and
+ * disassociate QP from CM_ID
+ */
+ cm_id->provider_data = NULL;
+ qp->cm_id = NULL;
+ cm_id->rem_ref(cm_id);
+ }
+ return err;
+}
+
+int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+ struct c2_dev *c2dev;
+ struct c2wr_cr_reject_req wr;
+ struct c2_vq_req *vq_req;
+ struct c2wr_cr_reject_rep *reply;
+ int err;
+
+ c2dev = to_c2dev(cm_id->device);
+
+ /*
+ * Allocate verbs request.
+ */
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ /*
+ * Build the WR
+ */
+ c2_wr_set_id(&wr, CCWR_CR_REJECT);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
+
+ /*
+ * reference the request struct. dereferenced in the int handler.
+ */
+ vq_req_get(c2dev, vq_req);
+
+ /*
+ * Send WR to adapter
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ /*
+ * Wait for reply from adapter
+ */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail0;
+
+ /*
+ * Process reply
+ */
+ reply = (struct c2wr_cr_reject_rep *) (unsigned long)
+ vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+ err = c2_errno(reply);
+ /*
+ * free vq stuff
+ */
+ vq_repbuf_free(c2dev, reply);
+
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c
new file mode 100644
index 000000000000..1b63185b4ad4
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_cq.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
+
+static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
+{
+ struct c2_cq *cq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&c2dev->lock, flags);
+ cq = c2dev->qptr_array[cqn];
+ if (!cq) {
+ spin_unlock_irqrestore(&c2dev->lock, flags);
+ return NULL;
+ }
+ atomic_inc(&cq->refcount);
+ spin_unlock_irqrestore(&c2dev->lock, flags);
+ return cq;
+}
+
+static void c2_cq_put(struct c2_cq *cq)
+{
+ if (atomic_dec_and_test(&cq->refcount))
+ wake_up(&cq->wait);
+}
+
+void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
+{
+ struct c2_cq *cq;
+
+ cq = c2_cq_get(c2dev, mq_index);
+ if (!cq) {
+ printk("discarding events on destroyed CQN=%d\n", mq_index);
+ return;
+ }
+
+ (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+ c2_cq_put(cq);
+}
+
+void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
+{
+ struct c2_cq *cq;
+ struct c2_mq *q;
+
+ cq = c2_cq_get(c2dev, mq_index);
+ if (!cq)
+ return;
+
+ spin_lock_irq(&cq->lock);
+ q = &cq->mq;
+ if (q && !c2_mq_empty(q)) {
+ u16 priv = q->priv;
+ struct c2wr_ce *msg;
+
+ while (priv != be16_to_cpu(*q->shared)) {
+ msg = (struct c2wr_ce *)
+ (q->msg_pool.host + priv * q->msg_size);
+ if (msg->qp_user_context == (u64) (unsigned long) qp) {
+ msg->qp_user_context = (u64) 0;
+ }
+ priv = (priv + 1) % q->q_size;
+ }
+ }
+ spin_unlock_irq(&cq->lock);
+ c2_cq_put(cq);
+}
+
+static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
+{
+ switch (status) {
+ case C2_OK:
+ return IB_WC_SUCCESS;
+ case CCERR_FLUSHED:
+ return IB_WC_WR_FLUSH_ERR;
+ case CCERR_BASE_AND_BOUNDS_VIOLATION:
+ return IB_WC_LOC_PROT_ERR;
+ case CCERR_ACCESS_VIOLATION:
+ return IB_WC_LOC_ACCESS_ERR;
+ case CCERR_TOTAL_LENGTH_TOO_BIG:
+ return IB_WC_LOC_LEN_ERR;
+ case CCERR_INVALID_WINDOW:
+ return IB_WC_MW_BIND_ERR;
+ default:
+ return IB_WC_GENERAL_ERR;
+ }
+}
+
+
+static inline int c2_poll_one(struct c2_dev *c2dev,
+ struct c2_cq *cq, struct ib_wc *entry)
+{
+ struct c2wr_ce *ce;
+ struct c2_qp *qp;
+ int is_recv = 0;
+
+ ce = c2_mq_consume(&cq->mq);
+ if (!ce) {
+ return -EAGAIN;
+ }
+
+ /*
+ * if the qp returned is null then this qp has already
+ * been freed and we are unable process the completion.
+ * try pulling the next message
+ */
+ while ((qp =
+ (struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
+ c2_mq_free(&cq->mq);
+ ce = c2_mq_consume(&cq->mq);
+ if (!ce)
+ return -EAGAIN;
+ }
+
+ entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
+ entry->wr_id = ce->hdr.context;
+ entry->qp = &qp->ibqp;
+ entry->wc_flags = 0;
+ entry->slid = 0;
+ entry->sl = 0;
+ entry->src_qp = 0;
+ entry->dlid_path_bits = 0;
+ entry->pkey_index = 0;
+
+ switch (c2_wr_get_id(ce)) {
+ case C2_WR_TYPE_SEND:
+ entry->opcode = IB_WC_SEND;
+ break;
+ case C2_WR_TYPE_RDMA_WRITE:
+ entry->opcode = IB_WC_RDMA_WRITE;
+ break;
+ case C2_WR_TYPE_RDMA_READ:
+ entry->opcode = IB_WC_RDMA_READ;
+ break;
+ case C2_WR_TYPE_BIND_MW:
+ entry->opcode = IB_WC_BIND_MW;
+ break;
+ case C2_WR_TYPE_RECV:
+ entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
+ entry->opcode = IB_WC_RECV;
+ is_recv = 1;
+ break;
+ default:
+ break;
+ }
+
+ /* consume the WQEs */
+ if (is_recv)
+ c2_mq_lconsume(&qp->rq_mq, 1);
+ else
+ c2_mq_lconsume(&qp->sq_mq,
+ be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
+
+ /* free the message */
+ c2_mq_free(&cq->mq);
+
+ return 0;
+}
+
+int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+ struct c2_dev *c2dev = to_c2dev(ibcq->device);
+ struct c2_cq *cq = to_c2cq(ibcq);
+ unsigned long flags;
+ int npolled, err;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ for (npolled = 0; npolled < num_entries; ++npolled) {
+
+ err = c2_poll_one(c2dev, cq, entry + npolled);
+ if (err)
+ break;
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return npolled;
+}
+
+int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+ struct c2_mq_shared __iomem *shared;
+ struct c2_cq *cq;
+ unsigned long flags;
+ int ret = 0;
+
+ cq = to_c2cq(ibcq);
+ shared = cq->mq.peer;
+
+ if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+ writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
+ else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+ writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
+ else
+ return -EINVAL;
+
+ writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
+
+ /*
+ * Now read back shared->armed to make the PCI
+ * write synchronous. This is necessary for
+ * correct cq notification semantics.
+ */
+ readb(&shared->armed);
+
+ if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+ spin_lock_irqsave(&cq->lock, flags);
+ ret = !c2_mq_empty(&cq->mq);
+ spin_unlock_irqrestore(&cq->lock, flags);
+ }
+
+ return ret;
+}
+
+static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
+{
+ dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
+ mq->msg_pool.host, dma_unmap_addr(mq, mapping));
+}
+
+static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
+ size_t q_size, size_t msg_size)
+{
+ u8 *pool_start;
+
+ if (q_size > SIZE_MAX / msg_size)
+ return -EINVAL;
+
+ pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
+ &mq->host_dma, GFP_KERNEL);
+ if (!pool_start)
+ return -ENOMEM;
+
+ c2_mq_rep_init(mq,
+ 0, /* index (currently unknown) */
+ q_size,
+ msg_size,
+ pool_start,
+ NULL, /* peer (currently unknown) */
+ C2_MQ_HOST_TARGET);
+
+ dma_unmap_addr_set(mq, mapping, mq->host_dma);
+
+ return 0;
+}
+
+int c2_init_cq(struct c2_dev *c2dev, int entries,
+ struct c2_ucontext *ctx, struct c2_cq *cq)
+{
+ struct c2wr_cq_create_req wr;
+ struct c2wr_cq_create_rep *reply;
+ unsigned long peer_pa;
+ struct c2_vq_req *vq_req;
+ int err;
+
+ might_sleep();
+
+ cq->ibcq.cqe = entries - 1;
+ cq->is_kernel = !ctx;
+
+ /* Allocate a shared pointer */
+ cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+ &cq->mq.shared_dma, GFP_KERNEL);
+ if (!cq->mq.shared)
+ return -ENOMEM;
+
+ /* Allocate pages for the message pool */
+ err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
+ if (err)
+ goto bail0;
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ memset(&wr, 0, sizeof(wr));
+ c2_wr_set_id(&wr, CCWR_CQ_CREATE);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.msg_size = cpu_to_be32(cq->mq.msg_size);
+ wr.depth = cpu_to_be32(cq->mq.q_size);
+ wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
+ wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
+ wr.user_context = (u64) (unsigned long) (cq);
+
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail2;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail2;
+
+ reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail2;
+ }
+
+ if ((err = c2_errno(reply)) != 0)
+ goto bail3;
+
+ cq->adapter_handle = reply->cq_handle;
+ cq->mq.index = be32_to_cpu(reply->mq_index);
+
+ peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
+ cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
+ if (!cq->mq.peer) {
+ err = -ENOMEM;
+ goto bail3;
+ }
+
+ vq_repbuf_free(c2dev, reply);
+ vq_req_free(c2dev, vq_req);
+
+ spin_lock_init(&cq->lock);
+ atomic_set(&cq->refcount, 1);
+ init_waitqueue_head(&cq->wait);
+
+ /*
+ * Use the MQ index allocated by the adapter to
+ * store the CQ in the qptr_array
+ */
+ cq->cqn = cq->mq.index;
+ c2dev->qptr_array[cq->cqn] = cq;
+
+ return 0;
+
+ bail3:
+ vq_repbuf_free(c2dev, reply);
+ bail2:
+ vq_req_free(c2dev, vq_req);
+ bail1:
+ c2_free_cq_buf(c2dev, &cq->mq);
+ bail0:
+ c2_free_mqsp(cq->mq.shared);
+
+ return err;
+}
+
+void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
+{
+ int err;
+ struct c2_vq_req *vq_req;
+ struct c2wr_cq_destroy_req wr;
+ struct c2wr_cq_destroy_rep *reply;
+
+ might_sleep();
+
+ /* Clear CQ from the qptr array */
+ spin_lock_irq(&c2dev->lock);
+ c2dev->qptr_array[cq->mq.index] = NULL;
+ atomic_dec(&cq->refcount);
+ spin_unlock_irq(&c2dev->lock);
+
+ wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req) {
+ goto bail0;
+ }
+
+ memset(&wr, 0, sizeof(wr));
+ c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.cq_handle = cq->adapter_handle;
+
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail1;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail1;
+
+ reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+ if (reply)
+ vq_repbuf_free(c2dev, reply);
+ bail1:
+ vq_req_free(c2dev, vq_req);
+ bail0:
+ if (cq->is_kernel) {
+ c2_free_cq_buf(c2dev, &cq->mq);
+ }
+
+ return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_intr.c b/drivers/staging/rdma/amso1100/c2_intr.c
new file mode 100644
index 000000000000..3a17d9b36dba
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_intr.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_vq.h"
+
+static void handle_mq(struct c2_dev *c2dev, u32 index);
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
+
+/*
+ * Handle RNIC interrupts
+ */
+void c2_rnic_interrupt(struct c2_dev *c2dev)
+{
+ unsigned int mq_index;
+
+ while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
+ mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
+ if (mq_index & 0x80000000) {
+ break;
+ }
+
+ c2dev->hints_read++;
+ handle_mq(c2dev, mq_index);
+ }
+
+}
+
+/*
+ * Top level MQ handler
+ */
+static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
+{
+ if (c2dev->qptr_array[mq_index] == NULL) {
+ pr_debug("handle_mq: stray activity for mq_index=%d\n",
+ mq_index);
+ return;
+ }
+
+ switch (mq_index) {
+ case (0):
+ /*
+ * An index of 0 in the activity queue
+ * indicates the req vq now has messages
+ * available...
+ *
+ * Wake up any waiters waiting on req VQ
+ * message availability.
+ */
+ wake_up(&c2dev->req_vq_wo);
+ break;
+ case (1):
+ handle_vq(c2dev, mq_index);
+ break;
+ case (2):
+ /* We have to purge the VQ in case there are pending
+ * accept reply requests that would result in the
+ * generation of an ESTABLISHED event. If we don't
+ * generate these first, a CLOSE event could end up
+ * being delivered before the ESTABLISHED event.
+ */
+ handle_vq(c2dev, 1);
+
+ c2_ae_event(c2dev, mq_index);
+ break;
+ default:
+ /* There is no event synchronization between CQ events
+ * and AE or CM events. In fact, CQE could be
+ * delivered for all of the I/O up to and including the
+ * FLUSH for a peer disconenct prior to the ESTABLISHED
+ * event being delivered to the app. The reason for this
+ * is that CM events are delivered on a thread, while AE
+ * and CM events are delivered on interrupt context.
+ */
+ c2_cq_event(c2dev, mq_index);
+ break;
+ }
+
+ return;
+}
+
+/*
+ * Handles verbs WR replies.
+ */
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
+{
+ void *adapter_msg, *reply_msg;
+ struct c2wr_hdr *host_msg;
+ struct c2wr_hdr tmp;
+ struct c2_mq *reply_vq;
+ struct c2_vq_req *req;
+ struct iw_cm_event cm_event;
+ int err;
+
+ reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index];
+
+ /*
+ * get next msg from mq_index into adapter_msg.
+ * don't free it yet.
+ */
+ adapter_msg = c2_mq_consume(reply_vq);
+ if (adapter_msg == NULL) {
+ return;
+ }
+
+ host_msg = vq_repbuf_alloc(c2dev);
+
+ /*
+ * If we can't get a host buffer, then we'll still
+ * wakeup the waiter, we just won't give him the msg.
+ * It is assumed the waiter will deal with this...
+ */
+ if (!host_msg) {
+ pr_debug("handle_vq: no repbufs!\n");
+
+ /*
+ * just copy the WR header into a local variable.
+ * this allows us to still demux on the context
+ */
+ host_msg = &tmp;
+ memcpy(host_msg, adapter_msg, sizeof(tmp));
+ reply_msg = NULL;
+ } else {
+ memcpy(host_msg, adapter_msg, reply_vq->msg_size);
+ reply_msg = host_msg;
+ }
+
+ /*
+ * consume the msg from the MQ
+ */
+ c2_mq_free(reply_vq);
+
+ /*
+ * wakeup the waiter.
+ */
+ req = (struct c2_vq_req *) (unsigned long) host_msg->context;
+ if (req == NULL) {
+ /*
+ * We should never get here, as the adapter should
+ * never send us a reply that we're not expecting.
+ */
+ if (reply_msg != NULL)
+ vq_repbuf_free(c2dev, host_msg);
+ pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
+ return;
+ }
+
+ if (reply_msg)
+ err = c2_errno(reply_msg);
+ else
+ err = -ENOMEM;
+
+ if (!err) switch (req->event) {
+ case IW_CM_EVENT_ESTABLISHED:
+ c2_set_qp_state(req->qp,
+ C2_QP_STATE_RTS);
+ /*
+ * Until ird/ord negotiation via MPAv2 support is added, send
+ * max supported values
+ */
+ cm_event.ird = cm_event.ord = 128;
+ case IW_CM_EVENT_CLOSE:
+
+ /*
+ * Move the QP to RTS if this is
+ * the established event
+ */
+ cm_event.event = req->event;
+ cm_event.status = 0;
+ cm_event.local_addr = req->cm_id->local_addr;
+ cm_event.remote_addr = req->cm_id->remote_addr;
+ cm_event.private_data = NULL;
+ cm_event.private_data_len = 0;
+ req->cm_id->event_handler(req->cm_id, &cm_event);
+ break;
+ default:
+ break;
+ }
+
+ req->reply_msg = (u64) (unsigned long) (reply_msg);
+ atomic_set(&req->reply_ready, 1);
+ wake_up(&req->wait_object);
+
+ /*
+ * If the request was cancelled, then this put will
+ * free the vq_req memory...and reply_msg!!!
+ */
+ vq_req_put(c2dev, req);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mm.c b/drivers/staging/rdma/amso1100/c2_mm.c
new file mode 100644
index 000000000000..119c4f3d9791
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_mm.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+
+#define PBL_VIRT 1
+#define PBL_PHYS 2
+
+/*
+ * Send all the PBL messages to convey the remainder of the PBL
+ * Wait for the adapter's reply on the last one.
+ * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
+ *
+ * NOTE: vq_req is _not_ freed by this function. The VQ Host
+ * Reply buffer _is_ freed by this function.
+ */
+static int
+send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
+ unsigned long va, u32 pbl_depth,
+ struct c2_vq_req *vq_req, int pbl_type)
+{
+ u32 pbe_count; /* amt that fits in a PBL msg */
+ u32 count; /* amt in this PBL MSG. */
+ struct c2wr_nsmr_pbl_req *wr; /* PBL WR ptr */
+ struct c2wr_nsmr_pbl_rep *reply; /* reply ptr */
+ int err, pbl_virt, pbl_index, i;
+
+ switch (pbl_type) {
+ case PBL_VIRT:
+ pbl_virt = 1;
+ break;
+ case PBL_PHYS:
+ pbl_virt = 0;
+ break;
+ default:
+ return -EINVAL;
+ break;
+ }
+
+ pbe_count = (c2dev->req_vq.msg_size -
+ sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
+ wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+ if (!wr) {
+ return -ENOMEM;
+ }
+ c2_wr_set_id(wr, CCWR_NSMR_PBL);
+
+ /*
+ * Only the last PBL message will generate a reply from the verbs,
+ * so we set the context to 0 indicating there is no kernel verbs
+ * handler blocked awaiting this reply.
+ */
+ wr->hdr.context = 0;
+ wr->rnic_handle = c2dev->adapter_handle;
+ wr->stag_index = stag_index; /* already swapped */
+ wr->flags = 0;
+ pbl_index = 0;
+ while (pbl_depth) {
+ count = min(pbe_count, pbl_depth);
+ wr->addrs_length = cpu_to_be32(count);
+
+ /*
+ * If this is the last message, then reference the
+ * vq request struct cuz we're gonna wait for a reply.
+ * also make this PBL msg as the last one.
+ */
+ if (count == pbl_depth) {
+ /*
+ * reference the request struct. dereferenced in the
+ * int handler.
+ */
+ vq_req_get(c2dev, vq_req);
+ wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
+
+ /*
+ * This is the last PBL message.
+ * Set the context to our VQ Request Object so we can
+ * wait for the reply.
+ */
+ wr->hdr.context = (unsigned long) vq_req;
+ }
+
+ /*
+ * If pbl_virt is set then va is a virtual address
+ * that describes a virtually contiguous memory
+ * allocation. The wr needs the start of each virtual page
+ * to be converted to the corresponding physical address
+ * of the page. If pbl_virt is not set then va is an array
+ * of physical addresses and there is no conversion to do.
+ * Just fill in the wr with what is in the array.
+ */
+ for (i = 0; i < count; i++) {
+ if (pbl_virt) {
+ va += PAGE_SIZE;
+ } else {
+ wr->paddrs[i] =
+ cpu_to_be64(((u64 *)va)[pbl_index + i]);
+ }
+ }
+
+ /*
+ * Send WR to adapter
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) wr);
+ if (err) {
+ if (count <= pbe_count) {
+ vq_req_put(c2dev, vq_req);
+ }
+ goto bail0;
+ }
+ pbl_depth -= count;
+ pbl_index += count;
+ }
+
+ /*
+ * Now wait for the reply...
+ */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err) {
+ goto bail0;
+ }
+
+ /*
+ * Process reply
+ */
+ reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ err = c2_errno(reply);
+
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ kfree(wr);
+ return err;
+}
+
+#define C2_PBL_MAX_DEPTH 131072
+int
+c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ int page_size, int pbl_depth, u32 length,
+ u32 offset, u64 *va, enum c2_acf acf,
+ struct c2_mr *mr)
+{
+ struct c2_vq_req *vq_req;
+ struct c2wr_nsmr_register_req *wr;
+ struct c2wr_nsmr_register_rep *reply;
+ u16 flags;
+ int i, pbe_count, count;
+ int err;
+
+ if (!va || !length || !addr_list || !pbl_depth)
+ return -EINTR;
+
+ /*
+ * Verify PBL depth is within rnic max
+ */
+ if (pbl_depth > C2_PBL_MAX_DEPTH) {
+ return -EINTR;
+ }
+
+ /*
+ * allocate verbs request object
+ */
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+ if (!wr) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ /*
+ * build the WR
+ */
+ c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
+ wr->hdr.context = (unsigned long) vq_req;
+ wr->rnic_handle = c2dev->adapter_handle;
+
+ flags = (acf | MEM_VA_BASED | MEM_REMOTE);
+
+ /*
+ * compute how many pbes can fit in the message
+ */
+ pbe_count = (c2dev->req_vq.msg_size -
+ sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
+
+ if (pbl_depth <= pbe_count) {
+ flags |= MEM_PBL_COMPLETE;
+ }
+ wr->flags = cpu_to_be16(flags);
+ wr->stag_key = 0; //stag_key;
+ wr->va = cpu_to_be64(*va);
+ wr->pd_id = mr->pd->pd_id;
+ wr->pbe_size = cpu_to_be32(page_size);
+ wr->length = cpu_to_be32(length);
+ wr->pbl_depth = cpu_to_be32(pbl_depth);
+ wr->fbo = cpu_to_be32(offset);
+ count = min(pbl_depth, pbe_count);
+ wr->addrs_length = cpu_to_be32(count);
+
+ /*
+ * fill out the PBL for this message
+ */
+ for (i = 0; i < count; i++) {
+ wr->paddrs[i] = cpu_to_be64(addr_list[i]);
+ }
+
+ /*
+ * regerence the request struct
+ */
+ vq_req_get(c2dev, vq_req);
+
+ /*
+ * send the WR to the adapter
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail1;
+ }
+
+ /*
+ * wait for reply from adapter
+ */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err) {
+ goto bail1;
+ }
+
+ /*
+ * process reply
+ */
+ reply =
+ (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+ if ((err = c2_errno(reply))) {
+ goto bail2;
+ }
+ //*p_pb_entries = be32_to_cpu(reply->pbl_depth);
+ mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
+ vq_repbuf_free(c2dev, reply);
+
+ /*
+ * if there are still more PBEs we need to send them to
+ * the adapter and wait for a reply on the final one.
+ * reuse vq_req for this purpose.
+ */
+ pbl_depth -= count;
+ if (pbl_depth) {
+
+ vq_req->reply_msg = (unsigned long) NULL;
+ atomic_set(&vq_req->reply_ready, 0);
+ err = send_pbl_messages(c2dev,
+ cpu_to_be32(mr->ibmr.lkey),
+ (unsigned long) &addr_list[i],
+ pbl_depth, vq_req, PBL_PHYS);
+ if (err) {
+ goto bail1;
+ }
+ }
+
+ vq_req_free(c2dev, vq_req);
+ kfree(wr);
+
+ return err;
+
+ bail2:
+ vq_repbuf_free(c2dev, reply);
+ bail1:
+ kfree(wr);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
+{
+ struct c2_vq_req *vq_req; /* verbs request object */
+ struct c2wr_stag_dealloc_req wr; /* work request */
+ struct c2wr_stag_dealloc_rep *reply; /* WR reply */
+ int err;
+
+
+ /*
+ * allocate verbs request object
+ */
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req) {
+ return -ENOMEM;
+ }
+
+ /*
+ * Build the WR
+ */
+ c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
+ wr.hdr.context = (u64) (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.stag_index = cpu_to_be32(stag_index);
+
+ /*
+ * reference the request struct. dereferenced in the int handler.
+ */
+ vq_req_get(c2dev, vq_req);
+
+ /*
+ * Send WR to adapter
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ /*
+ * Wait for reply from adapter
+ */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err) {
+ goto bail0;
+ }
+
+ /*
+ * Process reply
+ */
+ reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ err = c2_errno(reply);
+
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.c b/drivers/staging/rdma/amso1100/c2_mq.c
new file mode 100644
index 000000000000..0cddc49beae1
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_mq.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include "c2_mq.h"
+
+void *c2_mq_alloc(struct c2_mq *q)
+{
+ BUG_ON(q->magic != C2_MQ_MAGIC);
+ BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+ if (c2_mq_full(q)) {
+ return NULL;
+ } else {
+#ifdef DEBUG
+ struct c2wr_hdr *m =
+ (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+ BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
+ m->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+ return m;
+#else
+ return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+ }
+}
+
+void c2_mq_produce(struct c2_mq *q)
+{
+ BUG_ON(q->magic != C2_MQ_MAGIC);
+ BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+ if (!c2_mq_full(q)) {
+ q->priv = (q->priv + 1) % q->q_size;
+ q->hint_count++;
+ /* Update peer's offset. */
+ __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+ }
+}
+
+void *c2_mq_consume(struct c2_mq *q)
+{
+ BUG_ON(q->magic != C2_MQ_MAGIC);
+ BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+ if (c2_mq_empty(q)) {
+ return NULL;
+ } else {
+#ifdef DEBUG
+ struct c2wr_hdr *m = (struct c2wr_hdr *)
+ (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+ BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
+#endif
+ return m;
+#else
+ return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+ }
+}
+
+void c2_mq_free(struct c2_mq *q)
+{
+ BUG_ON(q->magic != C2_MQ_MAGIC);
+ BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+ if (!c2_mq_empty(q)) {
+
+#ifdef CCMSGMAGIC
+ {
+ struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
+ (q->msg_pool.adapter + q->priv * q->msg_size);
+ __raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
+ }
+#endif
+ q->priv = (q->priv + 1) % q->q_size;
+ /* Update peer's offset. */
+ __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+ }
+}
+
+
+void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
+{
+ BUG_ON(q->magic != C2_MQ_MAGIC);
+ BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+ while (wqe_count--) {
+ BUG_ON(c2_mq_empty(q));
+ *q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
+ }
+}
+
+#if 0
+u32 c2_mq_count(struct c2_mq *q)
+{
+ s32 count;
+
+ if (q->type == C2_MQ_HOST_TARGET)
+ count = be16_to_cpu(*q->shared) - q->priv;
+ else
+ count = q->priv - be16_to_cpu(*q->shared);
+
+ if (count < 0)
+ count += q->q_size;
+
+ return (u32) count;
+}
+#endif /* 0 */
+
+void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+ u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
+{
+ BUG_ON(!q->shared);
+
+ /* This code assumes the byte swapping has already been done! */
+ q->index = index;
+ q->q_size = q_size;
+ q->msg_size = msg_size;
+ q->msg_pool.adapter = pool_start;
+ q->peer = (struct c2_mq_shared __iomem *) peer;
+ q->magic = C2_MQ_MAGIC;
+ q->type = type;
+ q->priv = 0;
+ q->hint_count = 0;
+ return;
+}
+void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+ u8 *pool_start, u16 __iomem *peer, u32 type)
+{
+ BUG_ON(!q->shared);
+
+ /* This code assumes the byte swapping has already been done! */
+ q->index = index;
+ q->q_size = q_size;
+ q->msg_size = msg_size;
+ q->msg_pool.host = pool_start;
+ q->peer = (struct c2_mq_shared __iomem *) peer;
+ q->magic = C2_MQ_MAGIC;
+ q->type = type;
+ q->priv = 0;
+ q->hint_count = 0;
+ return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.h b/drivers/staging/rdma/amso1100/c2_mq.h
new file mode 100644
index 000000000000..fc1b9a7cec4b
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_mq.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _C2_MQ_H_
+#define _C2_MQ_H_
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include "c2_wr.h"
+
+enum c2_shared_regs {
+
+ C2_SHARED_ARMED = 0x10,
+ C2_SHARED_NOTIFY = 0x18,
+ C2_SHARED_SHARED = 0x40,
+};
+
+struct c2_mq_shared {
+ u16 unused1;
+ u8 armed;
+ u8 notification_type;
+ u32 unused2;
+ u16 shared;
+ /* Pad to 64 bytes. */
+ u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
+};
+
+enum c2_mq_type {
+ C2_MQ_HOST_TARGET = 1,
+ C2_MQ_ADAPTER_TARGET = 2,
+};
+
+/*
+ * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
+ * c2_user_mq_t (which is the same format) is for user-mode MQs...
+ */
+#define C2_MQ_MAGIC 0x4d512020 /* 'MQ ' */
+struct c2_mq {
+ u32 magic;
+ union {
+ u8 *host;
+ u8 __iomem *adapter;
+ } msg_pool;
+ dma_addr_t host_dma;
+ DEFINE_DMA_UNMAP_ADDR(mapping);
+ u16 hint_count;
+ u16 priv;
+ struct c2_mq_shared __iomem *peer;
+ __be16 *shared;
+ dma_addr_t shared_dma;
+ u32 q_size;
+ u32 msg_size;
+ u32 index;
+ enum c2_mq_type type;
+};
+
+static __inline__ int c2_mq_empty(struct c2_mq *q)
+{
+ return q->priv == be16_to_cpu(*q->shared);
+}
+
+static __inline__ int c2_mq_full(struct c2_mq *q)
+{
+ return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
+}
+
+extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
+extern void *c2_mq_alloc(struct c2_mq *q);
+extern void c2_mq_produce(struct c2_mq *q);
+extern void *c2_mq_consume(struct c2_mq *q);
+extern void c2_mq_free(struct c2_mq *q);
+extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+ u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
+extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+ u8 *pool_start, u16 __iomem *peer, u32 type);
+
+#endif /* _C2_MQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_pd.c b/drivers/staging/rdma/amso1100/c2_pd.c
new file mode 100644
index 000000000000..f3e81dc357bb
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_pd.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "c2.h"
+#include "c2_provider.h"
+
+int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
+{
+ u32 obj;
+ int ret = 0;
+
+ spin_lock(&c2dev->pd_table.lock);
+ obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
+ c2dev->pd_table.last);
+ if (obj >= c2dev->pd_table.max)
+ obj = find_first_zero_bit(c2dev->pd_table.table,
+ c2dev->pd_table.max);
+ if (obj < c2dev->pd_table.max) {
+ pd->pd_id = obj;
+ __set_bit(obj, c2dev->pd_table.table);
+ c2dev->pd_table.last = obj+1;
+ if (c2dev->pd_table.last >= c2dev->pd_table.max)
+ c2dev->pd_table.last = 0;
+ } else
+ ret = -ENOMEM;
+ spin_unlock(&c2dev->pd_table.lock);
+ return ret;
+}
+
+void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
+{
+ spin_lock(&c2dev->pd_table.lock);
+ __clear_bit(pd->pd_id, c2dev->pd_table.table);
+ spin_unlock(&c2dev->pd_table.lock);
+}
+
+int c2_init_pd_table(struct c2_dev *c2dev)
+{
+
+ c2dev->pd_table.last = 0;
+ c2dev->pd_table.max = c2dev->props.max_pd;
+ spin_lock_init(&c2dev->pd_table.lock);
+ c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
+ sizeof(long), GFP_KERNEL);
+ if (!c2dev->pd_table.table)
+ return -ENOMEM;
+ bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
+ return 0;
+}
+
+void c2_cleanup_pd_table(struct c2_dev *c2dev)
+{
+ kfree(c2dev->pd_table.table);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c
new file mode 100644
index 000000000000..25c3f0085563
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_provider.c
@@ -0,0 +1,912 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_arp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include "c2.h"
+#include "c2_provider.h"
+#include "c2_user.h"
+
+static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+ struct ib_udata *uhw)
+{
+ struct c2_dev *c2dev = to_c2dev(ibdev);
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ if (uhw->inlen || uhw->outlen)
+ return -EINVAL;
+
+ *props = c2dev->props;
+ return 0;
+}
+
+static int c2_query_port(struct ib_device *ibdev,
+ u8 port, struct ib_port_attr *props)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ props->max_mtu = IB_MTU_4096;
+ props->lid = 0;
+ props->lmc = 0;
+ props->sm_lid = 0;
+ props->sm_sl = 0;
+ props->state = IB_PORT_ACTIVE;
+ props->phys_state = 0;
+ props->port_cap_flags =
+ IB_PORT_CM_SUP |
+ IB_PORT_REINIT_SUP |
+ IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+ props->gid_tbl_len = 1;
+ props->pkey_tbl_len = 1;
+ props->qkey_viol_cntr = 0;
+ props->active_width = 1;
+ props->active_speed = IB_SPEED_SDR;
+
+ return 0;
+}
+
+static int c2_query_pkey(struct ib_device *ibdev,
+ u8 port, u16 index, u16 * pkey)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ *pkey = 0;
+ return 0;
+}
+
+static int c2_query_gid(struct ib_device *ibdev, u8 port,
+ int index, union ib_gid *gid)
+{
+ struct c2_dev *c2dev = to_c2dev(ibdev);
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+ memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
+
+ return 0;
+}
+
+/* Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ struct c2_ucontext *context;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ context = kmalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
+ return ERR_PTR(-ENOMEM);
+
+ return &context->ibucontext;
+}
+
+static int c2_dealloc_ucontext(struct ib_ucontext *context)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ kfree(context);
+ return 0;
+}
+
+static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return -ENOSYS;
+}
+
+static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct c2_pd *pd;
+ int err;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
+ if (err) {
+ kfree(pd);
+ return ERR_PTR(err);
+ }
+
+ if (context) {
+ if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
+ c2_pd_free(to_c2dev(ibdev), pd);
+ kfree(pd);
+ return ERR_PTR(-EFAULT);
+ }
+ }
+
+ return &pd->ibpd;
+}
+
+static int c2_dealloc_pd(struct ib_pd *pd)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
+ kfree(pd);
+
+ return 0;
+}
+
+static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return ERR_PTR(-ENOSYS);
+}
+
+static int c2_ah_destroy(struct ib_ah *ah)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return -ENOSYS;
+}
+
+static void c2_add_ref(struct ib_qp *ibqp)
+{
+ struct c2_qp *qp;
+ BUG_ON(!ibqp);
+ qp = to_c2qp(ibqp);
+ atomic_inc(&qp->refcount);
+}
+
+static void c2_rem_ref(struct ib_qp *ibqp)
+{
+ struct c2_qp *qp;
+ BUG_ON(!ibqp);
+ qp = to_c2qp(ibqp);
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
+{
+ struct c2_dev* c2dev = to_c2dev(device);
+ struct c2_qp *qp;
+
+ qp = c2_find_qpn(c2dev, qpn);
+ pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
+ __func__, qp, qpn, device,
+ (qp?atomic_read(&qp->refcount):0));
+
+ return (qp?&qp->ibqp:NULL);
+}
+
+static struct ib_qp *c2_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct c2_qp *qp;
+ int err;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ if (init_attr->create_flags)
+ return ERR_PTR(-EINVAL);
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_RC:
+ qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+ if (!qp) {
+ pr_debug("%s: Unable to allocate QP\n", __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+ spin_lock_init(&qp->lock);
+ if (pd->uobject) {
+ /* userspace specific */
+ }
+
+ err = c2_alloc_qp(to_c2dev(pd->device),
+ to_c2pd(pd), init_attr, qp);
+
+ if (err && pd->uobject) {
+ /* userspace specific */
+ }
+
+ break;
+ default:
+ pr_debug("%s: Invalid QP type: %d\n", __func__,
+ init_attr->qp_type);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ return &qp->ibqp;
+}
+
+static int c2_destroy_qp(struct ib_qp *ib_qp)
+{
+ struct c2_qp *qp = to_c2qp(ib_qp);
+
+ pr_debug("%s:%u qp=%p,qp->state=%d\n",
+ __func__, __LINE__, ib_qp, qp->state);
+ c2_free_qp(to_c2dev(ib_qp->device), qp);
+ kfree(qp);
+ return 0;
+}
+
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ int entries = attr->cqe;
+ struct c2_cq *cq;
+ int err;
+
+ if (attr->flags)
+ return ERR_PTR(-EINVAL);
+
+ cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+ if (!cq) {
+ pr_debug("%s: Unable to allocate CQ\n", __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
+ if (err) {
+ pr_debug("%s: error initializing CQ\n", __func__);
+ kfree(cq);
+ return ERR_PTR(err);
+ }
+
+ return &cq->ibcq;
+}
+
+static int c2_destroy_cq(struct ib_cq *ib_cq)
+{
+ struct c2_cq *cq = to_c2cq(ib_cq);
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ c2_free_cq(to_c2dev(ib_cq->device), cq);
+ kfree(cq);
+
+ return 0;
+}
+
+static inline u32 c2_convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
+ C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
+}
+
+static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf, int acc, u64 * iova_start)
+{
+ struct c2_mr *mr;
+ u64 *page_list;
+ u32 total_len;
+ int err, i, j, k, page_shift, pbl_depth;
+
+ pbl_depth = 0;
+ total_len = 0;
+
+ page_shift = PAGE_SHIFT;
+ /*
+ * If there is only 1 buffer we assume this could
+ * be a map of all phy mem...use a 32k page_shift.
+ */
+ if (num_phys_buf == 1)
+ page_shift += 3;
+
+ for (i = 0; i < num_phys_buf; i++) {
+
+ if (buffer_list[i].addr & ~PAGE_MASK) {
+ pr_debug("Unaligned Memory Buffer: 0x%x\n",
+ (unsigned int) buffer_list[i].addr);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!buffer_list[i].size) {
+ pr_debug("Invalid Buffer Size\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ total_len += buffer_list[i].size;
+ pbl_depth += ALIGN(buffer_list[i].size,
+ (1 << page_shift)) >> page_shift;
+ }
+
+ page_list = vmalloc(sizeof(u64) * pbl_depth);
+ if (!page_list) {
+ pr_debug("couldn't vmalloc page_list of size %zd\n",
+ (sizeof(u64) * pbl_depth));
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0, j = 0; i < num_phys_buf; i++) {
+
+ int naddrs;
+
+ naddrs = ALIGN(buffer_list[i].size,
+ (1 << page_shift)) >> page_shift;
+ for (k = 0; k < naddrs; k++)
+ page_list[j++] = (buffer_list[i].addr +
+ (k << page_shift));
+ }
+
+ mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ vfree(page_list);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mr->pd = to_c2pd(ib_pd);
+ mr->umem = NULL;
+ pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
+ "*iova_start %llx, first pa %llx, last pa %llx\n",
+ __func__, page_shift, pbl_depth, total_len,
+ (unsigned long long) *iova_start,
+ (unsigned long long) page_list[0],
+ (unsigned long long) page_list[pbl_depth-1]);
+ err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+ (1 << page_shift), pbl_depth,
+ total_len, 0, iova_start,
+ c2_convert_access(acc), mr);
+ vfree(page_list);
+ if (err) {
+ kfree(mr);
+ return ERR_PTR(err);
+ }
+
+ return &mr->ibmr;
+}
+
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct ib_phys_buf bl;
+ u64 kva = 0;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ /* AMSO1100 limit */
+ bl.size = 0xffffffff;
+ bl.addr = 0;
+ return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt, int acc, struct ib_udata *udata)
+{
+ u64 *pages;
+ u64 kva = 0;
+ int shift, n, len;
+ int i, k, entry;
+ int err = 0;
+ struct scatterlist *sg;
+ struct c2_pd *c2pd = to_c2pd(pd);
+ struct c2_mr *c2mr;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
+ if (!c2mr)
+ return ERR_PTR(-ENOMEM);
+ c2mr->pd = c2pd;
+
+ c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+ if (IS_ERR(c2mr->umem)) {
+ err = PTR_ERR(c2mr->umem);
+ kfree(c2mr);
+ return ERR_PTR(err);
+ }
+
+ shift = ffs(c2mr->umem->page_size) - 1;
+ n = c2mr->umem->nmap;
+
+ pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
+ if (!pages) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ i = 0;
+ for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
+ len = sg_dma_len(sg) >> shift;
+ for (k = 0; k < len; ++k) {
+ pages[i++] =
+ sg_dma_address(sg) +
+ (c2mr->umem->page_size * k);
+ }
+ }
+
+ kva = virt;
+ err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
+ pages,
+ c2mr->umem->page_size,
+ i,
+ length,
+ ib_umem_offset(c2mr->umem),
+ &kva,
+ c2_convert_access(acc),
+ c2mr);
+ kfree(pages);
+ if (err)
+ goto err;
+ return &c2mr->ibmr;
+
+err:
+ ib_umem_release(c2mr->umem);
+ kfree(c2mr);
+ return ERR_PTR(err);
+}
+
+static int c2_dereg_mr(struct ib_mr *ib_mr)
+{
+ struct c2_mr *mr = to_c2mr(ib_mr);
+ int err;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
+ if (err)
+ pr_debug("c2_stag_dealloc failed: %d\n", err);
+ else {
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+ kfree(mr);
+ }
+
+ return err;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return sprintf(buf, "%x\n", c2dev->props.hw_ver);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return sprintf(buf, "%x.%x.%x\n",
+ (int) (c2dev->props.fw_ver >> 32),
+ (int) (c2dev->props.fw_ver >> 16) & 0xffff,
+ (int) (c2dev->props.fw_ver & 0xffff));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return sprintf(buf, "AMSO1100\n");
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c2_dev_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_fw_ver,
+ &dev_attr_hca_type,
+ &dev_attr_board_id
+};
+
+static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ int err;
+
+ err =
+ c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
+ attr_mask);
+
+ return err;
+}
+
+static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return -ENOSYS;
+}
+
+static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return -ENOSYS;
+}
+
+static int c2_process_mad(struct ib_device *ibdev,
+ int mad_flags,
+ u8 port_num,
+ const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in_mad,
+ size_t in_mad_size,
+ struct ib_mad_hdr *out_mad,
+ size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ return -ENOSYS;
+}
+
+static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ /* Request a connection */
+ return c2_llp_connect(cm_id, iw_param);
+}
+
+static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ /* Accept the new connection */
+ return c2_llp_accept(cm_id, iw_param);
+}
+
+static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+ int err;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ err = c2_llp_reject(cm_id, pdata, pdata_len);
+ return err;
+}
+
+static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+ int err;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ err = c2_llp_service_create(cm_id, backlog);
+ pr_debug("%s:%u err=%d\n",
+ __func__, __LINE__,
+ err);
+ return err;
+}
+
+static int c2_service_destroy(struct iw_cm_id *cm_id)
+{
+ int err;
+ pr_debug("%s:%u\n", __func__, __LINE__);
+
+ err = c2_llp_service_destroy(cm_id);
+
+ return err;
+}
+
+static int c2_pseudo_up(struct net_device *netdev)
+{
+ struct in_device *ind;
+ struct c2_dev *c2dev = netdev->ml_priv;
+
+ ind = in_dev_get(netdev);
+ if (!ind)
+ return 0;
+
+ pr_debug("adding...\n");
+ for_ifa(ind) {
+#ifdef DEBUG
+ u8 *ip = (u8 *) & ifa->ifa_address;
+
+ pr_debug("%s: %d.%d.%d.%d\n",
+ ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+ c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+ }
+ endfor_ifa(ind);
+ in_dev_put(ind);
+
+ return 0;
+}
+
+static int c2_pseudo_down(struct net_device *netdev)
+{
+ struct in_device *ind;
+ struct c2_dev *c2dev = netdev->ml_priv;
+
+ ind = in_dev_get(netdev);
+ if (!ind)
+ return 0;
+
+ pr_debug("deleting...\n");
+ for_ifa(ind) {
+#ifdef DEBUG
+ u8 *ip = (u8 *) & ifa->ifa_address;
+
+ pr_debug("%s: %d.%d.%d.%d\n",
+ ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+ c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+ }
+ endfor_ifa(ind);
+ in_dev_put(ind);
+
+ return 0;
+}
+
+static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
+{
+ if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+ return -EINVAL;
+
+ netdev->mtu = new_mtu;
+
+ /* TODO: Tell rnic about new rmda interface mtu */
+ return 0;
+}
+
+static const struct net_device_ops c2_pseudo_netdev_ops = {
+ .ndo_open = c2_pseudo_up,
+ .ndo_stop = c2_pseudo_down,
+ .ndo_start_xmit = c2_pseudo_xmit_frame,
+ .ndo_change_mtu = c2_pseudo_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
+static void setup(struct net_device *netdev)
+{
+ netdev->netdev_ops = &c2_pseudo_netdev_ops;
+
+ netdev->watchdog_timeo = 0;
+ netdev->type = ARPHRD_ETHER;
+ netdev->mtu = 1500;
+ netdev->hard_header_len = ETH_HLEN;
+ netdev->addr_len = ETH_ALEN;
+ netdev->tx_queue_len = 0;
+ netdev->flags |= IFF_NOARP;
+}
+
+static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
+{
+ char name[IFNAMSIZ];
+ struct net_device *netdev;
+
+ /* change ethxxx to iwxxx */
+ strcpy(name, "iw");
+ strcat(name, &c2dev->netdev->name[3]);
+ netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
+ if (!netdev) {
+ printk(KERN_ERR PFX "%s - etherdev alloc failed",
+ __func__);
+ return NULL;
+ }
+
+ netdev->ml_priv = c2dev;
+
+ SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+ memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
+
+ /* Print out the MAC address */
+ pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
+
+#if 0
+ /* Disable network packets */
+ netif_stop_queue(netdev);
+#endif
+ return netdev;
+}
+
+static int c2_port_immutable(struct ib_device *ibdev, u8 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct ib_port_attr attr;
+ int err;
+
+ err = c2_query_port(ibdev, port_num, &attr);
+ if (err)
+ return err;
+
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+ immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+ return 0;
+}
+
+int c2_register_device(struct c2_dev *dev)
+{
+ int ret = -ENOMEM;
+ int i;
+
+ /* Register pseudo network device */
+ dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
+ if (!dev->pseudo_netdev)
+ goto out;
+
+ ret = register_netdev(dev->pseudo_netdev);
+ if (ret)
+ goto out_free_netdev;
+
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
+ dev->ibdev.owner = THIS_MODULE;
+ dev->ibdev.uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+ (1ull << IB_USER_VERBS_CMD_POST_RECV);
+
+ dev->ibdev.node_type = RDMA_NODE_RNIC;
+ memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+ memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
+ dev->ibdev.phys_port_cnt = 1;
+ dev->ibdev.num_comp_vectors = 1;
+ dev->ibdev.dma_device = &dev->pcidev->dev;
+ dev->ibdev.query_device = c2_query_device;
+ dev->ibdev.query_port = c2_query_port;
+ dev->ibdev.query_pkey = c2_query_pkey;
+ dev->ibdev.query_gid = c2_query_gid;
+ dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
+ dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
+ dev->ibdev.mmap = c2_mmap_uar;
+ dev->ibdev.alloc_pd = c2_alloc_pd;
+ dev->ibdev.dealloc_pd = c2_dealloc_pd;
+ dev->ibdev.create_ah = c2_ah_create;
+ dev->ibdev.destroy_ah = c2_ah_destroy;
+ dev->ibdev.create_qp = c2_create_qp;
+ dev->ibdev.modify_qp = c2_modify_qp;
+ dev->ibdev.destroy_qp = c2_destroy_qp;
+ dev->ibdev.create_cq = c2_create_cq;
+ dev->ibdev.destroy_cq = c2_destroy_cq;
+ dev->ibdev.poll_cq = c2_poll_cq;
+ dev->ibdev.get_dma_mr = c2_get_dma_mr;
+ dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
+ dev->ibdev.reg_user_mr = c2_reg_user_mr;
+ dev->ibdev.dereg_mr = c2_dereg_mr;
+ dev->ibdev.get_port_immutable = c2_port_immutable;
+
+ dev->ibdev.alloc_fmr = NULL;
+ dev->ibdev.unmap_fmr = NULL;
+ dev->ibdev.dealloc_fmr = NULL;
+ dev->ibdev.map_phys_fmr = NULL;
+
+ dev->ibdev.attach_mcast = c2_multicast_attach;
+ dev->ibdev.detach_mcast = c2_multicast_detach;
+ dev->ibdev.process_mad = c2_process_mad;
+
+ dev->ibdev.req_notify_cq = c2_arm_cq;
+ dev->ibdev.post_send = c2_post_send;
+ dev->ibdev.post_recv = c2_post_receive;
+
+ dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
+ if (dev->ibdev.iwcm == NULL) {
+ ret = -ENOMEM;
+ goto out_unregister_netdev;
+ }
+ dev->ibdev.iwcm->add_ref = c2_add_ref;
+ dev->ibdev.iwcm->rem_ref = c2_rem_ref;
+ dev->ibdev.iwcm->get_qp = c2_get_qp;
+ dev->ibdev.iwcm->connect = c2_connect;
+ dev->ibdev.iwcm->accept = c2_accept;
+ dev->ibdev.iwcm->reject = c2_reject;
+ dev->ibdev.iwcm->create_listen = c2_service_create;
+ dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
+
+ ret = ib_register_device(&dev->ibdev, NULL);
+ if (ret)
+ goto out_free_iwcm;
+
+ for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
+ ret = device_create_file(&dev->ibdev.dev,
+ c2_dev_attributes[i]);
+ if (ret)
+ goto out_unregister_ibdev;
+ }
+ goto out;
+
+out_unregister_ibdev:
+ ib_unregister_device(&dev->ibdev);
+out_free_iwcm:
+ kfree(dev->ibdev.iwcm);
+out_unregister_netdev:
+ unregister_netdev(dev->pseudo_netdev);
+out_free_netdev:
+ free_netdev(dev->pseudo_netdev);
+out:
+ pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
+ return ret;
+}
+
+void c2_unregister_device(struct c2_dev *dev)
+{
+ pr_debug("%s:%u\n", __func__, __LINE__);
+ unregister_netdev(dev->pseudo_netdev);
+ free_netdev(dev->pseudo_netdev);
+ ib_unregister_device(&dev->ibdev);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.h b/drivers/staging/rdma/amso1100/c2_provider.h
new file mode 100644
index 000000000000..bf189987711f
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_provider.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_PROVIDER_H
+#define C2_PROVIDER_H
+#include <linux/inetdevice.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#include "c2_mq.h"
+#include <rdma/iw_cm.h>
+
+#define C2_MPT_FLAG_ATOMIC (1 << 14)
+#define C2_MPT_FLAG_REMOTE_WRITE (1 << 13)
+#define C2_MPT_FLAG_REMOTE_READ (1 << 12)
+#define C2_MPT_FLAG_LOCAL_WRITE (1 << 11)
+#define C2_MPT_FLAG_LOCAL_READ (1 << 10)
+
+struct c2_buf_list {
+ void *buf;
+ DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+
+/* The user context keeps track of objects allocated for a
+ * particular user-mode client. */
+struct c2_ucontext {
+ struct ib_ucontext ibucontext;
+};
+
+struct c2_mtt;
+
+/* All objects associated with a PD are kept in the
+ * associated user context if present.
+ */
+struct c2_pd {
+ struct ib_pd ibpd;
+ u32 pd_id;
+};
+
+struct c2_mr {
+ struct ib_mr ibmr;
+ struct c2_pd *pd;
+ struct ib_umem *umem;
+};
+
+struct c2_av;
+
+enum c2_ah_type {
+ C2_AH_ON_HCA,
+ C2_AH_PCI_POOL,
+ C2_AH_KMALLOC
+};
+
+struct c2_ah {
+ struct ib_ah ibah;
+};
+
+struct c2_cq {
+ struct ib_cq ibcq;
+ spinlock_t lock;
+ atomic_t refcount;
+ int cqn;
+ int is_kernel;
+ wait_queue_head_t wait;
+
+ u32 adapter_handle;
+ struct c2_mq mq;
+};
+
+struct c2_wq {
+ spinlock_t lock;
+};
+struct iw_cm_id;
+struct c2_qp {
+ struct ib_qp ibqp;
+ struct iw_cm_id *cm_id;
+ spinlock_t lock;
+ atomic_t refcount;
+ wait_queue_head_t wait;
+ int qpn;
+
+ u32 adapter_handle;
+ u32 send_sgl_depth;
+ u32 recv_sgl_depth;
+ u32 rdma_write_sgl_depth;
+ u8 state;
+
+ struct c2_mq sq_mq;
+ struct c2_mq rq_mq;
+};
+
+struct c2_cr_query_attrs {
+ u32 local_addr;
+ u32 remote_addr;
+ u16 local_port;
+ u16 remote_port;
+};
+
+static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct c2_pd, ibpd);
+}
+
+static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
+{
+ return container_of(ibucontext, struct c2_ucontext, ibucontext);
+}
+
+static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct c2_mr, ibmr);
+}
+
+
+static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct c2_ah, ibah);
+}
+
+static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct c2_cq, ibcq);
+}
+
+static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct c2_qp, ibqp);
+}
+
+static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
+{
+ struct in_device *ind;
+ int ret = 0;
+
+ ind = in_dev_get(netdev);
+ if (!ind)
+ return 0;
+
+ for_ifa(ind) {
+ if (ifa->ifa_address == addr) {
+ ret = 1;
+ break;
+ }
+ }
+ endfor_ifa(ind);
+ in_dev_put(ind);
+ return ret;
+}
+#endif /* C2_PROVIDER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_qp.c b/drivers/staging/rdma/amso1100/c2_qp.c
new file mode 100644
index 000000000000..86708dee58b1
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_qp.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_MAX_ORD_PER_QP 128
+#define C2_MAX_IRD_PER_QP 128
+
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+#define NO_SUPPORT -1
+static const u8 c2_opcode[] = {
+ [IB_WR_SEND] = C2_WR_TYPE_SEND,
+ [IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
+ [IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
+ [IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
+};
+
+static int to_c2_state(enum ib_qp_state ib_state)
+{
+ switch (ib_state) {
+ case IB_QPS_RESET:
+ return C2_QP_STATE_IDLE;
+ case IB_QPS_RTS:
+ return C2_QP_STATE_RTS;
+ case IB_QPS_SQD:
+ return C2_QP_STATE_CLOSING;
+ case IB_QPS_SQE:
+ return C2_QP_STATE_CLOSING;
+ case IB_QPS_ERR:
+ return C2_QP_STATE_ERROR;
+ default:
+ return -1;
+ }
+}
+
+static int to_ib_state(enum c2_qp_state c2_state)
+{
+ switch (c2_state) {
+ case C2_QP_STATE_IDLE:
+ return IB_QPS_RESET;
+ case C2_QP_STATE_CONNECTING:
+ return IB_QPS_RTR;
+ case C2_QP_STATE_RTS:
+ return IB_QPS_RTS;
+ case C2_QP_STATE_CLOSING:
+ return IB_QPS_SQD;
+ case C2_QP_STATE_ERROR:
+ return IB_QPS_ERR;
+ case C2_QP_STATE_TERMINATE:
+ return IB_QPS_SQE;
+ default:
+ return -1;
+ }
+}
+
+static const char *to_ib_state_str(int ib_state)
+{
+ static const char *state_str[] = {
+ "IB_QPS_RESET",
+ "IB_QPS_INIT",
+ "IB_QPS_RTR",
+ "IB_QPS_RTS",
+ "IB_QPS_SQD",
+ "IB_QPS_SQE",
+ "IB_QPS_ERR"
+ };
+ if (ib_state < IB_QPS_RESET ||
+ ib_state > IB_QPS_ERR)
+ return "<invalid IB QP state>";
+
+ ib_state -= IB_QPS_RESET;
+ return state_str[ib_state];
+}
+
+void c2_set_qp_state(struct c2_qp *qp, int c2_state)
+{
+ int new_state = to_ib_state(c2_state);
+
+ pr_debug("%s: qp[%p] state modify %s --> %s\n",
+ __func__,
+ qp,
+ to_ib_state_str(qp->state),
+ to_ib_state_str(new_state));
+ qp->state = new_state;
+}
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+ struct ib_qp_attr *attr, int attr_mask)
+{
+ struct c2wr_qp_modify_req wr;
+ struct c2wr_qp_modify_rep *reply;
+ struct c2_vq_req *vq_req;
+ unsigned long flags;
+ u8 next_state;
+ int err;
+
+ pr_debug("%s:%d qp=%p, %s --> %s\n",
+ __func__, __LINE__,
+ qp,
+ to_ib_state_str(qp->state),
+ to_ib_state_str(attr->qp_state));
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.qp_handle = qp->adapter_handle;
+ wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+ wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+ wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+ wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+ if (attr_mask & IB_QP_STATE) {
+ /* Ensure the state is valid */
+ if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
+ err = -EINVAL;
+ goto bail0;
+ }
+
+ wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
+
+ if (attr->qp_state == IB_QPS_ERR) {
+ spin_lock_irqsave(&qp->lock, flags);
+ if (qp->cm_id && qp->state == IB_QPS_RTS) {
+ pr_debug("Generating CLOSE event for QP-->ERR, "
+ "qp=%p, cm_id=%p\n",qp,qp->cm_id);
+ /* Generate an CLOSE event */
+ vq_req->cm_id = qp->cm_id;
+ vq_req->event = IW_CM_EVENT_CLOSE;
+ }
+ spin_unlock_irqrestore(&qp->lock, flags);
+ }
+ next_state = attr->qp_state;
+
+ } else if (attr_mask & IB_QP_CUR_STATE) {
+
+ if (attr->cur_qp_state != IB_QPS_RTR &&
+ attr->cur_qp_state != IB_QPS_RTS &&
+ attr->cur_qp_state != IB_QPS_SQD &&
+ attr->cur_qp_state != IB_QPS_SQE) {
+ err = -EINVAL;
+ goto bail0;
+ } else
+ wr.next_qp_state =
+ cpu_to_be32(to_c2_state(attr->cur_qp_state));
+
+ next_state = attr->cur_qp_state;
+
+ } else {
+ err = 0;
+ goto bail0;
+ }
+
+ /* reference the request struct */
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail0;
+
+ reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ err = c2_errno(reply);
+ if (!err)
+ qp->state = next_state;
+#ifdef DEBUG
+ else
+ pr_debug("%s: c2_errno=%d\n", __func__, err);
+#endif
+ /*
+ * If we're going to error and generating the event here, then
+ * we need to remove the reference because there will be no
+ * close event generated by the adapter
+ */
+ spin_lock_irqsave(&qp->lock, flags);
+ if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
+ qp->cm_id->rem_ref(qp->cm_id);
+ qp->cm_id = NULL;
+ }
+ spin_unlock_irqrestore(&qp->lock, flags);
+
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+
+ pr_debug("%s:%d qp=%p, cur_state=%s\n",
+ __func__, __LINE__,
+ qp,
+ to_ib_state_str(qp->state));
+ return err;
+}
+
+int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+ int ord, int ird)
+{
+ struct c2wr_qp_modify_req wr;
+ struct c2wr_qp_modify_rep *reply;
+ struct c2_vq_req *vq_req;
+ int err;
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.qp_handle = qp->adapter_handle;
+ wr.ord = cpu_to_be32(ord);
+ wr.ird = cpu_to_be32(ird);
+ wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+ wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+ wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+ /* reference the request struct */
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail0;
+
+ reply = (struct c2wr_qp_modify_rep *) (unsigned long)
+ vq_req->reply_msg;
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ err = c2_errno(reply);
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+ struct c2_vq_req *vq_req;
+ struct c2wr_qp_destroy_req wr;
+ struct c2wr_qp_destroy_rep *reply;
+ unsigned long flags;
+ int err;
+
+ /*
+ * Allocate a verb request message
+ */
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req) {
+ return -ENOMEM;
+ }
+
+ /*
+ * Initialize the WR
+ */
+ c2_wr_set_id(&wr, CCWR_QP_DESTROY);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.qp_handle = qp->adapter_handle;
+
+ /*
+ * reference the request struct. dereferenced in the int handler.
+ */
+ vq_req_get(c2dev, vq_req);
+
+ spin_lock_irqsave(&qp->lock, flags);
+ if (qp->cm_id && qp->state == IB_QPS_RTS) {
+ pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
+ "qp=%p, cm_id=%p\n",qp,qp->cm_id);
+ /* Generate an CLOSE event */
+ vq_req->qp = qp;
+ vq_req->cm_id = qp->cm_id;
+ vq_req->event = IW_CM_EVENT_CLOSE;
+ }
+ spin_unlock_irqrestore(&qp->lock, flags);
+
+ /*
+ * Send WR to adapter
+ */
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ /*
+ * Wait for reply from adapter
+ */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err) {
+ goto bail0;
+ }
+
+ /*
+ * Process reply
+ */
+ reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ spin_lock_irqsave(&qp->lock, flags);
+ if (qp->cm_id) {
+ qp->cm_id->rem_ref(qp->cm_id);
+ qp->cm_id = NULL;
+ }
+ spin_unlock_irqrestore(&qp->lock, flags);
+
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+ int ret;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&c2dev->qp_table.lock);
+
+ ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
+ if (ret >= 0)
+ qp->qpn = ret;
+
+ spin_unlock_irq(&c2dev->qp_table.lock);
+ idr_preload_end();
+ return ret < 0 ? ret : 0;
+}
+
+static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
+{
+ spin_lock_irq(&c2dev->qp_table.lock);
+ idr_remove(&c2dev->qp_table.idr, qpn);
+ spin_unlock_irq(&c2dev->qp_table.lock);
+}
+
+struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
+{
+ unsigned long flags;
+ struct c2_qp *qp;
+
+ spin_lock_irqsave(&c2dev->qp_table.lock, flags);
+ qp = idr_find(&c2dev->qp_table.idr, qpn);
+ spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
+ return qp;
+}
+
+int c2_alloc_qp(struct c2_dev *c2dev,
+ struct c2_pd *pd,
+ struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
+{
+ struct c2wr_qp_create_req wr;
+ struct c2wr_qp_create_rep *reply;
+ struct c2_vq_req *vq_req;
+ struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
+ struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
+ unsigned long peer_pa;
+ u32 q_size, msg_size, mmap_size;
+ void __iomem *mmap;
+ int err;
+
+ err = c2_alloc_qpn(c2dev, qp);
+ if (err)
+ return err;
+ qp->ibqp.qp_num = qp->qpn;
+ qp->ibqp.qp_type = IB_QPT_RC;
+
+ /* Allocate the SQ and RQ shared pointers */
+ qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+ &qp->sq_mq.shared_dma, GFP_KERNEL);
+ if (!qp->sq_mq.shared) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+ &qp->rq_mq.shared_dma, GFP_KERNEL);
+ if (!qp->rq_mq.shared) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ /* Allocate the verbs request */
+ vq_req = vq_req_alloc(c2dev);
+ if (vq_req == NULL) {
+ err = -ENOMEM;
+ goto bail2;
+ }
+
+ /* Initialize the work request */
+ memset(&wr, 0, sizeof(wr));
+ c2_wr_set_id(&wr, CCWR_QP_CREATE);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+ wr.sq_cq_handle = send_cq->adapter_handle;
+ wr.rq_cq_handle = recv_cq->adapter_handle;
+ wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
+ wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
+ wr.srq_handle = 0;
+ wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
+ QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
+ wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+ wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
+ wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+ wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
+ wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
+ wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
+ wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
+ wr.pd_id = pd->pd_id;
+ wr.user_context = (unsigned long) qp;
+
+ vq_req_get(c2dev, vq_req);
+
+ /* Send the WR to the adapter */
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail3;
+ }
+
+ /* Wait for the verb reply */
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err) {
+ goto bail3;
+ }
+
+ /* Process the reply */
+ reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail3;
+ }
+
+ if ((err = c2_wr_get_result(reply)) != 0) {
+ goto bail4;
+ }
+
+ /* Fill in the kernel QP struct */
+ atomic_set(&qp->refcount, 1);
+ qp->adapter_handle = reply->qp_handle;
+ qp->state = IB_QPS_RESET;
+ qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
+ qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
+ qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
+ init_waitqueue_head(&qp->wait);
+
+ /* Initialize the SQ MQ */
+ q_size = be32_to_cpu(reply->sq_depth);
+ msg_size = be32_to_cpu(reply->sq_msg_size);
+ peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
+ mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+ mmap = ioremap_nocache(peer_pa, mmap_size);
+ if (!mmap) {
+ err = -ENOMEM;
+ goto bail5;
+ }
+
+ c2_mq_req_init(&qp->sq_mq,
+ be32_to_cpu(reply->sq_mq_index),
+ q_size,
+ msg_size,
+ mmap + sizeof(struct c2_mq_shared), /* pool start */
+ mmap, /* peer */
+ C2_MQ_ADAPTER_TARGET);
+
+ /* Initialize the RQ mq */
+ q_size = be32_to_cpu(reply->rq_depth);
+ msg_size = be32_to_cpu(reply->rq_msg_size);
+ peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
+ mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+ mmap = ioremap_nocache(peer_pa, mmap_size);
+ if (!mmap) {
+ err = -ENOMEM;
+ goto bail6;
+ }
+
+ c2_mq_req_init(&qp->rq_mq,
+ be32_to_cpu(reply->rq_mq_index),
+ q_size,
+ msg_size,
+ mmap + sizeof(struct c2_mq_shared), /* pool start */
+ mmap, /* peer */
+ C2_MQ_ADAPTER_TARGET);
+
+ vq_repbuf_free(c2dev, reply);
+ vq_req_free(c2dev, vq_req);
+
+ return 0;
+
+ bail6:
+ iounmap(qp->sq_mq.peer);
+ bail5:
+ destroy_qp(c2dev, qp);
+ bail4:
+ vq_repbuf_free(c2dev, reply);
+ bail3:
+ vq_req_free(c2dev, vq_req);
+ bail2:
+ c2_free_mqsp(qp->rq_mq.shared);
+ bail1:
+ c2_free_mqsp(qp->sq_mq.shared);
+ bail0:
+ c2_free_qpn(c2dev, qp->qpn);
+ return err;
+}
+
+static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_lock_irq(&send_cq->lock);
+ else if (send_cq > recv_cq) {
+ spin_lock_irq(&send_cq->lock);
+ spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock_irq(&recv_cq->lock);
+ spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_unlock_irq(&send_cq->lock);
+ else if (send_cq > recv_cq) {
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);
+ } else {
+ spin_unlock(&send_cq->lock);
+ spin_unlock_irq(&recv_cq->lock);
+ }
+}
+
+void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+ struct c2_cq *send_cq;
+ struct c2_cq *recv_cq;
+
+ send_cq = to_c2cq(qp->ibqp.send_cq);
+ recv_cq = to_c2cq(qp->ibqp.recv_cq);
+
+ /*
+ * Lock CQs here, so that CQ polling code can do QP lookup
+ * without taking a lock.
+ */
+ c2_lock_cqs(send_cq, recv_cq);
+ c2_free_qpn(c2dev, qp->qpn);
+ c2_unlock_cqs(send_cq, recv_cq);
+
+ /*
+ * Destroy qp in the rnic...
+ */
+ destroy_qp(c2dev, qp);
+
+ /*
+ * Mark any unreaped CQEs as null and void.
+ */
+ c2_cq_clean(c2dev, qp, send_cq->cqn);
+ if (send_cq != recv_cq)
+ c2_cq_clean(c2dev, qp, recv_cq->cqn);
+ /*
+ * Unmap the MQs and return the shared pointers
+ * to the message pool.
+ */
+ iounmap(qp->sq_mq.peer);
+ iounmap(qp->rq_mq.peer);
+ c2_free_mqsp(qp->sq_mq.shared);
+ c2_free_mqsp(qp->rq_mq.shared);
+
+ atomic_dec(&qp->refcount);
+ wait_event(qp->wait, !atomic_read(&qp->refcount));
+}
+
+/*
+ * Function: move_sgl
+ *
+ * Description:
+ * Move an SGL from the user's work request struct into a CCIL Work Request
+ * message, swapping to WR byte order and ensure the total length doesn't
+ * overflow.
+ *
+ * IN:
+ * dst - ptr to CCIL Work Request message SGL memory.
+ * src - ptr to the consumers SGL memory.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int
+move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
+ u8 * actual_count)
+{
+ u32 tot = 0; /* running total */
+ u8 acount = 0; /* running total non-0 len sge's */
+
+ while (count > 0) {
+ /*
+ * If the addition of this SGE causes the
+ * total SGL length to exceed 2^32-1, then
+ * fail-n-bail.
+ *
+ * If the current total plus the next element length
+ * wraps, then it will go negative and be less than the
+ * current total...
+ */
+ if ((tot + src->length) < tot) {
+ return -EINVAL;
+ }
+ /*
+ * Bug: 1456 (as well as 1498 & 1643)
+ * Skip over any sge's supplied with len=0
+ */
+ if (src->length) {
+ tot += src->length;
+ dst->stag = cpu_to_be32(src->lkey);
+ dst->to = cpu_to_be64(src->addr);
+ dst->length = cpu_to_be32(src->length);
+ dst++;
+ acount++;
+ }
+ src++;
+ count--;
+ }
+
+ if (acount == 0) {
+ /*
+ * Bug: 1476 (as well as 1498, 1456 and 1643)
+ * Setup the SGL in the WR to make it easier for the RNIC.
+ * This way, the FW doesn't have to deal with special cases.
+ * Setting length=0 should be sufficient.
+ */
+ dst->stag = 0;
+ dst->to = 0;
+ dst->length = 0;
+ }
+
+ *p_len = tot;
+ *actual_count = acount;
+ return 0;
+}
+
+/*
+ * Function: c2_activity (private function)
+ *
+ * Description:
+ * Post an mq index to the host->adapter activity fifo.
+ *
+ * IN:
+ * c2dev - ptr to c2dev structure
+ * mq_index - mq index to post
+ * shared - value most recently written to shared
+ *
+ * OUT:
+ *
+ * Return:
+ * none
+ */
+static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
+{
+ /*
+ * First read the register to see if the FIFO is full, and if so,
+ * spin until it's not. This isn't perfect -- there is no
+ * synchronization among the clients of the register, but in
+ * practice it prevents multiple CPU from hammering the bus
+ * with PCI RETRY. Note that when this does happen, the card
+ * cannot get on the bus and the card and system hang in a
+ * deadlock -- thus the need for this code. [TOT]
+ */
+ while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
+ udelay(10);
+
+ __raw_writel(C2_HINT_MAKE(mq_index, shared),
+ c2dev->regs + PCI_BAR0_ADAPTER_HINT);
+}
+
+/*
+ * Function: qp_wr_post
+ *
+ * Description:
+ * This in-line function allocates a MQ msg, then moves the host-copy of
+ * the completed WR into msg. Then it posts the message.
+ *
+ * IN:
+ * q - ptr to user MQ.
+ * wr - ptr to host-copy of the WR.
+ * qp - ptr to user qp
+ * size - Number of bytes to post. Assumed to be divisible by 4.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
+{
+ union c2wr *msg;
+
+ msg = c2_mq_alloc(q);
+ if (msg == NULL) {
+ return -EINVAL;
+ }
+#ifdef CCMSGMAGIC
+ ((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+
+ /*
+ * Since all header fields in the WR are the same as the
+ * CQE, set the following so the adapter need not.
+ */
+ c2_wr_set_result(wr, CCERR_PENDING);
+
+ /*
+ * Copy the wr down to the adapter
+ */
+ memcpy((void *) msg, (void *) wr, size);
+
+ c2_mq_produce(q);
+ return 0;
+}
+
+
+int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct c2_dev *c2dev = to_c2dev(ibqp->device);
+ struct c2_qp *qp = to_c2qp(ibqp);
+ union c2wr wr;
+ unsigned long lock_flags;
+ int err = 0;
+
+ u32 flags;
+ u32 tot_len;
+ u8 actual_sge_count;
+ u32 msg_size;
+
+ if (qp->state > IB_QPS_RTS) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ while (ib_wr) {
+
+ flags = 0;
+ wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+ if (ib_wr->send_flags & IB_SEND_SIGNALED) {
+ flags |= SQ_SIGNALED;
+ }
+
+ switch (ib_wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_INV:
+ if (ib_wr->opcode == IB_WR_SEND) {
+ if (ib_wr->send_flags & IB_SEND_SOLICITED)
+ c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
+ else
+ c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
+ wr.sqwr.send.remote_stag = 0;
+ } else {
+ if (ib_wr->send_flags & IB_SEND_SOLICITED)
+ c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
+ else
+ c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
+ wr.sqwr.send.remote_stag =
+ cpu_to_be32(ib_wr->ex.invalidate_rkey);
+ }
+
+ msg_size = sizeof(struct c2wr_send_req) +
+ sizeof(struct c2_data_addr) * ib_wr->num_sge;
+ if (ib_wr->num_sge > qp->send_sgl_depth) {
+ err = -EINVAL;
+ break;
+ }
+ if (ib_wr->send_flags & IB_SEND_FENCE) {
+ flags |= SQ_READ_FENCE;
+ }
+ err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
+ ib_wr->sg_list,
+ ib_wr->num_sge,
+ &tot_len, &actual_sge_count);
+ wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
+ c2_wr_set_sge_count(&wr, actual_sge_count);
+ break;
+ case IB_WR_RDMA_WRITE:
+ c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
+ msg_size = sizeof(struct c2wr_rdma_write_req) +
+ (sizeof(struct c2_data_addr) * ib_wr->num_sge);
+ if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
+ err = -EINVAL;
+ break;
+ }
+ if (ib_wr->send_flags & IB_SEND_FENCE) {
+ flags |= SQ_READ_FENCE;
+ }
+ wr.sqwr.rdma_write.remote_stag =
+ cpu_to_be32(ib_wr->wr.rdma.rkey);
+ wr.sqwr.rdma_write.remote_to =
+ cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+ err = move_sgl((struct c2_data_addr *)
+ & (wr.sqwr.rdma_write.data),
+ ib_wr->sg_list,
+ ib_wr->num_sge,
+ &tot_len, &actual_sge_count);
+ wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
+ c2_wr_set_sge_count(&wr, actual_sge_count);
+ break;
+ case IB_WR_RDMA_READ:
+ c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
+ msg_size = sizeof(struct c2wr_rdma_read_req);
+
+ /* IWarp only suppots 1 sge for RDMA reads */
+ if (ib_wr->num_sge > 1) {
+ err = -EINVAL;
+ break;
+ }
+
+ /*
+ * Move the local and remote stag/to/len into the WR.
+ */
+ wr.sqwr.rdma_read.local_stag =
+ cpu_to_be32(ib_wr->sg_list->lkey);
+ wr.sqwr.rdma_read.local_to =
+ cpu_to_be64(ib_wr->sg_list->addr);
+ wr.sqwr.rdma_read.remote_stag =
+ cpu_to_be32(ib_wr->wr.rdma.rkey);
+ wr.sqwr.rdma_read.remote_to =
+ cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+ wr.sqwr.rdma_read.length =
+ cpu_to_be32(ib_wr->sg_list->length);
+ break;
+ default:
+ /* error */
+ msg_size = 0;
+ err = -EINVAL;
+ break;
+ }
+
+ /*
+ * If we had an error on the last wr build, then
+ * break out. Possible errors include bogus WR
+ * type, and a bogus SGL length...
+ */
+ if (err) {
+ break;
+ }
+
+ /*
+ * Store flags
+ */
+ c2_wr_set_flags(&wr, flags);
+
+ /*
+ * Post the puppy!
+ */
+ spin_lock_irqsave(&qp->lock, lock_flags);
+ err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
+ if (err) {
+ spin_unlock_irqrestore(&qp->lock, lock_flags);
+ break;
+ }
+
+ /*
+ * Enqueue mq index to activity FIFO.
+ */
+ c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
+ spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+ ib_wr = ib_wr->next;
+ }
+
+out:
+ if (err)
+ *bad_wr = ib_wr;
+ return err;
+}
+
+int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct c2_dev *c2dev = to_c2dev(ibqp->device);
+ struct c2_qp *qp = to_c2qp(ibqp);
+ union c2wr wr;
+ unsigned long lock_flags;
+ int err = 0;
+
+ if (qp->state > IB_QPS_RTS) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Try and post each work request
+ */
+ while (ib_wr) {
+ u32 tot_len;
+ u8 actual_sge_count;
+
+ if (ib_wr->num_sge > qp->recv_sgl_depth) {
+ err = -EINVAL;
+ break;
+ }
+
+ /*
+ * Create local host-copy of the WR
+ */
+ wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+ c2_wr_set_id(&wr, CCWR_RECV);
+ c2_wr_set_flags(&wr, 0);
+
+ /* sge_count is limited to eight bits. */
+ BUG_ON(ib_wr->num_sge >= 256);
+ err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
+ ib_wr->sg_list,
+ ib_wr->num_sge, &tot_len, &actual_sge_count);
+ c2_wr_set_sge_count(&wr, actual_sge_count);
+
+ /*
+ * If we had an error on the last wr build, then
+ * break out. Possible errors include bogus WR
+ * type, and a bogus SGL length...
+ */
+ if (err) {
+ break;
+ }
+
+ spin_lock_irqsave(&qp->lock, lock_flags);
+ err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
+ if (err) {
+ spin_unlock_irqrestore(&qp->lock, lock_flags);
+ break;
+ }
+
+ /*
+ * Enqueue mq index to activity FIFO
+ */
+ c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
+ spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+ ib_wr = ib_wr->next;
+ }
+
+out:
+ if (err)
+ *bad_wr = ib_wr;
+ return err;
+}
+
+void c2_init_qp_table(struct c2_dev *c2dev)
+{
+ spin_lock_init(&c2dev->qp_table.lock);
+ idr_init(&c2dev->qp_table.idr);
+}
+
+void c2_cleanup_qp_table(struct c2_dev *c2dev)
+{
+ idr_destroy(&c2dev->qp_table.idr);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_rnic.c b/drivers/staging/rdma/amso1100/c2_rnic.c
new file mode 100644
index 000000000000..d2a6d961344b
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_rnic.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <linux/route.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_vq.h"
+
+/* Device capabilities */
+#define C2_MIN_PAGESIZE 1024
+
+#define C2_MAX_MRS 32768
+#define C2_MAX_QPS 16000
+#define C2_MAX_WQE_SZ 256
+#define C2_MAX_QP_WR ((128*1024)/C2_MAX_WQE_SZ)
+#define C2_MAX_SGES 4
+#define C2_MAX_SGE_RD 1
+#define C2_MAX_CQS 32768
+#define C2_MAX_CQES 4096
+#define C2_MAX_PDS 16384
+
+/*
+ * Send the adapter INIT message to the amso1100
+ */
+static int c2_adapter_init(struct c2_dev *c2dev)
+{
+ struct c2wr_init_req wr;
+ int err;
+
+ memset(&wr, 0, sizeof(wr));
+ c2_wr_set_id(&wr, CCWR_INIT);
+ wr.hdr.context = 0;
+ wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
+ wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
+ wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
+ wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
+ wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
+ wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
+
+ /* Post the init message */
+ err = vq_send_wr(c2dev, (union c2wr *) & wr);
+
+ return err;
+}
+
+/*
+ * Send the adapter TERM message to the amso1100
+ */
+static void c2_adapter_term(struct c2_dev *c2dev)
+{
+ struct c2wr_init_req wr;
+
+ memset(&wr, 0, sizeof(wr));
+ c2_wr_set_id(&wr, CCWR_TERM);
+ wr.hdr.context = 0;
+
+ /* Post the init message */
+ vq_send_wr(c2dev, (union c2wr *) & wr);
+ c2dev->init = 0;
+
+ return;
+}
+
+/*
+ * Query the adapter
+ */
+static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
+{
+ struct c2_vq_req *vq_req;
+ struct c2wr_rnic_query_req wr;
+ struct c2wr_rnic_query_rep *reply;
+ int err;
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
+ wr.hdr.context = (unsigned long) vq_req;
+ wr.rnic_handle = c2dev->adapter_handle;
+
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, (union c2wr *) &wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail1;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail1;
+
+ reply =
+ (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply)
+ err = -ENOMEM;
+ else
+ err = c2_errno(reply);
+ if (err)
+ goto bail2;
+
+ props->fw_ver =
+ ((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
+ ((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
+ (be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
+ memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
+ props->max_mr_size = 0xFFFFFFFF;
+ props->page_size_cap = ~(C2_MIN_PAGESIZE-1);
+ props->vendor_id = be32_to_cpu(reply->vendor_id);
+ props->vendor_part_id = be32_to_cpu(reply->part_number);
+ props->hw_ver = be32_to_cpu(reply->hw_version);
+ props->max_qp = be32_to_cpu(reply->max_qps);
+ props->max_qp_wr = be32_to_cpu(reply->max_qp_depth);
+ props->device_cap_flags = c2dev->device_cap_flags;
+ props->max_sge = C2_MAX_SGES;
+ props->max_sge_rd = C2_MAX_SGE_RD;
+ props->max_cq = be32_to_cpu(reply->max_cqs);
+ props->max_cqe = be32_to_cpu(reply->max_cq_depth);
+ props->max_mr = be32_to_cpu(reply->max_mrs);
+ props->max_pd = be32_to_cpu(reply->max_pds);
+ props->max_qp_rd_atom = be32_to_cpu(reply->max_qp_ird);
+ props->max_ee_rd_atom = 0;
+ props->max_res_rd_atom = be32_to_cpu(reply->max_global_ird);
+ props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
+ props->max_ee_init_rd_atom = 0;
+ props->atomic_cap = IB_ATOMIC_NONE;
+ props->max_ee = 0;
+ props->max_rdd = 0;
+ props->max_mw = be32_to_cpu(reply->max_mws);
+ props->max_raw_ipv6_qp = 0;
+ props->max_raw_ethy_qp = 0;
+ props->max_mcast_grp = 0;
+ props->max_mcast_qp_attach = 0;
+ props->max_total_mcast_qp_attach = 0;
+ props->max_ah = 0;
+ props->max_fmr = 0;
+ props->max_map_per_fmr = 0;
+ props->max_srq = 0;
+ props->max_srq_wr = 0;
+ props->max_srq_sge = 0;
+ props->max_pkeys = 0;
+ props->local_ca_ack_delay = 0;
+
+ bail2:
+ vq_repbuf_free(c2dev, reply);
+
+ bail1:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+/*
+ * Add an IP address to the RNIC interface
+ */
+int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+ struct c2_vq_req *vq_req;
+ struct c2wr_rnic_setconfig_req *wr;
+ struct c2wr_rnic_setconfig_rep *reply;
+ struct c2_netaddr netaddr;
+ int err, len;
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ len = sizeof(struct c2_netaddr);
+ wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+ if (!wr) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+ wr->hdr.context = (unsigned long) vq_req;
+ wr->rnic_handle = c2dev->adapter_handle;
+ wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
+
+ netaddr.ip_addr = inaddr;
+ netaddr.netmask = inmask;
+ netaddr.mtu = 0;
+
+ memcpy(wr->data, &netaddr, len);
+
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, (union c2wr *) wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail1;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail1;
+
+ reply =
+ (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ err = c2_errno(reply);
+ vq_repbuf_free(c2dev, reply);
+
+ bail1:
+ kfree(wr);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+/*
+ * Delete an IP address from the RNIC interface
+ */
+int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+ struct c2_vq_req *vq_req;
+ struct c2wr_rnic_setconfig_req *wr;
+ struct c2wr_rnic_setconfig_rep *reply;
+ struct c2_netaddr netaddr;
+ int err, len;
+
+ vq_req = vq_req_alloc(c2dev);
+ if (!vq_req)
+ return -ENOMEM;
+
+ len = sizeof(struct c2_netaddr);
+ wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+ if (!wr) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+ wr->hdr.context = (unsigned long) vq_req;
+ wr->rnic_handle = c2dev->adapter_handle;
+ wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
+
+ netaddr.ip_addr = inaddr;
+ netaddr.netmask = inmask;
+ netaddr.mtu = 0;
+
+ memcpy(wr->data, &netaddr, len);
+
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, (union c2wr *) wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail1;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err)
+ goto bail1;
+
+ reply =
+ (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ err = c2_errno(reply);
+ vq_repbuf_free(c2dev, reply);
+
+ bail1:
+ kfree(wr);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+/*
+ * Open a single RNIC instance to use with all
+ * low level openib calls
+ */
+static int c2_rnic_open(struct c2_dev *c2dev)
+{
+ struct c2_vq_req *vq_req;
+ union c2wr wr;
+ struct c2wr_rnic_open_rep *reply;
+ int err;
+
+ vq_req = vq_req_alloc(c2dev);
+ if (vq_req == NULL) {
+ return -ENOMEM;
+ }
+
+ memset(&wr, 0, sizeof(wr));
+ c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
+ wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
+ wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
+ wr.rnic_open.req.port_num = cpu_to_be16(0);
+ wr.rnic_open.req.user_context = (unsigned long) c2dev;
+
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, &wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err) {
+ goto bail0;
+ }
+
+ reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ if ((err = c2_errno(reply)) != 0) {
+ goto bail1;
+ }
+
+ c2dev->adapter_handle = reply->rnic_handle;
+
+ bail1:
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+/*
+ * Close the RNIC instance
+ */
+static int c2_rnic_close(struct c2_dev *c2dev)
+{
+ struct c2_vq_req *vq_req;
+ union c2wr wr;
+ struct c2wr_rnic_close_rep *reply;
+ int err;
+
+ vq_req = vq_req_alloc(c2dev);
+ if (vq_req == NULL) {
+ return -ENOMEM;
+ }
+
+ memset(&wr, 0, sizeof(wr));
+ c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
+ wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
+ wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
+
+ vq_req_get(c2dev, vq_req);
+
+ err = vq_send_wr(c2dev, &wr);
+ if (err) {
+ vq_req_put(c2dev, vq_req);
+ goto bail0;
+ }
+
+ err = vq_wait_for_reply(c2dev, vq_req);
+ if (err) {
+ goto bail0;
+ }
+
+ reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
+ if (!reply) {
+ err = -ENOMEM;
+ goto bail0;
+ }
+
+ if ((err = c2_errno(reply)) != 0) {
+ goto bail1;
+ }
+
+ c2dev->adapter_handle = 0;
+
+ bail1:
+ vq_repbuf_free(c2dev, reply);
+ bail0:
+ vq_req_free(c2dev, vq_req);
+ return err;
+}
+
+/*
+ * Called by c2_probe to initialize the RNIC. This principally
+ * involves initializing the various limits and resource pools that
+ * comprise the RNIC instance.
+ */
+int c2_rnic_init(struct c2_dev *c2dev)
+{
+ int err;
+ u32 qsize, msgsize;
+ void *q1_pages;
+ void *q2_pages;
+ void __iomem *mmio_regs;
+
+ /* Device capabilities */
+ c2dev->device_cap_flags =
+ (IB_DEVICE_RESIZE_MAX_WR |
+ IB_DEVICE_CURR_QP_STATE_MOD |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_LOCAL_DMA_LKEY |
+ IB_DEVICE_MEM_WINDOW);
+
+ /* Allocate the qptr_array */
+ c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
+ if (!c2dev->qptr_array) {
+ return -ENOMEM;
+ }
+
+ /* Initialize the qptr_array */
+ c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
+ c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
+ c2dev->qptr_array[2] = (void *) &c2dev->aeq;
+
+ /* Initialize data structures */
+ init_waitqueue_head(&c2dev->req_vq_wo);
+ spin_lock_init(&c2dev->vqlock);
+ spin_lock_init(&c2dev->lock);
+
+ /* Allocate MQ shared pointer pool for kernel clients. User
+ * mode client pools are hung off the user context
+ */
+ err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
+ if (err) {
+ goto bail0;
+ }
+
+ /* Allocate shared pointers for Q0, Q1, and Q2 from
+ * the shared pointer pool.
+ */
+
+ c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+ &c2dev->hint_count_dma,
+ GFP_KERNEL);
+ c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+ &c2dev->req_vq.shared_dma,
+ GFP_KERNEL);
+ c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+ &c2dev->rep_vq.shared_dma,
+ GFP_KERNEL);
+ c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+ &c2dev->aeq.shared_dma, GFP_KERNEL);
+ if (!c2dev->hint_count || !c2dev->req_vq.shared ||
+ !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+
+ mmio_regs = c2dev->kva;
+ /* Initialize the Verbs Request Queue */
+ c2_mq_req_init(&c2dev->req_vq, 0,
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
+ mmio_regs +
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
+ mmio_regs +
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
+ C2_MQ_ADAPTER_TARGET);
+
+ /* Initialize the Verbs Reply Queue */
+ qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
+ msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
+ q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+ &c2dev->rep_vq.host_dma, GFP_KERNEL);
+ if (!q1_pages) {
+ err = -ENOMEM;
+ goto bail1;
+ }
+ dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
+ pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
+ (unsigned long long) c2dev->rep_vq.host_dma);
+ c2_mq_rep_init(&c2dev->rep_vq,
+ 1,
+ qsize,
+ msgsize,
+ q1_pages,
+ mmio_regs +
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
+ C2_MQ_HOST_TARGET);
+
+ /* Initialize the Asynchronus Event Queue */
+ qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
+ msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
+ q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+ &c2dev->aeq.host_dma, GFP_KERNEL);
+ if (!q2_pages) {
+ err = -ENOMEM;
+ goto bail2;
+ }
+ dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
+ pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
+ (unsigned long long) c2dev->aeq.host_dma);
+ c2_mq_rep_init(&c2dev->aeq,
+ 2,
+ qsize,
+ msgsize,
+ q2_pages,
+ mmio_regs +
+ be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
+ C2_MQ_HOST_TARGET);
+
+ /* Initialize the verbs request allocator */
+ err = vq_init(c2dev);
+ if (err)
+ goto bail3;
+
+ /* Enable interrupts on the adapter */
+ writel(0, c2dev->regs + C2_IDIS);
+
+ /* create the WR init message */
+ err = c2_adapter_init(c2dev);
+ if (err)
+ goto bail4;
+ c2dev->init++;
+
+ /* open an adapter instance */
+ err = c2_rnic_open(c2dev);
+ if (err)
+ goto bail4;
+
+ /* Initialize cached the adapter limits */
+ err = c2_rnic_query(c2dev, &c2dev->props);
+ if (err)
+ goto bail5;
+
+ /* Initialize the PD pool */
+ err = c2_init_pd_table(c2dev);
+ if (err)
+ goto bail5;
+
+ /* Initialize the QP pool */
+ c2_init_qp_table(c2dev);
+ return 0;
+
+ bail5:
+ c2_rnic_close(c2dev);
+ bail4:
+ vq_term(c2dev);
+ bail3:
+ dma_free_coherent(&c2dev->pcidev->dev,
+ c2dev->aeq.q_size * c2dev->aeq.msg_size,
+ q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
+ bail2:
+ dma_free_coherent(&c2dev->pcidev->dev,
+ c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+ q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
+ bail1:
+ c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+ bail0:
+ vfree(c2dev->qptr_array);
+
+ return err;
+}
+
+/*
+ * Called by c2_remove to cleanup the RNIC resources.
+ */
+void c2_rnic_term(struct c2_dev *c2dev)
+{
+
+ /* Close the open adapter instance */
+ c2_rnic_close(c2dev);
+
+ /* Send the TERM message to the adapter */
+ c2_adapter_term(c2dev);
+
+ /* Disable interrupts on the adapter */
+ writel(1, c2dev->regs + C2_IDIS);
+
+ /* Free the QP pool */
+ c2_cleanup_qp_table(c2dev);
+
+ /* Free the PD pool */
+ c2_cleanup_pd_table(c2dev);
+
+ /* Free the verbs request allocator */
+ vq_term(c2dev);
+
+ /* Free the asynchronus event queue */
+ dma_free_coherent(&c2dev->pcidev->dev,
+ c2dev->aeq.q_size * c2dev->aeq.msg_size,
+ c2dev->aeq.msg_pool.host,
+ dma_unmap_addr(&c2dev->aeq, mapping));
+
+ /* Free the verbs reply queue */
+ dma_free_coherent(&c2dev->pcidev->dev,
+ c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+ c2dev->rep_vq.msg_pool.host,
+ dma_unmap_addr(&c2dev->rep_vq, mapping));
+
+ /* Free the MQ shared pointer pool */
+ c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+
+ /* Free the qptr_array */
+ vfree(c2dev->qptr_array);
+
+ return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_status.h b/drivers/staging/rdma/amso1100/c2_status.h
new file mode 100644
index 000000000000..6ee4aa92d875
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_status.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_STATUS_H_
+#define _C2_STATUS_H_
+
+/*
+ * Verbs Status Codes
+ */
+enum c2_status {
+ C2_OK = 0, /* This must be zero */
+ CCERR_INSUFFICIENT_RESOURCES = 1,
+ CCERR_INVALID_MODIFIER = 2,
+ CCERR_INVALID_MODE = 3,
+ CCERR_IN_USE = 4,
+ CCERR_INVALID_RNIC = 5,
+ CCERR_INTERRUPTED_OPERATION = 6,
+ CCERR_INVALID_EH = 7,
+ CCERR_INVALID_CQ = 8,
+ CCERR_CQ_EMPTY = 9,
+ CCERR_NOT_IMPLEMENTED = 10,
+ CCERR_CQ_DEPTH_TOO_SMALL = 11,
+ CCERR_PD_IN_USE = 12,
+ CCERR_INVALID_PD = 13,
+ CCERR_INVALID_SRQ = 14,
+ CCERR_INVALID_ADDRESS = 15,
+ CCERR_INVALID_NETMASK = 16,
+ CCERR_INVALID_QP = 17,
+ CCERR_INVALID_QP_STATE = 18,
+ CCERR_TOO_MANY_WRS_POSTED = 19,
+ CCERR_INVALID_WR_TYPE = 20,
+ CCERR_INVALID_SGL_LENGTH = 21,
+ CCERR_INVALID_SQ_DEPTH = 22,
+ CCERR_INVALID_RQ_DEPTH = 23,
+ CCERR_INVALID_ORD = 24,
+ CCERR_INVALID_IRD = 25,
+ CCERR_QP_ATTR_CANNOT_CHANGE = 26,
+ CCERR_INVALID_STAG = 27,
+ CCERR_QP_IN_USE = 28,
+ CCERR_OUTSTANDING_WRS = 29,
+ CCERR_STAG_IN_USE = 30,
+ CCERR_INVALID_STAG_INDEX = 31,
+ CCERR_INVALID_SGL_FORMAT = 32,
+ CCERR_ADAPTER_TIMEOUT = 33,
+ CCERR_INVALID_CQ_DEPTH = 34,
+ CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
+ CCERR_INVALID_EP = 36,
+ CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
+ CCERR_FLUSHED = 38,
+ CCERR_INVALID_WQE = 39,
+ CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
+ CCERR_REMOTE_TERMINATION_ERROR = 41,
+ CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
+ CCERR_ACCESS_VIOLATION = 43,
+ CCERR_INVALID_PD_ID = 44,
+ CCERR_WRAP_ERROR = 45,
+ CCERR_INV_STAG_ACCESS_ERROR = 46,
+ CCERR_ZERO_RDMA_READ_RESOURCES = 47,
+ CCERR_QP_NOT_PRIVILEGED = 48,
+ CCERR_STAG_STATE_NOT_INVALID = 49,
+ CCERR_INVALID_PAGE_SIZE = 50,
+ CCERR_INVALID_BUFFER_SIZE = 51,
+ CCERR_INVALID_PBE = 52,
+ CCERR_INVALID_FBO = 53,
+ CCERR_INVALID_LENGTH = 54,
+ CCERR_INVALID_ACCESS_RIGHTS = 55,
+ CCERR_PBL_TOO_BIG = 56,
+ CCERR_INVALID_VA = 57,
+ CCERR_INVALID_REGION = 58,
+ CCERR_INVALID_WINDOW = 59,
+ CCERR_TOTAL_LENGTH_TOO_BIG = 60,
+ CCERR_INVALID_QP_ID = 61,
+ CCERR_ADDR_IN_USE = 62,
+ CCERR_ADDR_NOT_AVAIL = 63,
+ CCERR_NET_DOWN = 64,
+ CCERR_NET_UNREACHABLE = 65,
+ CCERR_CONN_ABORTED = 66,
+ CCERR_CONN_RESET = 67,
+ CCERR_NO_BUFS = 68,
+ CCERR_CONN_TIMEDOUT = 69,
+ CCERR_CONN_REFUSED = 70,
+ CCERR_HOST_UNREACHABLE = 71,
+ CCERR_INVALID_SEND_SGL_DEPTH = 72,
+ CCERR_INVALID_RECV_SGL_DEPTH = 73,
+ CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
+ CCERR_INSUFFICIENT_PRIVILEGES = 75,
+ CCERR_STACK_ERROR = 76,
+ CCERR_INVALID_VERSION = 77,
+ CCERR_INVALID_MTU = 78,
+ CCERR_INVALID_IMAGE = 79,
+ CCERR_PENDING = 98, /* not an error; user internally by adapter */
+ CCERR_DEFER = 99, /* not an error; used internally by adapter */
+ CCERR_FAILED_WRITE = 100,
+ CCERR_FAILED_ERASE = 101,
+ CCERR_FAILED_VERIFICATION = 102,
+ CCERR_NOT_FOUND = 103,
+
+};
+
+/*
+ * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
+ */
+enum c2_connect_status {
+ C2_CONN_STATUS_SUCCESS = C2_OK,
+ C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
+ C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
+ C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
+ C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
+ C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
+ C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
+ C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
+ C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
+ C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
+ C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
+};
+
+/*
+ * Flash programming status codes.
+ */
+enum c2_flash_status {
+ C2_FLASH_STATUS_SUCCESS = 0x0000,
+ C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
+ C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
+ C2_FLASH_STATUS_ECLBS = 0x0400,
+ C2_FLASH_STATUS_PSLBS = 0x0800,
+ C2_FLASH_STATUS_VPENS = 0x1000,
+};
+
+#endif /* _C2_STATUS_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_user.h b/drivers/staging/rdma/amso1100/c2_user.h
new file mode 100644
index 000000000000..7e9e7ad65467
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_user.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_USER_H
+#define C2_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct c2_alloc_ucontext_resp {
+ __u32 qp_tab_size;
+ __u32 uarc_size;
+};
+
+struct c2_alloc_pd_resp {
+ __u32 pdn;
+ __u32 reserved;
+};
+
+struct c2_create_cq {
+ __u32 lkey;
+ __u32 pdn;
+ __u64 arm_db_page;
+ __u64 set_db_page;
+ __u32 arm_db_index;
+ __u32 set_db_index;
+};
+
+struct c2_create_cq_resp {
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct c2_create_qp {
+ __u32 lkey;
+ __u32 reserved;
+ __u64 sq_db_page;
+ __u64 rq_db_page;
+ __u32 sq_db_index;
+ __u32 rq_db_index;
+};
+
+#endif /* C2_USER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_vq.c b/drivers/staging/rdma/amso1100/c2_vq.c
new file mode 100644
index 000000000000..2ec716fb2edb
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_vq.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "c2_vq.h"
+#include "c2_provider.h"
+
+/*
+ * Verbs Request Objects:
+ *
+ * VQ Request Objects are allocated by the kernel verbs handlers.
+ * They contain a wait object, a refcnt, an atomic bool indicating that the
+ * adapter has replied, and a copy of the verb reply work request.
+ * A pointer to the VQ Request Object is passed down in the context
+ * field of the work request message, and reflected back by the adapter
+ * in the verbs reply message. The function handle_vq() in the interrupt
+ * path will use this pointer to:
+ * 1) append a copy of the verbs reply message
+ * 2) mark that the reply is ready
+ * 3) wake up the kernel verbs handler blocked awaiting the reply.
+ *
+ *
+ * The kernel verbs handlers do a "get" to put a 2nd reference on the
+ * VQ Request object. If the kernel verbs handler exits before the adapter
+ * can respond, this extra reference will keep the VQ Request object around
+ * until the adapter's reply can be processed. The reason we need this is
+ * because a pointer to this object is stuffed into the context field of
+ * the verbs work request message, and reflected back in the reply message.
+ * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
+ * kernel verb handler that is blocked awaiting the verb reply.
+ * So handle_vq() will do a "put" on the object when it's done accessing it.
+ * NOTE: If we guarantee that the kernel verb handler will never bail before
+ * getting the reply, then we don't need these refcnts.
+ *
+ *
+ * VQ Request objects are freed by the kernel verbs handlers only
+ * after the verb has been processed, or when the adapter fails and
+ * does not reply.
+ *
+ *
+ * Verbs Reply Buffers:
+ *
+ * VQ Reply bufs are local host memory copies of a
+ * outstanding Verb Request reply
+ * message. The are always allocated by the kernel verbs handlers, and _may_ be
+ * freed by either the kernel verbs handler -or- the interrupt handler. The
+ * kernel verbs handler _must_ free the repbuf, then free the vq request object
+ * in that order.
+ */
+
+int vq_init(struct c2_dev *c2dev)
+{
+ sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
+ (char) ('0' + c2dev->devnum));
+ c2dev->host_msg_cache =
+ kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (c2dev->host_msg_cache == NULL) {
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void vq_term(struct c2_dev *c2dev)
+{
+ kmem_cache_destroy(c2dev->host_msg_cache);
+}
+
+/* vq_req_alloc - allocate a VQ Request Object and initialize it.
+ * The refcnt is set to 1.
+ */
+struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
+{
+ struct c2_vq_req *r;
+
+ r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
+ if (r) {
+ init_waitqueue_head(&r->wait_object);
+ r->reply_msg = 0;
+ r->event = 0;
+ r->cm_id = NULL;
+ r->qp = NULL;
+ atomic_set(&r->refcnt, 1);
+ atomic_set(&r->reply_ready, 0);
+ }
+ return r;
+}
+
+
+/* vq_req_free - free the VQ Request Object. It is assumed the verbs handler
+ * has already free the VQ Reply Buffer if it existed.
+ */
+void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+ r->reply_msg = 0;
+ if (atomic_dec_and_test(&r->refcnt)) {
+ kfree(r);
+ }
+}
+
+/* vq_req_get - reference a VQ Request Object. Done
+ * only in the kernel verbs handlers.
+ */
+void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+ atomic_inc(&r->refcnt);
+}
+
+
+/* vq_req_put - dereference and potentially free a VQ Request Object.
+ *
+ * This is only called by handle_vq() on the
+ * interrupt when it is done processing
+ * a verb reply message. If the associated
+ * kernel verbs handler has already bailed,
+ * then this put will actually free the VQ
+ * Request object _and_ the VQ Reply Buffer
+ * if it exists.
+ */
+void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+ if (atomic_dec_and_test(&r->refcnt)) {
+ if (r->reply_msg != 0)
+ vq_repbuf_free(c2dev,
+ (void *) (unsigned long) r->reply_msg);
+ kfree(r);
+ }
+}
+
+
+/*
+ * vq_repbuf_alloc - allocate a VQ Reply Buffer.
+ */
+void *vq_repbuf_alloc(struct c2_dev *c2dev)
+{
+ return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
+}
+
+/*
+ * vq_send_wr - post a verbs request message to the Verbs Request Queue.
+ * If a message is not available in the MQ, then block until one is available.
+ * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
+ * When the adapter drains the Verbs Request Queue,
+ * it inserts MQ index 0 in to the
+ * adapter->host activity fifo and interrupts the host.
+ */
+int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
+{
+ void *msg;
+ wait_queue_t __wait;
+
+ /*
+ * grab adapter vq lock
+ */
+ spin_lock(&c2dev->vqlock);
+
+ /*
+ * allocate msg
+ */
+ msg = c2_mq_alloc(&c2dev->req_vq);
+
+ /*
+ * If we cannot get a msg, then we'll wait
+ * When a messages are available, the int handler will wake_up()
+ * any waiters.
+ */
+ while (msg == NULL) {
+ pr_debug("%s:%d no available msg in VQ, waiting...\n",
+ __func__, __LINE__);
+ init_waitqueue_entry(&__wait, current);
+ add_wait_queue(&c2dev->req_vq_wo, &__wait);
+ spin_unlock(&c2dev->vqlock);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!c2_mq_full(&c2dev->req_vq)) {
+ break;
+ }
+ if (!signal_pending(current)) {
+ schedule_timeout(1 * HZ); /* 1 second... */
+ continue;
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+ return -EINTR;
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+ spin_lock(&c2dev->vqlock);
+ msg = c2_mq_alloc(&c2dev->req_vq);
+ }
+
+ /*
+ * copy wr into adapter msg
+ */
+ memcpy(msg, wr, c2dev->req_vq.msg_size);
+
+ /*
+ * post msg
+ */
+ c2_mq_produce(&c2dev->req_vq);
+
+ /*
+ * release adapter vq lock
+ */
+ spin_unlock(&c2dev->vqlock);
+ return 0;
+}
+
+
+/*
+ * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
+ */
+int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
+{
+ if (!wait_event_timeout(req->wait_object,
+ atomic_read(&req->reply_ready),
+ 60*HZ))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+/*
+ * vq_repbuf_free - Free a Verbs Reply Buffer.
+ */
+void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
+{
+ kmem_cache_free(c2dev->host_msg_cache, reply);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_vq.h b/drivers/staging/rdma/amso1100/c2_vq.h
new file mode 100644
index 000000000000..33805627a607
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_vq.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_VQ_H_
+#define _C2_VQ_H_
+#include <linux/sched.h>
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_provider.h"
+
+struct c2_vq_req {
+ u64 reply_msg; /* ptr to reply msg */
+ wait_queue_head_t wait_object; /* wait object for vq reqs */
+ atomic_t reply_ready; /* set when reply is ready */
+ atomic_t refcnt; /* used to cancel WRs... */
+ int event;
+ struct iw_cm_id *cm_id;
+ struct c2_qp *qp;
+};
+
+extern int vq_init(struct c2_dev *c2dev);
+extern void vq_term(struct c2_dev *c2dev);
+
+extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
+extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
+
+extern void *vq_repbuf_alloc(struct c2_dev *c2dev);
+extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
+
+extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
+#endif /* _C2_VQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_wr.h b/drivers/staging/rdma/amso1100/c2_wr.h
new file mode 100644
index 000000000000..8d4b4ca463ca
--- /dev/null
+++ b/drivers/staging/rdma/amso1100/c2_wr.h
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_WR_H_
+#define _C2_WR_H_
+
+#ifdef CCDEBUG
+#define CCWR_MAGIC 0xb07700b0
+#endif
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+/* Maximum allowed size in bytes of private_data exchange
+ * on connect.
+ */
+#define C2_MAX_PRIVATE_DATA_SIZE 200
+
+/*
+ * These types are shared among the adapter, host, and CCIL consumer.
+ */
+enum c2_cq_notification_type {
+ C2_CQ_NOTIFICATION_TYPE_NONE = 1,
+ C2_CQ_NOTIFICATION_TYPE_NEXT,
+ C2_CQ_NOTIFICATION_TYPE_NEXT_SE
+};
+
+enum c2_setconfig_cmd {
+ C2_CFG_ADD_ADDR = 1,
+ C2_CFG_DEL_ADDR = 2,
+ C2_CFG_ADD_ROUTE = 3,
+ C2_CFG_DEL_ROUTE = 4
+};
+
+enum c2_getconfig_cmd {
+ C2_GETCONFIG_ROUTES = 1,
+ C2_GETCONFIG_ADDRS
+};
+
+/*
+ * CCIL Work Request Identifiers
+ */
+enum c2wr_ids {
+ CCWR_RNIC_OPEN = 1,
+ CCWR_RNIC_QUERY,
+ CCWR_RNIC_SETCONFIG,
+ CCWR_RNIC_GETCONFIG,
+ CCWR_RNIC_CLOSE,
+ CCWR_CQ_CREATE,
+ CCWR_CQ_QUERY,
+ CCWR_CQ_MODIFY,
+ CCWR_CQ_DESTROY,
+ CCWR_QP_CONNECT,
+ CCWR_PD_ALLOC,
+ CCWR_PD_DEALLOC,
+ CCWR_SRQ_CREATE,
+ CCWR_SRQ_QUERY,
+ CCWR_SRQ_MODIFY,
+ CCWR_SRQ_DESTROY,
+ CCWR_QP_CREATE,
+ CCWR_QP_QUERY,
+ CCWR_QP_MODIFY,
+ CCWR_QP_DESTROY,
+ CCWR_NSMR_STAG_ALLOC,
+ CCWR_NSMR_REGISTER,
+ CCWR_NSMR_PBL,
+ CCWR_STAG_DEALLOC,
+ CCWR_NSMR_REREGISTER,
+ CCWR_SMR_REGISTER,
+ CCWR_MR_QUERY,
+ CCWR_MW_ALLOC,
+ CCWR_MW_QUERY,
+ CCWR_EP_CREATE,
+ CCWR_EP_GETOPT,
+ CCWR_EP_SETOPT,
+ CCWR_EP_DESTROY,
+ CCWR_EP_BIND,
+ CCWR_EP_CONNECT,
+ CCWR_EP_LISTEN,
+ CCWR_EP_SHUTDOWN,
+ CCWR_EP_LISTEN_CREATE,
+ CCWR_EP_LISTEN_DESTROY,
+ CCWR_EP_QUERY,
+ CCWR_CR_ACCEPT,
+ CCWR_CR_REJECT,
+ CCWR_CONSOLE,
+ CCWR_TERM,
+ CCWR_FLASH_INIT,
+ CCWR_FLASH,
+ CCWR_BUF_ALLOC,
+ CCWR_BUF_FREE,
+ CCWR_FLASH_WRITE,
+ CCWR_INIT, /* WARNING: Don't move this ever again! */
+
+
+
+ /* Add new IDs here */
+
+
+
+ /*
+ * WARNING: CCWR_LAST must always be the last verbs id defined!
+ * All the preceding IDs are fixed, and must not change.
+ * You can add new IDs, but must not remove or reorder
+ * any IDs. If you do, YOU will ruin any hope of
+ * compatibility between versions.
+ */
+ CCWR_LAST,
+
+ /*
+ * Start over at 1 so that arrays indexed by user wr id's
+ * begin at 1. This is OK since the verbs and user wr id's
+ * are always used on disjoint sets of queues.
+ */
+ /*
+ * The order of the CCWR_SEND_XX verbs must
+ * match the order of the RDMA_OPs
+ */
+ CCWR_SEND = 1,
+ CCWR_SEND_INV,
+ CCWR_SEND_SE,
+ CCWR_SEND_SE_INV,
+ CCWR_RDMA_WRITE,
+ CCWR_RDMA_READ,
+ CCWR_RDMA_READ_INV,
+ CCWR_MW_BIND,
+ CCWR_NSMR_FASTREG,
+ CCWR_STAG_INVALIDATE,
+ CCWR_RECV,
+ CCWR_NOP,
+ CCWR_UNIMPL,
+/* WARNING: This must always be the last user wr id defined! */
+};
+#define RDMA_SEND_OPCODE_FROM_WR_ID(x) (x+2)
+
+/*
+ * SQ/RQ Work Request Types
+ */
+enum c2_wr_type {
+ C2_WR_TYPE_SEND = CCWR_SEND,
+ C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
+ C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
+ C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
+ C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
+ C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
+ C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
+ C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
+ C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
+ C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
+ C2_WR_TYPE_RECV = CCWR_RECV,
+ C2_WR_TYPE_NOP = CCWR_NOP,
+};
+
+struct c2_netaddr {
+ __be32 ip_addr;
+ __be32 netmask;
+ u32 mtu;
+};
+
+struct c2_route {
+ u32 ip_addr; /* 0 indicates the default route */
+ u32 netmask; /* netmask associated with dst */
+ u32 flags;
+ union {
+ u32 ipaddr; /* address of the nexthop interface */
+ u8 enaddr[6];
+ } nexthop;
+};
+
+/*
+ * A Scatter Gather Entry.
+ */
+struct c2_data_addr {
+ __be32 stag;
+ __be32 length;
+ __be64 to;
+};
+
+/*
+ * MR and MW flags used by the consumer, RI, and RNIC.
+ */
+enum c2_mm_flags {
+ MEM_REMOTE = 0x0001, /* allow mw binds with remote access. */
+ MEM_VA_BASED = 0x0002, /* Not Zero-based */
+ MEM_PBL_COMPLETE = 0x0004, /* PBL array is complete in this msg */
+ MEM_LOCAL_READ = 0x0008, /* allow local reads */
+ MEM_LOCAL_WRITE = 0x0010, /* allow local writes */
+ MEM_REMOTE_READ = 0x0020, /* allow remote reads */
+ MEM_REMOTE_WRITE = 0x0040, /* allow remote writes */
+ MEM_WINDOW_BIND = 0x0080, /* binds allowed */
+ MEM_SHARED = 0x0100, /* set if MR is shared */
+ MEM_STAG_VALID = 0x0200 /* set if STAG is in valid state */
+};
+
+/*
+ * CCIL API ACF flags defined in terms of the low level mem flags.
+ * This minimizes translation needed in the user API
+ */
+enum c2_acf {
+ C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
+ C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
+ C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
+ C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
+ C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
+};
+
+/*
+ * Image types of objects written to flash
+ */
+#define C2_FLASH_IMG_BITFILE 1
+#define C2_FLASH_IMG_OPTION_ROM 2
+#define C2_FLASH_IMG_VPD 3
+
+/*
+ * to fix bug 1815 we define the max size allowable of the
+ * terminate message (per the IETF spec).Refer to the IETF
+ * protocol specification, section 12.1.6, page 64)
+ * The message is prefixed by 20 types of DDP info.
+ *
+ * Then the message has 6 bytes for the terminate control
+ * and DDP segment length info plus a DDP header (either
+ * 14 or 18 byts) plus 28 bytes for the RDMA header.
+ * Thus the max size in:
+ * 20 + (6 + 18 + 28) = 72
+ */
+#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
+
+/*
+ * Build String Length. It must be the same as C2_BUILD_STR_LEN in ccil_api.h
+ */
+#define WR_BUILD_STR_LEN 64
+
+/*
+ * WARNING: All of these structs need to align any 64bit types on
+ * 64 bit boundaries! 64bit types include u64 and u64.
+ */
+
+/*
+ * Clustercore Work Request Header. Be sensitive to field layout
+ * and alignment.
+ */
+struct c2wr_hdr {
+ /* wqe_count is part of the cqe. It is put here so the
+ * adapter can write to it while the wr is pending without
+ * clobbering part of the wr. This word need not be dma'd
+ * from the host to adapter by libccil, but we copy it anyway
+ * to make the memcpy to the adapter better aligned.
+ */
+ __be32 wqe_count;
+
+ /* Put these fields next so that later 32- and 64-bit
+ * quantities are naturally aligned.
+ */
+ u8 id;
+ u8 result; /* adapter -> host */
+ u8 sge_count; /* host -> adapter */
+ u8 flags; /* host -> adapter */
+
+ u64 context;
+#ifdef CCMSGMAGIC
+ u32 magic;
+ u32 pad;
+#endif
+} __attribute__((packed));
+
+/*
+ *------------------------ RNIC ------------------------
+ */
+
+/*
+ * WR_RNIC_OPEN
+ */
+
+/*
+ * Flags for the RNIC WRs
+ */
+enum c2_rnic_flags {
+ RNIC_IRD_STATIC = 0x0001,
+ RNIC_ORD_STATIC = 0x0002,
+ RNIC_QP_STATIC = 0x0004,
+ RNIC_SRQ_SUPPORTED = 0x0008,
+ RNIC_PBL_BLOCK_MODE = 0x0010,
+ RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
+ RNIC_CQ_OVF_DETECTED = 0x0040,
+ RNIC_PRIV_MODE = 0x0080
+};
+
+struct c2wr_rnic_open_req {
+ struct c2wr_hdr hdr;
+ u64 user_context;
+ __be16 flags; /* See enum c2_rnic_flags */
+ __be16 port_num;
+} __attribute__((packed));
+
+struct c2wr_rnic_open_rep {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+} __attribute__((packed));
+
+union c2wr_rnic_open {
+ struct c2wr_rnic_open_req req;
+ struct c2wr_rnic_open_rep rep;
+} __attribute__((packed));
+
+struct c2wr_rnic_query_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_QUERY
+ */
+struct c2wr_rnic_query_rep {
+ struct c2wr_hdr hdr;
+ u64 user_context;
+ __be32 vendor_id;
+ __be32 part_number;
+ __be32 hw_version;
+ __be32 fw_ver_major;
+ __be32 fw_ver_minor;
+ __be32 fw_ver_patch;
+ char fw_ver_build_str[WR_BUILD_STR_LEN];
+ __be32 max_qps;
+ __be32 max_qp_depth;
+ u32 max_srq_depth;
+ u32 max_send_sgl_depth;
+ u32 max_rdma_sgl_depth;
+ __be32 max_cqs;
+ __be32 max_cq_depth;
+ u32 max_cq_event_handlers;
+ __be32 max_mrs;
+ u32 max_pbl_depth;
+ __be32 max_pds;
+ __be32 max_global_ird;
+ u32 max_global_ord;
+ __be32 max_qp_ird;
+ __be32 max_qp_ord;
+ u32 flags;
+ __be32 max_mws;
+ u32 pbe_range_low;
+ u32 pbe_range_high;
+ u32 max_srqs;
+ u32 page_size;
+} __attribute__((packed));
+
+union c2wr_rnic_query {
+ struct c2wr_rnic_query_req req;
+ struct c2wr_rnic_query_rep rep;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_GETCONFIG
+ */
+
+struct c2wr_rnic_getconfig_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 option; /* see c2_getconfig_cmd_t */
+ u64 reply_buf;
+ u32 reply_buf_len;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_getconfig_rep {
+ struct c2wr_hdr hdr;
+ u32 option; /* see c2_getconfig_cmd_t */
+ u32 count_len; /* length of the number of addresses configured */
+} __attribute__((packed)) ;
+
+union c2wr_rnic_getconfig {
+ struct c2wr_rnic_getconfig_req req;
+ struct c2wr_rnic_getconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_SETCONFIG
+ */
+struct c2wr_rnic_setconfig_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ __be32 option; /* See c2_setconfig_cmd_t */
+ /* variable data and pad. See c2_netaddr and c2_route */
+ u8 data[0];
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_setconfig_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_setconfig {
+ struct c2wr_rnic_setconfig_req req;
+ struct c2wr_rnic_setconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_CLOSE
+ */
+struct c2wr_rnic_close_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_close_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_close {
+ struct c2wr_rnic_close_req req;
+ struct c2wr_rnic_close_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ CQ ------------------------
+ */
+struct c2wr_cq_create_req {
+ struct c2wr_hdr hdr;
+ __be64 shared_ht;
+ u64 user_context;
+ __be64 msg_pool;
+ u32 rnic_handle;
+ __be32 msg_size;
+ __be32 depth;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_create_rep {
+ struct c2wr_hdr hdr;
+ __be32 mq_index;
+ __be32 adapter_shared;
+ u32 cq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_cq_create {
+ struct c2wr_cq_create_req req;
+ struct c2wr_cq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 cq_handle;
+ u32 new_depth;
+ u64 new_msg_pool;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_modify {
+ struct c2wr_cq_modify_req req;
+ struct c2wr_cq_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 cq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_destroy {
+ struct c2wr_cq_destroy_req req;
+ struct c2wr_cq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ PD ------------------------
+ */
+struct c2wr_pd_alloc_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_alloc_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_alloc {
+ struct c2wr_pd_alloc_req req;
+ struct c2wr_pd_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_dealloc {
+ struct c2wr_pd_dealloc_req req;
+ struct c2wr_pd_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ SRQ ------------------------
+ */
+struct c2wr_srq_create_req {
+ struct c2wr_hdr hdr;
+ u64 shared_ht;
+ u64 user_context;
+ u32 rnic_handle;
+ u32 srq_depth;
+ u32 srq_limit;
+ u32 sgl_depth;
+ u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_create_rep {
+ struct c2wr_hdr hdr;
+ u32 srq_depth;
+ u32 sgl_depth;
+ u32 msg_size;
+ u32 mq_index;
+ u32 mq_start;
+ u32 srq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_srq_create {
+ struct c2wr_srq_create_req req;
+ struct c2wr_srq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 srq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_srq_destroy {
+ struct c2wr_srq_destroy_req req;
+ struct c2wr_srq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ QP ------------------------
+ */
+enum c2wr_qp_flags {
+ QP_RDMA_READ = 0x00000001, /* RDMA read enabled? */
+ QP_RDMA_WRITE = 0x00000002, /* RDMA write enabled? */
+ QP_MW_BIND = 0x00000004, /* MWs enabled */
+ QP_ZERO_STAG = 0x00000008, /* enabled? */
+ QP_REMOTE_TERMINATION = 0x00000010, /* remote end terminated */
+ QP_RDMA_READ_RESPONSE = 0x00000020 /* Remote RDMA read */
+ /* enabled? */
+};
+
+struct c2wr_qp_create_req {
+ struct c2wr_hdr hdr;
+ __be64 shared_sq_ht;
+ __be64 shared_rq_ht;
+ u64 user_context;
+ u32 rnic_handle;
+ u32 sq_cq_handle;
+ u32 rq_cq_handle;
+ __be32 sq_depth;
+ __be32 rq_depth;
+ u32 srq_handle;
+ u32 srq_limit;
+ __be32 flags; /* see enum c2wr_qp_flags */
+ __be32 send_sgl_depth;
+ __be32 recv_sgl_depth;
+ __be32 rdma_write_sgl_depth;
+ __be32 ord;
+ __be32 ird;
+ u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_create_rep {
+ struct c2wr_hdr hdr;
+ __be32 sq_depth;
+ __be32 rq_depth;
+ u32 send_sgl_depth;
+ u32 recv_sgl_depth;
+ u32 rdma_write_sgl_depth;
+ u32 ord;
+ u32 ird;
+ __be32 sq_msg_size;
+ __be32 sq_mq_index;
+ __be32 sq_mq_start;
+ __be32 rq_msg_size;
+ __be32 rq_mq_index;
+ __be32 rq_mq_start;
+ u32 qp_handle;
+} __attribute__((packed)) ;
+
+union c2wr_qp_create {
+ struct c2wr_qp_create_req req;
+ struct c2wr_qp_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_rep {
+ struct c2wr_hdr hdr;
+ u64 user_context;
+ u32 rnic_handle;
+ u32 sq_depth;
+ u32 rq_depth;
+ u32 send_sgl_depth;
+ u32 rdma_write_sgl_depth;
+ u32 recv_sgl_depth;
+ u32 ord;
+ u32 ird;
+ u16 qp_state;
+ u16 flags; /* see c2wr_qp_flags_t */
+ u32 qp_id;
+ u32 local_addr;
+ u32 remote_addr;
+ u16 local_port;
+ u16 remote_port;
+ u32 terminate_msg_length; /* 0 if not present */
+ u8 data[0];
+ /* Terminate Message in-line here. */
+} __attribute__((packed)) ;
+
+union c2wr_qp_query {
+ struct c2wr_qp_query_req req;
+ struct c2wr_qp_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_req {
+ struct c2wr_hdr hdr;
+ u64 stream_msg;
+ u32 stream_msg_length;
+ u32 rnic_handle;
+ u32 qp_handle;
+ __be32 next_qp_state;
+ __be32 ord;
+ __be32 ird;
+ __be32 sq_depth;
+ __be32 rq_depth;
+ u32 llp_ep_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_rep {
+ struct c2wr_hdr hdr;
+ u32 ord;
+ u32 ird;
+ u32 sq_depth;
+ u32 rq_depth;
+ u32 sq_msg_size;
+ u32 sq_mq_index;
+ u32 sq_mq_start;
+ u32 rq_msg_size;
+ u32 rq_mq_index;
+ u32 rq_mq_start;
+} __attribute__((packed)) ;
+
+union c2wr_qp_modify {
+ struct c2wr_qp_modify_req req;
+ struct c2wr_qp_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_qp_destroy {
+ struct c2wr_qp_destroy_req req;
+ struct c2wr_qp_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * The CCWR_QP_CONNECT msg is posted on the verbs request queue. It can
+ * only be posted when a QP is in IDLE state. After the connect request is
+ * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
+ * No synchronous reply from adapter to this WR. The results of
+ * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
+ * See c2wr_ae_active_connect_results_t
+ */
+struct c2wr_qp_connect_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 qp_handle;
+ __be32 remote_addr;
+ __be16 remote_port;
+ u16 pad;
+ __be32 private_data_length;
+ u8 private_data[0]; /* Private data in-line. */
+} __attribute__((packed)) ;
+
+struct c2wr_qp_connect {
+ struct c2wr_qp_connect_req req;
+ /* no synchronous reply. */
+} __attribute__((packed)) ;
+
+
+/*
+ *------------------------ MM ------------------------
+ */
+
+struct c2wr_nsmr_stag_alloc_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 pbl_depth;
+ u32 pd_id;
+ u32 flags;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_stag_alloc_rep {
+ struct c2wr_hdr hdr;
+ u32 pbl_depth;
+ u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_stag_alloc {
+ struct c2wr_nsmr_stag_alloc_req req;
+ struct c2wr_nsmr_stag_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_req {
+ struct c2wr_hdr hdr;
+ __be64 va;
+ u32 rnic_handle;
+ __be16 flags;
+ u8 stag_key;
+ u8 pad;
+ u32 pd_id;
+ __be32 pbl_depth;
+ __be32 pbe_size;
+ __be32 fbo;
+ __be32 length;
+ __be32 addrs_length;
+ /* array of paddrs (must be aligned on a 64bit boundary) */
+ __be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_rep {
+ struct c2wr_hdr hdr;
+ u32 pbl_depth;
+ __be32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_register {
+ struct c2wr_nsmr_register_req req;
+ struct c2wr_nsmr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ __be32 flags;
+ __be32 stag_index;
+ __be32 addrs_length;
+ /* array of paddrs (must be aligned on a 64bit boundary) */
+ __be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_pbl {
+ struct c2wr_nsmr_pbl_req req;
+ struct c2wr_nsmr_pbl_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_rep {
+ struct c2wr_hdr hdr;
+ u8 stag_key;
+ u8 pad[3];
+ u32 pd_id;
+ u32 flags;
+ u32 pbl_depth;
+} __attribute__((packed)) ;
+
+union c2wr_mr_query {
+ struct c2wr_mr_query_req req;
+ struct c2wr_mr_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_rep {
+ struct c2wr_hdr hdr;
+ u8 stag_key;
+ u8 pad[3];
+ u32 pd_id;
+ u32 flags;
+} __attribute__((packed)) ;
+
+union c2wr_mw_query {
+ struct c2wr_mw_query_req req;
+ struct c2wr_mw_query_rep rep;
+} __attribute__((packed)) ;
+
+
+struct c2wr_stag_dealloc_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ __be32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_stag_dealloc_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_stag_dealloc {
+ struct c2wr_stag_dealloc_req req;
+ struct c2wr_stag_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_req {
+ struct c2wr_hdr hdr;
+ u64 va;
+ u32 rnic_handle;
+ u16 flags;
+ u8 stag_key;
+ u8 pad;
+ u32 stag_index;
+ u32 pd_id;
+ u32 pbl_depth;
+ u32 pbe_size;
+ u32 fbo;
+ u32 length;
+ u32 addrs_length;
+ u32 pad1;
+ /* array of paddrs (must be aligned on a 64bit boundary) */
+ u64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_rep {
+ struct c2wr_hdr hdr;
+ u32 pbl_depth;
+ u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_reregister {
+ struct c2wr_nsmr_reregister_req req;
+ struct c2wr_nsmr_reregister_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_req {
+ struct c2wr_hdr hdr;
+ u64 va;
+ u32 rnic_handle;
+ u16 flags;
+ u8 stag_key;
+ u8 pad;
+ u32 stag_index;
+ u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_rep {
+ struct c2wr_hdr hdr;
+ u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_smr_register {
+ struct c2wr_smr_register_req req;
+ struct c2wr_smr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_rep {
+ struct c2wr_hdr hdr;
+ u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_mw_alloc {
+ struct c2wr_mw_alloc_req req;
+ struct c2wr_mw_alloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ WRs -----------------------
+ */
+
+struct c2wr_user_hdr {
+ struct c2wr_hdr hdr; /* Has status and WR Type */
+} __attribute__((packed)) ;
+
+enum c2_qp_state {
+ C2_QP_STATE_IDLE = 0x01,
+ C2_QP_STATE_CONNECTING = 0x02,
+ C2_QP_STATE_RTS = 0x04,
+ C2_QP_STATE_CLOSING = 0x08,
+ C2_QP_STATE_TERMINATE = 0x10,
+ C2_QP_STATE_ERROR = 0x20,
+};
+
+/* Completion queue entry. */
+struct c2wr_ce {
+ struct c2wr_hdr hdr; /* Has status and WR Type */
+ u64 qp_user_context; /* c2_user_qp_t * */
+ u32 qp_state; /* Current QP State */
+ u32 handle; /* QPID or EP Handle */
+ __be32 bytes_rcvd; /* valid for RECV WCs */
+ u32 stag;
+} __attribute__((packed)) ;
+
+
+/*
+ * Flags used for all post-sq WRs. These must fit in the flags
+ * field of the struct c2wr_hdr (eight bits).
+ */
+enum {
+ SQ_SIGNALED = 0x01,
+ SQ_READ_FENCE = 0x02,
+ SQ_FENCE = 0x04,
+};
+
+/*
+ * Common fields for all post-sq WRs. Namely the standard header and a
+ * secondary header with fields common to all post-sq WRs.
+ */
+struct c2_sq_hdr {
+ struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * Same as above but for post-rq WRs.
+ */
+struct c2_rq_hdr {
+ struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * use the same struct for all sends.
+ */
+struct c2wr_send_req {
+ struct c2_sq_hdr sq_hdr;
+ __be32 sge_len;
+ __be32 remote_stag;
+ u8 data[0]; /* SGE array */
+} __attribute__((packed));
+
+union c2wr_send {
+ struct c2wr_send_req req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_write_req {
+ struct c2_sq_hdr sq_hdr;
+ __be64 remote_to;
+ __be32 remote_stag;
+ __be32 sge_len;
+ u8 data[0]; /* SGE array */
+} __attribute__((packed));
+
+union c2wr_rdma_write {
+ struct c2wr_rdma_write_req req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_read_req {
+ struct c2_sq_hdr sq_hdr;
+ __be64 local_to;
+ __be64 remote_to;
+ __be32 local_stag;
+ __be32 remote_stag;
+ __be32 length;
+} __attribute__((packed));
+
+union c2wr_rdma_read {
+ struct c2wr_rdma_read_req req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_mw_bind_req {
+ struct c2_sq_hdr sq_hdr;
+ u64 va;
+ u8 stag_key;
+ u8 pad[3];
+ u32 mw_stag_index;
+ u32 mr_stag_index;
+ u32 length;
+ u32 flags;
+} __attribute__((packed));
+
+union c2wr_mw_bind {
+ struct c2wr_mw_bind_req req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_nsmr_fastreg_req {
+ struct c2_sq_hdr sq_hdr;
+ u64 va;
+ u8 stag_key;
+ u8 pad[3];
+ u32 stag_index;
+ u32 pbe_size;
+ u32 fbo;
+ u32 length;
+ u32 addrs_length;
+ /* array of paddrs (must be aligned on a 64bit boundary) */
+ u64 paddrs[0];
+} __attribute__((packed));
+
+union c2wr_nsmr_fastreg {
+ struct c2wr_nsmr_fastreg_req req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_stag_invalidate_req {
+ struct c2_sq_hdr sq_hdr;
+ u8 stag_key;
+ u8 pad[3];
+ u32 stag_index;
+} __attribute__((packed));
+
+union c2wr_stag_invalidate {
+ struct c2wr_stag_invalidate_req req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+union c2wr_sqwr {
+ struct c2_sq_hdr sq_hdr;
+ struct c2wr_send_req send;
+ struct c2wr_send_req send_se;
+ struct c2wr_send_req send_inv;
+ struct c2wr_send_req send_se_inv;
+ struct c2wr_rdma_write_req rdma_write;
+ struct c2wr_rdma_read_req rdma_read;
+ struct c2wr_mw_bind_req mw_bind;
+ struct c2wr_nsmr_fastreg_req nsmr_fastreg;
+ struct c2wr_stag_invalidate_req stag_inv;
+} __attribute__((packed));
+
+
+/*
+ * RQ WRs
+ */
+struct c2wr_rqwr {
+ struct c2_rq_hdr rq_hdr;
+ u8 data[0]; /* array of SGEs */
+} __attribute__((packed));
+
+union c2wr_recv {
+ struct c2wr_rqwr req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+/*
+ * All AEs start with this header. Most AEs only need to convey the
+ * information in the header. Some, like LLP connection events, need
+ * more info. The union typdef c2wr_ae_t has all the possible AEs.
+ *
+ * hdr.context is the user_context from the rnic_open WR. NULL If this
+ * is not affiliated with an rnic
+ *
+ * hdr.id is the AE identifier (eg; CCAE_REMOTE_SHUTDOWN,
+ * CCAE_LLP_CLOSE_COMPLETE)
+ *
+ * resource_type is one of: C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
+ *
+ * user_context is the context passed down when the host created the resource.
+ */
+struct c2wr_ae_hdr {
+ struct c2wr_hdr hdr;
+ u64 user_context; /* user context for this res. */
+ __be32 resource_type; /* see enum c2_resource_indicator */
+ __be32 resource; /* handle for resource */
+ __be32 qp_state; /* current QP State */
+} __attribute__((packed));
+
+/*
+ * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
+ * the adapter moves the QP into RTS state
+ */
+struct c2wr_ae_active_connect_results {
+ struct c2wr_ae_hdr ae_hdr;
+ __be32 laddr;
+ __be32 raddr;
+ __be16 lport;
+ __be16 rport;
+ __be32 private_data_length;
+ u8 private_data[0]; /* data is in-line in the msg. */
+} __attribute__((packed));
+
+/*
+ * When connections are established by the stack (and the private data
+ * MPA frame is received), the adapter will generate an event to the host.
+ * The details of the connection, any private data, and the new connection
+ * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
+ * AE queue:
+ */
+struct c2wr_ae_connection_request {
+ struct c2wr_ae_hdr ae_hdr;
+ u32 cr_handle; /* connreq handle (sock ptr) */
+ __be32 laddr;
+ __be32 raddr;
+ __be16 lport;
+ __be16 rport;
+ __be32 private_data_length;
+ u8 private_data[0]; /* data is in-line in the msg. */
+} __attribute__((packed));
+
+union c2wr_ae {
+ struct c2wr_ae_hdr ae_generic;
+ struct c2wr_ae_active_connect_results ae_active_connect_results;
+ struct c2wr_ae_connection_request ae_connection_request;
+} __attribute__((packed));
+
+struct c2wr_init_req {
+ struct c2wr_hdr hdr;
+ __be64 hint_count;
+ __be64 q0_host_shared;
+ __be64 q1_host_shared;
+ __be64 q1_host_msg_pool;
+ __be64 q2_host_shared;
+ __be64 q2_host_msg_pool;
+} __attribute__((packed));
+
+struct c2wr_init_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_init {
+ struct c2wr_init_req req;
+ struct c2wr_init_rep rep;
+} __attribute__((packed));
+
+/*
+ * For upgrading flash.
+ */
+
+struct c2wr_flash_init_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+} __attribute__((packed));
+
+struct c2wr_flash_init_rep {
+ struct c2wr_hdr hdr;
+ u32 adapter_flash_buf_offset;
+ u32 adapter_flash_len;
+} __attribute__((packed));
+
+union c2wr_flash_init {
+ struct c2wr_flash_init_req req;
+ struct c2wr_flash_init_rep rep;
+} __attribute__((packed));
+
+struct c2wr_flash_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 len;
+} __attribute__((packed));
+
+struct c2wr_flash_rep {
+ struct c2wr_hdr hdr;
+ u32 status;
+} __attribute__((packed));
+
+union c2wr_flash {
+ struct c2wr_flash_req req;
+ struct c2wr_flash_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 size;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_rep {
+ struct c2wr_hdr hdr;
+ u32 offset; /* 0 if mem not available */
+ u32 size; /* 0 if mem not available */
+} __attribute__((packed));
+
+union c2wr_buf_alloc {
+ struct c2wr_buf_alloc_req req;
+ struct c2wr_buf_alloc_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_free_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 offset; /* Must match value from alloc */
+ u32 size; /* Must match value from alloc */
+} __attribute__((packed));
+
+struct c2wr_buf_free_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_buf_free {
+ struct c2wr_buf_free_req req;
+ struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_flash_write_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 offset;
+ u32 size;
+ u32 type;
+ u32 flags;
+} __attribute__((packed));
+
+struct c2wr_flash_write_rep {
+ struct c2wr_hdr hdr;
+ u32 status;
+} __attribute__((packed));
+
+union c2wr_flash_write {
+ struct c2wr_flash_write_req req;
+ struct c2wr_flash_write_rep rep;
+} __attribute__((packed));
+
+/*
+ * Messages for LLP connection setup.
+ */
+
+/*
+ * Listen Request. This allocates a listening endpoint to allow passive
+ * connection setup. Newly established LLP connections are passed up
+ * via an AE. See c2wr_ae_connection_request_t
+ */
+struct c2wr_ep_listen_create_req {
+ struct c2wr_hdr hdr;
+ u64 user_context; /* returned in AEs. */
+ u32 rnic_handle;
+ __be32 local_addr; /* local addr, or 0 */
+ __be16 local_port; /* 0 means "pick one" */
+ u16 pad;
+ __be32 backlog; /* tradional tcp listen bl */
+} __attribute__((packed));
+
+struct c2wr_ep_listen_create_rep {
+ struct c2wr_hdr hdr;
+ u32 ep_handle; /* handle to new listening ep */
+ u16 local_port; /* resulting port... */
+ u16 pad;
+} __attribute__((packed));
+
+union c2wr_ep_listen_create {
+ struct c2wr_ep_listen_create_req req;
+ struct c2wr_ep_listen_create_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_ep_listen_destroy {
+ struct c2wr_ep_listen_destroy_req req;
+ struct c2wr_ep_listen_destroy_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_query_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_query_rep {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 local_addr;
+ u32 remote_addr;
+ u16 local_port;
+ u16 remote_port;
+} __attribute__((packed));
+
+union c2wr_ep_query {
+ struct c2wr_ep_query_req req;
+ struct c2wr_ep_query_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * The host passes this down to indicate acceptance of a pending iWARP
+ * connection. The cr_handle was obtained from the CONNECTION_REQUEST
+ * AE passed up by the adapter. See c2wr_ae_connection_request_t.
+ */
+struct c2wr_cr_accept_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 qp_handle; /* QP to bind to this LLP conn */
+ u32 ep_handle; /* LLP handle to accept */
+ __be32 private_data_length;
+ u8 private_data[0]; /* data in-line in msg. */
+} __attribute__((packed));
+
+/*
+ * adapter sends reply when private data is successfully submitted to
+ * the LLP.
+ */
+struct c2wr_cr_accept_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_accept {
+ struct c2wr_cr_accept_req req;
+ struct c2wr_cr_accept_rep rep;
+} __attribute__((packed));
+
+/*
+ * The host sends this down if a given iWARP connection request was
+ * rejected by the consumer. The cr_handle was obtained from a
+ * previous c2wr_ae_connection_request_t AE sent by the adapter.
+ */
+struct c2wr_cr_reject_req {
+ struct c2wr_hdr hdr;
+ u32 rnic_handle;
+ u32 ep_handle; /* LLP handle to reject */
+} __attribute__((packed));
+
+/*
+ * Dunno if this is needed, but we'll add it for now. The adapter will
+ * send the reject_reply after the LLP endpoint has been destroyed.
+ */
+struct c2wr_cr_reject_rep {
+ struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_reject {
+ struct c2wr_cr_reject_req req;
+ struct c2wr_cr_reject_rep rep;
+} __attribute__((packed));
+
+/*
+ * console command. Used to implement a debug console over the verbs
+ * request and reply queues.
+ */
+
+/*
+ * Console request message. It contains:
+ * - message hdr with id = CCWR_CONSOLE
+ * - the physaddr/len of host memory to be used for the reply.
+ * - the command string. eg: "netstat -s" or "zoneinfo"
+ */
+struct c2wr_console_req {
+ struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */
+ u64 reply_buf; /* pinned host buf for reply */
+ u32 reply_buf_len; /* length of reply buffer */
+ u8 command[0]; /* NUL terminated ascii string */
+ /* containing the command req */
+} __attribute__((packed));
+
+/*
+ * flags used in the console reply.
+ */
+enum c2_console_flags {
+ CONS_REPLY_TRUNCATED = 0x00000001 /* reply was truncated */
+} __attribute__((packed));
+
+/*
+ * Console reply message.
+ * hdr.result contains the c2_status_t error if the reply was _not_ generated,
+ * or C2_OK if the reply was generated.
+ */
+struct c2wr_console_rep {
+ struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */
+ u32 flags;
+} __attribute__((packed));
+
+union c2wr_console {
+ struct c2wr_console_req req;
+ struct c2wr_console_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * Giant union with all WRs. Makes life easier...
+ */
+union c2wr {
+ struct c2wr_hdr hdr;
+ struct c2wr_user_hdr user_hdr;
+ union c2wr_rnic_open rnic_open;
+ union c2wr_rnic_query rnic_query;
+ union c2wr_rnic_getconfig rnic_getconfig;
+ union c2wr_rnic_setconfig rnic_setconfig;
+ union c2wr_rnic_close rnic_close;
+ union c2wr_cq_create cq_create;
+ union c2wr_cq_modify cq_modify;
+ union c2wr_cq_destroy cq_destroy;
+ union c2wr_pd_alloc pd_alloc;
+ union c2wr_pd_dealloc pd_dealloc;
+ union c2wr_srq_create srq_create;
+ union c2wr_srq_destroy srq_destroy;
+ union c2wr_qp_create qp_create;
+ union c2wr_qp_query qp_query;
+ union c2wr_qp_modify qp_modify;
+ union c2wr_qp_destroy qp_destroy;
+ struct c2wr_qp_connect qp_connect;
+ union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
+ union c2wr_nsmr_register nsmr_register;
+ union c2wr_nsmr_pbl nsmr_pbl;
+ union c2wr_mr_query mr_query;
+ union c2wr_mw_query mw_query;
+ union c2wr_stag_dealloc stag_dealloc;
+ union c2wr_sqwr sqwr;
+ struct c2wr_rqwr rqwr;
+ struct c2wr_ce ce;
+ union c2wr_ae ae;
+ union c2wr_init init;
+ union c2wr_ep_listen_create ep_listen_create;
+ union c2wr_ep_listen_destroy ep_listen_destroy;
+ union c2wr_cr_accept cr_accept;
+ union c2wr_cr_reject cr_reject;
+ union c2wr_console console;
+ union c2wr_flash_init flash_init;
+ union c2wr_flash flash;
+ union c2wr_buf_alloc buf_alloc;
+ union c2wr_buf_free buf_free;
+ union c2wr_flash_write flash_write;
+} __attribute__((packed));
+
+
+/*
+ * Accessors for the wr fields that are packed together tightly to
+ * reduce the wr message size. The wr arguments are void* so that
+ * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
+ * in the struct c2wr union can be passed in.
+ */
+static __inline__ u8 c2_wr_get_id(void *wr)
+{
+ return ((struct c2wr_hdr *) wr)->id;
+}
+static __inline__ void c2_wr_set_id(void *wr, u8 id)
+{
+ ((struct c2wr_hdr *) wr)->id = id;
+}
+static __inline__ u8 c2_wr_get_result(void *wr)
+{
+ return ((struct c2wr_hdr *) wr)->result;
+}
+static __inline__ void c2_wr_set_result(void *wr, u8 result)
+{
+ ((struct c2wr_hdr *) wr)->result = result;
+}
+static __inline__ u8 c2_wr_get_flags(void *wr)
+{
+ return ((struct c2wr_hdr *) wr)->flags;
+}
+static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
+{
+ ((struct c2wr_hdr *) wr)->flags = flags;
+}
+static __inline__ u8 c2_wr_get_sge_count(void *wr)
+{
+ return ((struct c2wr_hdr *) wr)->sge_count;
+}
+static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
+{
+ ((struct c2wr_hdr *) wr)->sge_count = sge_count;
+}
+static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
+{
+ return ((struct c2wr_hdr *) wr)->wqe_count;
+}
+static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
+{
+ ((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
+}
+
+#endif /* _C2_WR_H_ */
diff --git a/drivers/staging/rdma/ehca/Kconfig b/drivers/staging/rdma/ehca/Kconfig
new file mode 100644
index 000000000000..3fadd2ad6426
--- /dev/null
+++ b/drivers/staging/rdma/ehca/Kconfig
@@ -0,0 +1,10 @@
+config INFINIBAND_EHCA
+ tristate "eHCA support"
+ depends on IBMEBUS
+ ---help---
+ This driver supports the deprecated IBM pSeries eHCA InfiniBand
+ adapter.
+
+ To compile the driver as a module, choose M here. The module
+ will be called ib_ehca.
+
diff --git a/drivers/staging/rdma/ehca/Makefile b/drivers/staging/rdma/ehca/Makefile
new file mode 100644
index 000000000000..74d284e46a40
--- /dev/null
+++ b/drivers/staging/rdma/ehca/Makefile
@@ -0,0 +1,16 @@
+# Authors: Heiko J Schick <schickhj@de.ibm.com>
+# Christoph Raisch <raisch@de.ibm.com>
+# Joachim Fenkes <fenkes@de.ibm.com>
+#
+# Copyright (c) 2005 IBM Corporation
+#
+# All rights reserved.
+#
+# This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
+
+obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
+
+ib_ehca-objs = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
+ ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
+ ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
+
diff --git a/drivers/staging/rdma/ehca/TODO b/drivers/staging/rdma/ehca/TODO
new file mode 100644
index 000000000000..199a4a600142
--- /dev/null
+++ b/drivers/staging/rdma/ehca/TODO
@@ -0,0 +1,4 @@
+9/2015
+
+The ehca driver has been deprecated and moved to drivers/staging/rdma.
+It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/ehca/ehca_av.c b/drivers/staging/rdma/ehca/ehca_av.c
new file mode 100644
index 000000000000..465926319f3d
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_av.c
@@ -0,0 +1,277 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * address vector functions
+ *
+ * Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Khadija Souissi <souissik@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *av_cache;
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+ enum ib_rate path_rate, u32 *ipd)
+{
+ int path = ib_rate_to_mult(path_rate);
+ int link, ret;
+ struct ib_port_attr pa;
+
+ if (path_rate == IB_RATE_PORT_CURRENT) {
+ *ipd = 0;
+ return 0;
+ }
+
+ if (unlikely(path < 0)) {
+ ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
+ path_rate);
+ return -EINVAL;
+ }
+
+ ret = ehca_query_port(&shca->ib_device, port, &pa);
+ if (unlikely(ret < 0)) {
+ ehca_err(&shca->ib_device, "Failed to query port ret=%i", ret);
+ return ret;
+ }
+
+ link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
+
+ if (path >= link)
+ /* no need to throttle if path faster than link */
+ *ipd = 0;
+ else
+ /* IPD = round((link / path) - 1) */
+ *ipd = ((link + (path >> 1)) / path) - 1;
+
+ return 0;
+}
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ int ret;
+ struct ehca_av *av;
+ struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+ ib_device);
+
+ av = kmem_cache_alloc(av_cache, GFP_KERNEL);
+ if (!av) {
+ ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
+ pd, ah_attr);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ av->av.sl = ah_attr->sl;
+ av->av.dlid = ah_attr->dlid;
+ av->av.slid_path_bits = ah_attr->src_path_bits;
+
+ if (ehca_static_rate < 0) {
+ u32 ipd;
+ if (ehca_calc_ipd(shca, ah_attr->port_num,
+ ah_attr->static_rate, &ipd)) {
+ ret = -EINVAL;
+ goto create_ah_exit1;
+ }
+ av->av.ipd = ipd;
+ } else
+ av->av.ipd = ehca_static_rate;
+
+ av->av.lnh = ah_attr->ah_flags;
+ av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
+ av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
+ ah_attr->grh.traffic_class);
+ av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+ ah_attr->grh.flow_label);
+ av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+ ah_attr->grh.hop_limit);
+ av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
+ /* set sgid in grh.word_1 */
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ int rc;
+ struct ib_port_attr port_attr;
+ union ib_gid gid;
+ memset(&port_attr, 0, sizeof(port_attr));
+ rc = ehca_query_port(pd->device, ah_attr->port_num,
+ &port_attr);
+ if (rc) { /* invalid port number */
+ ret = -EINVAL;
+ ehca_err(pd->device, "Invalid port number "
+ "ehca_query_port() returned %x "
+ "pd=%p ah_attr=%p", rc, pd, ah_attr);
+ goto create_ah_exit1;
+ }
+ memset(&gid, 0, sizeof(gid));
+ rc = ehca_query_gid(pd->device,
+ ah_attr->port_num,
+ ah_attr->grh.sgid_index, &gid);
+ if (rc) {
+ ret = -EINVAL;
+ ehca_err(pd->device, "Failed to retrieve sgid "
+ "ehca_query_gid() returned %x "
+ "pd=%p ah_attr=%p", rc, pd, ah_attr);
+ goto create_ah_exit1;
+ }
+ memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
+ }
+ av->av.pmtu = shca->max_mtu;
+
+ /* dgid comes in grh.word_3 */
+ memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
+ sizeof(ah_attr->grh.dgid));
+
+ return &av->ib_ah;
+
+create_ah_exit1:
+ kmem_cache_free(av_cache, av);
+
+ return ERR_PTR(ret);
+}
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ struct ehca_av *av;
+ struct ehca_ud_av new_ehca_av;
+ struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
+ ib_device);
+
+ memset(&new_ehca_av, 0, sizeof(new_ehca_av));
+ new_ehca_av.sl = ah_attr->sl;
+ new_ehca_av.dlid = ah_attr->dlid;
+ new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
+ new_ehca_av.ipd = ah_attr->static_rate;
+ new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
+ (ah_attr->ah_flags & IB_AH_GRH) > 0);
+ new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
+ ah_attr->grh.traffic_class);
+ new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+ ah_attr->grh.flow_label);
+ new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+ ah_attr->grh.hop_limit);
+ new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
+
+ /* set sgid in grh.word_1 */
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ int rc;
+ struct ib_port_attr port_attr;
+ union ib_gid gid;
+ memset(&port_attr, 0, sizeof(port_attr));
+ rc = ehca_query_port(ah->device, ah_attr->port_num,
+ &port_attr);
+ if (rc) { /* invalid port number */
+ ehca_err(ah->device, "Invalid port number "
+ "ehca_query_port() returned %x "
+ "ah=%p ah_attr=%p port_num=%x",
+ rc, ah, ah_attr, ah_attr->port_num);
+ return -EINVAL;
+ }
+ memset(&gid, 0, sizeof(gid));
+ rc = ehca_query_gid(ah->device,
+ ah_attr->port_num,
+ ah_attr->grh.sgid_index, &gid);
+ if (rc) {
+ ehca_err(ah->device, "Failed to retrieve sgid "
+ "ehca_query_gid() returned %x "
+ "ah=%p ah_attr=%p port_num=%x "
+ "sgid_index=%x",
+ rc, ah, ah_attr, ah_attr->port_num,
+ ah_attr->grh.sgid_index);
+ return -EINVAL;
+ }
+ memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
+ }
+
+ new_ehca_av.pmtu = shca->max_mtu;
+
+ memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
+ sizeof(ah_attr->grh.dgid));
+
+ av = container_of(ah, struct ehca_av, ib_ah);
+ av->av = new_ehca_av;
+
+ return 0;
+}
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
+
+ memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
+ sizeof(ah_attr->grh.dgid));
+ ah_attr->sl = av->av.sl;
+
+ ah_attr->dlid = av->av.dlid;
+
+ ah_attr->src_path_bits = av->av.slid_path_bits;
+ ah_attr->static_rate = av->av.ipd;
+ ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
+ ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
+ av->av.grh.word_0);
+ ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
+ av->av.grh.word_0);
+ ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
+ av->av.grh.word_0);
+
+ return 0;
+}
+
+int ehca_destroy_ah(struct ib_ah *ah)
+{
+ kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
+
+ return 0;
+}
+
+int ehca_init_av_cache(void)
+{
+ av_cache = kmem_cache_create("ehca_cache_av",
+ sizeof(struct ehca_av), 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!av_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void ehca_cleanup_av_cache(void)
+{
+ if (av_cache)
+ kmem_cache_destroy(av_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h
new file mode 100644
index 000000000000..bd45e0f3923f
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_classes.h
@@ -0,0 +1,482 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Struct definition for eHCA internal structures
+ *
+ * Authors: Heiko J Schick <schickhj@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ * Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_H__
+#define __EHCA_CLASSES_H__
+
+struct ehca_module;
+struct ehca_qp;
+struct ehca_cq;
+struct ehca_eq;
+struct ehca_mr;
+struct ehca_mw;
+struct ehca_pd;
+struct ehca_av;
+
+#include <linux/wait.h>
+#include <linux/mutex.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#ifdef CONFIG_PPC64
+#include "ehca_classes_pSeries.h"
+#endif
+#include "ipz_pt_fn.h"
+#include "ehca_qes.h"
+#include "ehca_irq.h"
+
+#define EHCA_EQE_CACHE_SIZE 20
+#define EHCA_MAX_NUM_QUEUES 0xffff
+
+struct ehca_eqe_cache_entry {
+ struct ehca_eqe *eqe;
+ struct ehca_cq *cq;
+};
+
+struct ehca_eq {
+ u32 length;
+ struct ipz_queue ipz_queue;
+ struct ipz_eq_handle ipz_eq_handle;
+ struct work_struct work;
+ struct h_galpas galpas;
+ int is_initialized;
+ struct ehca_pfeq pf;
+ spinlock_t spinlock;
+ struct tasklet_struct interrupt_task;
+ u32 ist;
+ spinlock_t irq_spinlock;
+ struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
+};
+
+struct ehca_sma_attr {
+ u16 lid, lmc, sm_sl, sm_lid;
+ u16 pkey_tbl_len, pkeys[16];
+};
+
+struct ehca_sport {
+ struct ib_cq *ibcq_aqp1;
+ struct ib_qp *ibqp_sqp[2];
+ /* lock to serialze modify_qp() calls for sqp in normal
+ * and irq path (when event PORT_ACTIVE is received first time)
+ */
+ spinlock_t mod_sqp_lock;
+ enum ib_port_state port_state;
+ struct ehca_sma_attr saved_attr;
+ u32 pma_qp_nr;
+};
+
+#define HCA_CAP_MR_PGSIZE_4K 0x80000000
+#define HCA_CAP_MR_PGSIZE_64K 0x40000000
+#define HCA_CAP_MR_PGSIZE_1M 0x20000000
+#define HCA_CAP_MR_PGSIZE_16M 0x10000000
+
+struct ehca_shca {
+ struct ib_device ib_device;
+ struct platform_device *ofdev;
+ u8 num_ports;
+ int hw_level;
+ struct list_head shca_list;
+ struct ipz_adapter_handle ipz_hca_handle;
+ struct ehca_sport sport[2];
+ struct ehca_eq eq;
+ struct ehca_eq neq;
+ struct ehca_mr *maxmr;
+ struct ehca_pd *pd;
+ struct h_galpas galpas;
+ struct mutex modify_mutex;
+ u64 hca_cap;
+ /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
+ u32 hca_cap_mr_pgsize;
+ int max_mtu;
+ int max_num_qps;
+ int max_num_cqs;
+ atomic_t num_cqs;
+ atomic_t num_qps;
+};
+
+struct ehca_pd {
+ struct ib_pd ib_pd;
+ struct ipz_pd fw_pd;
+ /* small queue mgmt */
+ struct mutex lock;
+ struct list_head free[2];
+ struct list_head full[2];
+};
+
+enum ehca_ext_qp_type {
+ EQPT_NORMAL = 0,
+ EQPT_LLQP = 1,
+ EQPT_SRQBASE = 2,
+ EQPT_SRQ = 3,
+};
+
+/* struct to cache modify_qp()'s parms for GSI/SMI qp */
+struct ehca_mod_qp_parm {
+ int mask;
+ struct ib_qp_attr attr;
+};
+
+#define EHCA_MOD_QP_PARM_MAX 4
+
+#define QMAP_IDX_MASK 0xFFFFULL
+
+/* struct for tracking if cqes have been reported to the application */
+struct ehca_qmap_entry {
+ u16 app_wr_id;
+ u8 reported;
+ u8 cqe_req;
+};
+
+struct ehca_queue_map {
+ struct ehca_qmap_entry *map;
+ unsigned int entries;
+ unsigned int tail;
+ unsigned int left_to_poll;
+ unsigned int next_wqe_idx; /* Idx to first wqe to be flushed */
+};
+
+/* function to calculate the next index for the qmap */
+static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
+{
+ unsigned int temp = cur_index + 1;
+ return (temp == limit) ? 0 : temp;
+}
+
+struct ehca_qp {
+ union {
+ struct ib_qp ib_qp;
+ struct ib_srq ib_srq;
+ };
+ u32 qp_type;
+ enum ehca_ext_qp_type ext_type;
+ enum ib_qp_state state;
+ struct ipz_queue ipz_squeue;
+ struct ehca_queue_map sq_map;
+ struct ipz_queue ipz_rqueue;
+ struct ehca_queue_map rq_map;
+ struct h_galpas galpas;
+ u32 qkey;
+ u32 real_qp_num;
+ u32 token;
+ spinlock_t spinlock_s;
+ spinlock_t spinlock_r;
+ u32 sq_max_inline_data_size;
+ struct ipz_qp_handle ipz_qp_handle;
+ struct ehca_pfqp pf;
+ struct ib_qp_init_attr init_attr;
+ struct ehca_cq *send_cq;
+ struct ehca_cq *recv_cq;
+ unsigned int sqerr_purgeflag;
+ struct hlist_node list_entries;
+ /* array to cache modify_qp()'s parms for GSI/SMI qp */
+ struct ehca_mod_qp_parm *mod_qp_parm;
+ int mod_qp_parm_idx;
+ /* mmap counter for resources mapped into user space */
+ u32 mm_count_squeue;
+ u32 mm_count_rqueue;
+ u32 mm_count_galpa;
+ /* unsolicited ack circumvention */
+ int unsol_ack_circ;
+ int mtu_shift;
+ u32 message_count;
+ u32 packet_count;
+ atomic_t nr_events; /* events seen */
+ wait_queue_head_t wait_completion;
+ int mig_armed;
+ struct list_head sq_err_node;
+ struct list_head rq_err_node;
+};
+
+#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
+#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
+#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
+
+/* must be power of 2 */
+#define QP_HASHTAB_LEN 8
+
+struct ehca_cq {
+ struct ib_cq ib_cq;
+ struct ipz_queue ipz_queue;
+ struct h_galpas galpas;
+ spinlock_t spinlock;
+ u32 cq_number;
+ u32 token;
+ u32 nr_of_entries;
+ struct ipz_cq_handle ipz_cq_handle;
+ struct ehca_pfcq pf;
+ spinlock_t cb_lock;
+ struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
+ struct list_head entry;
+ u32 nr_callbacks; /* #events assigned to cpu by scaling code */
+ atomic_t nr_events; /* #events seen */
+ wait_queue_head_t wait_completion;
+ spinlock_t task_lock;
+ /* mmap counter for resources mapped into user space */
+ u32 mm_count_queue;
+ u32 mm_count_galpa;
+ struct list_head sqp_err_list;
+ struct list_head rqp_err_list;
+};
+
+enum ehca_mr_flag {
+ EHCA_MR_FLAG_FMR = 0x80000000, /* FMR, created with ehca_alloc_fmr */
+ EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR */
+};
+
+struct ehca_mr {
+ union {
+ struct ib_mr ib_mr; /* must always be first in ehca_mr */
+ struct ib_fmr ib_fmr; /* must always be first in ehca_mr */
+ } ib;
+ struct ib_umem *umem;
+ spinlock_t mrlock;
+
+ enum ehca_mr_flag flags;
+ u32 num_kpages; /* number of kernel pages */
+ u32 num_hwpages; /* number of hw pages to form MR */
+ u64 hwpage_size; /* hw page size used for this MR */
+ int acl; /* ACL (stored here for usage in reregister) */
+ u64 *start; /* virtual start address (stored here for */
+ /* usage in reregister) */
+ u64 size; /* size (stored here for usage in reregister) */
+ u32 fmr_page_size; /* page size for FMR */
+ u32 fmr_max_pages; /* max pages for FMR */
+ u32 fmr_max_maps; /* max outstanding maps for FMR */
+ u32 fmr_map_cnt; /* map counter for FMR */
+ /* fw specific data */
+ struct ipz_mrmw_handle ipz_mr_handle; /* MR handle for h-calls */
+ struct h_galpas galpas;
+};
+
+struct ehca_mw {
+ struct ib_mw ib_mw; /* gen2 mw, must always be first in ehca_mw */
+ spinlock_t mwlock;
+
+ u8 never_bound; /* indication MW was never bound */
+ struct ipz_mrmw_handle ipz_mw_handle; /* MW handle for h-calls */
+ struct h_galpas galpas;
+};
+
+enum ehca_mr_pgi_type {
+ EHCA_MR_PGI_PHYS = 1, /* type of ehca_reg_phys_mr,
+ * ehca_rereg_phys_mr,
+ * ehca_reg_internal_maxmr */
+ EHCA_MR_PGI_USER = 2, /* type of ehca_reg_user_mr */
+ EHCA_MR_PGI_FMR = 3 /* type of ehca_map_phys_fmr */
+};
+
+struct ehca_mr_pginfo {
+ enum ehca_mr_pgi_type type;
+ u64 num_kpages;
+ u64 kpage_cnt;
+ u64 hwpage_size; /* hw page size used for this MR */
+ u64 num_hwpages; /* number of hw pages */
+ u64 hwpage_cnt; /* counter for hw pages */
+ u64 next_hwpage; /* next hw page in buffer/chunk/listelem */
+
+ union {
+ struct { /* type EHCA_MR_PGI_PHYS section */
+ int num_phys_buf;
+ struct ib_phys_buf *phys_buf_array;
+ u64 next_buf;
+ } phy;
+ struct { /* type EHCA_MR_PGI_USER section */
+ struct ib_umem *region;
+ struct scatterlist *next_sg;
+ u64 next_nmap;
+ } usr;
+ struct { /* type EHCA_MR_PGI_FMR section */
+ u64 fmr_pgsize;
+ u64 *page_list;
+ u64 next_listelem;
+ } fmr;
+ } u;
+};
+
+/* output parameters for MR/FMR hipz calls */
+struct ehca_mr_hipzout_parms {
+ struct ipz_mrmw_handle handle;
+ u32 lkey;
+ u32 rkey;
+ u64 len;
+ u64 vaddr;
+ u32 acl;
+};
+
+/* output parameters for MW hipz calls */
+struct ehca_mw_hipzout_parms {
+ struct ipz_mrmw_handle handle;
+ u32 rkey;
+};
+
+struct ehca_av {
+ struct ib_ah ib_ah;
+ struct ehca_ud_av av;
+};
+
+struct ehca_ucontext {
+ struct ib_ucontext ib_ucontext;
+};
+
+int ehca_init_pd_cache(void);
+void ehca_cleanup_pd_cache(void);
+int ehca_init_cq_cache(void);
+void ehca_cleanup_cq_cache(void);
+int ehca_init_qp_cache(void);
+void ehca_cleanup_qp_cache(void);
+int ehca_init_av_cache(void);
+void ehca_cleanup_av_cache(void);
+int ehca_init_mrmw_cache(void);
+void ehca_cleanup_mrmw_cache(void);
+int ehca_init_small_qp_cache(void);
+void ehca_cleanup_small_qp_cache(void);
+
+extern rwlock_t ehca_qp_idr_lock;
+extern rwlock_t ehca_cq_idr_lock;
+extern struct idr ehca_qp_idr;
+extern struct idr ehca_cq_idr;
+extern spinlock_t shca_list_lock;
+
+extern int ehca_static_rate;
+extern int ehca_port_act_time;
+extern bool ehca_use_hp_mr;
+extern bool ehca_scaling_code;
+extern int ehca_lock_hcalls;
+extern int ehca_nr_ports;
+extern int ehca_max_cq;
+extern int ehca_max_qp;
+
+struct ipzu_queue_resp {
+ u32 qe_size; /* queue entry size */
+ u32 act_nr_of_sg;
+ u32 queue_length; /* queue length allocated in bytes */
+ u32 pagesize;
+ u32 toggle_state;
+ u32 offset; /* save offset within a page for small_qp */
+};
+
+struct ehca_create_cq_resp {
+ u32 cq_number;
+ u32 token;
+ struct ipzu_queue_resp ipz_queue;
+ u32 fw_handle_ofs;
+ u32 dummy;
+};
+
+struct ehca_create_qp_resp {
+ u32 qp_num;
+ u32 token;
+ u32 qp_type;
+ u32 ext_type;
+ u32 qkey;
+ /* qp_num assigned by ehca: sqp0/1 may have got different numbers */
+ u32 real_qp_num;
+ u32 fw_handle_ofs;
+ u32 dummy;
+ struct ipzu_queue_resp ipz_squeue;
+ struct ipzu_queue_resp ipz_rqueue;
+};
+
+struct ehca_alloc_cq_parms {
+ u32 nr_cqe;
+ u32 act_nr_of_entries;
+ u32 act_pages;
+ struct ipz_eq_handle eq_handle;
+};
+
+enum ehca_service_type {
+ ST_RC = 0,
+ ST_UC = 1,
+ ST_RD = 2,
+ ST_UD = 3,
+};
+
+enum ehca_ll_comp_flags {
+ LLQP_SEND_COMP = 0x20,
+ LLQP_RECV_COMP = 0x40,
+ LLQP_COMP_MASK = 0x60,
+};
+
+struct ehca_alloc_queue_parms {
+ /* input parameters */
+ int max_wr;
+ int max_sge;
+ int page_size;
+ int is_small;
+
+ /* output parameters */
+ u16 act_nr_wqes;
+ u8 act_nr_sges;
+ u32 queue_size; /* bytes for small queues, pages otherwise */
+};
+
+struct ehca_alloc_qp_parms {
+ struct ehca_alloc_queue_parms squeue;
+ struct ehca_alloc_queue_parms rqueue;
+
+ /* input parameters */
+ enum ehca_service_type servicetype;
+ int qp_storage;
+ int sigtype;
+ enum ehca_ext_qp_type ext_type;
+ enum ehca_ll_comp_flags ll_comp_flags;
+ int ud_av_l_key_ctl;
+
+ u32 token;
+ struct ipz_eq_handle eq_handle;
+ struct ipz_pd pd;
+ struct ipz_cq_handle send_cq_handle, recv_cq_handle;
+
+ u32 srq_qpn, srq_token, srq_limit;
+
+ /* output parameters */
+ u32 real_qp_num;
+ struct ipz_qp_handle qp_handle;
+ struct h_galpas galpas;
+};
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_classes_pSeries.h b/drivers/staging/rdma/ehca/ehca_classes_pSeries.h
new file mode 100644
index 000000000000..689c35786dd2
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_classes_pSeries.h
@@ -0,0 +1,208 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * pSeries interface definitions
+ *
+ * Authors: Waleri Fomin <fomin@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_PSERIES_H__
+#define __EHCA_CLASSES_PSERIES_H__
+
+#include "hcp_phyp.h"
+#include "ipz_pt_fn.h"
+
+
+struct ehca_pfqp {
+ struct ipz_qpt sqpt;
+ struct ipz_qpt rqpt;
+};
+
+struct ehca_pfcq {
+ struct ipz_qpt qpt;
+ u32 cqnr;
+};
+
+struct ehca_pfeq {
+ struct ipz_qpt qpt;
+ struct h_galpa galpa;
+ u32 eqnr;
+};
+
+struct ipz_adapter_handle {
+ u64 handle;
+};
+
+struct ipz_cq_handle {
+ u64 handle;
+};
+
+struct ipz_eq_handle {
+ u64 handle;
+};
+
+struct ipz_qp_handle {
+ u64 handle;
+};
+struct ipz_mrmw_handle {
+ u64 handle;
+};
+
+struct ipz_pd {
+ u32 value;
+};
+
+struct hcp_modify_qp_control_block {
+ u32 qkey; /* 00 */
+ u32 rdd; /* reliable datagram domain */
+ u32 send_psn; /* 02 */
+ u32 receive_psn; /* 03 */
+ u32 prim_phys_port; /* 04 */
+ u32 alt_phys_port; /* 05 */
+ u32 prim_p_key_idx; /* 06 */
+ u32 alt_p_key_idx; /* 07 */
+ u32 rdma_atomic_ctrl; /* 08 */
+ u32 qp_state; /* 09 */
+ u32 reserved_10; /* 10 */
+ u32 rdma_nr_atomic_resp_res; /* 11 */
+ u32 path_migration_state; /* 12 */
+ u32 rdma_atomic_outst_dest_qp; /* 13 */
+ u32 dest_qp_nr; /* 14 */
+ u32 min_rnr_nak_timer_field; /* 15 */
+ u32 service_level; /* 16 */
+ u32 send_grh_flag; /* 17 */
+ u32 retry_count; /* 18 */
+ u32 timeout; /* 19 */
+ u32 path_mtu; /* 20 */
+ u32 max_static_rate; /* 21 */
+ u32 dlid; /* 22 */
+ u32 rnr_retry_count; /* 23 */
+ u32 source_path_bits; /* 24 */
+ u32 traffic_class; /* 25 */
+ u32 hop_limit; /* 26 */
+ u32 source_gid_idx; /* 27 */
+ u32 flow_label; /* 28 */
+ u32 reserved_29; /* 29 */
+ union { /* 30 */
+ u64 dw[2];
+ u8 byte[16];
+ } dest_gid;
+ u32 service_level_al; /* 34 */
+ u32 send_grh_flag_al; /* 35 */
+ u32 retry_count_al; /* 36 */
+ u32 timeout_al; /* 37 */
+ u32 max_static_rate_al; /* 38 */
+ u32 dlid_al; /* 39 */
+ u32 rnr_retry_count_al; /* 40 */
+ u32 source_path_bits_al; /* 41 */
+ u32 traffic_class_al; /* 42 */
+ u32 hop_limit_al; /* 43 */
+ u32 source_gid_idx_al; /* 44 */
+ u32 flow_label_al; /* 45 */
+ u32 reserved_46; /* 46 */
+ u32 reserved_47; /* 47 */
+ union { /* 48 */
+ u64 dw[2];
+ u8 byte[16];
+ } dest_gid_al;
+ u32 max_nr_outst_send_wr; /* 52 */
+ u32 max_nr_outst_recv_wr; /* 53 */
+ u32 disable_ete_credit_check; /* 54 */
+ u32 qp_number; /* 55 */
+ u64 send_queue_handle; /* 56 */
+ u64 recv_queue_handle; /* 58 */
+ u32 actual_nr_sges_in_sq_wqe; /* 60 */
+ u32 actual_nr_sges_in_rq_wqe; /* 61 */
+ u32 qp_enable; /* 62 */
+ u32 curr_srq_limit; /* 63 */
+ u64 qp_aff_asyn_ev_log_reg; /* 64 */
+ u64 shared_rq_hndl; /* 66 */
+ u64 trigg_doorbell_qp_hndl; /* 68 */
+ u32 reserved_70_127[58]; /* 70 */
+};
+
+#define MQPCB_MASK_QKEY EHCA_BMASK_IBM( 0, 0)
+#define MQPCB_MASK_SEND_PSN EHCA_BMASK_IBM( 2, 2)
+#define MQPCB_MASK_RECEIVE_PSN EHCA_BMASK_IBM( 3, 3)
+#define MQPCB_MASK_PRIM_PHYS_PORT EHCA_BMASK_IBM( 4, 4)
+#define MQPCB_PRIM_PHYS_PORT EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_PHYS_PORT EHCA_BMASK_IBM( 5, 5)
+#define MQPCB_MASK_PRIM_P_KEY_IDX EHCA_BMASK_IBM( 6, 6)
+#define MQPCB_PRIM_P_KEY_IDX EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_P_KEY_IDX EHCA_BMASK_IBM( 7, 7)
+#define MQPCB_MASK_RDMA_ATOMIC_CTRL EHCA_BMASK_IBM( 8, 8)
+#define MQPCB_MASK_QP_STATE EHCA_BMASK_IBM( 9, 9)
+#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES EHCA_BMASK_IBM(11, 11)
+#define MQPCB_MASK_PATH_MIGRATION_STATE EHCA_BMASK_IBM(12, 12)
+#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP EHCA_BMASK_IBM(13, 13)
+#define MQPCB_MASK_DEST_QP_NR EHCA_BMASK_IBM(14, 14)
+#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD EHCA_BMASK_IBM(15, 15)
+#define MQPCB_MASK_SERVICE_LEVEL EHCA_BMASK_IBM(16, 16)
+#define MQPCB_MASK_SEND_GRH_FLAG EHCA_BMASK_IBM(17, 17)
+#define MQPCB_MASK_RETRY_COUNT EHCA_BMASK_IBM(18, 18)
+#define MQPCB_MASK_TIMEOUT EHCA_BMASK_IBM(19, 19)
+#define MQPCB_MASK_PATH_MTU EHCA_BMASK_IBM(20, 20)
+#define MQPCB_MASK_MAX_STATIC_RATE EHCA_BMASK_IBM(21, 21)
+#define MQPCB_MASK_DLID EHCA_BMASK_IBM(22, 22)
+#define MQPCB_MASK_RNR_RETRY_COUNT EHCA_BMASK_IBM(23, 23)
+#define MQPCB_MASK_SOURCE_PATH_BITS EHCA_BMASK_IBM(24, 24)
+#define MQPCB_MASK_TRAFFIC_CLASS EHCA_BMASK_IBM(25, 25)
+#define MQPCB_MASK_HOP_LIMIT EHCA_BMASK_IBM(26, 26)
+#define MQPCB_MASK_SOURCE_GID_IDX EHCA_BMASK_IBM(27, 27)
+#define MQPCB_MASK_FLOW_LABEL EHCA_BMASK_IBM(28, 28)
+#define MQPCB_MASK_DEST_GID EHCA_BMASK_IBM(30, 30)
+#define MQPCB_MASK_SERVICE_LEVEL_AL EHCA_BMASK_IBM(31, 31)
+#define MQPCB_MASK_SEND_GRH_FLAG_AL EHCA_BMASK_IBM(32, 32)
+#define MQPCB_MASK_RETRY_COUNT_AL EHCA_BMASK_IBM(33, 33)
+#define MQPCB_MASK_TIMEOUT_AL EHCA_BMASK_IBM(34, 34)
+#define MQPCB_MASK_MAX_STATIC_RATE_AL EHCA_BMASK_IBM(35, 35)
+#define MQPCB_MASK_DLID_AL EHCA_BMASK_IBM(36, 36)
+#define MQPCB_MASK_RNR_RETRY_COUNT_AL EHCA_BMASK_IBM(37, 37)
+#define MQPCB_MASK_SOURCE_PATH_BITS_AL EHCA_BMASK_IBM(38, 38)
+#define MQPCB_MASK_TRAFFIC_CLASS_AL EHCA_BMASK_IBM(39, 39)
+#define MQPCB_MASK_HOP_LIMIT_AL EHCA_BMASK_IBM(40, 40)
+#define MQPCB_MASK_SOURCE_GID_IDX_AL EHCA_BMASK_IBM(41, 41)
+#define MQPCB_MASK_FLOW_LABEL_AL EHCA_BMASK_IBM(42, 42)
+#define MQPCB_MASK_DEST_GID_AL EHCA_BMASK_IBM(44, 44)
+#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR EHCA_BMASK_IBM(45, 45)
+#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR EHCA_BMASK_IBM(46, 46)
+#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK EHCA_BMASK_IBM(47, 47)
+#define MQPCB_MASK_QP_ENABLE EHCA_BMASK_IBM(48, 48)
+#define MQPCB_MASK_CURR_SRQ_LIMIT EHCA_BMASK_IBM(49, 49)
+#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG EHCA_BMASK_IBM(50, 50)
+#define MQPCB_MASK_SHARED_RQ_HNDL EHCA_BMASK_IBM(51, 51)
+
+#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/staging/rdma/ehca/ehca_cq.c b/drivers/staging/rdma/ehca/ehca_cq.c
new file mode 100644
index 000000000000..9b68b175069b
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_cq.c
@@ -0,0 +1,397 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Completion queue handling
+ *
+ * Authors: Waleri Fomin <fomin@de.ibm.com>
+ * Khadija Souissi <souissi@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *cq_cache;
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
+{
+ unsigned int qp_num = qp->real_qp_num;
+ unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->spinlock, flags);
+ hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
+ spin_unlock_irqrestore(&cq->spinlock, flags);
+
+ ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
+ cq->cq_number, qp_num);
+
+ return 0;
+}
+
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
+{
+ int ret = -EINVAL;
+ unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+ struct hlist_node *iter;
+ struct ehca_qp *qp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->spinlock, flags);
+ hlist_for_each(iter, &cq->qp_hashtab[key]) {
+ qp = hlist_entry(iter, struct ehca_qp, list_entries);
+ if (qp->real_qp_num == real_qp_num) {
+ hlist_del(iter);
+ ehca_dbg(cq->ib_cq.device,
+ "removed qp from cq .cq_num=%x real_qp_num=%x",
+ cq->cq_number, real_qp_num);
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&cq->spinlock, flags);
+ if (ret)
+ ehca_err(cq->ib_cq.device,
+ "qp not found cq_num=%x real_qp_num=%x",
+ cq->cq_number, real_qp_num);
+
+ return ret;
+}
+
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
+{
+ struct ehca_qp *ret = NULL;
+ unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+ struct hlist_node *iter;
+ struct ehca_qp *qp;
+ hlist_for_each(iter, &cq->qp_hashtab[key]) {
+ qp = hlist_entry(iter, struct ehca_qp, list_entries);
+ if (qp->real_qp_num == real_qp_num) {
+ ret = qp;
+ break;
+ }
+ }
+ return ret;
+}
+
+struct ib_cq *ehca_create_cq(struct ib_device *device,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ int cqe = attr->cqe;
+ static const u32 additional_cqe = 20;
+ struct ib_cq *cq;
+ struct ehca_cq *my_cq;
+ struct ehca_shca *shca =
+ container_of(device, struct ehca_shca, ib_device);
+ struct ipz_adapter_handle adapter_handle;
+ struct ehca_alloc_cq_parms param; /* h_call's out parameters */
+ struct h_galpa gal;
+ void *vpage;
+ u32 counter;
+ u64 rpage, cqx_fec, h_ret;
+ int ipz_rc, i;
+ unsigned long flags;
+
+ if (attr->flags)
+ return ERR_PTR(-EINVAL);
+
+ if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
+ return ERR_PTR(-EINVAL);
+
+ if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
+ ehca_err(device, "Unable to create CQ, max number of %i "
+ "CQs reached.", shca->max_num_cqs);
+ ehca_err(device, "To increase the maximum number of CQs "
+ "use the number_of_cqs module parameter.\n");
+ return ERR_PTR(-ENOSPC);
+ }
+
+ my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
+ if (!my_cq) {
+ ehca_err(device, "Out of memory for ehca_cq struct device=%p",
+ device);
+ atomic_dec(&shca->num_cqs);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
+
+ spin_lock_init(&my_cq->spinlock);
+ spin_lock_init(&my_cq->cb_lock);
+ spin_lock_init(&my_cq->task_lock);
+ atomic_set(&my_cq->nr_events, 0);
+ init_waitqueue_head(&my_cq->wait_completion);
+
+ cq = &my_cq->ib_cq;
+
+ adapter_handle = shca->ipz_hca_handle;
+ param.eq_handle = shca->eq.ipz_eq_handle;
+
+ idr_preload(GFP_KERNEL);
+ write_lock_irqsave(&ehca_cq_idr_lock, flags);
+ my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
+ write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+ idr_preload_end();
+
+ if (my_cq->token < 0) {
+ cq = ERR_PTR(-ENOMEM);
+ ehca_err(device, "Can't allocate new idr entry. device=%p",
+ device);
+ goto create_cq_exit1;
+ }
+
+ /*
+ * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
+ * for receiving errors CQEs.
+ */
+ param.nr_cqe = cqe + additional_cqe;
+ h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
+
+ if (h_ret != H_SUCCESS) {
+ ehca_err(device, "hipz_h_alloc_resource_cq() failed "
+ "h_ret=%lli device=%p", h_ret, device);
+ cq = ERR_PTR(ehca2ib_return_code(h_ret));
+ goto create_cq_exit2;
+ }
+
+ ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
+ EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
+ if (!ipz_rc) {
+ ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
+ ipz_rc, device);
+ cq = ERR_PTR(-EINVAL);
+ goto create_cq_exit3;
+ }
+
+ for (counter = 0; counter < param.act_pages; counter++) {
+ vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+ if (!vpage) {
+ ehca_err(device, "ipz_qpageit_get_inc() "
+ "returns NULL device=%p", device);
+ cq = ERR_PTR(-EAGAIN);
+ goto create_cq_exit4;
+ }
+ rpage = __pa(vpage);
+
+ h_ret = hipz_h_register_rpage_cq(adapter_handle,
+ my_cq->ipz_cq_handle,
+ &my_cq->pf,
+ 0,
+ 0,
+ rpage,
+ 1,
+ my_cq->galpas.
+ kernel);
+
+ if (h_ret < H_SUCCESS) {
+ ehca_err(device, "hipz_h_register_rpage_cq() failed "
+ "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
+ "act_pages=%i", my_cq, my_cq->cq_number,
+ h_ret, counter, param.act_pages);
+ cq = ERR_PTR(-EINVAL);
+ goto create_cq_exit4;
+ }
+
+ if (counter == (param.act_pages - 1)) {
+ vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+ if ((h_ret != H_SUCCESS) || vpage) {
+ ehca_err(device, "Registration of pages not "
+ "complete ehca_cq=%p cq_num=%x "
+ "h_ret=%lli", my_cq, my_cq->cq_number,
+ h_ret);
+ cq = ERR_PTR(-EAGAIN);
+ goto create_cq_exit4;
+ }
+ } else {
+ if (h_ret != H_PAGE_REGISTERED) {
+ ehca_err(device, "Registration of page failed "
+ "ehca_cq=%p cq_num=%x h_ret=%lli "
+ "counter=%i act_pages=%i",
+ my_cq, my_cq->cq_number,
+ h_ret, counter, param.act_pages);
+ cq = ERR_PTR(-ENOMEM);
+ goto create_cq_exit4;
+ }
+ }
+ }
+
+ ipz_qeit_reset(&my_cq->ipz_queue);
+
+ gal = my_cq->galpas.kernel;
+ cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
+ ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
+ my_cq, my_cq->cq_number, cqx_fec);
+
+ my_cq->ib_cq.cqe = my_cq->nr_of_entries =
+ param.act_nr_of_entries - additional_cqe;
+ my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
+
+ for (i = 0; i < QP_HASHTAB_LEN; i++)
+ INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
+
+ INIT_LIST_HEAD(&my_cq->sqp_err_list);
+ INIT_LIST_HEAD(&my_cq->rqp_err_list);
+
+ if (context) {
+ struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
+ struct ehca_create_cq_resp resp;
+ memset(&resp, 0, sizeof(resp));
+ resp.cq_number = my_cq->cq_number;
+ resp.token = my_cq->token;
+ resp.ipz_queue.qe_size = ipz_queue->qe_size;
+ resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
+ resp.ipz_queue.queue_length = ipz_queue->queue_length;
+ resp.ipz_queue.pagesize = ipz_queue->pagesize;
+ resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
+ resp.fw_handle_ofs = (u32)
+ (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
+ if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+ ehca_err(device, "Copy to udata failed.");
+ cq = ERR_PTR(-EFAULT);
+ goto create_cq_exit4;
+ }
+ }
+
+ return cq;
+
+create_cq_exit4:
+ ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+
+create_cq_exit3:
+ h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+ if (h_ret != H_SUCCESS)
+ ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
+ "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
+
+create_cq_exit2:
+ write_lock_irqsave(&ehca_cq_idr_lock, flags);
+ idr_remove(&ehca_cq_idr, my_cq->token);
+ write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+create_cq_exit1:
+ kmem_cache_free(cq_cache, my_cq);
+
+ atomic_dec(&shca->num_cqs);
+ return cq;
+}
+
+int ehca_destroy_cq(struct ib_cq *cq)
+{
+ u64 h_ret;
+ struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+ int cq_num = my_cq->cq_number;
+ struct ib_device *device = cq->device;
+ struct ehca_shca *shca = container_of(device, struct ehca_shca,
+ ib_device);
+ struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+ unsigned long flags;
+
+ if (cq->uobject) {
+ if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
+ ehca_err(device, "Resources still referenced in "
+ "user space cq_num=%x", my_cq->cq_number);
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * remove the CQ from the idr first to make sure
+ * no more interrupt tasklets will touch this CQ
+ */
+ write_lock_irqsave(&ehca_cq_idr_lock, flags);
+ idr_remove(&ehca_cq_idr, my_cq->token);
+ write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+ /* now wait until all pending events have completed */
+ wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
+
+ /* nobody's using our CQ any longer -- we can destroy it */
+ h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
+ if (h_ret == H_R_STATE) {
+ /* cq in err: read err data and destroy it forcibly */
+ ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
+ "state. Try to delete it forcibly.",
+ my_cq, cq_num, my_cq->ipz_cq_handle.handle);
+ ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
+ h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+ if (h_ret == H_SUCCESS)
+ ehca_dbg(device, "cq_num=%x deleted successfully.",
+ cq_num);
+ }
+ if (h_ret != H_SUCCESS) {
+ ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
+ "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
+ return ehca2ib_return_code(h_ret);
+ }
+ ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+ kmem_cache_free(cq_cache, my_cq);
+
+ atomic_dec(&shca->num_cqs);
+ return 0;
+}
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+ /* TODO: proper resize needs to be done */
+ ehca_err(cq->device, "not implemented yet");
+
+ return -EFAULT;
+}
+
+int ehca_init_cq_cache(void)
+{
+ cq_cache = kmem_cache_create("ehca_cache_cq",
+ sizeof(struct ehca_cq), 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!cq_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void ehca_cleanup_cq_cache(void)
+{
+ if (cq_cache)
+ kmem_cache_destroy(cq_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_eq.c b/drivers/staging/rdma/ehca/ehca_eq.c
new file mode 100644
index 000000000000..90da6747d395
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_eq.c
@@ -0,0 +1,189 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Event queue handling
+ *
+ * Authors: Waleri Fomin <fomin@de.ibm.com>
+ * Khadija Souissi <souissi@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_qes.h"
+#include "hcp_if.h"
+#include "ipz_pt_fn.h"
+
+int ehca_create_eq(struct ehca_shca *shca,
+ struct ehca_eq *eq,
+ const enum ehca_eq_type type, const u32 length)
+{
+ int ret;
+ u64 h_ret;
+ u32 nr_pages;
+ u32 i;
+ void *vpage;
+ struct ib_device *ib_dev = &shca->ib_device;
+
+ spin_lock_init(&eq->spinlock);
+ spin_lock_init(&eq->irq_spinlock);
+ eq->is_initialized = 0;
+
+ if (type != EHCA_EQ && type != EHCA_NEQ) {
+ ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
+ return -EINVAL;
+ }
+ if (!length) {
+ ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
+ return -EINVAL;
+ }
+
+ h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
+ &eq->pf,
+ type,
+ length,
+ &eq->ipz_eq_handle,
+ &eq->length,
+ &nr_pages, &eq->ist);
+
+ if (h_ret != H_SUCCESS) {
+ ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
+ return -EINVAL;
+ }
+
+ ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
+ EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
+ if (!ret) {
+ ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
+ goto create_eq_exit1;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ u64 rpage;
+
+ vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+ if (!vpage)
+ goto create_eq_exit2;
+
+ rpage = __pa(vpage);
+ h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
+ eq->ipz_eq_handle,
+ &eq->pf,
+ 0, 0, rpage, 1);
+
+ if (i == (nr_pages - 1)) {
+ /* last page */
+ vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+ if (h_ret != H_SUCCESS || vpage)
+ goto create_eq_exit2;
+ } else {
+ if (h_ret != H_PAGE_REGISTERED)
+ goto create_eq_exit2;
+ }
+ }
+
+ ipz_qeit_reset(&eq->ipz_queue);
+
+ /* register interrupt handlers and initialize work queues */
+ if (type == EHCA_EQ) {
+ tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
+
+ ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
+ 0, "ehca_eq",
+ (void *)shca);
+ if (ret < 0)
+ ehca_err(ib_dev, "Can't map interrupt handler.");
+ } else if (type == EHCA_NEQ) {
+ tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
+
+ ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
+ 0, "ehca_neq",
+ (void *)shca);
+ if (ret < 0)
+ ehca_err(ib_dev, "Can't map interrupt handler.");
+ }
+
+ eq->is_initialized = 1;
+
+ return 0;
+
+create_eq_exit2:
+ ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+create_eq_exit1:
+ hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+ return -EINVAL;
+}
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+ unsigned long flags;
+ void *eqe;
+
+ spin_lock_irqsave(&eq->spinlock, flags);
+ eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
+ spin_unlock_irqrestore(&eq->spinlock, flags);
+
+ return eqe;
+}
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+ unsigned long flags;
+ u64 h_ret;
+
+ ibmebus_free_irq(eq->ist, (void *)shca);
+
+ spin_lock_irqsave(&shca_list_lock, flags);
+ eq->is_initialized = 0;
+ spin_unlock_irqrestore(&shca_list_lock, flags);
+
+ tasklet_kill(&eq->interrupt_task);
+
+ h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't free EQ resources.");
+ return -EINVAL;
+ }
+ ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+ return 0;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_hca.c b/drivers/staging/rdma/ehca/ehca_hca.c
new file mode 100644
index 000000000000..e8b1bb65797a
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_hca.c
@@ -0,0 +1,414 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * HCA query functions
+ *
+ * Authors: Heiko J Schick <schickhj@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/gfp.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static unsigned int limit_uint(unsigned int value)
+{
+ return min_t(unsigned int, value, INT_MAX);
+}
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+ struct ib_udata *uhw)
+{
+ int i, ret = 0;
+ struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+ ib_device);
+ struct hipz_query_hca *rblock;
+
+ static const u32 cap_mapping[] = {
+ IB_DEVICE_RESIZE_MAX_WR, HCA_CAP_WQE_RESIZE,
+ IB_DEVICE_BAD_PKEY_CNTR, HCA_CAP_BAD_P_KEY_CTR,
+ IB_DEVICE_BAD_QKEY_CNTR, HCA_CAP_Q_KEY_VIOL_CTR,
+ IB_DEVICE_RAW_MULTI, HCA_CAP_RAW_PACKET_MCAST,
+ IB_DEVICE_AUTO_PATH_MIG, HCA_CAP_AUTO_PATH_MIG,
+ IB_DEVICE_CHANGE_PHY_PORT, HCA_CAP_SQD_RTS_PORT_CHANGE,
+ IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
+ IB_DEVICE_CURR_QP_STATE_MOD, HCA_CAP_CUR_QP_STATE_MOD,
+ IB_DEVICE_SHUTDOWN_PORT, HCA_CAP_SHUTDOWN_PORT,
+ IB_DEVICE_INIT_TYPE, HCA_CAP_INIT_TYPE,
+ IB_DEVICE_PORT_ACTIVE_EVENT, HCA_CAP_PORT_ACTIVE_EVENT,
+ };
+
+ if (uhw->inlen || uhw->outlen)
+ return -EINVAL;
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+ return -ENOMEM;
+ }
+
+ if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't query device properties");
+ ret = -EINVAL;
+ goto query_device1;
+ }
+
+ memset(props, 0, sizeof(struct ib_device_attr));
+ props->page_size_cap = shca->hca_cap_mr_pgsize;
+ props->fw_ver = rblock->hw_ver;
+ props->max_mr_size = rblock->max_mr_size;
+ props->vendor_id = rblock->vendor_id >> 8;
+ props->vendor_part_id = rblock->vendor_part_id >> 16;
+ props->hw_ver = rblock->hw_ver;
+ props->max_qp = limit_uint(rblock->max_qp);
+ props->max_qp_wr = limit_uint(rblock->max_wqes_wq);
+ props->max_sge = limit_uint(rblock->max_sge);
+ props->max_sge_rd = limit_uint(rblock->max_sge_rd);
+ props->max_cq = limit_uint(rblock->max_cq);
+ props->max_cqe = limit_uint(rblock->max_cqe);
+ props->max_mr = limit_uint(rblock->max_mr);
+ props->max_mw = limit_uint(rblock->max_mw);
+ props->max_pd = limit_uint(rblock->max_pd);
+ props->max_ah = limit_uint(rblock->max_ah);
+ props->max_ee = limit_uint(rblock->max_rd_ee_context);
+ props->max_rdd = limit_uint(rblock->max_rd_domain);
+ props->max_fmr = limit_uint(rblock->max_mr);
+ props->max_qp_rd_atom = limit_uint(rblock->max_rr_qp);
+ props->max_ee_rd_atom = limit_uint(rblock->max_rr_ee_context);
+ props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
+ props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
+ props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
+
+ if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+ props->max_srq = limit_uint(props->max_qp);
+ props->max_srq_wr = limit_uint(props->max_qp_wr);
+ props->max_srq_sge = 3;
+ }
+
+ props->max_pkeys = 16;
+ /* Some FW versions say 0 here; insert sensible value in that case */
+ props->local_ca_ack_delay = rblock->local_ca_ack_delay ?
+ min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
+ props->max_raw_ipv6_qp = limit_uint(rblock->max_raw_ipv6_qp);
+ props->max_raw_ethy_qp = limit_uint(rblock->max_raw_ethy_qp);
+ props->max_mcast_grp = limit_uint(rblock->max_mcast_grp);
+ props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
+ props->max_total_mcast_qp_attach
+ = limit_uint(rblock->max_total_mcast_qp_attach);
+
+ /* translate device capabilities */
+ props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
+ for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
+ if (rblock->hca_cap_indicators & cap_mapping[i + 1])
+ props->device_cap_flags |= cap_mapping[i];
+
+query_device1:
+ ehca_free_fw_ctrlblock(rblock);
+
+ return ret;
+}
+
+static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+{
+ switch (fw_mtu) {
+ case 0x1:
+ return IB_MTU_256;
+ case 0x2:
+ return IB_MTU_512;
+ case 0x3:
+ return IB_MTU_1024;
+ case 0x4:
+ return IB_MTU_2048;
+ case 0x5:
+ return IB_MTU_4096;
+ default:
+ ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
+ fw_mtu);
+ return 0;
+ }
+}
+
+static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
+{
+ switch (vl_cap) {
+ case 0x1:
+ return 1;
+ case 0x2:
+ return 2;
+ case 0x3:
+ return 4;
+ case 0x4:
+ return 8;
+ case 0x5:
+ return 15;
+ default:
+ ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
+ vl_cap);
+ return 0;
+ }
+}
+
+int ehca_query_port(struct ib_device *ibdev,
+ u8 port, struct ib_port_attr *props)
+{
+ int ret = 0;
+ u64 h_ret;
+ struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+ ib_device);
+ struct hipz_query_port *rblock;
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't query port properties");
+ ret = -EINVAL;
+ goto query_port1;
+ }
+
+ memset(props, 0, sizeof(struct ib_port_attr));
+
+ props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
+ props->port_cap_flags = rblock->capability_mask;
+ props->gid_tbl_len = rblock->gid_tbl_len;
+ if (rblock->max_msg_sz)
+ props->max_msg_sz = rblock->max_msg_sz;
+ else
+ props->max_msg_sz = 0x1 << 31;
+ props->bad_pkey_cntr = rblock->bad_pkey_cntr;
+ props->qkey_viol_cntr = rblock->qkey_viol_cntr;
+ props->pkey_tbl_len = rblock->pkey_tbl_len;
+ props->lid = rblock->lid;
+ props->sm_lid = rblock->sm_lid;
+ props->lmc = rblock->lmc;
+ props->sm_sl = rblock->sm_sl;
+ props->subnet_timeout = rblock->subnet_timeout;
+ props->init_type_reply = rblock->init_type_reply;
+ props->max_vl_num = map_number_of_vls(shca, rblock->vl_cap);
+
+ if (rblock->state && rblock->phys_width) {
+ props->phys_state = rblock->phys_pstate;
+ props->state = rblock->phys_state;
+ props->active_width = rblock->phys_width;
+ props->active_speed = rblock->phys_speed;
+ } else {
+ /* old firmware releases don't report physical
+ * port info, so use default values
+ */
+ props->phys_state = 5;
+ props->state = rblock->state;
+ props->active_width = IB_WIDTH_12X;
+ props->active_speed = IB_SPEED_SDR;
+ }
+
+query_port1:
+ ehca_free_fw_ctrlblock(rblock);
+
+ return ret;
+}
+
+int ehca_query_sma_attr(struct ehca_shca *shca,
+ u8 port, struct ehca_sma_attr *attr)
+{
+ int ret = 0;
+ u64 h_ret;
+ struct hipz_query_port *rblock;
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't query port properties");
+ ret = -EINVAL;
+ goto query_sma_attr1;
+ }
+
+ memset(attr, 0, sizeof(struct ehca_sma_attr));
+
+ attr->lid = rblock->lid;
+ attr->lmc = rblock->lmc;
+ attr->sm_sl = rblock->sm_sl;
+ attr->sm_lid = rblock->sm_lid;
+
+ attr->pkey_tbl_len = rblock->pkey_tbl_len;
+ memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
+
+query_sma_attr1:
+ ehca_free_fw_ctrlblock(rblock);
+
+ return ret;
+}
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+ int ret = 0;
+ u64 h_ret;
+ struct ehca_shca *shca;
+ struct hipz_query_port *rblock;
+
+ shca = container_of(ibdev, struct ehca_shca, ib_device);
+ if (index > 16) {
+ ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+ return -EINVAL;
+ }
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't query port properties");
+ ret = -EINVAL;
+ goto query_pkey1;
+ }
+
+ memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
+
+query_pkey1:
+ ehca_free_fw_ctrlblock(rblock);
+
+ return ret;
+}
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port,
+ int index, union ib_gid *gid)
+{
+ int ret = 0;
+ u64 h_ret;
+ struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+ ib_device);
+ struct hipz_query_port *rblock;
+
+ if (index < 0 || index > 255) {
+ ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+ return -EINVAL;
+ }
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't query port properties");
+ ret = -EINVAL;
+ goto query_gid1;
+ }
+
+ memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
+ memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
+
+query_gid1:
+ ehca_free_fw_ctrlblock(rblock);
+
+ return ret;
+}
+
+static const u32 allowed_port_caps = (
+ IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
+ IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
+ IB_PORT_VENDOR_CLASS_SUP);
+
+int ehca_modify_port(struct ib_device *ibdev,
+ u8 port, int port_modify_mask,
+ struct ib_port_modify *props)
+{
+ int ret = 0;
+ struct ehca_shca *shca;
+ struct hipz_query_port *rblock;
+ u32 cap;
+ u64 hret;
+
+ shca = container_of(ibdev, struct ehca_shca, ib_device);
+ if ((props->set_port_cap_mask | props->clr_port_cap_mask)
+ & ~allowed_port_caps) {
+ ehca_err(&shca->ib_device, "Non-changeable bits set in masks "
+ "set=%x clr=%x allowed=%x", props->set_port_cap_mask,
+ props->clr_port_cap_mask, allowed_port_caps);
+ return -EINVAL;
+ }
+
+ if (mutex_lock_interruptible(&shca->modify_mutex))
+ return -ERESTARTSYS;
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+ ret = -ENOMEM;
+ goto modify_port1;
+ }
+
+ hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+ if (hret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't query port properties");
+ ret = -EINVAL;
+ goto modify_port2;
+ }
+
+ cap = (rblock->capability_mask | props->set_port_cap_mask)
+ & ~props->clr_port_cap_mask;
+
+ hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
+ cap, props->init_type, port_modify_mask);
+ if (hret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Modify port failed h_ret=%lli",
+ hret);
+ ret = -EINVAL;
+ }
+
+modify_port2:
+ ehca_free_fw_ctrlblock(rblock);
+
+modify_port1:
+ mutex_unlock(&shca->modify_mutex);
+
+ return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.c b/drivers/staging/rdma/ehca/ehca_irq.c
new file mode 100644
index 000000000000..8615d7cf7e01
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_irq.c
@@ -0,0 +1,870 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Functions for EQs, NEQs and interrupts
+ *
+ * Authors: Heiko J Schick <schickhj@de.ibm.com>
+ * Khadija Souissi <souissi@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/smpboot.h>
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define EQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1)
+#define EQE_CQ_QP_NUMBER EHCA_BMASK_IBM( 8, 31)
+#define EQE_EE_IDENTIFIER EHCA_BMASK_IBM( 2, 7)
+#define EQE_CQ_NUMBER EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_NUMBER EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_TOKEN EHCA_BMASK_IBM(32, 63)
+#define EQE_CQ_TOKEN EHCA_BMASK_IBM(32, 63)
+
+#define NEQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1)
+#define NEQE_EVENT_CODE EHCA_BMASK_IBM( 2, 7)
+#define NEQE_PORT_NUMBER EHCA_BMASK_IBM( 8, 15)
+#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
+#define NEQE_DISRUPTIVE EHCA_BMASK_IBM(16, 16)
+#define NEQE_SPECIFIC_EVENT EHCA_BMASK_IBM(16, 23)
+
+#define ERROR_DATA_LENGTH EHCA_BMASK_IBM(52, 63)
+#define ERROR_DATA_TYPE EHCA_BMASK_IBM( 0, 7)
+
+static void queue_comp_task(struct ehca_cq *__cq);
+
+static struct ehca_comp_pool *pool;
+
+static inline void comp_event_callback(struct ehca_cq *cq)
+{
+ if (!cq->ib_cq.comp_handler)
+ return;
+
+ spin_lock(&cq->cb_lock);
+ cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
+ spin_unlock(&cq->cb_lock);
+
+ return;
+}
+
+static void print_error_data(struct ehca_shca *shca, void *data,
+ u64 *rblock, int length)
+{
+ u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
+ u64 resource = rblock[1];
+
+ switch (type) {
+ case 0x1: /* Queue Pair */
+ {
+ struct ehca_qp *qp = (struct ehca_qp *)data;
+
+ /* only print error data if AER is set */
+ if (rblock[6] == 0)
+ return;
+
+ ehca_err(&shca->ib_device,
+ "QP 0x%x (resource=%llx) has errors.",
+ qp->ib_qp.qp_num, resource);
+ break;
+ }
+ case 0x4: /* Completion Queue */
+ {
+ struct ehca_cq *cq = (struct ehca_cq *)data;
+
+ ehca_err(&shca->ib_device,
+ "CQ 0x%x (resource=%llx) has errors.",
+ cq->cq_number, resource);
+ break;
+ }
+ default:
+ ehca_err(&shca->ib_device,
+ "Unknown error type: %llx on %s.",
+ type, shca->ib_device.name);
+ break;
+ }
+
+ ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
+ ehca_err(&shca->ib_device, "EHCA ----- error data begin "
+ "---------------------------------------------------");
+ ehca_dmp(rblock, length, "resource=%llx", resource);
+ ehca_err(&shca->ib_device, "EHCA ----- error data end "
+ "----------------------------------------------------");
+
+ return;
+}
+
+int ehca_error_data(struct ehca_shca *shca, void *data,
+ u64 resource)
+{
+
+ unsigned long ret;
+ u64 *rblock;
+ unsigned long block_count;
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
+ ret = -ENOMEM;
+ goto error_data1;
+ }
+
+ /* rblock must be 4K aligned and should be 4K large */
+ ret = hipz_h_error_data(shca->ipz_hca_handle,
+ resource,
+ rblock,
+ &block_count);
+
+ if (ret == H_R_STATE)
+ ehca_err(&shca->ib_device,
+ "No error data is available: %llx.", resource);
+ else if (ret == H_SUCCESS) {
+ int length;
+
+ length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
+
+ if (length > EHCA_PAGESIZE)
+ length = EHCA_PAGESIZE;
+
+ print_error_data(shca, data, rblock, length);
+ } else
+ ehca_err(&shca->ib_device,
+ "Error data could not be fetched: %llx", resource);
+
+ ehca_free_fw_ctrlblock(rblock);
+
+error_data1:
+ return ret;
+
+}
+
+static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
+ enum ib_event_type event_type)
+{
+ struct ib_event event;
+
+ /* PATH_MIG without the QP ever having been armed is false alarm */
+ if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
+ return;
+
+ event.device = &shca->ib_device;
+ event.event = event_type;
+
+ if (qp->ext_type == EQPT_SRQ) {
+ if (!qp->ib_srq.event_handler)
+ return;
+
+ event.element.srq = &qp->ib_srq;
+ qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
+ } else {
+ if (!qp->ib_qp.event_handler)
+ return;
+
+ event.element.qp = &qp->ib_qp;
+ qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
+ }
+}
+
+static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
+ enum ib_event_type event_type, int fatal)
+{
+ struct ehca_qp *qp;
+ u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
+
+ read_lock(&ehca_qp_idr_lock);
+ qp = idr_find(&ehca_qp_idr, token);
+ if (qp)
+ atomic_inc(&qp->nr_events);
+ read_unlock(&ehca_qp_idr_lock);
+
+ if (!qp)
+ return;
+
+ if (fatal)
+ ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
+
+ dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
+ IB_EVENT_SRQ_ERR : event_type);
+
+ /*
+ * eHCA only processes one WQE at a time for SRQ base QPs,
+ * so the last WQE has been processed as soon as the QP enters
+ * error state.
+ */
+ if (fatal && qp->ext_type == EQPT_SRQBASE)
+ dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
+
+ if (atomic_dec_and_test(&qp->nr_events))
+ wake_up(&qp->wait_completion);
+ return;
+}
+
+static void cq_event_callback(struct ehca_shca *shca,
+ u64 eqe)
+{
+ struct ehca_cq *cq;
+ u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
+
+ read_lock(&ehca_cq_idr_lock);
+ cq = idr_find(&ehca_cq_idr, token);
+ if (cq)
+ atomic_inc(&cq->nr_events);
+ read_unlock(&ehca_cq_idr_lock);
+
+ if (!cq)
+ return;
+
+ ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
+
+ if (atomic_dec_and_test(&cq->nr_events))
+ wake_up(&cq->wait_completion);
+
+ return;
+}
+
+static void parse_identifier(struct ehca_shca *shca, u64 eqe)
+{
+ u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
+
+ switch (identifier) {
+ case 0x02: /* path migrated */
+ qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
+ break;
+ case 0x03: /* communication established */
+ qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
+ break;
+ case 0x04: /* send queue drained */
+ qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
+ break;
+ case 0x05: /* QP error */
+ case 0x06: /* QP error */
+ qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
+ break;
+ case 0x07: /* CQ error */
+ case 0x08: /* CQ error */
+ cq_event_callback(shca, eqe);
+ break;
+ case 0x09: /* MRMWPTE error */
+ ehca_err(&shca->ib_device, "MRMWPTE error.");
+ break;
+ case 0x0A: /* port event */
+ ehca_err(&shca->ib_device, "Port event.");
+ break;
+ case 0x0B: /* MR access error */
+ ehca_err(&shca->ib_device, "MR access error.");
+ break;
+ case 0x0C: /* EQ error */
+ ehca_err(&shca->ib_device, "EQ error.");
+ break;
+ case 0x0D: /* P/Q_Key mismatch */
+ ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
+ break;
+ case 0x10: /* sampling complete */
+ ehca_err(&shca->ib_device, "Sampling complete.");
+ break;
+ case 0x11: /* unaffiliated access error */
+ ehca_err(&shca->ib_device, "Unaffiliated access error.");
+ break;
+ case 0x12: /* path migrating */
+ ehca_err(&shca->ib_device, "Path migrating.");
+ break;
+ case 0x13: /* interface trace stopped */
+ ehca_err(&shca->ib_device, "Interface trace stopped.");
+ break;
+ case 0x14: /* first error capture info available */
+ ehca_info(&shca->ib_device, "First error capture available");
+ break;
+ case 0x15: /* SRQ limit reached */
+ qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
+ break;
+ default:
+ ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
+ identifier, shca->ib_device.name);
+ break;
+ }
+
+ return;
+}
+
+static void dispatch_port_event(struct ehca_shca *shca, int port_num,
+ enum ib_event_type type, const char *msg)
+{
+ struct ib_event event;
+
+ ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
+ event.device = &shca->ib_device;
+ event.event = type;
+ event.element.port_num = port_num;
+ ib_dispatch_event(&event);
+}
+
+static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
+{
+ struct ehca_sma_attr new_attr;
+ struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
+
+ ehca_query_sma_attr(shca, port_num, &new_attr);
+
+ if (new_attr.sm_sl != old_attr->sm_sl ||
+ new_attr.sm_lid != old_attr->sm_lid)
+ dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
+ "SM changed");
+
+ if (new_attr.lid != old_attr->lid ||
+ new_attr.lmc != old_attr->lmc)
+ dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
+ "LID changed");
+
+ if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
+ memcmp(new_attr.pkeys, old_attr->pkeys,
+ sizeof(u16) * new_attr.pkey_tbl_len))
+ dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
+ "P_Key changed");
+
+ *old_attr = new_attr;
+}
+
+/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
+static int replay_modify_qp(struct ehca_sport *sport)
+{
+ int aqp1_destroyed;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+
+ aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
+
+ if (sport->ibqp_sqp[IB_QPT_SMI])
+ ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
+ if (!aqp1_destroyed)
+ ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
+
+ spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+
+ return aqp1_destroyed;
+}
+
+static void parse_ec(struct ehca_shca *shca, u64 eqe)
+{
+ u8 ec = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
+ u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
+ u8 spec_event;
+ struct ehca_sport *sport = &shca->sport[port - 1];
+
+ switch (ec) {
+ case 0x30: /* port availability change */
+ if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
+ /* only replay modify_qp calls in autodetect mode;
+ * if AQP1 was destroyed, the port is already down
+ * again and we can drop the event.
+ */
+ if (ehca_nr_ports < 0)
+ if (replay_modify_qp(sport))
+ break;
+
+ sport->port_state = IB_PORT_ACTIVE;
+ dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+ "is active");
+ ehca_query_sma_attr(shca, port, &sport->saved_attr);
+ } else {
+ sport->port_state = IB_PORT_DOWN;
+ dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+ "is inactive");
+ }
+ break;
+ case 0x31:
+ /* port configuration change
+ * disruptive change is caused by
+ * LID, PKEY or SM change
+ */
+ if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
+ ehca_warn(&shca->ib_device, "disruptive port "
+ "%d configuration change", port);
+
+ sport->port_state = IB_PORT_DOWN;
+ dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+ "is inactive");
+
+ sport->port_state = IB_PORT_ACTIVE;
+ dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+ "is active");
+ ehca_query_sma_attr(shca, port,
+ &sport->saved_attr);
+ } else
+ notify_port_conf_change(shca, port);
+ break;
+ case 0x32: /* adapter malfunction */
+ ehca_err(&shca->ib_device, "Adapter malfunction.");
+ break;
+ case 0x33: /* trace stopped */
+ ehca_err(&shca->ib_device, "Traced stopped.");
+ break;
+ case 0x34: /* util async event */
+ spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
+ if (spec_event == 0x80) /* client reregister required */
+ dispatch_port_event(shca, port,
+ IB_EVENT_CLIENT_REREGISTER,
+ "client reregister req.");
+ else
+ ehca_warn(&shca->ib_device, "Unknown util async "
+ "event %x on port %x", spec_event, port);
+ break;
+ default:
+ ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
+ ec, shca->ib_device.name);
+ break;
+ }
+
+ return;
+}
+
+static inline void reset_eq_pending(struct ehca_cq *cq)
+{
+ u64 CQx_EP;
+ struct h_galpa gal = cq->galpas.kernel;
+
+ hipz_galpa_store_cq(gal, cqx_ep, 0x0);
+ CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
+
+ return;
+}
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
+{
+ struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+ tasklet_hi_schedule(&shca->neq.interrupt_task);
+
+ return IRQ_HANDLED;
+}
+
+void ehca_tasklet_neq(unsigned long data)
+{
+ struct ehca_shca *shca = (struct ehca_shca*)data;
+ struct ehca_eqe *eqe;
+ u64 ret;
+
+ eqe = ehca_poll_eq(shca, &shca->neq);
+
+ while (eqe) {
+ if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
+ parse_ec(shca, eqe->entry);
+
+ eqe = ehca_poll_eq(shca, &shca->neq);
+ }
+
+ ret = hipz_h_reset_event(shca->ipz_hca_handle,
+ shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
+
+ if (ret != H_SUCCESS)
+ ehca_err(&shca->ib_device, "Can't clear notification events.");
+
+ return;
+}
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
+{
+ struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+ tasklet_hi_schedule(&shca->eq.interrupt_task);
+
+ return IRQ_HANDLED;
+}
+
+
+static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
+{
+ u64 eqe_value;
+ u32 token;
+ struct ehca_cq *cq;
+
+ eqe_value = eqe->entry;
+ ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
+ if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+ ehca_dbg(&shca->ib_device, "Got completion event");
+ token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+ read_lock(&ehca_cq_idr_lock);
+ cq = idr_find(&ehca_cq_idr, token);
+ if (cq)
+ atomic_inc(&cq->nr_events);
+ read_unlock(&ehca_cq_idr_lock);
+ if (cq == NULL) {
+ ehca_err(&shca->ib_device,
+ "Invalid eqe for non-existing cq token=%x",
+ token);
+ return;
+ }
+ reset_eq_pending(cq);
+ if (ehca_scaling_code)
+ queue_comp_task(cq);
+ else {
+ comp_event_callback(cq);
+ if (atomic_dec_and_test(&cq->nr_events))
+ wake_up(&cq->wait_completion);
+ }
+ } else {
+ ehca_dbg(&shca->ib_device, "Got non completion event");
+ parse_identifier(shca, eqe_value);
+ }
+}
+
+void ehca_process_eq(struct ehca_shca *shca, int is_irq)
+{
+ struct ehca_eq *eq = &shca->eq;
+ struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
+ u64 eqe_value, ret;
+ int eqe_cnt, i;
+ int eq_empty = 0;
+
+ spin_lock(&eq->irq_spinlock);
+ if (is_irq) {
+ const int max_query_cnt = 100;
+ int query_cnt = 0;
+ int int_state = 1;
+ do {
+ int_state = hipz_h_query_int_state(
+ shca->ipz_hca_handle, eq->ist);
+ query_cnt++;
+ iosync();
+ } while (int_state && query_cnt < max_query_cnt);
+ if (unlikely((query_cnt == max_query_cnt)))
+ ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
+ int_state, query_cnt);
+ }
+
+ /* read out all eqes */
+ eqe_cnt = 0;
+ do {
+ u32 token;
+ eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
+ if (!eqe_cache[eqe_cnt].eqe)
+ break;
+ eqe_value = eqe_cache[eqe_cnt].eqe->entry;
+ if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+ token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+ read_lock(&ehca_cq_idr_lock);
+ eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
+ if (eqe_cache[eqe_cnt].cq)
+ atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
+ read_unlock(&ehca_cq_idr_lock);
+ if (!eqe_cache[eqe_cnt].cq) {
+ ehca_err(&shca->ib_device,
+ "Invalid eqe for non-existing cq "
+ "token=%x", token);
+ continue;
+ }
+ } else
+ eqe_cache[eqe_cnt].cq = NULL;
+ eqe_cnt++;
+ } while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
+ if (!eqe_cnt) {
+ if (is_irq)
+ ehca_dbg(&shca->ib_device,
+ "No eqe found for irq event");
+ goto unlock_irq_spinlock;
+ } else if (!is_irq) {
+ ret = hipz_h_eoi(eq->ist);
+ if (ret != H_SUCCESS)
+ ehca_err(&shca->ib_device,
+ "bad return code EOI -rc = %lld\n", ret);
+ ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+ }
+ if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
+ ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
+ /* enable irq for new packets */
+ for (i = 0; i < eqe_cnt; i++) {
+ if (eq->eqe_cache[i].cq)
+ reset_eq_pending(eq->eqe_cache[i].cq);
+ }
+ /* check eq */
+ spin_lock(&eq->spinlock);
+ eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
+ spin_unlock(&eq->spinlock);
+ /* call completion handler for cached eqes */
+ for (i = 0; i < eqe_cnt; i++)
+ if (eq->eqe_cache[i].cq) {
+ if (ehca_scaling_code)
+ queue_comp_task(eq->eqe_cache[i].cq);
+ else {
+ struct ehca_cq *cq = eq->eqe_cache[i].cq;
+ comp_event_callback(cq);
+ if (atomic_dec_and_test(&cq->nr_events))
+ wake_up(&cq->wait_completion);
+ }
+ } else {
+ ehca_dbg(&shca->ib_device, "Got non completion event");
+ parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
+ }
+ /* poll eq if not empty */
+ if (eq_empty)
+ goto unlock_irq_spinlock;
+ do {
+ struct ehca_eqe *eqe;
+ eqe = ehca_poll_eq(shca, &shca->eq);
+ if (!eqe)
+ break;
+ process_eqe(shca, eqe);
+ } while (1);
+
+unlock_irq_spinlock:
+ spin_unlock(&eq->irq_spinlock);
+}
+
+void ehca_tasklet_eq(unsigned long data)
+{
+ ehca_process_eq((struct ehca_shca*)data, 1);
+}
+
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
+{
+ int cpu;
+ unsigned long flags;
+
+ WARN_ON_ONCE(!in_interrupt());
+ if (ehca_debug_level >= 3)
+ ehca_dmp(cpu_online_mask, cpumask_size(), "");
+
+ spin_lock_irqsave(&pool->last_cpu_lock, flags);
+ do {
+ cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(cpu_online_mask);
+ pool->last_cpu = cpu;
+ } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
+ spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
+
+ return cpu;
+}
+
+static void __queue_comp_task(struct ehca_cq *__cq,
+ struct ehca_cpu_comp_task *cct,
+ struct task_struct *thread)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cct->task_lock, flags);
+ spin_lock(&__cq->task_lock);
+
+ if (__cq->nr_callbacks == 0) {
+ __cq->nr_callbacks++;
+ list_add_tail(&__cq->entry, &cct->cq_list);
+ cct->cq_jobs++;
+ wake_up_process(thread);
+ } else
+ __cq->nr_callbacks++;
+
+ spin_unlock(&__cq->task_lock);
+ spin_unlock_irqrestore(&cct->task_lock, flags);
+}
+
+static void queue_comp_task(struct ehca_cq *__cq)
+{
+ int cpu_id;
+ struct ehca_cpu_comp_task *cct;
+ struct task_struct *thread;
+ int cq_jobs;
+ unsigned long flags;
+
+ cpu_id = find_next_online_cpu(pool);
+ BUG_ON(!cpu_online(cpu_id));
+
+ cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+ thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+ BUG_ON(!cct || !thread);
+
+ spin_lock_irqsave(&cct->task_lock, flags);
+ cq_jobs = cct->cq_jobs;
+ spin_unlock_irqrestore(&cct->task_lock, flags);
+ if (cq_jobs > 0) {
+ cpu_id = find_next_online_cpu(pool);
+ cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+ thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+ BUG_ON(!cct || !thread);
+ }
+ __queue_comp_task(__cq, cct, thread);
+}
+
+static void run_comp_task(struct ehca_cpu_comp_task *cct)
+{
+ struct ehca_cq *cq;
+
+ while (!list_empty(&cct->cq_list)) {
+ cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
+ spin_unlock_irq(&cct->task_lock);
+
+ comp_event_callback(cq);
+ if (atomic_dec_and_test(&cq->nr_events))
+ wake_up(&cq->wait_completion);
+
+ spin_lock_irq(&cct->task_lock);
+ spin_lock(&cq->task_lock);
+ cq->nr_callbacks--;
+ if (!cq->nr_callbacks) {
+ list_del_init(cct->cq_list.next);
+ cct->cq_jobs--;
+ }
+ spin_unlock(&cq->task_lock);
+ }
+}
+
+static void comp_task_park(unsigned int cpu)
+{
+ struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ struct ehca_cpu_comp_task *target;
+ struct task_struct *thread;
+ struct ehca_cq *cq, *tmp;
+ LIST_HEAD(list);
+
+ spin_lock_irq(&cct->task_lock);
+ cct->cq_jobs = 0;
+ cct->active = 0;
+ list_splice_init(&cct->cq_list, &list);
+ spin_unlock_irq(&cct->task_lock);
+
+ cpu = find_next_online_cpu(pool);
+ target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
+ spin_lock_irq(&target->task_lock);
+ list_for_each_entry_safe(cq, tmp, &list, entry) {
+ list_del(&cq->entry);
+ __queue_comp_task(cq, target, thread);
+ }
+ spin_unlock_irq(&target->task_lock);
+}
+
+static void comp_task_stop(unsigned int cpu, bool online)
+{
+ struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+ spin_lock_irq(&cct->task_lock);
+ cct->cq_jobs = 0;
+ cct->active = 0;
+ WARN_ON(!list_empty(&cct->cq_list));
+ spin_unlock_irq(&cct->task_lock);
+}
+
+static int comp_task_should_run(unsigned int cpu)
+{
+ struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+ return cct->cq_jobs;
+}
+
+static void comp_task(unsigned int cpu)
+{
+ struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+ int cql_empty;
+
+ spin_lock_irq(&cct->task_lock);
+ cql_empty = list_empty(&cct->cq_list);
+ if (!cql_empty) {
+ __set_current_state(TASK_RUNNING);
+ run_comp_task(cct);
+ }
+ spin_unlock_irq(&cct->task_lock);
+}
+
+static struct smp_hotplug_thread comp_pool_threads = {
+ .thread_should_run = comp_task_should_run,
+ .thread_fn = comp_task,
+ .thread_comm = "ehca_comp/%u",
+ .cleanup = comp_task_stop,
+ .park = comp_task_park,
+};
+
+int ehca_create_comp_pool(void)
+{
+ int cpu, ret = -ENOMEM;
+
+ if (!ehca_scaling_code)
+ return 0;
+
+ pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
+ if (pool == NULL)
+ return -ENOMEM;
+
+ spin_lock_init(&pool->last_cpu_lock);
+ pool->last_cpu = cpumask_any(cpu_online_mask);
+
+ pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
+ if (!pool->cpu_comp_tasks)
+ goto out_pool;
+
+ pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+ if (!pool->cpu_comp_threads)
+ goto out_tasks;
+
+ for_each_present_cpu(cpu) {
+ struct ehca_cpu_comp_task *cct;
+
+ cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+ spin_lock_init(&cct->task_lock);
+ INIT_LIST_HEAD(&cct->cq_list);
+ }
+
+ comp_pool_threads.store = pool->cpu_comp_threads;
+ ret = smpboot_register_percpu_thread(&comp_pool_threads);
+ if (ret)
+ goto out_threads;
+
+ pr_info("eHCA scaling code enabled\n");
+ return ret;
+
+out_threads:
+ free_percpu(pool->cpu_comp_threads);
+out_tasks:
+ free_percpu(pool->cpu_comp_tasks);
+out_pool:
+ kfree(pool);
+ return ret;
+}
+
+void ehca_destroy_comp_pool(void)
+{
+ if (!ehca_scaling_code)
+ return;
+
+ smpboot_unregister_percpu_thread(&comp_pool_threads);
+
+ free_percpu(pool->cpu_comp_threads);
+ free_percpu(pool->cpu_comp_tasks);
+ kfree(pool);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.h b/drivers/staging/rdma/ehca/ehca_irq.h
new file mode 100644
index 000000000000..5370199f08c7
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_irq.h
@@ -0,0 +1,77 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Function definitions and structs for EQs, NEQs and interrupts
+ *
+ * Authors: Heiko J Schick <schickhj@de.ibm.com>
+ * Khadija Souissi <souissi@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IRQ_H
+#define __EHCA_IRQ_H
+
+
+struct ehca_shca;
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+
+int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
+void ehca_tasklet_neq(unsigned long data);
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
+void ehca_tasklet_eq(unsigned long data);
+void ehca_process_eq(struct ehca_shca *shca, int is_irq);
+
+struct ehca_cpu_comp_task {
+ struct list_head cq_list;
+ spinlock_t task_lock;
+ int cq_jobs;
+ int active;
+};
+
+struct ehca_comp_pool {
+ struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+ struct task_struct * __percpu *cpu_comp_threads;
+ int last_cpu;
+ spinlock_t last_cpu_lock;
+};
+
+int ehca_create_comp_pool(void);
+void ehca_destroy_comp_pool(void);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h
new file mode 100644
index 000000000000..80e6a3d5df3e
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_iverbs.h
@@ -0,0 +1,218 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Function definitions for internal functions
+ *
+ * Authors: Heiko J Schick <schickhj@de.ibm.com>
+ * Dietmar Decker <ddecker@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IVERBS_H__
+#define __EHCA_IVERBS_H__
+
+#include "ehca_classes.h"
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+ struct ib_udata *uhw);
+
+int ehca_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props);
+
+enum rdma_protocol_type
+ehca_query_protocol(struct ib_device *device, u8 port_num);
+
+int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
+ struct ehca_sma_attr *attr);
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid);
+
+int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
+ struct ib_port_modify *props);
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+
+int ehca_dealloc_pd(struct ib_pd *pd);
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_destroy_ah(struct ib_ah *ah);
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags, u64 *iova_start);
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt, int mr_access_flags,
+ struct ib_udata *udata);
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+int ehca_dereg_mr(struct ib_mr *mr);
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind);
+
+int ehca_dealloc_mw(struct ib_mw *mw);
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+ u64 *page_list, int list_len, u64 iova);
+
+int ehca_unmap_fmr(struct list_head *fmr_list);
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr);
+
+enum ehca_eq_type {
+ EHCA_EQ = 0, /* Event Queue */
+ EHCA_NEQ /* Notification Event Queue */
+};
+
+int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
+ enum ehca_eq_type type, const u32 length);
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+
+struct ib_cq *ehca_create_cq(struct ib_device *device,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+
+int ehca_destroy_cq(struct ib_cq *cq);
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+
+int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+
+int ehca_destroy_qp(struct ib_qp *qp);
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+ struct ib_udata *udata);
+
+int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr);
+
+int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata);
+
+int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+
+int ehca_destroy_srq(struct ib_srq *srq);
+
+u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
+ struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+ struct ib_udata *udata);
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context);
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in, size_t in_mad_size,
+ struct ib_mad_hdr *out, size_t *out_mad_size,
+ u16 *out_mad_pkey_index);
+
+void ehca_poll_eqs(unsigned long data);
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+ enum ib_rate path_rate, u32 *ipd);
+
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
+
+#ifdef CONFIG_PPC_64K_PAGES
+void *ehca_alloc_fw_ctrlblock(gfp_t flags);
+void ehca_free_fw_ctrlblock(void *ptr);
+#else
+#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
+#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
+#endif
+
+void ehca_recover_sqp(struct ib_qp *sqp);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c
new file mode 100644
index 000000000000..8246418cd4e0
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_main.c
@@ -0,0 +1,1123 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * module start stop, hca detection
+ *
+ * Authors: Heiko J Schick <schickhj@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <linux/slab.h>
+#endif
+
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <rdma/ib_mad.h>
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+#define HCAD_VERSION "0029"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
+MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
+MODULE_VERSION(HCAD_VERSION);
+
+static bool ehca_open_aqp1 = 0;
+static int ehca_hw_level = 0;
+static bool ehca_poll_all_eqs = 1;
+
+int ehca_debug_level = 0;
+int ehca_nr_ports = -1;
+bool ehca_use_hp_mr = 0;
+int ehca_port_act_time = 30;
+int ehca_static_rate = -1;
+bool ehca_scaling_code = 0;
+int ehca_lock_hcalls = -1;
+int ehca_max_cq = -1;
+int ehca_max_qp = -1;
+
+module_param_named(open_aqp1, ehca_open_aqp1, bool, S_IRUGO);
+module_param_named(debug_level, ehca_debug_level, int, S_IRUGO);
+module_param_named(hw_level, ehca_hw_level, int, S_IRUGO);
+module_param_named(nr_ports, ehca_nr_ports, int, S_IRUGO);
+module_param_named(use_hp_mr, ehca_use_hp_mr, bool, S_IRUGO);
+module_param_named(port_act_time, ehca_port_act_time, int, S_IRUGO);
+module_param_named(poll_all_eqs, ehca_poll_all_eqs, bool, S_IRUGO);
+module_param_named(static_rate, ehca_static_rate, int, S_IRUGO);
+module_param_named(scaling_code, ehca_scaling_code, bool, S_IRUGO);
+module_param_named(lock_hcalls, ehca_lock_hcalls, bint, S_IRUGO);
+module_param_named(number_of_cqs, ehca_max_cq, int, S_IRUGO);
+module_param_named(number_of_qps, ehca_max_qp, int, S_IRUGO);
+
+MODULE_PARM_DESC(open_aqp1,
+ "Open AQP1 on startup (default: no)");
+MODULE_PARM_DESC(debug_level,
+ "Amount of debug output (0: none (default), 1: traces, "
+ "2: some dumps, 3: lots)");
+MODULE_PARM_DESC(hw_level,
+ "Hardware level (0: autosensing (default), "
+ "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
+MODULE_PARM_DESC(nr_ports,
+ "number of connected ports (-1: autodetect (default), "
+ "1: port one only, 2: two ports)");
+MODULE_PARM_DESC(use_hp_mr,
+ "Use high performance MRs (default: no)");
+MODULE_PARM_DESC(port_act_time,
+ "Time to wait for port activation (default: 30 sec)");
+MODULE_PARM_DESC(poll_all_eqs,
+ "Poll all event queues periodically (default: yes)");
+MODULE_PARM_DESC(static_rate,
+ "Set permanent static rate (default: no static rate)");
+MODULE_PARM_DESC(scaling_code,
+ "Enable scaling code (default: no)");
+MODULE_PARM_DESC(lock_hcalls,
+ "Serialize all hCalls made by the driver "
+ "(default: autodetect)");
+MODULE_PARM_DESC(number_of_cqs,
+ "Max number of CQs which can be allocated "
+ "(default: autodetect)");
+MODULE_PARM_DESC(number_of_qps,
+ "Max number of QPs which can be allocated "
+ "(default: autodetect)");
+
+DEFINE_RWLOCK(ehca_qp_idr_lock);
+DEFINE_RWLOCK(ehca_cq_idr_lock);
+DEFINE_IDR(ehca_qp_idr);
+DEFINE_IDR(ehca_cq_idr);
+
+static LIST_HEAD(shca_list); /* list of all registered ehcas */
+DEFINE_SPINLOCK(shca_list_lock);
+
+static struct timer_list poll_eqs_timer;
+
+#ifdef CONFIG_PPC_64K_PAGES
+static struct kmem_cache *ctblk_cache;
+
+void *ehca_alloc_fw_ctrlblock(gfp_t flags)
+{
+ void *ret = kmem_cache_zalloc(ctblk_cache, flags);
+ if (!ret)
+ ehca_gen_err("Out of memory for ctblk");
+ return ret;
+}
+
+void ehca_free_fw_ctrlblock(void *ptr)
+{
+ if (ptr)
+ kmem_cache_free(ctblk_cache, ptr);
+
+}
+#endif
+
+int ehca2ib_return_code(u64 ehca_rc)
+{
+ switch (ehca_rc) {
+ case H_SUCCESS:
+ return 0;
+ case H_RESOURCE: /* Resource in use */
+ case H_BUSY:
+ return -EBUSY;
+ case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
+ case H_CONSTRAINED: /* resource constraint */
+ case H_NO_MEM:
+ return -ENOMEM;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int ehca_create_slab_caches(void)
+{
+ int ret;
+
+ ret = ehca_init_pd_cache();
+ if (ret) {
+ ehca_gen_err("Cannot create PD SLAB cache.");
+ return ret;
+ }
+
+ ret = ehca_init_cq_cache();
+ if (ret) {
+ ehca_gen_err("Cannot create CQ SLAB cache.");
+ goto create_slab_caches2;
+ }
+
+ ret = ehca_init_qp_cache();
+ if (ret) {
+ ehca_gen_err("Cannot create QP SLAB cache.");
+ goto create_slab_caches3;
+ }
+
+ ret = ehca_init_av_cache();
+ if (ret) {
+ ehca_gen_err("Cannot create AV SLAB cache.");
+ goto create_slab_caches4;
+ }
+
+ ret = ehca_init_mrmw_cache();
+ if (ret) {
+ ehca_gen_err("Cannot create MR&MW SLAB cache.");
+ goto create_slab_caches5;
+ }
+
+ ret = ehca_init_small_qp_cache();
+ if (ret) {
+ ehca_gen_err("Cannot create small queue SLAB cache.");
+ goto create_slab_caches6;
+ }
+
+#ifdef CONFIG_PPC_64K_PAGES
+ ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
+ EHCA_PAGESIZE, H_CB_ALIGNMENT,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!ctblk_cache) {
+ ehca_gen_err("Cannot create ctblk SLAB cache.");
+ ehca_cleanup_small_qp_cache();
+ ret = -ENOMEM;
+ goto create_slab_caches6;
+ }
+#endif
+ return 0;
+
+create_slab_caches6:
+ ehca_cleanup_mrmw_cache();
+
+create_slab_caches5:
+ ehca_cleanup_av_cache();
+
+create_slab_caches4:
+ ehca_cleanup_qp_cache();
+
+create_slab_caches3:
+ ehca_cleanup_cq_cache();
+
+create_slab_caches2:
+ ehca_cleanup_pd_cache();
+
+ return ret;
+}
+
+static void ehca_destroy_slab_caches(void)
+{
+ ehca_cleanup_small_qp_cache();
+ ehca_cleanup_mrmw_cache();
+ ehca_cleanup_av_cache();
+ ehca_cleanup_qp_cache();
+ ehca_cleanup_cq_cache();
+ ehca_cleanup_pd_cache();
+#ifdef CONFIG_PPC_64K_PAGES
+ if (ctblk_cache)
+ kmem_cache_destroy(ctblk_cache);
+#endif
+}
+
+#define EHCA_HCAAVER EHCA_BMASK_IBM(32, 39)
+#define EHCA_REVID EHCA_BMASK_IBM(40, 63)
+
+static struct cap_descr {
+ u64 mask;
+ char *descr;
+} hca_cap_descr[] = {
+ { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
+ { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
+ { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
+ { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
+ { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
+ { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
+ { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
+ { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
+ { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
+ { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
+ { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
+ { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
+ { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
+ { HCA_CAP_SRQ, "HCA_CAP_SRQ" },
+ { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
+ { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
+ { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
+ { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
+};
+
+static int ehca_sense_attributes(struct ehca_shca *shca)
+{
+ int i, ret = 0;
+ u64 h_ret;
+ struct hipz_query_hca *rblock;
+ struct hipz_query_port *port;
+ const char *loc_code;
+
+ static const u32 pgsize_map[] = {
+ HCA_CAP_MR_PGSIZE_4K, 0x1000,
+ HCA_CAP_MR_PGSIZE_64K, 0x10000,
+ HCA_CAP_MR_PGSIZE_1M, 0x100000,
+ HCA_CAP_MR_PGSIZE_16M, 0x1000000,
+ };
+
+ ehca_gen_dbg("Probing adapter %s...",
+ shca->ofdev->dev.of_node->full_name);
+ loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
+ NULL);
+ if (loc_code)
+ ehca_gen_dbg(" ... location lode=%s", loc_code);
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!rblock) {
+ ehca_gen_err("Cannot allocate rblock memory.");
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
+ if (h_ret != H_SUCCESS) {
+ ehca_gen_err("Cannot query device properties. h_ret=%lli",
+ h_ret);
+ ret = -EPERM;
+ goto sense_attributes1;
+ }
+
+ if (ehca_nr_ports == 1)
+ shca->num_ports = 1;
+ else
+ shca->num_ports = (u8)rblock->num_ports;
+
+ ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
+
+ if (ehca_hw_level == 0) {
+ u32 hcaaver;
+ u32 revid;
+
+ hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
+ revid = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
+
+ ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
+
+ if (hcaaver == 1) {
+ if (revid <= 3)
+ shca->hw_level = 0x10 | (revid + 1);
+ else
+ shca->hw_level = 0x14;
+ } else if (hcaaver == 2) {
+ if (revid == 0)
+ shca->hw_level = 0x21;
+ else if (revid == 0x10)
+ shca->hw_level = 0x22;
+ else if (revid == 0x20 || revid == 0x21)
+ shca->hw_level = 0x23;
+ }
+
+ if (!shca->hw_level) {
+ ehca_gen_warn("unknown hardware version"
+ " - assuming default level");
+ shca->hw_level = 0x22;
+ }
+ } else
+ shca->hw_level = ehca_hw_level;
+ ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
+
+ shca->hca_cap = rblock->hca_cap_indicators;
+ ehca_gen_dbg(" ... HCA capabilities:");
+ for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
+ if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
+ ehca_gen_dbg(" %s", hca_cap_descr[i].descr);
+
+ /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
+ * a firmware property, so it's valid across all adapters
+ */
+ if (ehca_lock_hcalls == -1)
+ ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
+ shca->hca_cap);
+
+ /* translate supported MR page sizes; always support 4K */
+ shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
+ for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
+ if (rblock->memory_page_size_supported & pgsize_map[i])
+ shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
+
+ /* Set maximum number of CQs and QPs to calculate EQ size */
+ if (shca->max_num_qps == -1)
+ shca->max_num_qps = min_t(int, rblock->max_qp,
+ EHCA_MAX_NUM_QUEUES);
+ else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
+ ehca_gen_warn("The requested number of QPs is out of range "
+ "(1 - %i) specified by HW. Value is set to %i",
+ rblock->max_qp, rblock->max_qp);
+ shca->max_num_qps = rblock->max_qp;
+ }
+
+ if (shca->max_num_cqs == -1)
+ shca->max_num_cqs = min_t(int, rblock->max_cq,
+ EHCA_MAX_NUM_QUEUES);
+ else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
+ ehca_gen_warn("The requested number of CQs is out of range "
+ "(1 - %i) specified by HW. Value is set to %i",
+ rblock->max_cq, rblock->max_cq);
+ }
+
+ /* query max MTU from first port -- it's the same for all ports */
+ port = (struct hipz_query_port *)rblock;
+ h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
+ if (h_ret != H_SUCCESS) {
+ ehca_gen_err("Cannot query port properties. h_ret=%lli",
+ h_ret);
+ ret = -EPERM;
+ goto sense_attributes1;
+ }
+
+ shca->max_mtu = port->max_mtu;
+
+sense_attributes1:
+ ehca_free_fw_ctrlblock(rblock);
+ return ret;
+}
+
+static int init_node_guid(struct ehca_shca *shca)
+{
+ int ret = 0;
+ struct hipz_query_hca *rblock;
+
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!rblock) {
+ ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+ return -ENOMEM;
+ }
+
+ if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "Can't query device properties");
+ ret = -EINVAL;
+ goto init_node_guid1;
+ }
+
+ memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
+
+init_node_guid1:
+ ehca_free_fw_ctrlblock(rblock);
+ return ret;
+}
+
+static int ehca_port_immutable(struct ib_device *ibdev, u8 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct ib_port_attr attr;
+ int err;
+
+ err = ehca_query_port(ibdev, port_num, &attr);
+ if (err)
+ return err;
+
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+ return 0;
+}
+
+static int ehca_init_device(struct ehca_shca *shca)
+{
+ int ret;
+
+ ret = init_node_guid(shca);
+ if (ret)
+ return ret;
+
+ strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
+ shca->ib_device.owner = THIS_MODULE;
+
+ shca->ib_device.uverbs_abi_ver = 8;
+ shca->ib_device.uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+
+ shca->ib_device.node_type = RDMA_NODE_IB_CA;
+ shca->ib_device.phys_port_cnt = shca->num_ports;
+ shca->ib_device.num_comp_vectors = 1;
+ shca->ib_device.dma_device = &shca->ofdev->dev;
+ shca->ib_device.query_device = ehca_query_device;
+ shca->ib_device.query_port = ehca_query_port;
+ shca->ib_device.query_gid = ehca_query_gid;
+ shca->ib_device.query_pkey = ehca_query_pkey;
+ /* shca->in_device.modify_device = ehca_modify_device */
+ shca->ib_device.modify_port = ehca_modify_port;
+ shca->ib_device.alloc_ucontext = ehca_alloc_ucontext;
+ shca->ib_device.dealloc_ucontext = ehca_dealloc_ucontext;
+ shca->ib_device.alloc_pd = ehca_alloc_pd;
+ shca->ib_device.dealloc_pd = ehca_dealloc_pd;
+ shca->ib_device.create_ah = ehca_create_ah;
+ /* shca->ib_device.modify_ah = ehca_modify_ah; */
+ shca->ib_device.query_ah = ehca_query_ah;
+ shca->ib_device.destroy_ah = ehca_destroy_ah;
+ shca->ib_device.create_qp = ehca_create_qp;
+ shca->ib_device.modify_qp = ehca_modify_qp;
+ shca->ib_device.query_qp = ehca_query_qp;
+ shca->ib_device.destroy_qp = ehca_destroy_qp;
+ shca->ib_device.post_send = ehca_post_send;
+ shca->ib_device.post_recv = ehca_post_recv;
+ shca->ib_device.create_cq = ehca_create_cq;
+ shca->ib_device.destroy_cq = ehca_destroy_cq;
+ shca->ib_device.resize_cq = ehca_resize_cq;
+ shca->ib_device.poll_cq = ehca_poll_cq;
+ /* shca->ib_device.peek_cq = ehca_peek_cq; */
+ shca->ib_device.req_notify_cq = ehca_req_notify_cq;
+ /* shca->ib_device.req_ncomp_notif = ehca_req_ncomp_notif; */
+ shca->ib_device.get_dma_mr = ehca_get_dma_mr;
+ shca->ib_device.reg_phys_mr = ehca_reg_phys_mr;
+ shca->ib_device.reg_user_mr = ehca_reg_user_mr;
+ shca->ib_device.query_mr = ehca_query_mr;
+ shca->ib_device.dereg_mr = ehca_dereg_mr;
+ shca->ib_device.rereg_phys_mr = ehca_rereg_phys_mr;
+ shca->ib_device.alloc_mw = ehca_alloc_mw;
+ shca->ib_device.bind_mw = ehca_bind_mw;
+ shca->ib_device.dealloc_mw = ehca_dealloc_mw;
+ shca->ib_device.alloc_fmr = ehca_alloc_fmr;
+ shca->ib_device.map_phys_fmr = ehca_map_phys_fmr;
+ shca->ib_device.unmap_fmr = ehca_unmap_fmr;
+ shca->ib_device.dealloc_fmr = ehca_dealloc_fmr;
+ shca->ib_device.attach_mcast = ehca_attach_mcast;
+ shca->ib_device.detach_mcast = ehca_detach_mcast;
+ shca->ib_device.process_mad = ehca_process_mad;
+ shca->ib_device.mmap = ehca_mmap;
+ shca->ib_device.dma_ops = &ehca_dma_mapping_ops;
+ shca->ib_device.get_port_immutable = ehca_port_immutable;
+
+ if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+ shca->ib_device.uverbs_cmd_mask |=
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+ shca->ib_device.create_srq = ehca_create_srq;
+ shca->ib_device.modify_srq = ehca_modify_srq;
+ shca->ib_device.query_srq = ehca_query_srq;
+ shca->ib_device.destroy_srq = ehca_destroy_srq;
+ shca->ib_device.post_srq_recv = ehca_post_srq_recv;
+ }
+
+ return ret;
+}
+
+static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
+{
+ struct ehca_sport *sport = &shca->sport[port - 1];
+ struct ib_cq *ibcq;
+ struct ib_qp *ibqp;
+ struct ib_qp_init_attr qp_init_attr;
+ struct ib_cq_init_attr cq_attr = {};
+ int ret;
+
+ if (sport->ibcq_aqp1) {
+ ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
+ return -EPERM;
+ }
+
+ cq_attr.cqe = 10;
+ ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
+ &cq_attr);
+ if (IS_ERR(ibcq)) {
+ ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
+ return PTR_ERR(ibcq);
+ }
+ sport->ibcq_aqp1 = ibcq;
+
+ if (sport->ibqp_sqp[IB_QPT_GSI]) {
+ ehca_err(&shca->ib_device, "AQP1 QP is already created.");
+ ret = -EPERM;
+ goto create_aqp1;
+ }
+
+ memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+ qp_init_attr.send_cq = ibcq;
+ qp_init_attr.recv_cq = ibcq;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.cap.max_send_wr = 100;
+ qp_init_attr.cap.max_recv_wr = 100;
+ qp_init_attr.cap.max_send_sge = 2;
+ qp_init_attr.cap.max_recv_sge = 1;
+ qp_init_attr.qp_type = IB_QPT_GSI;
+ qp_init_attr.port_num = port;
+ qp_init_attr.qp_context = NULL;
+ qp_init_attr.event_handler = NULL;
+ qp_init_attr.srq = NULL;
+
+ ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
+ if (IS_ERR(ibqp)) {
+ ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
+ ret = PTR_ERR(ibqp);
+ goto create_aqp1;
+ }
+ sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
+
+ return 0;
+
+create_aqp1:
+ ib_destroy_cq(sport->ibcq_aqp1);
+ return ret;
+}
+
+static int ehca_destroy_aqp1(struct ehca_sport *sport)
+{
+ int ret;
+
+ ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
+ if (ret) {
+ ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
+ return ret;
+ }
+
+ ret = ib_destroy_cq(sport->ibcq_aqp1);
+ if (ret)
+ ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
+
+ return ret;
+}
+
+static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
+}
+
+static ssize_t ehca_store_debug_level(struct device_driver *ddp,
+ const char *buf, size_t count)
+{
+ int value = (*buf) - '0';
+ if (value >= 0 && value <= 9)
+ ehca_debug_level = value;
+ return 1;
+}
+
+static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
+ ehca_show_debug_level, ehca_store_debug_level);
+
+static struct attribute *ehca_drv_attrs[] = {
+ &driver_attr_debug_level.attr,
+ NULL
+};
+
+static struct attribute_group ehca_drv_attr_grp = {
+ .attrs = ehca_drv_attrs
+};
+
+static const struct attribute_group *ehca_drv_attr_groups[] = {
+ &ehca_drv_attr_grp,
+ NULL,
+};
+
+#define EHCA_RESOURCE_ATTR(name) \
+static ssize_t ehca_show_##name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct ehca_shca *shca; \
+ struct hipz_query_hca *rblock; \
+ int data; \
+ \
+ shca = dev_get_drvdata(dev); \
+ \
+ rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); \
+ if (!rblock) { \
+ dev_err(dev, "Can't allocate rblock memory.\n"); \
+ return 0; \
+ } \
+ \
+ if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
+ dev_err(dev, "Can't query device properties\n"); \
+ ehca_free_fw_ctrlblock(rblock); \
+ return 0; \
+ } \
+ \
+ data = rblock->name; \
+ ehca_free_fw_ctrlblock(rblock); \
+ \
+ if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1)) \
+ return snprintf(buf, 256, "1\n"); \
+ else \
+ return snprintf(buf, 256, "%d\n", data); \
+ \
+} \
+static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
+
+EHCA_RESOURCE_ATTR(num_ports);
+EHCA_RESOURCE_ATTR(hw_ver);
+EHCA_RESOURCE_ATTR(max_eq);
+EHCA_RESOURCE_ATTR(cur_eq);
+EHCA_RESOURCE_ATTR(max_cq);
+EHCA_RESOURCE_ATTR(cur_cq);
+EHCA_RESOURCE_ATTR(max_qp);
+EHCA_RESOURCE_ATTR(cur_qp);
+EHCA_RESOURCE_ATTR(max_mr);
+EHCA_RESOURCE_ATTR(cur_mr);
+EHCA_RESOURCE_ATTR(max_mw);
+EHCA_RESOURCE_ATTR(cur_mw);
+EHCA_RESOURCE_ATTR(max_pd);
+EHCA_RESOURCE_ATTR(max_ah);
+
+static ssize_t ehca_show_adapter_handle(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ehca_shca *shca = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
+
+}
+static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
+
+static struct attribute *ehca_dev_attrs[] = {
+ &dev_attr_adapter_handle.attr,
+ &dev_attr_num_ports.attr,
+ &dev_attr_hw_ver.attr,
+ &dev_attr_max_eq.attr,
+ &dev_attr_cur_eq.attr,
+ &dev_attr_max_cq.attr,
+ &dev_attr_cur_cq.attr,
+ &dev_attr_max_qp.attr,
+ &dev_attr_cur_qp.attr,
+ &dev_attr_max_mr.attr,
+ &dev_attr_cur_mr.attr,
+ &dev_attr_max_mw.attr,
+ &dev_attr_cur_mw.attr,
+ &dev_attr_max_pd.attr,
+ &dev_attr_max_ah.attr,
+ NULL
+};
+
+static struct attribute_group ehca_dev_attr_grp = {
+ .attrs = ehca_dev_attrs
+};
+
+static int ehca_probe(struct platform_device *dev)
+{
+ struct ehca_shca *shca;
+ const u64 *handle;
+ struct ib_pd *ibpd;
+ int ret, i, eq_size;
+ unsigned long flags;
+
+ handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
+ if (!handle) {
+ ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
+ dev->dev.of_node->full_name);
+ return -ENODEV;
+ }
+
+ if (!(*handle)) {
+ ehca_gen_err("Wrong eHCA handle for adapter: %s.",
+ dev->dev.of_node->full_name);
+ return -ENODEV;
+ }
+
+ shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
+ if (!shca) {
+ ehca_gen_err("Cannot allocate shca memory.");
+ return -ENOMEM;
+ }
+
+ mutex_init(&shca->modify_mutex);
+ atomic_set(&shca->num_cqs, 0);
+ atomic_set(&shca->num_qps, 0);
+ shca->max_num_qps = ehca_max_qp;
+ shca->max_num_cqs = ehca_max_cq;
+
+ for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
+ spin_lock_init(&shca->sport[i].mod_sqp_lock);
+
+ shca->ofdev = dev;
+ shca->ipz_hca_handle.handle = *handle;
+ dev_set_drvdata(&dev->dev, shca);
+
+ ret = ehca_sense_attributes(shca);
+ if (ret < 0) {
+ ehca_gen_err("Cannot sense eHCA attributes.");
+ goto probe1;
+ }
+
+ ret = ehca_init_device(shca);
+ if (ret) {
+ ehca_gen_err("Cannot init ehca device struct");
+ goto probe1;
+ }
+
+ eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
+ /* create event queues */
+ ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
+ if (ret) {
+ ehca_err(&shca->ib_device, "Cannot create EQ.");
+ goto probe1;
+ }
+
+ ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
+ if (ret) {
+ ehca_err(&shca->ib_device, "Cannot create NEQ.");
+ goto probe3;
+ }
+
+ /* create internal protection domain */
+ ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
+ if (IS_ERR(ibpd)) {
+ ehca_err(&shca->ib_device, "Cannot create internal PD.");
+ ret = PTR_ERR(ibpd);
+ goto probe4;
+ }
+
+ shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
+ shca->pd->ib_pd.device = &shca->ib_device;
+
+ /* create internal max MR */
+ ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
+
+ if (ret) {
+ ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
+ ret);
+ goto probe5;
+ }
+
+ ret = ib_register_device(&shca->ib_device, NULL);
+ if (ret) {
+ ehca_err(&shca->ib_device,
+ "ib_register_device() failed ret=%i", ret);
+ goto probe6;
+ }
+
+ /* create AQP1 for port 1 */
+ if (ehca_open_aqp1 == 1) {
+ shca->sport[0].port_state = IB_PORT_DOWN;
+ ret = ehca_create_aqp1(shca, 1);
+ if (ret) {
+ ehca_err(&shca->ib_device,
+ "Cannot create AQP1 for port 1.");
+ goto probe7;
+ }
+ }
+
+ /* create AQP1 for port 2 */
+ if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
+ shca->sport[1].port_state = IB_PORT_DOWN;
+ ret = ehca_create_aqp1(shca, 2);
+ if (ret) {
+ ehca_err(&shca->ib_device,
+ "Cannot create AQP1 for port 2.");
+ goto probe8;
+ }
+ }
+
+ ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+ if (ret) /* only complain; we can live without attributes */
+ ehca_err(&shca->ib_device,
+ "Cannot create device attributes ret=%d", ret);
+
+ spin_lock_irqsave(&shca_list_lock, flags);
+ list_add(&shca->shca_list, &shca_list);
+ spin_unlock_irqrestore(&shca_list_lock, flags);
+
+ return 0;
+
+probe8:
+ ret = ehca_destroy_aqp1(&shca->sport[0]);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy AQP1 for port 1. ret=%i", ret);
+
+probe7:
+ ib_unregister_device(&shca->ib_device);
+
+probe6:
+ ret = ehca_dereg_internal_maxmr(shca);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy internal MR. ret=%x", ret);
+
+probe5:
+ ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy internal PD. ret=%x", ret);
+
+probe4:
+ ret = ehca_destroy_eq(shca, &shca->neq);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy NEQ. ret=%x", ret);
+
+probe3:
+ ret = ehca_destroy_eq(shca, &shca->eq);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy EQ. ret=%x", ret);
+
+probe1:
+ ib_dealloc_device(&shca->ib_device);
+
+ return -EINVAL;
+}
+
+static int ehca_remove(struct platform_device *dev)
+{
+ struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
+ unsigned long flags;
+ int ret;
+
+ sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+
+ if (ehca_open_aqp1 == 1) {
+ int i;
+ for (i = 0; i < shca->num_ports; i++) {
+ ret = ehca_destroy_aqp1(&shca->sport[i]);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy AQP1 for port %x "
+ "ret=%i", ret, i);
+ }
+ }
+
+ ib_unregister_device(&shca->ib_device);
+
+ ret = ehca_dereg_internal_maxmr(shca);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy internal MR. ret=%i", ret);
+
+ ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+ if (ret)
+ ehca_err(&shca->ib_device,
+ "Cannot destroy internal PD. ret=%i", ret);
+
+ ret = ehca_destroy_eq(shca, &shca->eq);
+ if (ret)
+ ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
+
+ ret = ehca_destroy_eq(shca, &shca->neq);
+ if (ret)
+ ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
+
+ ib_dealloc_device(&shca->ib_device);
+
+ spin_lock_irqsave(&shca_list_lock, flags);
+ list_del(&shca->shca_list);
+ spin_unlock_irqrestore(&shca_list_lock, flags);
+
+ return ret;
+}
+
+static struct of_device_id ehca_device_table[] =
+{
+ {
+ .name = "lhca",
+ .compatible = "IBM,lhca",
+ },
+ {},
+};
+MODULE_DEVICE_TABLE(of, ehca_device_table);
+
+static struct platform_driver ehca_driver = {
+ .probe = ehca_probe,
+ .remove = ehca_remove,
+ .driver = {
+ .name = "ehca",
+ .owner = THIS_MODULE,
+ .groups = ehca_drv_attr_groups,
+ .of_match_table = ehca_device_table,
+ },
+};
+
+void ehca_poll_eqs(unsigned long data)
+{
+ struct ehca_shca *shca;
+
+ spin_lock(&shca_list_lock);
+ list_for_each_entry(shca, &shca_list, shca_list) {
+ if (shca->eq.is_initialized) {
+ /* call deadman proc only if eq ptr does not change */
+ struct ehca_eq *eq = &shca->eq;
+ int max = 3;
+ volatile u64 q_ofs, q_ofs2;
+ unsigned long flags;
+ spin_lock_irqsave(&eq->spinlock, flags);
+ q_ofs = eq->ipz_queue.current_q_offset;
+ spin_unlock_irqrestore(&eq->spinlock, flags);
+ do {
+ spin_lock_irqsave(&eq->spinlock, flags);
+ q_ofs2 = eq->ipz_queue.current_q_offset;
+ spin_unlock_irqrestore(&eq->spinlock, flags);
+ max--;
+ } while (q_ofs == q_ofs2 && max > 0);
+ if (q_ofs == q_ofs2)
+ ehca_process_eq(shca, 0);
+ }
+ }
+ mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
+ spin_unlock(&shca_list_lock);
+}
+
+static int ehca_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ static unsigned long ehca_dmem_warn_time;
+ unsigned long flags;
+
+ switch (action) {
+ case MEM_CANCEL_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ return NOTIFY_OK;
+ case MEM_GOING_ONLINE:
+ case MEM_GOING_OFFLINE:
+ /* only ok if no hca is attached to the lpar */
+ spin_lock_irqsave(&shca_list_lock, flags);
+ if (list_empty(&shca_list)) {
+ spin_unlock_irqrestore(&shca_list_lock, flags);
+ return NOTIFY_OK;
+ } else {
+ spin_unlock_irqrestore(&shca_list_lock, flags);
+ if (printk_timed_ratelimit(&ehca_dmem_warn_time,
+ 30 * 1000))
+ ehca_gen_err("DMEM operations are not allowed"
+ "in conjunction with eHCA");
+ return NOTIFY_BAD;
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block ehca_mem_nb = {
+ .notifier_call = ehca_mem_notifier,
+};
+
+static int __init ehca_module_init(void)
+{
+ int ret;
+
+ printk(KERN_INFO "eHCA Infiniband Device Driver "
+ "(Version " HCAD_VERSION ")\n");
+
+ ret = ehca_create_comp_pool();
+ if (ret) {
+ ehca_gen_err("Cannot create comp pool.");
+ return ret;
+ }
+
+ ret = ehca_create_slab_caches();
+ if (ret) {
+ ehca_gen_err("Cannot create SLAB caches");
+ ret = -ENOMEM;
+ goto module_init1;
+ }
+
+ ret = ehca_create_busmap();
+ if (ret) {
+ ehca_gen_err("Cannot create busmap.");
+ goto module_init2;
+ }
+
+ ret = ibmebus_register_driver(&ehca_driver);
+ if (ret) {
+ ehca_gen_err("Cannot register eHCA device driver");
+ ret = -EINVAL;
+ goto module_init3;
+ }
+
+ ret = register_memory_notifier(&ehca_mem_nb);
+ if (ret) {
+ ehca_gen_err("Failed registering memory add/remove notifier");
+ goto module_init4;
+ }
+
+ if (ehca_poll_all_eqs != 1) {
+ ehca_gen_err("WARNING!!!");
+ ehca_gen_err("It is possible to lose interrupts.");
+ } else {
+ init_timer(&poll_eqs_timer);
+ poll_eqs_timer.function = ehca_poll_eqs;
+ poll_eqs_timer.expires = jiffies + HZ;
+ add_timer(&poll_eqs_timer);
+ }
+
+ return 0;
+
+module_init4:
+ ibmebus_unregister_driver(&ehca_driver);
+
+module_init3:
+ ehca_destroy_busmap();
+
+module_init2:
+ ehca_destroy_slab_caches();
+
+module_init1:
+ ehca_destroy_comp_pool();
+ return ret;
+};
+
+static void __exit ehca_module_exit(void)
+{
+ if (ehca_poll_all_eqs == 1)
+ del_timer_sync(&poll_eqs_timer);
+
+ ibmebus_unregister_driver(&ehca_driver);
+
+ unregister_memory_notifier(&ehca_mem_nb);
+
+ ehca_destroy_busmap();
+
+ ehca_destroy_slab_caches();
+
+ ehca_destroy_comp_pool();
+
+ idr_destroy(&ehca_cq_idr);
+ idr_destroy(&ehca_qp_idr);
+};
+
+module_init(ehca_module_init);
+module_exit(ehca_module_exit);
diff --git a/drivers/staging/rdma/ehca/ehca_mcast.c b/drivers/staging/rdma/ehca/ehca_mcast.c
new file mode 100644
index 000000000000..cec181532924
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_mcast.c
@@ -0,0 +1,131 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * mcast functions
+ *
+ * Authors: Khadija Souissi <souissik@de.ibm.com>
+ * Waleri Fomin <fomin@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define MAX_MC_LID 0xFFFE
+#define MIN_MC_LID 0xC000 /* Multicast limits */
+#define EHCA_VALID_MULTICAST_GID(gid) ((gid)[0] == 0xFF)
+#define EHCA_VALID_MULTICAST_LID(lid) \
+ (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
+
+int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+ struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+ ib_device);
+ union ib_gid my_gid;
+ u64 subnet_prefix, interface_id, h_ret;
+
+ if (ibqp->qp_type != IB_QPT_UD) {
+ ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
+ return -EINVAL;
+ }
+
+ if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+ ehca_err(ibqp->device, "invalid mulitcast gid");
+ return -EINVAL;
+ } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+ ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+ return -EINVAL;
+ }
+
+ memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+ subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+ interface_id = be64_to_cpu(my_gid.global.interface_id);
+ h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ my_qp->galpas.kernel,
+ lid, subnet_prefix, interface_id);
+ if (h_ret != H_SUCCESS)
+ ehca_err(ibqp->device,
+ "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
+ "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+ return ehca2ib_return_code(h_ret);
+}
+
+int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+ struct ehca_shca *shca = container_of(ibqp->pd->device,
+ struct ehca_shca, ib_device);
+ union ib_gid my_gid;
+ u64 subnet_prefix, interface_id, h_ret;
+
+ if (ibqp->qp_type != IB_QPT_UD) {
+ ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
+ return -EINVAL;
+ }
+
+ if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+ ehca_err(ibqp->device, "invalid mulitcast gid");
+ return -EINVAL;
+ } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+ ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+ return -EINVAL;
+ }
+
+ memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+ subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+ interface_id = be64_to_cpu(my_gid.global.interface_id);
+ h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ my_qp->galpas.kernel,
+ lid, subnet_prefix, interface_id);
+ if (h_ret != H_SUCCESS)
+ ehca_err(ibqp->device,
+ "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
+ "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+ return ehca2ib_return_code(h_ret);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c
new file mode 100644
index 000000000000..f914b30999f8
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_mrmw.c
@@ -0,0 +1,2593 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * MR/MW functions
+ *
+ * Authors: Dietmar Decker <ddecker@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "hcp_if.h"
+#include "hipz_hw.h"
+
+#define NUM_CHUNKS(length, chunk_size) \
+ (((length) + (chunk_size - 1)) / (chunk_size))
+
+/* max number of rpages (per hcall register_rpages) */
+#define MAX_RPAGES 512
+
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT SECTION_SIZE_BITS
+#define EHCA_SECTSIZE (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT 34
+#define EHCA_HUGEPAGE_SIZE (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR 0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13 /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000) /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+
+static unsigned long ehca_mr_len;
+
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+ u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+ struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+ struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
+static struct kmem_cache *mr_cache;
+static struct kmem_cache *mw_cache;
+
+enum ehca_mr_pgsize {
+ EHCA_MR_PGSIZE4K = 0x1000L,
+ EHCA_MR_PGSIZE64K = 0x10000L,
+ EHCA_MR_PGSIZE1M = 0x100000L,
+ EHCA_MR_PGSIZE16M = 0x1000000L
+};
+
+#define EHCA_MR_PGSHIFT4K 12
+#define EHCA_MR_PGSHIFT64K 16
+#define EHCA_MR_PGSHIFT1M 20
+#define EHCA_MR_PGSHIFT16M 24
+
+static u64 ehca_map_vaddr(void *caddr);
+
+static u32 ehca_encode_hwpage_size(u32 pgsize)
+{
+ int log = ilog2(pgsize);
+ WARN_ON(log < 12 || log > 24 || log & 3);
+ return (log - 12) / 4;
+}
+
+static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
+{
+ return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
+}
+
+static struct ehca_mr *ehca_mr_new(void)
+{
+ struct ehca_mr *me;
+
+ me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
+ if (me)
+ spin_lock_init(&me->mrlock);
+ else
+ ehca_gen_err("alloc failed");
+
+ return me;
+}
+
+static void ehca_mr_delete(struct ehca_mr *me)
+{
+ kmem_cache_free(mr_cache, me);
+}
+
+static struct ehca_mw *ehca_mw_new(void)
+{
+ struct ehca_mw *me;
+
+ me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
+ if (me)
+ spin_lock_init(&me->mwlock);
+ else
+ ehca_gen_err("alloc failed");
+
+ return me;
+}
+
+static void ehca_mw_delete(struct ehca_mw *me)
+{
+ kmem_cache_free(mw_cache, me);
+}
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+ struct ib_mr *ib_mr;
+ int ret;
+ struct ehca_mr *e_maxmr;
+ struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+ struct ehca_shca *shca =
+ container_of(pd->device, struct ehca_shca, ib_device);
+
+ if (shca->maxmr) {
+ e_maxmr = ehca_mr_new();
+ if (!e_maxmr) {
+ ehca_err(&shca->ib_device, "out of memory");
+ ib_mr = ERR_PTR(-ENOMEM);
+ goto get_dma_mr_exit0;
+ }
+
+ ret = ehca_reg_maxmr(shca, e_maxmr,
+ (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
+ mr_access_flags, e_pd,
+ &e_maxmr->ib.ib_mr.lkey,
+ &e_maxmr->ib.ib_mr.rkey);
+ if (ret) {
+ ehca_mr_delete(e_maxmr);
+ ib_mr = ERR_PTR(ret);
+ goto get_dma_mr_exit0;
+ }
+ ib_mr = &e_maxmr->ib.ib_mr;
+ } else {
+ ehca_err(&shca->ib_device, "no internal max-MR exist!");
+ ib_mr = ERR_PTR(-EINVAL);
+ goto get_dma_mr_exit0;
+ }
+
+get_dma_mr_exit0:
+ if (IS_ERR(ib_mr))
+ ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
+ PTR_ERR(ib_mr), pd, mr_access_flags);
+ return ib_mr;
+} /* end ehca_get_dma_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_mr *ib_mr;
+ int ret;
+ struct ehca_mr *e_mr;
+ struct ehca_shca *shca =
+ container_of(pd->device, struct ehca_shca, ib_device);
+ struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+ u64 size;
+
+ if ((num_phys_buf <= 0) || !phys_buf_array) {
+ ehca_err(pd->device, "bad input values: num_phys_buf=%x "
+ "phys_buf_array=%p", num_phys_buf, phys_buf_array);
+ ib_mr = ERR_PTR(-EINVAL);
+ goto reg_phys_mr_exit0;
+ }
+ if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+ ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+ /*
+ * Remote Write Access requires Local Write Access
+ * Remote Atomic Access requires Local Write Access
+ */
+ ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+ mr_access_flags);
+ ib_mr = ERR_PTR(-EINVAL);
+ goto reg_phys_mr_exit0;
+ }
+
+ /* check physical buffer list and calculate size */
+ ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
+ iova_start, &size);
+ if (ret) {
+ ib_mr = ERR_PTR(ret);
+ goto reg_phys_mr_exit0;
+ }
+ if ((size == 0) ||
+ (((u64)iova_start + size) < (u64)iova_start)) {
+ ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
+ size, iova_start);
+ ib_mr = ERR_PTR(-EINVAL);
+ goto reg_phys_mr_exit0;
+ }
+
+ e_mr = ehca_mr_new();
+ if (!e_mr) {
+ ehca_err(pd->device, "out of memory");
+ ib_mr = ERR_PTR(-ENOMEM);
+ goto reg_phys_mr_exit0;
+ }
+
+ /* register MR on HCA */
+ if (ehca_mr_is_maxmr(size, iova_start)) {
+ e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+ ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
+ e_pd, &e_mr->ib.ib_mr.lkey,
+ &e_mr->ib.ib_mr.rkey);
+ if (ret) {
+ ib_mr = ERR_PTR(ret);
+ goto reg_phys_mr_exit1;
+ }
+ } else {
+ struct ehca_mr_pginfo pginfo;
+ u32 num_kpages;
+ u32 num_hwpages;
+ u64 hw_pgsize;
+
+ num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
+ PAGE_SIZE);
+ /* for kernel space we try most possible pgsize */
+ hw_pgsize = ehca_get_max_hwpage_size(shca);
+ num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
+ hw_pgsize);
+ memset(&pginfo, 0, sizeof(pginfo));
+ pginfo.type = EHCA_MR_PGI_PHYS;
+ pginfo.num_kpages = num_kpages;
+ pginfo.hwpage_size = hw_pgsize;
+ pginfo.num_hwpages = num_hwpages;
+ pginfo.u.phy.num_phys_buf = num_phys_buf;
+ pginfo.u.phy.phys_buf_array = phys_buf_array;
+ pginfo.next_hwpage =
+ ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+
+ ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
+ e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+ &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+ if (ret) {
+ ib_mr = ERR_PTR(ret);
+ goto reg_phys_mr_exit1;
+ }
+ }
+
+ /* successful registration of all pages */
+ return &e_mr->ib.ib_mr;
+
+reg_phys_mr_exit1:
+ ehca_mr_delete(e_mr);
+reg_phys_mr_exit0:
+ if (IS_ERR(ib_mr))
+ ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
+ "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
+ PTR_ERR(ib_mr), pd, phys_buf_array,
+ num_phys_buf, mr_access_flags, iova_start);
+ return ib_mr;
+} /* end ehca_reg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt, int mr_access_flags,
+ struct ib_udata *udata)
+{
+ struct ib_mr *ib_mr;
+ struct ehca_mr *e_mr;
+ struct ehca_shca *shca =
+ container_of(pd->device, struct ehca_shca, ib_device);
+ struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+ struct ehca_mr_pginfo pginfo;
+ int ret, page_shift;
+ u32 num_kpages;
+ u32 num_hwpages;
+ u64 hwpage_size;
+
+ if (!pd) {
+ ehca_gen_err("bad pd=%p", pd);
+ return ERR_PTR(-EFAULT);
+ }
+
+ if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+ ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+ /*
+ * Remote Write Access requires Local Write Access
+ * Remote Atomic Access requires Local Write Access
+ */
+ ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+ mr_access_flags);
+ ib_mr = ERR_PTR(-EINVAL);
+ goto reg_user_mr_exit0;
+ }
+
+ if (length == 0 || virt + length < virt) {
+ ehca_err(pd->device, "bad input values: length=%llx "
+ "virt_base=%llx", length, virt);
+ ib_mr = ERR_PTR(-EINVAL);
+ goto reg_user_mr_exit0;
+ }
+
+ e_mr = ehca_mr_new();
+ if (!e_mr) {
+ ehca_err(pd->device, "out of memory");
+ ib_mr = ERR_PTR(-ENOMEM);
+ goto reg_user_mr_exit0;
+ }
+
+ e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
+ mr_access_flags, 0);
+ if (IS_ERR(e_mr->umem)) {
+ ib_mr = (void *)e_mr->umem;
+ goto reg_user_mr_exit1;
+ }
+
+ if (e_mr->umem->page_size != PAGE_SIZE) {
+ ehca_err(pd->device, "page size not supported, "
+ "e_mr->umem->page_size=%x", e_mr->umem->page_size);
+ ib_mr = ERR_PTR(-EINVAL);
+ goto reg_user_mr_exit2;
+ }
+
+ /* determine number of MR pages */
+ num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
+ /* select proper hw_pgsize */
+ page_shift = PAGE_SHIFT;
+ if (e_mr->umem->hugetlb) {
+ /* determine page_shift, clamp between 4K and 16M */
+ page_shift = (fls64(length - 1) + 3) & ~3;
+ page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
+ EHCA_MR_PGSHIFT16M);
+ }
+ hwpage_size = 1UL << page_shift;
+
+ /* now that we have the desired page size, shift until it's
+ * supported, too. 4K is always supported, so this terminates.
+ */
+ while (!(hwpage_size & shca->hca_cap_mr_pgsize))
+ hwpage_size >>= 4;
+
+reg_user_mr_fallback:
+ num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
+ /* register MR on HCA */
+ memset(&pginfo, 0, sizeof(pginfo));
+ pginfo.type = EHCA_MR_PGI_USER;
+ pginfo.hwpage_size = hwpage_size;
+ pginfo.num_kpages = num_kpages;
+ pginfo.num_hwpages = num_hwpages;
+ pginfo.u.usr.region = e_mr->umem;
+ pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
+ pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
+ ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
+ e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+ &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+ if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
+ ehca_warn(pd->device, "failed to register mr "
+ "with hwpage_size=%llx", hwpage_size);
+ ehca_info(pd->device, "try to register mr with "
+ "kpage_size=%lx", PAGE_SIZE);
+ /*
+ * this means kpages are not contiguous for a hw page
+ * try kernel page size as fallback solution
+ */
+ hwpage_size = PAGE_SIZE;
+ goto reg_user_mr_fallback;
+ }
+ if (ret) {
+ ib_mr = ERR_PTR(ret);
+ goto reg_user_mr_exit2;
+ }
+
+ /* successful registration of all pages */
+ return &e_mr->ib.ib_mr;
+
+reg_user_mr_exit2:
+ ib_umem_release(e_mr->umem);
+reg_user_mr_exit1:
+ ehca_mr_delete(e_mr);
+reg_user_mr_exit0:
+ if (IS_ERR(ib_mr))
+ ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
+ PTR_ERR(ib_mr), pd, mr_access_flags, udata);
+ return ib_mr;
+} /* end ehca_reg_user_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ int ret;
+
+ struct ehca_shca *shca =
+ container_of(mr->device, struct ehca_shca, ib_device);
+ struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+ u64 new_size;
+ u64 *new_start;
+ u32 new_acl;
+ struct ehca_pd *new_pd;
+ u32 tmp_lkey, tmp_rkey;
+ unsigned long sl_flags;
+ u32 num_kpages = 0;
+ u32 num_hwpages = 0;
+ struct ehca_mr_pginfo pginfo;
+
+ if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
+ /* TODO not supported, because PHYP rereg hCall needs pages */
+ ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
+ "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit0;
+ }
+
+ if (mr_rereg_mask & IB_MR_REREG_PD) {
+ if (!pd) {
+ ehca_err(mr->device, "rereg with bad pd, pd=%p "
+ "mr_rereg_mask=%x", pd, mr_rereg_mask);
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit0;
+ }
+ }
+
+ if ((mr_rereg_mask &
+ ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
+ (mr_rereg_mask == 0)) {
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit0;
+ }
+
+ /* check other parameters */
+ if (e_mr == shca->maxmr) {
+ /* should be impossible, however reject to be sure */
+ ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
+ "shca->maxmr=%p mr->lkey=%x",
+ mr, shca->maxmr, mr->lkey);
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit0;
+ }
+ if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
+ if (e_mr->flags & EHCA_MR_FLAG_FMR) {
+ ehca_err(mr->device, "not supported for FMR, mr=%p "
+ "flags=%x", mr, e_mr->flags);
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit0;
+ }
+ if (!phys_buf_array || num_phys_buf <= 0) {
+ ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
+ " phys_buf_array=%p num_phys_buf=%x",
+ mr_rereg_mask, phys_buf_array, num_phys_buf);
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit0;
+ }
+ }
+ if ((mr_rereg_mask & IB_MR_REREG_ACCESS) && /* change ACL */
+ (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+ ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
+ /*
+ * Remote Write Access requires Local Write Access
+ * Remote Atomic Access requires Local Write Access
+ */
+ ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
+ "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit0;
+ }
+
+ /* set requested values dependent on rereg request */
+ spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+ new_start = e_mr->start;
+ new_size = e_mr->size;
+ new_acl = e_mr->acl;
+ new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
+
+ if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+ u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
+
+ new_start = iova_start; /* change address */
+ /* check physical buffer list and calculate size */
+ ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
+ num_phys_buf, iova_start,
+ &new_size);
+ if (ret)
+ goto rereg_phys_mr_exit1;
+ if ((new_size == 0) ||
+ (((u64)iova_start + new_size) < (u64)iova_start)) {
+ ehca_err(mr->device, "bad input values: new_size=%llx "
+ "iova_start=%p", new_size, iova_start);
+ ret = -EINVAL;
+ goto rereg_phys_mr_exit1;
+ }
+ num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
+ new_size, PAGE_SIZE);
+ num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
+ new_size, hw_pgsize);
+ memset(&pginfo, 0, sizeof(pginfo));
+ pginfo.type = EHCA_MR_PGI_PHYS;
+ pginfo.num_kpages = num_kpages;
+ pginfo.hwpage_size = hw_pgsize;
+ pginfo.num_hwpages = num_hwpages;
+ pginfo.u.phy.num_phys_buf = num_phys_buf;
+ pginfo.u.phy.phys_buf_array = phys_buf_array;
+ pginfo.next_hwpage =
+ ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+ }
+ if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+ new_acl = mr_access_flags;
+ if (mr_rereg_mask & IB_MR_REREG_PD)
+ new_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+ ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
+ new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+ if (ret)
+ goto rereg_phys_mr_exit1;
+
+ /* successful reregistration */
+ if (mr_rereg_mask & IB_MR_REREG_PD)
+ mr->pd = pd;
+ mr->lkey = tmp_lkey;
+ mr->rkey = tmp_rkey;
+
+rereg_phys_mr_exit1:
+ spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+rereg_phys_mr_exit0:
+ if (ret)
+ ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
+ "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
+ "iova_start=%p",
+ ret, mr, mr_rereg_mask, pd, phys_buf_array,
+ num_phys_buf, mr_access_flags, iova_start);
+ return ret;
+} /* end ehca_rereg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+ int ret = 0;
+ u64 h_ret;
+ struct ehca_shca *shca =
+ container_of(mr->device, struct ehca_shca, ib_device);
+ struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+ unsigned long sl_flags;
+ struct ehca_mr_hipzout_parms hipzout;
+
+ if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+ ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+ "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+ ret = -EINVAL;
+ goto query_mr_exit0;
+ }
+
+ memset(mr_attr, 0, sizeof(struct ib_mr_attr));
+ spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+
+ h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
+ "hca_hndl=%llx mr_hndl=%llx lkey=%x",
+ h_ret, mr, shca->ipz_hca_handle.handle,
+ e_mr->ipz_mr_handle.handle, mr->lkey);
+ ret = ehca2ib_return_code(h_ret);
+ goto query_mr_exit1;
+ }
+ mr_attr->pd = mr->pd;
+ mr_attr->device_virt_addr = hipzout.vaddr;
+ mr_attr->size = hipzout.len;
+ mr_attr->lkey = hipzout.lkey;
+ mr_attr->rkey = hipzout.rkey;
+ ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
+
+query_mr_exit1:
+ spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+query_mr_exit0:
+ if (ret)
+ ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
+ ret, mr, mr_attr);
+ return ret;
+} /* end ehca_query_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_mr(struct ib_mr *mr)
+{
+ int ret = 0;
+ u64 h_ret;
+ struct ehca_shca *shca =
+ container_of(mr->device, struct ehca_shca, ib_device);
+ struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+
+ if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+ ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+ "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+ ret = -EINVAL;
+ goto dereg_mr_exit0;
+ } else if (e_mr == shca->maxmr) {
+ /* should be impossible, however reject to be sure */
+ ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
+ "shca->maxmr=%p mr->lkey=%x",
+ mr, shca->maxmr, mr->lkey);
+ ret = -EINVAL;
+ goto dereg_mr_exit0;
+ }
+
+ /* TODO: BUSY: MR still has bound window(s) */
+ h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
+ "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
+ h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
+ e_mr->ipz_mr_handle.handle, mr->lkey);
+ ret = ehca2ib_return_code(h_ret);
+ goto dereg_mr_exit0;
+ }
+
+ if (e_mr->umem)
+ ib_umem_release(e_mr->umem);
+
+ /* successful deregistration */
+ ehca_mr_delete(e_mr);
+
+dereg_mr_exit0:
+ if (ret)
+ ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
+ return ret;
+} /* end ehca_dereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+ struct ib_mw *ib_mw;
+ u64 h_ret;
+ struct ehca_mw *e_mw;
+ struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+ struct ehca_shca *shca =
+ container_of(pd->device, struct ehca_shca, ib_device);
+ struct ehca_mw_hipzout_parms hipzout;
+
+ if (type != IB_MW_TYPE_1)
+ return ERR_PTR(-EINVAL);
+
+ e_mw = ehca_mw_new();
+ if (!e_mw) {
+ ib_mw = ERR_PTR(-ENOMEM);
+ goto alloc_mw_exit0;
+ }
+
+ h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
+ e_pd->fw_pd, &hipzout);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
+ "shca=%p hca_hndl=%llx mw=%p",
+ h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
+ ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
+ goto alloc_mw_exit1;
+ }
+ /* successful MW allocation */
+ e_mw->ipz_mw_handle = hipzout.handle;
+ e_mw->ib_mw.rkey = hipzout.rkey;
+ return &e_mw->ib_mw;
+
+alloc_mw_exit1:
+ ehca_mw_delete(e_mw);
+alloc_mw_exit0:
+ if (IS_ERR(ib_mw))
+ ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
+ return ib_mw;
+} /* end ehca_alloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_bind_mw(struct ib_qp *qp,
+ struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind)
+{
+ /* TODO: not supported up to now */
+ ehca_gen_err("bind MW currently not supported by HCAD");
+
+ return -EPERM;
+} /* end ehca_bind_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_mw(struct ib_mw *mw)
+{
+ u64 h_ret;
+ struct ehca_shca *shca =
+ container_of(mw->device, struct ehca_shca, ib_device);
+ struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
+
+ h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
+ "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
+ h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
+ e_mw->ipz_mw_handle.handle);
+ return ehca2ib_return_code(h_ret);
+ }
+ /* successful deallocation */
+ ehca_mw_delete(e_mw);
+ return 0;
+} /* end ehca_dealloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ib_fmr *ib_fmr;
+ struct ehca_shca *shca =
+ container_of(pd->device, struct ehca_shca, ib_device);
+ struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+ struct ehca_mr *e_fmr;
+ int ret;
+ u32 tmp_lkey, tmp_rkey;
+ struct ehca_mr_pginfo pginfo;
+ u64 hw_pgsize;
+
+ /* check other parameters */
+ if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+ ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+ /*
+ * Remote Write Access requires Local Write Access
+ * Remote Atomic Access requires Local Write Access
+ */
+ ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+ mr_access_flags);
+ ib_fmr = ERR_PTR(-EINVAL);
+ goto alloc_fmr_exit0;
+ }
+ if (mr_access_flags & IB_ACCESS_MW_BIND) {
+ ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+ mr_access_flags);
+ ib_fmr = ERR_PTR(-EINVAL);
+ goto alloc_fmr_exit0;
+ }
+ if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
+ ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
+ "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
+ fmr_attr->max_pages, fmr_attr->max_maps,
+ fmr_attr->page_shift);
+ ib_fmr = ERR_PTR(-EINVAL);
+ goto alloc_fmr_exit0;
+ }
+
+ hw_pgsize = 1 << fmr_attr->page_shift;
+ if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
+ ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
+ fmr_attr->page_shift);
+ ib_fmr = ERR_PTR(-EINVAL);
+ goto alloc_fmr_exit0;
+ }
+
+ e_fmr = ehca_mr_new();
+ if (!e_fmr) {
+ ib_fmr = ERR_PTR(-ENOMEM);
+ goto alloc_fmr_exit0;
+ }
+ e_fmr->flags |= EHCA_MR_FLAG_FMR;
+
+ /* register MR on HCA */
+ memset(&pginfo, 0, sizeof(pginfo));
+ pginfo.hwpage_size = hw_pgsize;
+ /*
+ * pginfo.num_hwpages==0, ie register_rpages() will not be called
+ * but deferred to map_phys_fmr()
+ */
+ ret = ehca_reg_mr(shca, e_fmr, NULL,
+ fmr_attr->max_pages * (1 << fmr_attr->page_shift),
+ mr_access_flags, e_pd, &pginfo,
+ &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
+ if (ret) {
+ ib_fmr = ERR_PTR(ret);
+ goto alloc_fmr_exit1;
+ }
+
+ /* successful */
+ e_fmr->hwpage_size = hw_pgsize;
+ e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
+ e_fmr->fmr_max_pages = fmr_attr->max_pages;
+ e_fmr->fmr_max_maps = fmr_attr->max_maps;
+ e_fmr->fmr_map_cnt = 0;
+ return &e_fmr->ib.ib_fmr;
+
+alloc_fmr_exit1:
+ ehca_mr_delete(e_fmr);
+alloc_fmr_exit0:
+ return ib_fmr;
+} /* end ehca_alloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+ u64 *page_list,
+ int list_len,
+ u64 iova)
+{
+ int ret;
+ struct ehca_shca *shca =
+ container_of(fmr->device, struct ehca_shca, ib_device);
+ struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+ struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
+ struct ehca_mr_pginfo pginfo;
+ u32 tmp_lkey, tmp_rkey;
+
+ if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+ ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+ e_fmr, e_fmr->flags);
+ ret = -EINVAL;
+ goto map_phys_fmr_exit0;
+ }
+ ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
+ if (ret)
+ goto map_phys_fmr_exit0;
+ if (iova % e_fmr->fmr_page_size) {
+ /* only whole-numbered pages */
+ ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
+ iova, e_fmr->fmr_page_size);
+ ret = -EINVAL;
+ goto map_phys_fmr_exit0;
+ }
+ if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
+ /* HCAD does not limit the maps, however trace this anyway */
+ ehca_info(fmr->device, "map limit exceeded, fmr=%p "
+ "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
+ fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
+ }
+
+ memset(&pginfo, 0, sizeof(pginfo));
+ pginfo.type = EHCA_MR_PGI_FMR;
+ pginfo.num_kpages = list_len;
+ pginfo.hwpage_size = e_fmr->hwpage_size;
+ pginfo.num_hwpages =
+ list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
+ pginfo.u.fmr.page_list = page_list;
+ pginfo.next_hwpage =
+ (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
+ pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
+
+ ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
+ list_len * e_fmr->fmr_page_size,
+ e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+ if (ret)
+ goto map_phys_fmr_exit0;
+
+ /* successful reregistration */
+ e_fmr->fmr_map_cnt++;
+ e_fmr->ib.ib_fmr.lkey = tmp_lkey;
+ e_fmr->ib.ib_fmr.rkey = tmp_rkey;
+ return 0;
+
+map_phys_fmr_exit0:
+ if (ret)
+ ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
+ "iova=%llx", ret, fmr, page_list, list_len, iova);
+ return ret;
+} /* end ehca_map_phys_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_fmr(struct list_head *fmr_list)
+{
+ int ret = 0;
+ struct ib_fmr *ib_fmr;
+ struct ehca_shca *shca = NULL;
+ struct ehca_shca *prev_shca;
+ struct ehca_mr *e_fmr;
+ u32 num_fmr = 0;
+ u32 unmap_fmr_cnt = 0;
+
+ /* check all FMR belong to same SHCA, and check internal flag */
+ list_for_each_entry(ib_fmr, fmr_list, list) {
+ prev_shca = shca;
+ shca = container_of(ib_fmr->device, struct ehca_shca,
+ ib_device);
+ e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+ if ((shca != prev_shca) && prev_shca) {
+ ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
+ "prev_shca=%p e_fmr=%p",
+ shca, prev_shca, e_fmr);
+ ret = -EINVAL;
+ goto unmap_fmr_exit0;
+ }
+ if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+ ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
+ "e_fmr->flags=%x", e_fmr, e_fmr->flags);
+ ret = -EINVAL;
+ goto unmap_fmr_exit0;
+ }
+ num_fmr++;
+ }
+
+ /* loop over all FMRs to unmap */
+ list_for_each_entry(ib_fmr, fmr_list, list) {
+ unmap_fmr_cnt++;
+ e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+ shca = container_of(ib_fmr->device, struct ehca_shca,
+ ib_device);
+ ret = ehca_unmap_one_fmr(shca, e_fmr);
+ if (ret) {
+ /* unmap failed, stop unmapping of rest of FMRs */
+ ehca_err(&shca->ib_device, "unmap of one FMR failed, "
+ "stop rest, e_fmr=%p num_fmr=%x "
+ "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
+ unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
+ goto unmap_fmr_exit0;
+ }
+ }
+
+unmap_fmr_exit0:
+ if (ret)
+ ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
+ ret, fmr_list, num_fmr, unmap_fmr_cnt);
+ return ret;
+} /* end ehca_unmap_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr)
+{
+ int ret;
+ u64 h_ret;
+ struct ehca_shca *shca =
+ container_of(fmr->device, struct ehca_shca, ib_device);
+ struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+
+ if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+ ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+ e_fmr, e_fmr->flags);
+ ret = -EINVAL;
+ goto free_fmr_exit0;
+ }
+
+ h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
+ "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
+ h_ret, e_fmr, shca->ipz_hca_handle.handle,
+ e_fmr->ipz_mr_handle.handle, fmr->lkey);
+ ret = ehca2ib_return_code(h_ret);
+ goto free_fmr_exit0;
+ }
+ /* successful deregistration */
+ ehca_mr_delete(e_fmr);
+ return 0;
+
+free_fmr_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
+ return ret;
+} /* end ehca_dealloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ struct ehca_mr_pginfo *pginfo);
+
+int ehca_reg_mr(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ u64 *iova_start,
+ u64 size,
+ int acl,
+ struct ehca_pd *e_pd,
+ struct ehca_mr_pginfo *pginfo,
+ u32 *lkey, /*OUT*/
+ u32 *rkey, /*OUT*/
+ enum ehca_reg_type reg_type)
+{
+ int ret;
+ u64 h_ret;
+ u32 hipz_acl;
+ struct ehca_mr_hipzout_parms hipzout;
+
+ ehca_mrmw_map_acl(acl, &hipz_acl);
+ ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+ if (ehca_use_hp_mr == 1)
+ hipz_acl |= 0x00000001;
+
+ h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
+ (u64)iova_start, size, hipz_acl,
+ e_pd->fw_pd, &hipzout);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
+ "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
+ ret = ehca2ib_return_code(h_ret);
+ goto ehca_reg_mr_exit0;
+ }
+
+ e_mr->ipz_mr_handle = hipzout.handle;
+
+ if (reg_type == EHCA_REG_BUSMAP_MR)
+ ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+ else if (reg_type == EHCA_REG_MR)
+ ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+ else
+ ret = -EINVAL;
+
+ if (ret)
+ goto ehca_reg_mr_exit1;
+
+ /* successful registration */
+ e_mr->num_kpages = pginfo->num_kpages;
+ e_mr->num_hwpages = pginfo->num_hwpages;
+ e_mr->hwpage_size = pginfo->hwpage_size;
+ e_mr->start = iova_start;
+ e_mr->size = size;
+ e_mr->acl = acl;
+ *lkey = hipzout.lkey;
+ *rkey = hipzout.rkey;
+ return 0;
+
+ehca_reg_mr_exit1:
+ h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
+ "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
+ "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
+ h_ret, shca, e_mr, iova_start, size, acl, e_pd,
+ hipzout.lkey, pginfo, pginfo->num_kpages,
+ pginfo->num_hwpages, ret);
+ ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
+ "not recoverable");
+ }
+ehca_reg_mr_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+ "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+ "num_kpages=%llx num_hwpages=%llx",
+ ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
+ pginfo->num_kpages, pginfo->num_hwpages);
+ return ret;
+} /* end ehca_reg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ struct ehca_mr_pginfo *pginfo)
+{
+ int ret = 0;
+ u64 h_ret;
+ u32 rnum;
+ u64 rpage;
+ u32 i;
+ u64 *kpage;
+
+ if (!pginfo->num_hwpages) /* in case of fmr */
+ return 0;
+
+ kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!kpage) {
+ ehca_err(&shca->ib_device, "kpage alloc failed");
+ ret = -ENOMEM;
+ goto ehca_reg_mr_rpages_exit0;
+ }
+
+ /* max MAX_RPAGES ehca mr pages per register call */
+ for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
+
+ if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+ rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
+ if (rnum == 0)
+ rnum = MAX_RPAGES; /* last shot is full */
+ } else
+ rnum = MAX_RPAGES;
+
+ ret = ehca_set_pagebuf(pginfo, rnum, kpage);
+ if (ret) {
+ ehca_err(&shca->ib_device, "ehca_set_pagebuf "
+ "bad rc, ret=%i rnum=%x kpage=%p",
+ ret, rnum, kpage);
+ goto ehca_reg_mr_rpages_exit1;
+ }
+
+ if (rnum > 1) {
+ rpage = __pa(kpage);
+ if (!rpage) {
+ ehca_err(&shca->ib_device, "kpage=%p i=%x",
+ kpage, i);
+ ret = -EFAULT;
+ goto ehca_reg_mr_rpages_exit1;
+ }
+ } else
+ rpage = *kpage;
+
+ h_ret = hipz_h_register_rpage_mr(
+ shca->ipz_hca_handle, e_mr,
+ ehca_encode_hwpage_size(pginfo->hwpage_size),
+ 0, rpage, rnum);
+
+ if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+ /*
+ * check for 'registration complete'==H_SUCCESS
+ * and for 'page registered'==H_PAGE_REGISTERED
+ */
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "last "
+ "hipz_reg_rpage_mr failed, h_ret=%lli "
+ "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
+ " lkey=%x", h_ret, e_mr, i,
+ shca->ipz_hca_handle.handle,
+ e_mr->ipz_mr_handle.handle,
+ e_mr->ib.ib_mr.lkey);
+ ret = ehca2ib_return_code(h_ret);
+ break;
+ } else
+ ret = 0;
+ } else if (h_ret != H_PAGE_REGISTERED) {
+ ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
+ "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
+ "mr_hndl=%llx", h_ret, e_mr, i,
+ e_mr->ib.ib_mr.lkey,
+ shca->ipz_hca_handle.handle,
+ e_mr->ipz_mr_handle.handle);
+ ret = ehca2ib_return_code(h_ret);
+ break;
+ } else
+ ret = 0;
+ } /* end for(i) */
+
+
+ehca_reg_mr_rpages_exit1:
+ ehca_free_fw_ctrlblock(kpage);
+ehca_reg_mr_rpages_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
+ "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
+ pginfo, pginfo->num_kpages, pginfo->num_hwpages);
+ return ret;
+} /* end ehca_reg_mr_rpages() */
+
+/*----------------------------------------------------------------------*/
+
+inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ u64 *iova_start,
+ u64 size,
+ u32 acl,
+ struct ehca_pd *e_pd,
+ struct ehca_mr_pginfo *pginfo,
+ u32 *lkey, /*OUT*/
+ u32 *rkey) /*OUT*/
+{
+ int ret;
+ u64 h_ret;
+ u32 hipz_acl;
+ u64 *kpage;
+ u64 rpage;
+ struct ehca_mr_pginfo pginfo_save;
+ struct ehca_mr_hipzout_parms hipzout;
+
+ ehca_mrmw_map_acl(acl, &hipz_acl);
+ ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+
+ kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!kpage) {
+ ehca_err(&shca->ib_device, "kpage alloc failed");
+ ret = -ENOMEM;
+ goto ehca_rereg_mr_rereg1_exit0;
+ }
+
+ pginfo_save = *pginfo;
+ ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
+ if (ret) {
+ ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
+ "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
+ "kpage=%p", e_mr, pginfo, pginfo->type,
+ pginfo->num_kpages, pginfo->num_hwpages, kpage);
+ goto ehca_rereg_mr_rereg1_exit1;
+ }
+ rpage = __pa(kpage);
+ if (!rpage) {
+ ehca_err(&shca->ib_device, "kpage=%p", kpage);
+ ret = -EFAULT;
+ goto ehca_rereg_mr_rereg1_exit1;
+ }
+ h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
+ (u64)iova_start, size, hipz_acl,
+ e_pd->fw_pd, rpage, &hipzout);
+ if (h_ret != H_SUCCESS) {
+ /*
+ * reregistration unsuccessful, try it again with the 3 hCalls,
+ * e.g. this is required in case H_MR_CONDITION
+ * (MW bound or MR is shared)
+ */
+ ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
+ "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
+ *pginfo = pginfo_save;
+ ret = -EAGAIN;
+ } else if ((u64 *)hipzout.vaddr != iova_start) {
+ ehca_err(&shca->ib_device, "PHYP changed iova_start in "
+ "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
+ "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
+ hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
+ e_mr->ib.ib_mr.lkey, hipzout.lkey);
+ ret = -EFAULT;
+ } else {
+ /*
+ * successful reregistration
+ * note: start and start_out are identical for eServer HCAs
+ */
+ e_mr->num_kpages = pginfo->num_kpages;
+ e_mr->num_hwpages = pginfo->num_hwpages;
+ e_mr->hwpage_size = pginfo->hwpage_size;
+ e_mr->start = iova_start;
+ e_mr->size = size;
+ e_mr->acl = acl;
+ *lkey = hipzout.lkey;
+ *rkey = hipzout.rkey;
+ }
+
+ehca_rereg_mr_rereg1_exit1:
+ ehca_free_fw_ctrlblock(kpage);
+ehca_rereg_mr_rereg1_exit0:
+ if ( ret && (ret != -EAGAIN) )
+ ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
+ "pginfo=%p num_kpages=%llx num_hwpages=%llx",
+ ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
+ pginfo->num_hwpages);
+ return ret;
+} /* end ehca_rereg_mr_rereg1() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ u64 *iova_start,
+ u64 size,
+ int acl,
+ struct ehca_pd *e_pd,
+ struct ehca_mr_pginfo *pginfo,
+ u32 *lkey,
+ u32 *rkey)
+{
+ int ret = 0;
+ u64 h_ret;
+ int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
+ int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
+
+ /* first determine reregistration hCall(s) */
+ if ((pginfo->num_hwpages > MAX_RPAGES) ||
+ (e_mr->num_hwpages > MAX_RPAGES) ||
+ (pginfo->num_hwpages > e_mr->num_hwpages)) {
+ ehca_dbg(&shca->ib_device, "Rereg3 case, "
+ "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
+ pginfo->num_hwpages, e_mr->num_hwpages);
+ rereg_1_hcall = 0;
+ rereg_3_hcall = 1;
+ }
+
+ if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */
+ rereg_1_hcall = 0;
+ rereg_3_hcall = 1;
+ e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
+ ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
+ e_mr);
+ }
+
+ if (rereg_1_hcall) {
+ ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
+ acl, e_pd, pginfo, lkey, rkey);
+ if (ret) {
+ if (ret == -EAGAIN)
+ rereg_3_hcall = 1;
+ else
+ goto ehca_rereg_mr_exit0;
+ }
+ }
+
+ if (rereg_3_hcall) {
+ struct ehca_mr save_mr;
+
+ /* first deregister old MR */
+ h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+ "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
+ "mr->lkey=%x",
+ h_ret, e_mr, shca->ipz_hca_handle.handle,
+ e_mr->ipz_mr_handle.handle,
+ e_mr->ib.ib_mr.lkey);
+ ret = ehca2ib_return_code(h_ret);
+ goto ehca_rereg_mr_exit0;
+ }
+ /* clean ehca_mr_t, without changing struct ib_mr and lock */
+ save_mr = *e_mr;
+ ehca_mr_deletenew(e_mr);
+
+ /* set some MR values */
+ e_mr->flags = save_mr.flags;
+ e_mr->hwpage_size = save_mr.hwpage_size;
+ e_mr->fmr_page_size = save_mr.fmr_page_size;
+ e_mr->fmr_max_pages = save_mr.fmr_max_pages;
+ e_mr->fmr_max_maps = save_mr.fmr_max_maps;
+ e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
+
+ ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
+ e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
+ if (ret) {
+ u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
+ memcpy(&e_mr->flags, &(save_mr.flags),
+ sizeof(struct ehca_mr) - offset);
+ goto ehca_rereg_mr_exit0;
+ }
+ }
+
+ehca_rereg_mr_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+ "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+ "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
+ "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
+ acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
+ rereg_1_hcall, rereg_3_hcall);
+ return ret;
+} /* end ehca_rereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+ struct ehca_mr *e_fmr)
+{
+ int ret = 0;
+ u64 h_ret;
+ struct ehca_pd *e_pd =
+ container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
+ struct ehca_mr save_fmr;
+ u32 tmp_lkey, tmp_rkey;
+ struct ehca_mr_pginfo pginfo;
+ struct ehca_mr_hipzout_parms hipzout;
+ struct ehca_mr save_mr;
+
+ if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
+ /*
+ * note: after using rereg hcall with len=0,
+ * rereg hcall must be used again for registering pages
+ */
+ h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
+ 0, 0, e_pd->fw_pd, 0, &hipzout);
+ if (h_ret == H_SUCCESS) {
+ /* successful reregistration */
+ e_fmr->start = NULL;
+ e_fmr->size = 0;
+ tmp_lkey = hipzout.lkey;
+ tmp_rkey = hipzout.rkey;
+ return 0;
+ }
+ /*
+ * should not happen, because length checked above,
+ * FMRs are not shared and no MW bound to FMRs
+ */
+ ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
+ "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
+ "mr_hndl=%llx lkey=%x lkey_out=%x",
+ h_ret, e_fmr, shca->ipz_hca_handle.handle,
+ e_fmr->ipz_mr_handle.handle,
+ e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
+ /* try free and rereg */
+ }
+
+ /* first free old FMR */
+ h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+ "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
+ "lkey=%x",
+ h_ret, e_fmr, shca->ipz_hca_handle.handle,
+ e_fmr->ipz_mr_handle.handle,
+ e_fmr->ib.ib_fmr.lkey);
+ ret = ehca2ib_return_code(h_ret);
+ goto ehca_unmap_one_fmr_exit0;
+ }
+ /* clean ehca_mr_t, without changing lock */
+ save_fmr = *e_fmr;
+ ehca_mr_deletenew(e_fmr);
+
+ /* set some MR values */
+ e_fmr->flags = save_fmr.flags;
+ e_fmr->hwpage_size = save_fmr.hwpage_size;
+ e_fmr->fmr_page_size = save_fmr.fmr_page_size;
+ e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
+ e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
+ e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
+ e_fmr->acl = save_fmr.acl;
+
+ memset(&pginfo, 0, sizeof(pginfo));
+ pginfo.type = EHCA_MR_PGI_FMR;
+ ret = ehca_reg_mr(shca, e_fmr, NULL,
+ (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
+ e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
+ &tmp_rkey, EHCA_REG_MR);
+ if (ret) {
+ u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
+ memcpy(&e_fmr->flags, &(save_mr.flags),
+ sizeof(struct ehca_mr) - offset);
+ }
+
+ehca_unmap_one_fmr_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
+ "fmr_max_pages=%x",
+ ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
+ return ret;
+} /* end ehca_unmap_one_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_smr(struct ehca_shca *shca,
+ struct ehca_mr *e_origmr,
+ struct ehca_mr *e_newmr,
+ u64 *iova_start,
+ int acl,
+ struct ehca_pd *e_pd,
+ u32 *lkey, /*OUT*/
+ u32 *rkey) /*OUT*/
+{
+ int ret = 0;
+ u64 h_ret;
+ u32 hipz_acl;
+ struct ehca_mr_hipzout_parms hipzout;
+
+ ehca_mrmw_map_acl(acl, &hipz_acl);
+ ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+ h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+ (u64)iova_start, hipz_acl, e_pd->fw_pd,
+ &hipzout);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+ "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
+ "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+ h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
+ shca->ipz_hca_handle.handle,
+ e_origmr->ipz_mr_handle.handle,
+ e_origmr->ib.ib_mr.lkey);
+ ret = ehca2ib_return_code(h_ret);
+ goto ehca_reg_smr_exit0;
+ }
+ /* successful registration */
+ e_newmr->num_kpages = e_origmr->num_kpages;
+ e_newmr->num_hwpages = e_origmr->num_hwpages;
+ e_newmr->hwpage_size = e_origmr->hwpage_size;
+ e_newmr->start = iova_start;
+ e_newmr->size = e_origmr->size;
+ e_newmr->acl = acl;
+ e_newmr->ipz_mr_handle = hipzout.handle;
+ *lkey = hipzout.lkey;
+ *rkey = hipzout.rkey;
+ return 0;
+
+ehca_reg_smr_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
+ "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
+ ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
+ return ret;
+} /* end ehca_reg_smr() */
+
+/*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+ unsigned long ret = idx;
+ ret |= dir << EHCA_DIR_INDEX_SHIFT;
+ ret |= top << EHCA_TOP_INDEX_SHIFT;
+ return __va(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+ ((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+ struct ehca_shca *shca, struct ehca_mr *mr,
+ struct ehca_mr_pginfo *pginfo)
+{
+ u64 h_ret = 0;
+ unsigned long page = 0;
+ u64 rpage = __pa(kpage);
+ int page_count;
+
+ void *sectbase = ehca_calc_sectbase(top, dir, idx);
+ if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+ ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+ "hwpage_size does not fit to "
+ "section start address");
+ }
+ page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+ while (page < page_count) {
+ u64 rnum;
+ for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+ rnum++) {
+ void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+ kpage[rnum] = __pa(pg);
+ }
+
+ h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+ ehca_encode_hwpage_size(pginfo->hwpage_size),
+ 0, rpage, rnum);
+
+ if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+ ehca_err(&shca->ib_device, "register_rpage_mr failed");
+ return h_ret;
+ }
+ }
+ return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+ struct ehca_shca *shca, struct ehca_mr *mr,
+ struct ehca_mr_pginfo *pginfo)
+{
+ u64 hret = H_SUCCESS;
+ int idx;
+
+ for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+ continue;
+
+ hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+ pginfo);
+ if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+ return hret;
+ }
+ return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+ struct ehca_mr *mr,
+ struct ehca_mr_pginfo *pginfo)
+{
+ u64 hret = H_SUCCESS;
+ int dir;
+
+ for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+ continue;
+
+ hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+ if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+ return hret;
+ }
+ return hret;
+}
+
+/* register internal max-MR to internal SHCA */
+int ehca_reg_internal_maxmr(
+ struct ehca_shca *shca,
+ struct ehca_pd *e_pd,
+ struct ehca_mr **e_maxmr) /*OUT*/
+{
+ int ret;
+ struct ehca_mr *e_mr;
+ u64 *iova_start;
+ u64 size_maxmr;
+ struct ehca_mr_pginfo pginfo;
+ struct ib_phys_buf ib_pbuf;
+ u32 num_kpages;
+ u32 num_hwpages;
+ u64 hw_pgsize;
+
+ if (!ehca_bmap) {
+ ret = -EFAULT;
+ goto ehca_reg_internal_maxmr_exit0;
+ }
+
+ e_mr = ehca_mr_new();
+ if (!e_mr) {
+ ehca_err(&shca->ib_device, "out of memory");
+ ret = -ENOMEM;
+ goto ehca_reg_internal_maxmr_exit0;
+ }
+ e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+
+ /* register internal max-MR on HCA */
+ size_maxmr = ehca_mr_len;
+ iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
+ ib_pbuf.addr = 0;
+ ib_pbuf.size = size_maxmr;
+ num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
+ PAGE_SIZE);
+ hw_pgsize = ehca_get_max_hwpage_size(shca);
+ num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
+ hw_pgsize);
+
+ memset(&pginfo, 0, sizeof(pginfo));
+ pginfo.type = EHCA_MR_PGI_PHYS;
+ pginfo.num_kpages = num_kpages;
+ pginfo.num_hwpages = num_hwpages;
+ pginfo.hwpage_size = hw_pgsize;
+ pginfo.u.phy.num_phys_buf = 1;
+ pginfo.u.phy.phys_buf_array = &ib_pbuf;
+
+ ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
+ &pginfo, &e_mr->ib.ib_mr.lkey,
+ &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
+ if (ret) {
+ ehca_err(&shca->ib_device, "reg of internal max MR failed, "
+ "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
+ "num_hwpages=%x", e_mr, iova_start, size_maxmr,
+ num_kpages, num_hwpages);
+ goto ehca_reg_internal_maxmr_exit1;
+ }
+
+ /* successful registration of all pages */
+ e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
+ e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
+ e_mr->ib.ib_mr.uobject = NULL;
+ atomic_inc(&(e_pd->ib_pd.usecnt));
+ atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
+ *e_maxmr = e_mr;
+ return 0;
+
+ehca_reg_internal_maxmr_exit1:
+ ehca_mr_delete(e_mr);
+ehca_reg_internal_maxmr_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
+ ret, shca, e_pd, e_maxmr);
+ return ret;
+} /* end ehca_reg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+ struct ehca_mr *e_newmr,
+ u64 *iova_start,
+ int acl,
+ struct ehca_pd *e_pd,
+ u32 *lkey,
+ u32 *rkey)
+{
+ u64 h_ret;
+ struct ehca_mr *e_origmr = shca->maxmr;
+ u32 hipz_acl;
+ struct ehca_mr_hipzout_parms hipzout;
+
+ ehca_mrmw_map_acl(acl, &hipz_acl);
+ ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+ h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+ (u64)iova_start, hipz_acl, e_pd->fw_pd,
+ &hipzout);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+ "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+ h_ret, e_origmr, shca->ipz_hca_handle.handle,
+ e_origmr->ipz_mr_handle.handle,
+ e_origmr->ib.ib_mr.lkey);
+ return ehca2ib_return_code(h_ret);
+ }
+ /* successful registration */
+ e_newmr->num_kpages = e_origmr->num_kpages;
+ e_newmr->num_hwpages = e_origmr->num_hwpages;
+ e_newmr->hwpage_size = e_origmr->hwpage_size;
+ e_newmr->start = iova_start;
+ e_newmr->size = e_origmr->size;
+ e_newmr->acl = acl;
+ e_newmr->ipz_mr_handle = hipzout.handle;
+ *lkey = hipzout.lkey;
+ *rkey = hipzout.rkey;
+ return 0;
+} /* end ehca_reg_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
+{
+ int ret;
+ struct ehca_mr *e_maxmr;
+ struct ib_pd *ib_pd;
+
+ if (!shca->maxmr) {
+ ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
+ ret = -EINVAL;
+ goto ehca_dereg_internal_maxmr_exit0;
+ }
+
+ e_maxmr = shca->maxmr;
+ ib_pd = e_maxmr->ib.ib_mr.pd;
+ shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
+
+ ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
+ if (ret) {
+ ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
+ "ret=%i e_maxmr=%p shca=%p lkey=%x",
+ ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
+ shca->maxmr = e_maxmr;
+ goto ehca_dereg_internal_maxmr_exit0;
+ }
+
+ atomic_dec(&ib_pd->usecnt);
+
+ehca_dereg_internal_maxmr_exit0:
+ if (ret)
+ ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
+ ret, shca, shca->maxmr);
+ return ret;
+} /* end ehca_dereg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check physical buffer array of MR verbs for validness and
+ * calculates MR size
+ */
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ u64 *iova_start,
+ u64 *size)
+{
+ struct ib_phys_buf *pbuf = phys_buf_array;
+ u64 size_count = 0;
+ u32 i;
+
+ if (num_phys_buf == 0) {
+ ehca_gen_err("bad phys buf array len, num_phys_buf=0");
+ return -EINVAL;
+ }
+ /* check first buffer */
+ if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
+ ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
+ "pbuf->addr=%llx pbuf->size=%llx",
+ iova_start, pbuf->addr, pbuf->size);
+ return -EINVAL;
+ }
+ if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
+ (num_phys_buf > 1)) {
+ ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
+ "pbuf->size=%llx", pbuf->addr, pbuf->size);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < num_phys_buf; i++) {
+ if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
+ ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
+ "pbuf->size=%llx",
+ i, pbuf->addr, pbuf->size);
+ return -EINVAL;
+ }
+ if (((i > 0) && /* not 1st */
+ (i < (num_phys_buf - 1)) && /* not last */
+ (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
+ ehca_gen_err("bad size, i=%x pbuf->size=%llx",
+ i, pbuf->size);
+ return -EINVAL;
+ }
+ size_count += pbuf->size;
+ pbuf++;
+ }
+
+ *size = size_count;
+ return 0;
+} /* end ehca_mr_chk_buf_and_calc_size() */
+
+/*----------------------------------------------------------------------*/
+
+/* check page list of map FMR verb for validness */
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+ u64 *page_list,
+ int list_len)
+{
+ u32 i;
+ u64 *page;
+
+ if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
+ ehca_gen_err("bad list_len, list_len=%x "
+ "e_fmr->fmr_max_pages=%x fmr=%p",
+ list_len, e_fmr->fmr_max_pages, e_fmr);
+ return -EINVAL;
+ }
+
+ /* each page must be aligned */
+ page = page_list;
+ for (i = 0; i < list_len; i++) {
+ if (*page % e_fmr->fmr_page_size) {
+ ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
+ "fmr_page_size=%x", i, *page, page, e_fmr,
+ e_fmr->fmr_page_size);
+ return -EINVAL;
+ }
+ page++;
+ }
+
+ return 0;
+} /* end ehca_fmr_check_page_list() */
+
+/*----------------------------------------------------------------------*/
+
+/* PAGE_SIZE >= pginfo->hwpage_size */
+static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
+ u32 number,
+ u64 *kpage)
+{
+ int ret = 0;
+ u64 pgaddr;
+ u32 j = 0;
+ int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
+ struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+ while (*sg != NULL) {
+ pgaddr = page_to_pfn(sg_page(*sg))
+ << PAGE_SHIFT;
+ *kpage = pgaddr + (pginfo->next_hwpage *
+ pginfo->hwpage_size);
+ if (!(*kpage)) {
+ ehca_gen_err("pgaddr=%llx "
+ "sg_dma_address=%llx "
+ "entry=%llx next_hwpage=%llx",
+ pgaddr, (u64)sg_dma_address(*sg),
+ pginfo->u.usr.next_nmap,
+ pginfo->next_hwpage);
+ return -EFAULT;
+ }
+ (pginfo->hwpage_cnt)++;
+ (pginfo->next_hwpage)++;
+ kpage++;
+ if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
+ (pginfo->kpage_cnt)++;
+ (pginfo->u.usr.next_nmap)++;
+ pginfo->next_hwpage = 0;
+ *sg = sg_next(*sg);
+ }
+ j++;
+ if (j >= number)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * check given pages for contiguous layout
+ * last page addr is returned in prev_pgaddr for further check
+ */
+static int ehca_check_kpages_per_ate(struct scatterlist **sg,
+ int num_pages,
+ u64 *prev_pgaddr)
+{
+ for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
+ u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
+ if (ehca_debug_level >= 3)
+ ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
+ *(u64 *)__va(pgaddr));
+ if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
+ ehca_gen_err("uncontiguous page found pgaddr=%llx "
+ "prev_pgaddr=%llx entries_left_in_hwpage=%x",
+ pgaddr, *prev_pgaddr, num_pages);
+ return -EINVAL;
+ }
+ *prev_pgaddr = pgaddr;
+ }
+ return 0;
+}
+
+/* PAGE_SIZE < pginfo->hwpage_size */
+static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
+ u32 number,
+ u64 *kpage)
+{
+ int ret = 0;
+ u64 pgaddr, prev_pgaddr;
+ u32 j = 0;
+ int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
+ int nr_kpages = kpages_per_hwpage;
+ struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+ while (*sg != NULL) {
+
+ if (nr_kpages == kpages_per_hwpage) {
+ pgaddr = (page_to_pfn(sg_page(*sg))
+ << PAGE_SHIFT);
+ *kpage = pgaddr;
+ if (!(*kpage)) {
+ ehca_gen_err("pgaddr=%llx entry=%llx",
+ pgaddr, pginfo->u.usr.next_nmap);
+ ret = -EFAULT;
+ return ret;
+ }
+ /*
+ * The first page in a hwpage must be aligned;
+ * the first MR page is exempt from this rule.
+ */
+ if (pgaddr & (pginfo->hwpage_size - 1)) {
+ if (pginfo->hwpage_cnt) {
+ ehca_gen_err(
+ "invalid alignment "
+ "pgaddr=%llx entry=%llx "
+ "mr_pgsize=%llx",
+ pgaddr, pginfo->u.usr.next_nmap,
+ pginfo->hwpage_size);
+ ret = -EFAULT;
+ return ret;
+ }
+ /* first MR page */
+ pginfo->kpage_cnt =
+ (pgaddr &
+ (pginfo->hwpage_size - 1)) >>
+ PAGE_SHIFT;
+ nr_kpages -= pginfo->kpage_cnt;
+ *kpage = pgaddr &
+ ~(pginfo->hwpage_size - 1);
+ }
+ if (ehca_debug_level >= 3) {
+ u64 val = *(u64 *)__va(pgaddr);
+ ehca_gen_dbg("kpage=%llx page=%llx "
+ "value=%016llx",
+ *kpage, pgaddr, val);
+ }
+ prev_pgaddr = pgaddr;
+ *sg = sg_next(*sg);
+ pginfo->kpage_cnt++;
+ pginfo->u.usr.next_nmap++;
+ nr_kpages--;
+ if (!nr_kpages)
+ goto next_kpage;
+ continue;
+ }
+
+ ret = ehca_check_kpages_per_ate(sg, nr_kpages,
+ &prev_pgaddr);
+ if (ret)
+ return ret;
+ pginfo->kpage_cnt += nr_kpages;
+ pginfo->u.usr.next_nmap += nr_kpages;
+
+next_kpage:
+ nr_kpages = kpages_per_hwpage;
+ (pginfo->hwpage_cnt)++;
+ kpage++;
+ j++;
+ if (j >= number)
+ break;
+ }
+
+ return ret;
+}
+
+static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
+ u32 number, u64 *kpage)
+{
+ int ret = 0;
+ struct ib_phys_buf *pbuf;
+ u64 num_hw, offs_hw;
+ u32 i = 0;
+
+ /* loop over desired phys_buf_array entries */
+ while (i < number) {
+ pbuf = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
+ num_hw = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
+ pbuf->size, pginfo->hwpage_size);
+ offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
+ pginfo->hwpage_size;
+ while (pginfo->next_hwpage < offs_hw + num_hw) {
+ /* sanity check */
+ if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+ (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+ ehca_gen_err("kpage_cnt >= num_kpages, "
+ "kpage_cnt=%llx num_kpages=%llx "
+ "hwpage_cnt=%llx "
+ "num_hwpages=%llx i=%x",
+ pginfo->kpage_cnt,
+ pginfo->num_kpages,
+ pginfo->hwpage_cnt,
+ pginfo->num_hwpages, i);
+ return -EFAULT;
+ }
+ *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
+ (pginfo->next_hwpage * pginfo->hwpage_size);
+ if ( !(*kpage) && pbuf->addr ) {
+ ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
+ "next_hwpage=%llx", pbuf->addr,
+ pbuf->size, pginfo->next_hwpage);
+ return -EFAULT;
+ }
+ (pginfo->hwpage_cnt)++;
+ (pginfo->next_hwpage)++;
+ if (PAGE_SIZE >= pginfo->hwpage_size) {
+ if (pginfo->next_hwpage %
+ (PAGE_SIZE / pginfo->hwpage_size) == 0)
+ (pginfo->kpage_cnt)++;
+ } else
+ pginfo->kpage_cnt += pginfo->hwpage_size /
+ PAGE_SIZE;
+ kpage++;
+ i++;
+ if (i >= number) break;
+ }
+ if (pginfo->next_hwpage >= offs_hw + num_hw) {
+ (pginfo->u.phy.next_buf)++;
+ pginfo->next_hwpage = 0;
+ }
+ }
+ return ret;
+}
+
+static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
+ u32 number, u64 *kpage)
+{
+ int ret = 0;
+ u64 *fmrlist;
+ u32 i;
+
+ /* loop over desired page_list entries */
+ fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
+ for (i = 0; i < number; i++) {
+ *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
+ pginfo->next_hwpage * pginfo->hwpage_size;
+ if ( !(*kpage) ) {
+ ehca_gen_err("*fmrlist=%llx fmrlist=%p "
+ "next_listelem=%llx next_hwpage=%llx",
+ *fmrlist, fmrlist,
+ pginfo->u.fmr.next_listelem,
+ pginfo->next_hwpage);
+ return -EFAULT;
+ }
+ (pginfo->hwpage_cnt)++;
+ if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
+ if (pginfo->next_hwpage %
+ (pginfo->u.fmr.fmr_pgsize /
+ pginfo->hwpage_size) == 0) {
+ (pginfo->kpage_cnt)++;
+ (pginfo->u.fmr.next_listelem)++;
+ fmrlist++;
+ pginfo->next_hwpage = 0;
+ } else
+ (pginfo->next_hwpage)++;
+ } else {
+ unsigned int cnt_per_hwpage = pginfo->hwpage_size /
+ pginfo->u.fmr.fmr_pgsize;
+ unsigned int j;
+ u64 prev = *kpage;
+ /* check if adrs are contiguous */
+ for (j = 1; j < cnt_per_hwpage; j++) {
+ u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
+ if (prev + pginfo->u.fmr.fmr_pgsize != p) {
+ ehca_gen_err("uncontiguous fmr pages "
+ "found prev=%llx p=%llx "
+ "idx=%x", prev, p, i + j);
+ return -EINVAL;
+ }
+ prev = p;
+ }
+ pginfo->kpage_cnt += cnt_per_hwpage;
+ pginfo->u.fmr.next_listelem += cnt_per_hwpage;
+ fmrlist += cnt_per_hwpage;
+ }
+ kpage++;
+ }
+ return ret;
+}
+
+/* setup page buffer from page info */
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+ u32 number,
+ u64 *kpage)
+{
+ int ret;
+
+ switch (pginfo->type) {
+ case EHCA_MR_PGI_PHYS:
+ ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
+ break;
+ case EHCA_MR_PGI_USER:
+ ret = PAGE_SIZE >= pginfo->hwpage_size ?
+ ehca_set_pagebuf_user1(pginfo, number, kpage) :
+ ehca_set_pagebuf_user2(pginfo, number, kpage);
+ break;
+ case EHCA_MR_PGI_FMR:
+ ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
+ break;
+ default:
+ ehca_gen_err("bad pginfo->type=%x", pginfo->type);
+ ret = -EFAULT;
+ break;
+ }
+ return ret;
+} /* end ehca_set_pagebuf() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check MR if it is a max-MR, i.e. uses whole memory
+ * in case it's a max-MR 1 is returned, else 0
+ */
+int ehca_mr_is_maxmr(u64 size,
+ u64 *iova_start)
+{
+ /* a MR is treated as max-MR only if it fits following: */
+ if ((size == ehca_mr_len) &&
+ (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
+ ehca_gen_dbg("this is a max-MR");
+ return 1;
+ } else
+ return 0;
+} /* end ehca_mr_is_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/* map access control for MR/MW. This routine is used for MR and MW. */
+void ehca_mrmw_map_acl(int ib_acl,
+ u32 *hipz_acl)
+{
+ *hipz_acl = 0;
+ if (ib_acl & IB_ACCESS_REMOTE_READ)
+ *hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
+ if (ib_acl & IB_ACCESS_REMOTE_WRITE)
+ *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
+ if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
+ *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
+ if (ib_acl & IB_ACCESS_LOCAL_WRITE)
+ *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
+ if (ib_acl & IB_ACCESS_MW_BIND)
+ *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
+} /* end ehca_mrmw_map_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/* sets page size in hipz access control for MR/MW. */
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
+{
+ *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
+} /* end ehca_mrmw_set_pgsize_hipz_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * reverse map access control for MR/MW.
+ * This routine is used for MR and MW.
+ */
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+ int *ib_acl) /*OUT*/
+{
+ *ib_acl = 0;
+ if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
+ *ib_acl |= IB_ACCESS_REMOTE_READ;
+ if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
+ *ib_acl |= IB_ACCESS_REMOTE_WRITE;
+ if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
+ *ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
+ if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
+ *ib_acl |= IB_ACCESS_LOCAL_WRITE;
+ if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
+ *ib_acl |= IB_ACCESS_MW_BIND;
+} /* end ehca_mrmw_reverse_map_acl() */
+
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * MR destructor and constructor
+ * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
+ * except struct ib_mr and spinlock
+ */
+void ehca_mr_deletenew(struct ehca_mr *mr)
+{
+ mr->flags = 0;
+ mr->num_kpages = 0;
+ mr->num_hwpages = 0;
+ mr->acl = 0;
+ mr->start = NULL;
+ mr->fmr_page_size = 0;
+ mr->fmr_max_pages = 0;
+ mr->fmr_max_maps = 0;
+ mr->fmr_map_cnt = 0;
+ memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
+ memset(&mr->galpas, 0, sizeof(mr->galpas));
+} /* end ehca_mr_deletenew() */
+
+int ehca_init_mrmw_cache(void)
+{
+ mr_cache = kmem_cache_create("ehca_cache_mr",
+ sizeof(struct ehca_mr), 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!mr_cache)
+ return -ENOMEM;
+ mw_cache = kmem_cache_create("ehca_cache_mw",
+ sizeof(struct ehca_mw), 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!mw_cache) {
+ kmem_cache_destroy(mr_cache);
+ mr_cache = NULL;
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void ehca_cleanup_mrmw_cache(void)
+{
+ if (mr_cache)
+ kmem_cache_destroy(mr_cache);
+ if (mw_cache)
+ kmem_cache_destroy(mw_cache);
+}
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+ int dir)
+{
+ if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+ ehca_top_bmap->dir[dir] =
+ kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+ if (!ehca_top_bmap->dir[dir])
+ return -ENOMEM;
+ /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+ memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+ }
+ return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+ if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+ ehca_bmap->top[top] =
+ kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+ if (!ehca_bmap->top[top])
+ return -ENOMEM;
+ /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+ memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+ }
+ return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+ return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+ int top, dir;
+
+ if (!ehca_bmap)
+ return;
+
+ for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+ if (!ehca_bmap_valid(ehca_bmap->top[top]))
+ continue;
+ for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+ continue;
+
+ kfree(ehca_bmap->top[top]->dir[dir]);
+ }
+
+ kfree(ehca_bmap->top[top]);
+ }
+
+ kfree(ehca_bmap);
+ ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long i, start_section, end_section;
+ int top, dir, idx;
+
+ if (!nr_pages)
+ return 0;
+
+ if (!ehca_bmap) {
+ ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+ if (!ehca_bmap)
+ return -ENOMEM;
+ /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+ memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+ }
+
+ start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+ end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+ for (i = start_section; i < end_section; i++) {
+ int ret;
+ top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+ dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+ idx = i & EHCA_INDEX_MASK;
+
+ ret = ehca_init_bmap(ehca_bmap, top, dir);
+ if (ret) {
+ ehca_destroy_busmap();
+ return ret;
+ }
+ ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+ ehca_mr_len += EHCA_SECTSIZE;
+ }
+ return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+ int page_order;
+
+ if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+ return 0;
+
+ page_order = compound_order(pfn_to_page(pfn));
+ if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+ return 0;
+
+ return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+ unsigned long total_nr_pages, void *arg)
+{
+ int ret;
+ unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+ if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+ return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+ /* Given chunk is >= 16GB -> check for hugepages */
+ start_pfn = initial_pfn;
+ end_pfn = initial_pfn + total_nr_pages;
+ pfn = start_pfn;
+
+ while (pfn < end_pfn) {
+ if (ehca_is_hugepage(pfn)) {
+ /* Add mem found in front of the hugepage */
+ nr_pages = pfn - start_pfn;
+ ret = ehca_update_busmap(start_pfn, nr_pages);
+ if (ret)
+ return ret;
+ /* Skip the hugepage */
+ pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+ start_pfn = pfn;
+ } else
+ pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+ }
+
+ /* Add mem found behind the hugepage(s) */
+ nr_pages = pfn - start_pfn;
+ return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+ int ret;
+
+ ehca_mr_len = 0;
+ ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+ ehca_create_busmap_callback);
+ return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ struct ehca_mr_pginfo *pginfo)
+{
+ int top;
+ u64 hret, *kpage;
+
+ kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!kpage) {
+ ehca_err(&shca->ib_device, "kpage alloc failed");
+ return -ENOMEM;
+ }
+ for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+ if (!ehca_bmap_valid(ehca_bmap->top[top]))
+ continue;
+ hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+ if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+ break;
+ }
+
+ ehca_free_fw_ctrlblock(kpage);
+
+ if (hret == H_SUCCESS)
+ return 0; /* Everything is fine */
+ else {
+ ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+ "h_ret=%lli e_mr=%p top=%x lkey=%x "
+ "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+ e_mr->ib.ib_mr.lkey,
+ shca->ipz_hca_handle.handle,
+ e_mr->ipz_mr_handle.handle);
+ return ehca2ib_return_code(hret);
+ }
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+ int top, dir, idx;
+ unsigned long abs_addr, offset;
+ u64 entry;
+
+ if (!ehca_bmap)
+ return EHCA_INVAL_ADDR;
+
+ abs_addr = __pa(caddr);
+ top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+ if (!ehca_bmap_valid(ehca_bmap->top[top]))
+ return EHCA_INVAL_ADDR;
+
+ dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+ if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+ return EHCA_INVAL_ADDR;
+
+ idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+ entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+ if (ehca_bmap_valid(entry)) {
+ offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+ return entry | offset;
+ } else
+ return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+ size_t size, enum dma_data_direction direction)
+{
+ if (cpu_addr)
+ return ehca_map_vaddr(cpu_addr);
+ else
+ return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ /* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction)
+{
+ u64 addr;
+
+ if (offset + size > PAGE_SIZE)
+ return EHCA_INVAL_ADDR;
+
+ addr = ehca_map_vaddr(page_address(page));
+ if (!ehca_dma_mapping_error(dev, addr))
+ addr += offset;
+
+ return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ /* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction direction)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i) {
+ u64 addr;
+ addr = ehca_map_vaddr(sg_virt(sg));
+ if (ehca_dma_mapping_error(dev, addr))
+ return 0;
+
+ sg->dma_address = addr;
+ sg->dma_length = sg->length;
+ }
+ return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction direction)
+{
+ /* This is only a stub; nothing to be done here */
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+ u64 *dma_handle, gfp_t flag)
+{
+ struct page *p;
+ void *addr = NULL;
+ u64 dma_addr;
+
+ p = alloc_pages(flag, get_order(size));
+ if (p) {
+ addr = page_address(p);
+ dma_addr = ehca_map_vaddr(addr);
+ if (ehca_dma_mapping_error(dev, dma_addr)) {
+ free_pages((unsigned long)addr, get_order(size));
+ return NULL;
+ }
+ if (dma_handle)
+ *dma_handle = dma_addr;
+ return addr;
+ }
+ return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+ void *cpu_addr, u64 dma_handle)
+{
+ if (cpu_addr && size)
+ free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+ .mapping_error = ehca_dma_mapping_error,
+ .map_single = ehca_dma_map_single,
+ .unmap_single = ehca_dma_unmap_single,
+ .map_page = ehca_dma_map_page,
+ .unmap_page = ehca_dma_unmap_page,
+ .map_sg = ehca_dma_map_sg,
+ .unmap_sg = ehca_dma_unmap_sg,
+ .sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
+ .sync_single_for_device = ehca_dma_sync_single_for_device,
+ .alloc_coherent = ehca_dma_alloc_coherent,
+ .free_coherent = ehca_dma_free_coherent,
+};
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h
new file mode 100644
index 000000000000..50d8b51306dd
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_mrmw.h
@@ -0,0 +1,132 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * MR/MW declarations and inline functions
+ *
+ * Authors: Dietmar Decker <ddecker@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _EHCA_MRMW_H_
+#define _EHCA_MRMW_H_
+
+enum ehca_reg_type {
+ EHCA_REG_MR,
+ EHCA_REG_BUSMAP_MR
+};
+
+int ehca_reg_mr(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ u64 *iova_start,
+ u64 size,
+ int acl,
+ struct ehca_pd *e_pd,
+ struct ehca_mr_pginfo *pginfo,
+ u32 *lkey,
+ u32 *rkey,
+ enum ehca_reg_type reg_type);
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ struct ehca_mr_pginfo *pginfo);
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+ struct ehca_mr *e_mr,
+ u64 *iova_start,
+ u64 size,
+ int mr_access_flags,
+ struct ehca_pd *e_pd,
+ struct ehca_mr_pginfo *pginfo,
+ u32 *lkey,
+ u32 *rkey);
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+ struct ehca_mr *e_fmr);
+
+int ehca_reg_smr(struct ehca_shca *shca,
+ struct ehca_mr *e_origmr,
+ struct ehca_mr *e_newmr,
+ u64 *iova_start,
+ int acl,
+ struct ehca_pd *e_pd,
+ u32 *lkey,
+ u32 *rkey);
+
+int ehca_reg_internal_maxmr(struct ehca_shca *shca,
+ struct ehca_pd *e_pd,
+ struct ehca_mr **maxmr);
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+ struct ehca_mr *e_newmr,
+ u64 *iova_start,
+ int acl,
+ struct ehca_pd *e_pd,
+ u32 *lkey,
+ u32 *rkey);
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
+
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ u64 *iova_start,
+ u64 *size);
+
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+ u64 *page_list,
+ int list_len);
+
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+ u32 number,
+ u64 *kpage);
+
+int ehca_mr_is_maxmr(u64 size,
+ u64 *iova_start);
+
+void ehca_mrmw_map_acl(int ib_acl,
+ u32 *hipz_acl);
+
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
+
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+ int *ib_acl);
+
+void ehca_mr_deletenew(struct ehca_mr *mr);
+
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
+#endif /*_EHCA_MRMW_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_pd.c b/drivers/staging/rdma/ehca/ehca_pd.c
new file mode 100644
index 000000000000..351577a6670a
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_pd.c
@@ -0,0 +1,124 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * PD functions
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+
+static struct kmem_cache *pd_cache;
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+ struct ib_ucontext *context, struct ib_udata *udata)
+{
+ struct ehca_pd *pd;
+ int i;
+
+ pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
+ if (!pd) {
+ ehca_err(device, "device=%p context=%p out of memory",
+ device, context);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < 2; i++) {
+ INIT_LIST_HEAD(&pd->free[i]);
+ INIT_LIST_HEAD(&pd->full[i]);
+ }
+ mutex_init(&pd->lock);
+
+ /*
+ * Kernel PD: when device = -1, 0
+ * User PD: when context != -1
+ */
+ if (!context) {
+ /*
+ * Kernel PDs after init reuses always
+ * the one created in ehca_shca_reopen()
+ */
+ struct ehca_shca *shca = container_of(device, struct ehca_shca,
+ ib_device);
+ pd->fw_pd.value = shca->pd->fw_pd.value;
+ } else
+ pd->fw_pd.value = (u64)pd;
+
+ return &pd->ib_pd;
+}
+
+int ehca_dealloc_pd(struct ib_pd *pd)
+{
+ struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+ int i, leftovers = 0;
+ struct ipz_small_queue_page *page, *tmp;
+
+ for (i = 0; i < 2; i++) {
+ list_splice(&my_pd->full[i], &my_pd->free[i]);
+ list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
+ leftovers = 1;
+ free_page(page->page);
+ kmem_cache_free(small_qp_cache, page);
+ }
+ }
+
+ if (leftovers)
+ ehca_warn(pd->device,
+ "Some small queue pages were not freed");
+
+ kmem_cache_free(pd_cache, my_pd);
+
+ return 0;
+}
+
+int ehca_init_pd_cache(void)
+{
+ pd_cache = kmem_cache_create("ehca_cache_pd",
+ sizeof(struct ehca_pd), 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!pd_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void ehca_cleanup_pd_cache(void)
+{
+ if (pd_cache)
+ kmem_cache_destroy(pd_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_qes.h b/drivers/staging/rdma/ehca/ehca_qes.h
new file mode 100644
index 000000000000..90c4efa67586
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_qes.h
@@ -0,0 +1,260 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Hardware request structures
+ *
+ * Authors: Waleri Fomin <fomin@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _EHCA_QES_H_
+#define _EHCA_QES_H_
+
+#include "ehca_tools.h"
+
+/* virtual scatter gather entry to specify remote addresses with length */
+struct ehca_vsgentry {
+ u64 vaddr;
+ u32 lkey;
+ u32 length;
+};
+
+#define GRH_FLAG_MASK EHCA_BMASK_IBM( 7, 7)
+#define GRH_IPVERSION_MASK EHCA_BMASK_IBM( 0, 3)
+#define GRH_TCLASS_MASK EHCA_BMASK_IBM( 4, 12)
+#define GRH_FLOWLABEL_MASK EHCA_BMASK_IBM(13, 31)
+#define GRH_PAYLEN_MASK EHCA_BMASK_IBM(32, 47)
+#define GRH_NEXTHEADER_MASK EHCA_BMASK_IBM(48, 55)
+#define GRH_HOPLIMIT_MASK EHCA_BMASK_IBM(56, 63)
+
+/*
+ * Unreliable Datagram Address Vector Format
+ * see IBTA Vol1 chapter 8.3 Global Routing Header
+ */
+struct ehca_ud_av {
+ u8 sl;
+ u8 lnh;
+ u16 dlid;
+ u8 reserved1;
+ u8 reserved2;
+ u8 reserved3;
+ u8 slid_path_bits;
+ u8 reserved4;
+ u8 ipd;
+ u8 reserved5;
+ u8 pmtu;
+ u32 reserved6;
+ u64 reserved7;
+ union {
+ struct {
+ u64 word_0; /* always set to 6 */
+ /*should be 0x1B for IB transport */
+ u64 word_1;
+ u64 word_2;
+ u64 word_3;
+ u64 word_4;
+ } grh;
+ struct {
+ u32 wd_0;
+ u32 wd_1;
+ /* DWord_1 --> SGID */
+
+ u32 sgid_wd3;
+ u32 sgid_wd2;
+
+ u32 sgid_wd1;
+ u32 sgid_wd0;
+ /* DWord_3 --> DGID */
+
+ u32 dgid_wd3;
+ u32 dgid_wd2;
+
+ u32 dgid_wd1;
+ u32 dgid_wd0;
+ } grh_l;
+ };
+};
+
+/* maximum number of sg entries allowed in a WQE */
+#define MAX_WQE_SG_ENTRIES 252
+
+#define WQE_OPTYPE_SEND 0x80
+#define WQE_OPTYPE_RDMAREAD 0x40
+#define WQE_OPTYPE_RDMAWRITE 0x20
+#define WQE_OPTYPE_CMPSWAP 0x10
+#define WQE_OPTYPE_FETCHADD 0x08
+#define WQE_OPTYPE_BIND 0x04
+
+#define WQE_WRFLAG_REQ_SIGNAL_COM 0x80
+#define WQE_WRFLAG_FENCE 0x40
+#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
+#define WQE_WRFLAG_SOLIC_EVENT 0x10
+
+#define WQEF_CACHE_HINT 0x80
+#define WQEF_CACHE_HINT_RD_WR 0x40
+#define WQEF_TIMED_WQE 0x20
+#define WQEF_PURGE 0x08
+#define WQEF_HIGH_NIBBLE 0xF0
+
+#define MW_BIND_ACCESSCTRL_R_WRITE 0x40
+#define MW_BIND_ACCESSCTRL_R_READ 0x20
+#define MW_BIND_ACCESSCTRL_R_ATOMIC 0x10
+
+struct ehca_wqe {
+ u64 work_request_id;
+ u8 optype;
+ u8 wr_flag;
+ u16 pkeyi;
+ u8 wqef;
+ u8 nr_of_data_seg;
+ u16 wqe_provided_slid;
+ u32 destination_qp_number;
+ u32 resync_psn_sqp;
+ u32 local_ee_context_qkey;
+ u32 immediate_data;
+ union {
+ struct {
+ u64 remote_virtual_address;
+ u32 rkey;
+ u32 reserved;
+ u64 atomic_1st_op_dma_len;
+ u64 atomic_2nd_op;
+ struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+
+ } nud;
+ struct {
+ u64 ehca_ud_av_ptr;
+ u64 reserved1;
+ u64 reserved2;
+ u64 reserved3;
+ struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+ } ud_avp;
+ struct {
+ struct ehca_ud_av ud_av;
+ struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
+ 2];
+ } ud_av;
+ struct {
+ u64 reserved0;
+ u64 reserved1;
+ u64 reserved2;
+ u64 reserved3;
+ struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+ } all_rcv;
+
+ struct {
+ u64 reserved;
+ u32 rkey;
+ u32 old_rkey;
+ u64 reserved1;
+ u64 reserved2;
+ u64 virtual_address;
+ u32 reserved3;
+ u32 length;
+ u32 reserved4;
+ u16 reserved5;
+ u8 reserved6;
+ u8 lr_ctl;
+ u32 lkey;
+ u32 reserved7;
+ u64 reserved8;
+ u64 reserved9;
+ u64 reserved10;
+ u64 reserved11;
+ } bind;
+ struct {
+ u64 reserved12;
+ u64 reserved13;
+ u32 size;
+ u32 start;
+ } inline_data;
+ } u;
+
+};
+
+#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
+#define WC_IMM_DATA EHCA_BMASK_IBM(1, 1)
+#define WC_GRH_PRESENT EHCA_BMASK_IBM(2, 2)
+#define WC_SE_BIT EHCA_BMASK_IBM(3, 3)
+#define WC_STATUS_ERROR_BIT 0x80000000
+#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
+#define WC_STATUS_PURGE_BIT 0x10
+#define WC_SEND_RECEIVE_BIT 0x80
+
+struct ehca_cqe {
+ u64 work_request_id;
+ u8 optype;
+ u8 w_completion_flags;
+ u16 reserved1;
+ u32 nr_bytes_transferred;
+ u32 immediate_data;
+ u32 local_qp_number;
+ u8 freed_resource_count;
+ u8 service_level;
+ u16 wqe_count;
+ u32 qp_token;
+ u32 qkey_ee_token;
+ u32 remote_qp_number;
+ u16 dlid;
+ u16 rlid;
+ u16 reserved2;
+ u16 pkey_index;
+ u32 cqe_timestamp;
+ u32 wqe_timestamp;
+ u8 wqe_timestamp_valid;
+ u8 reserved3;
+ u8 reserved4;
+ u8 cqe_flags;
+ u32 status;
+};
+
+struct ehca_eqe {
+ u64 entry;
+};
+
+struct ehca_mrte {
+ u64 starting_va;
+ u64 length; /* length of memory region in bytes*/
+ u32 pd;
+ u8 key_instance;
+ u8 pagesize;
+ u8 mr_control;
+ u8 local_remote_access_ctrl;
+ u8 reserved[0x20 - 0x18];
+ u64 at_pointer[4];
+};
+#endif /*_EHCA_QES_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_qp.c b/drivers/staging/rdma/ehca/ehca_qp.c
new file mode 100644
index 000000000000..2e89356c46fa
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_qp.c
@@ -0,0 +1,2257 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * QP functions
+ *
+ * Authors: Joachim Fenkes <fenkes@de.ibm.com>
+ * Stefan Roscher <stefan.roscher@de.ibm.com>
+ * Waleri Fomin <fomin@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+static struct kmem_cache *qp_cache;
+
+/*
+ * attributes not supported by query qp
+ */
+#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS | \
+ IB_QP_EN_SQD_ASYNC_NOTIFY)
+
+/*
+ * ehca (internal) qp state values
+ */
+enum ehca_qp_state {
+ EHCA_QPS_RESET = 1,
+ EHCA_QPS_INIT = 2,
+ EHCA_QPS_RTR = 3,
+ EHCA_QPS_RTS = 5,
+ EHCA_QPS_SQD = 6,
+ EHCA_QPS_SQE = 8,
+ EHCA_QPS_ERR = 128
+};
+
+/*
+ * qp state transitions as defined by IB Arch Rel 1.1 page 431
+ */
+enum ib_qp_statetrans {
+ IB_QPST_ANY2RESET,
+ IB_QPST_ANY2ERR,
+ IB_QPST_RESET2INIT,
+ IB_QPST_INIT2RTR,
+ IB_QPST_INIT2INIT,
+ IB_QPST_RTR2RTS,
+ IB_QPST_RTS2SQD,
+ IB_QPST_RTS2RTS,
+ IB_QPST_SQD2RTS,
+ IB_QPST_SQE2RTS,
+ IB_QPST_SQD2SQD,
+ IB_QPST_MAX /* nr of transitions, this must be last!!! */
+};
+
+/*
+ * ib2ehca_qp_state maps IB to ehca qp_state
+ * returns ehca qp state corresponding to given ib qp state
+ */
+static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
+{
+ switch (ib_qp_state) {
+ case IB_QPS_RESET:
+ return EHCA_QPS_RESET;
+ case IB_QPS_INIT:
+ return EHCA_QPS_INIT;
+ case IB_QPS_RTR:
+ return EHCA_QPS_RTR;
+ case IB_QPS_RTS:
+ return EHCA_QPS_RTS;
+ case IB_QPS_SQD:
+ return EHCA_QPS_SQD;
+ case IB_QPS_SQE:
+ return EHCA_QPS_SQE;
+ case IB_QPS_ERR:
+ return EHCA_QPS_ERR;
+ default:
+ ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
+ return -EINVAL;
+ }
+}
+
+/*
+ * ehca2ib_qp_state maps ehca to IB qp_state
+ * returns ib qp state corresponding to given ehca qp state
+ */
+static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
+ ehca_qp_state)
+{
+ switch (ehca_qp_state) {
+ case EHCA_QPS_RESET:
+ return IB_QPS_RESET;
+ case EHCA_QPS_INIT:
+ return IB_QPS_INIT;
+ case EHCA_QPS_RTR:
+ return IB_QPS_RTR;
+ case EHCA_QPS_RTS:
+ return IB_QPS_RTS;
+ case EHCA_QPS_SQD:
+ return IB_QPS_SQD;
+ case EHCA_QPS_SQE:
+ return IB_QPS_SQE;
+ case EHCA_QPS_ERR:
+ return IB_QPS_ERR;
+ default:
+ ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
+ return -EINVAL;
+ }
+}
+
+/*
+ * ehca_qp_type used as index for req_attr and opt_attr of
+ * struct ehca_modqp_statetrans
+ */
+enum ehca_qp_type {
+ QPT_RC = 0,
+ QPT_UC = 1,
+ QPT_UD = 2,
+ QPT_SQP = 3,
+ QPT_MAX
+};
+
+/*
+ * ib2ehcaqptype maps Ib to ehca qp_type
+ * returns ehca qp type corresponding to ib qp type
+ */
+static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
+{
+ switch (ibqptype) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ return QPT_SQP;
+ case IB_QPT_RC:
+ return QPT_RC;
+ case IB_QPT_UC:
+ return QPT_UC;
+ case IB_QPT_UD:
+ return QPT_UD;
+ default:
+ ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+ return -EINVAL;
+ }
+}
+
+static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
+ int ib_tostate)
+{
+ int index = -EINVAL;
+ switch (ib_tostate) {
+ case IB_QPS_RESET:
+ index = IB_QPST_ANY2RESET;
+ break;
+ case IB_QPS_INIT:
+ switch (ib_fromstate) {
+ case IB_QPS_RESET:
+ index = IB_QPST_RESET2INIT;
+ break;
+ case IB_QPS_INIT:
+ index = IB_QPST_INIT2INIT;
+ break;
+ }
+ break;
+ case IB_QPS_RTR:
+ if (ib_fromstate == IB_QPS_INIT)
+ index = IB_QPST_INIT2RTR;
+ break;
+ case IB_QPS_RTS:
+ switch (ib_fromstate) {
+ case IB_QPS_RTR:
+ index = IB_QPST_RTR2RTS;
+ break;
+ case IB_QPS_RTS:
+ index = IB_QPST_RTS2RTS;
+ break;
+ case IB_QPS_SQD:
+ index = IB_QPST_SQD2RTS;
+ break;
+ case IB_QPS_SQE:
+ index = IB_QPST_SQE2RTS;
+ break;
+ }
+ break;
+ case IB_QPS_SQD:
+ if (ib_fromstate == IB_QPS_RTS)
+ index = IB_QPST_RTS2SQD;
+ break;
+ case IB_QPS_SQE:
+ break;
+ case IB_QPS_ERR:
+ index = IB_QPST_ANY2ERR;
+ break;
+ default:
+ break;
+ }
+ return index;
+}
+
+/*
+ * ibqptype2servicetype returns hcp service type corresponding to given
+ * ib qp type used by create_qp()
+ */
+static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
+{
+ switch (ibqptype) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ return ST_UD;
+ case IB_QPT_RC:
+ return ST_RC;
+ case IB_QPT_UC:
+ return ST_UC;
+ case IB_QPT_UD:
+ return ST_UD;
+ case IB_QPT_RAW_IPV6:
+ return -EINVAL;
+ case IB_QPT_RAW_ETHERTYPE:
+ return -EINVAL;
+ default:
+ ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+ return -EINVAL;
+ }
+}
+
+/*
+ * init userspace queue info from ipz_queue data
+ */
+static inline void queue2resp(struct ipzu_queue_resp *resp,
+ struct ipz_queue *queue)
+{
+ resp->qe_size = queue->qe_size;
+ resp->act_nr_of_sg = queue->act_nr_of_sg;
+ resp->queue_length = queue->queue_length;
+ resp->pagesize = queue->pagesize;
+ resp->toggle_state = queue->toggle_state;
+ resp->offset = queue->offset;
+}
+
+/*
+ * init_qp_queue initializes/constructs r/squeue and registers queue pages.
+ */
+static inline int init_qp_queue(struct ehca_shca *shca,
+ struct ehca_pd *pd,
+ struct ehca_qp *my_qp,
+ struct ipz_queue *queue,
+ int q_type,
+ u64 expected_hret,
+ struct ehca_alloc_queue_parms *parms,
+ int wqe_size)
+{
+ int ret, cnt, ipz_rc, nr_q_pages;
+ void *vpage;
+ u64 rpage, h_ret;
+ struct ib_device *ib_dev = &shca->ib_device;
+ struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
+
+ if (!parms->queue_size)
+ return 0;
+
+ if (parms->is_small) {
+ nr_q_pages = 1;
+ ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+ 128 << parms->page_size,
+ wqe_size, parms->act_nr_sges, 1);
+ } else {
+ nr_q_pages = parms->queue_size;
+ ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+ EHCA_PAGESIZE, wqe_size,
+ parms->act_nr_sges, 0);
+ }
+
+ if (!ipz_rc) {
+ ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
+ ipz_rc);
+ return -EBUSY;
+ }
+
+ /* register queue pages */
+ for (cnt = 0; cnt < nr_q_pages; cnt++) {
+ vpage = ipz_qpageit_get_inc(queue);
+ if (!vpage) {
+ ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+ "failed p_vpage= %p", vpage);
+ ret = -EINVAL;
+ goto init_qp_queue1;
+ }
+ rpage = __pa(vpage);
+
+ h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ NULL, 0, q_type,
+ rpage, parms->is_small ? 0 : 1,
+ my_qp->galpas.kernel);
+ if (cnt == (nr_q_pages - 1)) { /* last page! */
+ if (h_ret != expected_hret) {
+ ehca_err(ib_dev, "hipz_qp_register_rpage() "
+ "h_ret=%lli", h_ret);
+ ret = ehca2ib_return_code(h_ret);
+ goto init_qp_queue1;
+ }
+ vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
+ if (vpage) {
+ ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+ "should not succeed vpage=%p", vpage);
+ ret = -EINVAL;
+ goto init_qp_queue1;
+ }
+ } else {
+ if (h_ret != H_PAGE_REGISTERED) {
+ ehca_err(ib_dev, "hipz_qp_register_rpage() "
+ "h_ret=%lli", h_ret);
+ ret = ehca2ib_return_code(h_ret);
+ goto init_qp_queue1;
+ }
+ }
+ }
+
+ ipz_qeit_reset(queue);
+
+ return 0;
+
+init_qp_queue1:
+ ipz_queue_dtor(pd, queue);
+ return ret;
+}
+
+static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
+{
+ if (is_llqp)
+ return 128 << act_nr_sge;
+ else
+ return offsetof(struct ehca_wqe,
+ u.nud.sg_list[act_nr_sge]);
+}
+
+static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
+ int req_nr_sge, int is_llqp)
+{
+ u32 wqe_size, q_size;
+ int act_nr_sge = req_nr_sge;
+
+ if (!is_llqp)
+ /* round up #SGEs so WQE size is a power of 2 */
+ for (act_nr_sge = 4; act_nr_sge <= 252;
+ act_nr_sge = 4 + 2 * act_nr_sge)
+ if (act_nr_sge >= req_nr_sge)
+ break;
+
+ wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
+ q_size = wqe_size * (queue->max_wr + 1);
+
+ if (q_size <= 512)
+ queue->page_size = 2;
+ else if (q_size <= 1024)
+ queue->page_size = 3;
+ else
+ queue->page_size = 0;
+
+ queue->is_small = (queue->page_size != 0);
+}
+
+/* needs to be called with cq->spinlock held */
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
+{
+ struct list_head *list, *node;
+
+ /* TODO: support low latency QPs */
+ if (qp->ext_type == EQPT_LLQP)
+ return;
+
+ if (on_sq) {
+ list = &qp->send_cq->sqp_err_list;
+ node = &qp->sq_err_node;
+ } else {
+ list = &qp->recv_cq->rqp_err_list;
+ node = &qp->rq_err_node;
+ }
+
+ if (list_empty(node))
+ list_add_tail(node, list);
+
+ return;
+}
+
+static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->spinlock, flags);
+
+ if (!list_empty(node))
+ list_del_init(node);
+
+ spin_unlock_irqrestore(&cq->spinlock, flags);
+}
+
+static void reset_queue_map(struct ehca_queue_map *qmap)
+{
+ int i;
+
+ qmap->tail = qmap->entries - 1;
+ qmap->left_to_poll = 0;
+ qmap->next_wqe_idx = 0;
+ for (i = 0; i < qmap->entries; i++) {
+ qmap->map[i].reported = 1;
+ qmap->map[i].cqe_req = 0;
+ }
+}
+
+/*
+ * Create an ib_qp struct that is either a QP or an SRQ, depending on
+ * the value of the is_srq parameter. If init_attr and srq_init_attr share
+ * fields, the field out of init_attr is used.
+ */
+static struct ehca_qp *internal_create_qp(
+ struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata, int is_srq)
+{
+ struct ehca_qp *my_qp, *my_srq = NULL;
+ struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+ struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+ ib_device);
+ struct ib_ucontext *context = NULL;
+ u64 h_ret;
+ int is_llqp = 0, has_srq = 0, is_user = 0;
+ int qp_type, max_send_sge, max_recv_sge, ret;
+
+ /* h_call's out parameters */
+ struct ehca_alloc_qp_parms parms;
+ u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
+ unsigned long flags;
+
+ if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
+ ehca_err(pd->device, "Unable to create QP, max number of %i "
+ "QPs reached.", shca->max_num_qps);
+ ehca_err(pd->device, "To increase the maximum number of QPs "
+ "use the number_of_qps module parameter.\n");
+ return ERR_PTR(-ENOSPC);
+ }
+
+ if (init_attr->create_flags) {
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+
+ memset(&parms, 0, sizeof(parms));
+ qp_type = init_attr->qp_type;
+
+ if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
+ init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
+ ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
+ init_attr->sq_sig_type);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* save LLQP info */
+ if (qp_type & 0x80) {
+ is_llqp = 1;
+ parms.ext_type = EQPT_LLQP;
+ parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
+ }
+ qp_type &= 0x1F;
+ init_attr->qp_type &= 0x1F;
+
+ /* handle SRQ base QPs */
+ if (init_attr->srq) {
+ my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
+
+ if (qp_type == IB_QPT_UC) {
+ ehca_err(pd->device, "UC with SRQ not supported");
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+
+ has_srq = 1;
+ parms.ext_type = EQPT_SRQBASE;
+ parms.srq_qpn = my_srq->real_qp_num;
+ }
+
+ if (is_llqp && has_srq) {
+ ehca_err(pd->device, "LLQPs can't have an SRQ");
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* handle SRQs */
+ if (is_srq) {
+ parms.ext_type = EQPT_SRQ;
+ parms.srq_limit = srq_init_attr->attr.srq_limit;
+ if (init_attr->cap.max_recv_sge > 3) {
+ ehca_err(pd->device, "no more than three SGEs "
+ "supported for SRQ pd=%p max_sge=%x",
+ pd, init_attr->cap.max_recv_sge);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ /* check QP type */
+ if (qp_type != IB_QPT_UD &&
+ qp_type != IB_QPT_UC &&
+ qp_type != IB_QPT_RC &&
+ qp_type != IB_QPT_SMI &&
+ qp_type != IB_QPT_GSI) {
+ ehca_err(pd->device, "wrong QP Type=%x", qp_type);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (is_llqp) {
+ switch (qp_type) {
+ case IB_QPT_RC:
+ if ((init_attr->cap.max_send_wr > 255) ||
+ (init_attr->cap.max_recv_wr > 255)) {
+ ehca_err(pd->device,
+ "Invalid Number of max_sq_wr=%x "
+ "or max_rq_wr=%x for RC LLQP",
+ init_attr->cap.max_send_wr,
+ init_attr->cap.max_recv_wr);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case IB_QPT_UD:
+ if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
+ ehca_err(pd->device, "UD LLQP not supported "
+ "by this adapter");
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-ENOSYS);
+ }
+ if (!(init_attr->cap.max_send_sge <= 5
+ && init_attr->cap.max_send_sge >= 1
+ && init_attr->cap.max_recv_sge <= 5
+ && init_attr->cap.max_recv_sge >= 1)) {
+ ehca_err(pd->device,
+ "Invalid Number of max_send_sge=%x "
+ "or max_recv_sge=%x for UD LLQP",
+ init_attr->cap.max_send_sge,
+ init_attr->cap.max_recv_sge);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ } else if (init_attr->cap.max_send_wr > 255) {
+ ehca_err(pd->device,
+ "Invalid Number of "
+ "max_send_wr=%x for UD QP_TYPE=%x",
+ init_attr->cap.max_send_wr, qp_type);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ default:
+ ehca_err(pd->device, "unsupported LL QP Type=%x",
+ qp_type);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+ } else {
+ int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
+ || qp_type == IB_QPT_GSI) ? 250 : 252;
+
+ if (init_attr->cap.max_send_sge > max_sge
+ || init_attr->cap.max_recv_sge > max_sge) {
+ ehca_err(pd->device, "Invalid number of SGEs requested "
+ "send_sge=%x recv_sge=%x max_sge=%x",
+ init_attr->cap.max_send_sge,
+ init_attr->cap.max_recv_sge, max_sge);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
+ if (!my_qp) {
+ ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (pd->uobject && udata) {
+ is_user = 1;
+ context = pd->uobject->context;
+ }
+
+ atomic_set(&my_qp->nr_events, 0);
+ init_waitqueue_head(&my_qp->wait_completion);
+ spin_lock_init(&my_qp->spinlock_s);
+ spin_lock_init(&my_qp->spinlock_r);
+ my_qp->qp_type = qp_type;
+ my_qp->ext_type = parms.ext_type;
+ my_qp->state = IB_QPS_RESET;
+
+ if (init_attr->recv_cq)
+ my_qp->recv_cq =
+ container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
+ if (init_attr->send_cq)
+ my_qp->send_cq =
+ container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
+
+ idr_preload(GFP_KERNEL);
+ write_lock_irqsave(&ehca_qp_idr_lock, flags);
+
+ ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
+ if (ret >= 0)
+ my_qp->token = ret;
+
+ write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+ idr_preload_end();
+ if (ret < 0) {
+ if (ret == -ENOSPC) {
+ ret = -EINVAL;
+ ehca_err(pd->device, "Invalid number of qp");
+ } else {
+ ret = -ENOMEM;
+ ehca_err(pd->device, "Can't allocate new idr entry.");
+ }
+ goto create_qp_exit0;
+ }
+
+ if (has_srq)
+ parms.srq_token = my_qp->token;
+
+ parms.servicetype = ibqptype2servicetype(qp_type);
+ if (parms.servicetype < 0) {
+ ret = -EINVAL;
+ ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
+ goto create_qp_exit1;
+ }
+
+ /* Always signal by WQE so we can hide circ. WQEs */
+ parms.sigtype = HCALL_SIGT_BY_WQE;
+
+ /* UD_AV CIRCUMVENTION */
+ max_send_sge = init_attr->cap.max_send_sge;
+ max_recv_sge = init_attr->cap.max_recv_sge;
+ if (parms.servicetype == ST_UD && !is_llqp) {
+ max_send_sge += 2;
+ max_recv_sge += 2;
+ }
+
+ parms.token = my_qp->token;
+ parms.eq_handle = shca->eq.ipz_eq_handle;
+ parms.pd = my_pd->fw_pd;
+ if (my_qp->send_cq)
+ parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
+ if (my_qp->recv_cq)
+ parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
+
+ parms.squeue.max_wr = init_attr->cap.max_send_wr;
+ parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
+ parms.squeue.max_sge = max_send_sge;
+ parms.rqueue.max_sge = max_recv_sge;
+
+ /* RC QPs need one more SWQE for unsolicited ack circumvention */
+ if (qp_type == IB_QPT_RC)
+ parms.squeue.max_wr++;
+
+ if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
+ if (HAS_SQ(my_qp))
+ ehca_determine_small_queue(
+ &parms.squeue, max_send_sge, is_llqp);
+ if (HAS_RQ(my_qp))
+ ehca_determine_small_queue(
+ &parms.rqueue, max_recv_sge, is_llqp);
+ parms.qp_storage =
+ (parms.squeue.is_small || parms.rqueue.is_small);
+ }
+
+ h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
+ h_ret);
+ ret = ehca2ib_return_code(h_ret);
+ goto create_qp_exit1;
+ }
+
+ ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
+ my_qp->ipz_qp_handle = parms.qp_handle;
+ my_qp->galpas = parms.galpas;
+
+ swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
+ rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
+
+ switch (qp_type) {
+ case IB_QPT_RC:
+ if (is_llqp) {
+ parms.squeue.act_nr_sges = 1;
+ parms.rqueue.act_nr_sges = 1;
+ }
+ /* hide the extra WQE */
+ parms.squeue.act_nr_wqes--;
+ break;
+ case IB_QPT_UD:
+ case IB_QPT_GSI:
+ case IB_QPT_SMI:
+ /* UD circumvention */
+ if (is_llqp) {
+ parms.squeue.act_nr_sges = 1;
+ parms.rqueue.act_nr_sges = 1;
+ } else {
+ parms.squeue.act_nr_sges -= 2;
+ parms.rqueue.act_nr_sges -= 2;
+ }
+
+ if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
+ parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
+ parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
+ parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
+ parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
+ ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+
+ /* initialize r/squeue and register queue pages */
+ if (HAS_SQ(my_qp)) {
+ ret = init_qp_queue(
+ shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
+ HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
+ &parms.squeue, swqe_size);
+ if (ret) {
+ ehca_err(pd->device, "Couldn't initialize squeue "
+ "and pages ret=%i", ret);
+ goto create_qp_exit2;
+ }
+
+ if (!is_user) {
+ my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+ my_qp->ipz_squeue.qe_size;
+ my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+ sizeof(struct ehca_qmap_entry));
+ if (!my_qp->sq_map.map) {
+ ehca_err(pd->device, "Couldn't allocate squeue "
+ "map ret=%i", ret);
+ goto create_qp_exit3;
+ }
+ INIT_LIST_HEAD(&my_qp->sq_err_node);
+ /* to avoid the generation of bogus flush CQEs */
+ reset_queue_map(&my_qp->sq_map);
+ }
+ }
+
+ if (HAS_RQ(my_qp)) {
+ ret = init_qp_queue(
+ shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
+ H_SUCCESS, &parms.rqueue, rwqe_size);
+ if (ret) {
+ ehca_err(pd->device, "Couldn't initialize rqueue "
+ "and pages ret=%i", ret);
+ goto create_qp_exit4;
+ }
+ if (!is_user) {
+ my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
+ my_qp->ipz_rqueue.qe_size;
+ my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
+ sizeof(struct ehca_qmap_entry));
+ if (!my_qp->rq_map.map) {
+ ehca_err(pd->device, "Couldn't allocate squeue "
+ "map ret=%i", ret);
+ goto create_qp_exit5;
+ }
+ INIT_LIST_HEAD(&my_qp->rq_err_node);
+ /* to avoid the generation of bogus flush CQEs */
+ reset_queue_map(&my_qp->rq_map);
+ }
+ } else if (init_attr->srq && !is_user) {
+ /* this is a base QP, use the queue map of the SRQ */
+ my_qp->rq_map = my_srq->rq_map;
+ INIT_LIST_HEAD(&my_qp->rq_err_node);
+
+ my_qp->ipz_rqueue = my_srq->ipz_rqueue;
+ }
+
+ if (is_srq) {
+ my_qp->ib_srq.pd = &my_pd->ib_pd;
+ my_qp->ib_srq.device = my_pd->ib_pd.device;
+
+ my_qp->ib_srq.srq_context = init_attr->qp_context;
+ my_qp->ib_srq.event_handler = init_attr->event_handler;
+ } else {
+ my_qp->ib_qp.qp_num = ib_qp_num;
+ my_qp->ib_qp.pd = &my_pd->ib_pd;
+ my_qp->ib_qp.device = my_pd->ib_pd.device;
+
+ my_qp->ib_qp.recv_cq = init_attr->recv_cq;
+ my_qp->ib_qp.send_cq = init_attr->send_cq;
+
+ my_qp->ib_qp.qp_type = qp_type;
+ my_qp->ib_qp.srq = init_attr->srq;
+
+ my_qp->ib_qp.qp_context = init_attr->qp_context;
+ my_qp->ib_qp.event_handler = init_attr->event_handler;
+ }
+
+ init_attr->cap.max_inline_data = 0; /* not supported yet */
+ init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
+ init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
+ init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
+ init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
+ my_qp->init_attr = *init_attr;
+
+ if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+ shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+ &my_qp->ib_qp;
+ if (ehca_nr_ports < 0) {
+ /* alloc array to cache subsequent modify qp parms
+ * for autodetect mode
+ */
+ my_qp->mod_qp_parm =
+ kzalloc(EHCA_MOD_QP_PARM_MAX *
+ sizeof(*my_qp->mod_qp_parm),
+ GFP_KERNEL);
+ if (!my_qp->mod_qp_parm) {
+ ehca_err(pd->device,
+ "Could not alloc mod_qp_parm");
+ goto create_qp_exit5;
+ }
+ }
+ }
+
+ /* NOTE: define_apq0() not supported yet */
+ if (qp_type == IB_QPT_GSI) {
+ h_ret = ehca_define_sqp(shca, my_qp, init_attr);
+ if (h_ret != H_SUCCESS) {
+ kfree(my_qp->mod_qp_parm);
+ my_qp->mod_qp_parm = NULL;
+ /* the QP pointer is no longer valid */
+ shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+ NULL;
+ ret = ehca2ib_return_code(h_ret);
+ goto create_qp_exit6;
+ }
+ }
+
+ if (my_qp->send_cq) {
+ ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
+ if (ret) {
+ ehca_err(pd->device,
+ "Couldn't assign qp to send_cq ret=%i", ret);
+ goto create_qp_exit7;
+ }
+ }
+
+ /* copy queues, galpa data to user space */
+ if (context && udata) {
+ struct ehca_create_qp_resp resp;
+ memset(&resp, 0, sizeof(resp));
+
+ resp.qp_num = my_qp->real_qp_num;
+ resp.token = my_qp->token;
+ resp.qp_type = my_qp->qp_type;
+ resp.ext_type = my_qp->ext_type;
+ resp.qkey = my_qp->qkey;
+ resp.real_qp_num = my_qp->real_qp_num;
+
+ if (HAS_SQ(my_qp))
+ queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
+ if (HAS_RQ(my_qp))
+ queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
+ resp.fw_handle_ofs = (u32)
+ (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
+
+ if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
+ ehca_err(pd->device, "Copy to udata failed");
+ ret = -EINVAL;
+ goto create_qp_exit8;
+ }
+ }
+
+ return my_qp;
+
+create_qp_exit8:
+ ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
+
+create_qp_exit7:
+ kfree(my_qp->mod_qp_parm);
+
+create_qp_exit6:
+ if (HAS_RQ(my_qp) && !is_user)
+ vfree(my_qp->rq_map.map);
+
+create_qp_exit5:
+ if (HAS_RQ(my_qp))
+ ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+
+create_qp_exit4:
+ if (HAS_SQ(my_qp) && !is_user)
+ vfree(my_qp->sq_map.map);
+
+create_qp_exit3:
+ if (HAS_SQ(my_qp))
+ ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+
+create_qp_exit2:
+ hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+
+create_qp_exit1:
+ write_lock_irqsave(&ehca_qp_idr_lock, flags);
+ idr_remove(&ehca_qp_idr, my_qp->token);
+ write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+create_qp_exit0:
+ kmem_cache_free(qp_cache, my_qp);
+ atomic_dec(&shca->num_qps);
+ return ERR_PTR(ret);
+}
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr,
+ struct ib_udata *udata)
+{
+ struct ehca_qp *ret;
+
+ ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
+ return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+ struct ib_uobject *uobject);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ struct ehca_qp *my_qp;
+ struct ib_srq *ret;
+ struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+ ib_device);
+ struct hcp_modify_qp_control_block *mqpcb;
+ u64 hret, update_mask;
+
+ if (srq_init_attr->srq_type != IB_SRQT_BASIC)
+ return ERR_PTR(-ENOSYS);
+
+ /* For common attributes, internal_create_qp() takes its info
+ * out of qp_init_attr, so copy all common attrs there.
+ */
+ memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+ qp_init_attr.event_handler = srq_init_attr->event_handler;
+ qp_init_attr.qp_context = srq_init_attr->srq_context;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.qp_type = IB_QPT_RC;
+ qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
+ qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
+
+ my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
+ if (IS_ERR(my_qp))
+ return (struct ib_srq *)my_qp;
+
+ /* copy back return values */
+ srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
+ srq_init_attr->attr.max_sge = 3;
+
+ /* drive SRQ into RTR state */
+ mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!mqpcb) {
+ ehca_err(pd->device, "Could not get zeroed page for mqpcb "
+ "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+ ret = ERR_PTR(-ENOMEM);
+ goto create_srq1;
+ }
+
+ mqpcb->qp_state = EHCA_QPS_INIT;
+ mqpcb->prim_phys_port = 1;
+ update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+ hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ &my_qp->pf,
+ update_mask,
+ mqpcb, my_qp->galpas.kernel);
+ if (hret != H_SUCCESS) {
+ ehca_err(pd->device, "Could not modify SRQ to INIT "
+ "ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, my_qp->real_qp_num, hret);
+ goto create_srq2;
+ }
+
+ mqpcb->qp_enable = 1;
+ update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+ hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ &my_qp->pf,
+ update_mask,
+ mqpcb, my_qp->galpas.kernel);
+ if (hret != H_SUCCESS) {
+ ehca_err(pd->device, "Could not enable SRQ "
+ "ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, my_qp->real_qp_num, hret);
+ goto create_srq2;
+ }
+
+ mqpcb->qp_state = EHCA_QPS_RTR;
+ update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+ hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ &my_qp->pf,
+ update_mask,
+ mqpcb, my_qp->galpas.kernel);
+ if (hret != H_SUCCESS) {
+ ehca_err(pd->device, "Could not modify SRQ to RTR "
+ "ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, my_qp->real_qp_num, hret);
+ goto create_srq2;
+ }
+
+ ehca_free_fw_ctrlblock(mqpcb);
+
+ return &my_qp->ib_srq;
+
+create_srq2:
+ ret = ERR_PTR(ehca2ib_return_code(hret));
+ ehca_free_fw_ctrlblock(mqpcb);
+
+create_srq1:
+ internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
+
+ return ret;
+}
+
+/*
+ * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
+ * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
+ * returns total number of bad wqes in bad_wqe_cnt
+ */
+static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
+ int *bad_wqe_cnt)
+{
+ u64 h_ret;
+ struct ipz_queue *squeue;
+ void *bad_send_wqe_p, *bad_send_wqe_v;
+ u64 q_ofs;
+ struct ehca_wqe *wqe;
+ int qp_num = my_qp->ib_qp.qp_num;
+
+ /* get send wqe pointer */
+ h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle, &my_qp->pf,
+ &bad_send_wqe_p, NULL, 2);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
+ " ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, qp_num, h_ret);
+ return ehca2ib_return_code(h_ret);
+ }
+ bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
+ ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
+ qp_num, bad_send_wqe_p);
+ /* convert wqe pointer to vadr */
+ bad_send_wqe_v = __va((u64)bad_send_wqe_p);
+ if (ehca_debug_level >= 2)
+ ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
+ squeue = &my_qp->ipz_squeue;
+ if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
+ ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
+ " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
+ return -EFAULT;
+ }
+
+ /* loop sets wqe's purge bit */
+ wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+ *bad_wqe_cnt = 0;
+ while (wqe->optype != 0xff && wqe->wqef != 0xff) {
+ if (ehca_debug_level >= 2)
+ ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
+ wqe->nr_of_data_seg = 0; /* suppress data access */
+ wqe->wqef = WQEF_PURGE; /* WQE to be purged */
+ q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
+ wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+ *bad_wqe_cnt = (*bad_wqe_cnt)+1;
+ }
+ /*
+ * bad wqe will be reprocessed and ignored when pol_cq() is called,
+ * i.e. nr of wqes with flush error status is one less
+ */
+ ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
+ qp_num, (*bad_wqe_cnt)-1);
+ wqe->wqef = 0;
+
+ return 0;
+}
+
+static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
+ struct ehca_queue_map *qmap)
+{
+ void *wqe_v;
+ u64 q_ofs;
+ u32 wqe_idx;
+ unsigned int tail_idx;
+
+ /* convert real to abs address */
+ wqe_p = wqe_p & (~(1UL << 63));
+
+ wqe_v = __va(wqe_p);
+
+ if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
+ ehca_gen_err("Invalid offset for calculating left cqes "
+ "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
+ return -EFAULT;
+ }
+
+ tail_idx = next_index(qmap->tail, qmap->entries);
+ wqe_idx = q_ofs / ipz_queue->qe_size;
+
+ /* check all processed wqes, whether a cqe is requested or not */
+ while (tail_idx != wqe_idx) {
+ if (qmap->map[tail_idx].cqe_req)
+ qmap->left_to_poll++;
+ tail_idx = next_index(tail_idx, qmap->entries);
+ }
+ /* save index in queue, where we have to start flushing */
+ qmap->next_wqe_idx = wqe_idx;
+ return 0;
+}
+
+static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
+{
+ u64 h_ret;
+ void *send_wqe_p, *recv_wqe_p;
+ int ret;
+ unsigned long flags;
+ int qp_num = my_qp->ib_qp.qp_num;
+
+ /* this hcall is not supported on base QPs */
+ if (my_qp->ext_type != EQPT_SRQBASE) {
+ /* get send and receive wqe pointer */
+ h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle, &my_qp->pf,
+ &send_wqe_p, &recv_wqe_p, 4);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device, "disable_and_get_wqe() "
+ "failed ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, qp_num, h_ret);
+ return ehca2ib_return_code(h_ret);
+ }
+
+ /*
+ * acquire lock to ensure that nobody is polling the cq which
+ * could mean that the qmap->tail pointer is in an
+ * inconsistent state.
+ */
+ spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+ ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
+ &my_qp->sq_map);
+ spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+ if (ret)
+ return ret;
+
+
+ spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+ ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
+ &my_qp->rq_map);
+ spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+ if (ret)
+ return ret;
+ } else {
+ spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+ my_qp->sq_map.left_to_poll = 0;
+ my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+ my_qp->sq_map.entries);
+ spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+ spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+ my_qp->rq_map.left_to_poll = 0;
+ my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+ my_qp->rq_map.entries);
+ spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+ }
+
+ /* this assures flush cqes being generated only for pending wqes */
+ if ((my_qp->sq_map.left_to_poll == 0) &&
+ (my_qp->rq_map.left_to_poll == 0)) {
+ spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+ ehca_add_to_err_list(my_qp, 1);
+ spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+ if (HAS_RQ(my_qp)) {
+ spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+ ehca_add_to_err_list(my_qp, 0);
+ spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
+ flags);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * internal_modify_qp with circumvention to handle aqp0 properly
+ * smi_reset2init indicates if this is an internal reset-to-init-call for
+ * smi. This flag must always be zero if called from ehca_modify_qp()!
+ * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
+ */
+static int internal_modify_qp(struct ib_qp *ibqp,
+ struct ib_qp_attr *attr,
+ int attr_mask, int smi_reset2init)
+{
+ enum ib_qp_state qp_cur_state, qp_new_state;
+ int cnt, qp_attr_idx, ret = 0;
+ enum ib_qp_statetrans statetrans;
+ struct hcp_modify_qp_control_block *mqpcb;
+ struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+ struct ehca_shca *shca =
+ container_of(ibqp->pd->device, struct ehca_shca, ib_device);
+ u64 update_mask;
+ u64 h_ret;
+ int bad_wqe_cnt = 0;
+ int is_user = 0;
+ int squeue_locked = 0;
+ unsigned long flags = 0;
+
+ /* do query_qp to obtain current attr values */
+ mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+ if (!mqpcb) {
+ ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
+ "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ &my_qp->pf,
+ mqpcb, my_qp->galpas.kernel);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(ibqp->device, "hipz_h_query_qp() failed "
+ "ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, ibqp->qp_num, h_ret);
+ ret = ehca2ib_return_code(h_ret);
+ goto modify_qp_exit1;
+ }
+ if (ibqp->uobject)
+ is_user = 1;
+
+ qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
+
+ if (qp_cur_state == -EINVAL) { /* invalid qp state */
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
+ "ehca_qp=%p qp_num=%x",
+ mqpcb->qp_state, my_qp, ibqp->qp_num);
+ goto modify_qp_exit1;
+ }
+ /*
+ * circumvention to set aqp0 initial state to init
+ * as expected by IB spec
+ */
+ if (smi_reset2init == 0 &&
+ ibqp->qp_type == IB_QPT_SMI &&
+ qp_cur_state == IB_QPS_RESET &&
+ (attr_mask & IB_QP_STATE) &&
+ attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
+ struct ib_qp_attr smiqp_attr = {
+ .qp_state = IB_QPS_INIT,
+ .port_num = my_qp->init_attr.port_num,
+ .pkey_index = 0,
+ .qkey = 0
+ };
+ int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
+ IB_QP_PKEY_INDEX | IB_QP_QKEY;
+ int smirc = internal_modify_qp(
+ ibqp, &smiqp_attr, smiqp_attr_mask, 1);
+ if (smirc) {
+ ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
+ "ehca_modify_qp() rc=%i", smirc);
+ ret = H_PARAMETER;
+ goto modify_qp_exit1;
+ }
+ qp_cur_state = IB_QPS_INIT;
+ ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
+ }
+ /* is transmitted current state equal to "real" current state */
+ if ((attr_mask & IB_QP_CUR_STATE) &&
+ qp_cur_state != attr->cur_qp_state) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device,
+ "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
+ " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
+ attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
+ goto modify_qp_exit1;
+ }
+
+ ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
+ "new qp_state=%x attribute_mask=%x",
+ my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
+
+ qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
+ if (!smi_reset2init &&
+ !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
+ attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device,
+ "Invalid qp transition new_state=%x cur_state=%x "
+ "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
+ qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
+ goto modify_qp_exit1;
+ }
+
+ mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
+ if (mqpcb->qp_state)
+ update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+ else {
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "Invalid new qp state=%x "
+ "ehca_qp=%p qp_num=%x",
+ qp_new_state, my_qp, ibqp->qp_num);
+ goto modify_qp_exit1;
+ }
+
+ /* retrieve state transition struct to get req and opt attrs */
+ statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
+ if (statetrans < 0) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
+ "new_qp_state=%x State_xsition=%x ehca_qp=%p "
+ "qp_num=%x", qp_cur_state, qp_new_state,
+ statetrans, my_qp, ibqp->qp_num);
+ goto modify_qp_exit1;
+ }
+
+ qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
+
+ if (qp_attr_idx < 0) {
+ ret = qp_attr_idx;
+ ehca_err(ibqp->device,
+ "Invalid QP type=%x ehca_qp=%p qp_num=%x",
+ ibqp->qp_type, my_qp, ibqp->qp_num);
+ goto modify_qp_exit1;
+ }
+
+ ehca_dbg(ibqp->device,
+ "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
+ my_qp, ibqp->qp_num, statetrans);
+
+ /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
+ * in non-LL UD QPs.
+ */
+ if ((my_qp->qp_type == IB_QPT_UD) &&
+ (my_qp->ext_type != EQPT_LLQP) &&
+ (statetrans == IB_QPST_INIT2RTR) &&
+ (shca->hw_level >= 0x22)) {
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+ mqpcb->send_grh_flag = 1;
+ }
+
+ /* sqe -> rts: set purge bit of bad wqe before actual trans */
+ if ((my_qp->qp_type == IB_QPT_UD ||
+ my_qp->qp_type == IB_QPT_GSI ||
+ my_qp->qp_type == IB_QPT_SMI) &&
+ statetrans == IB_QPST_SQE2RTS) {
+ /* mark next free wqe if kernel */
+ if (!ibqp->uobject) {
+ struct ehca_wqe *wqe;
+ /* lock send queue */
+ spin_lock_irqsave(&my_qp->spinlock_s, flags);
+ squeue_locked = 1;
+ /* mark next free wqe */
+ wqe = (struct ehca_wqe *)
+ ipz_qeit_get(&my_qp->ipz_squeue);
+ wqe->optype = wqe->wqef = 0xff;
+ ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
+ ibqp->qp_num, wqe);
+ }
+ ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
+ if (ret) {
+ ehca_err(ibqp->device, "prepare_sqe_rts() failed "
+ "ehca_qp=%p qp_num=%x ret=%i",
+ my_qp, ibqp->qp_num, ret);
+ goto modify_qp_exit2;
+ }
+ }
+
+ /*
+ * enable RDMA_Atomic_Control if reset->init und reliable con
+ * this is necessary since gen2 does not provide that flag,
+ * but pHyp requires it
+ */
+ if (statetrans == IB_QPST_RESET2INIT &&
+ (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
+ mqpcb->rdma_atomic_ctrl = 3;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
+ }
+ /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
+ if (statetrans == IB_QPST_INIT2RTR &&
+ (ibqp->qp_type == IB_QPT_UC) &&
+ !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
+ mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ if (attr->pkey_index >= 16) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "Invalid pkey_index=%x. "
+ "ehca_qp=%p qp_num=%x max_pkey_index=f",
+ attr->pkey_index, my_qp, ibqp->qp_num);
+ goto modify_qp_exit2;
+ }
+ mqpcb->prim_p_key_idx = attr->pkey_index;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
+ }
+ if (attr_mask & IB_QP_PORT) {
+ struct ehca_sport *sport;
+ struct ehca_qp *aqp1;
+ if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "Invalid port=%x. "
+ "ehca_qp=%p qp_num=%x num_ports=%x",
+ attr->port_num, my_qp, ibqp->qp_num,
+ shca->num_ports);
+ goto modify_qp_exit2;
+ }
+ sport = &shca->sport[attr->port_num - 1];
+ if (!sport->ibqp_sqp[IB_QPT_GSI]) {
+ /* should not occur */
+ ret = -EFAULT;
+ ehca_err(ibqp->device, "AQP1 was not created for "
+ "port=%x", attr->port_num);
+ goto modify_qp_exit2;
+ }
+ aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
+ struct ehca_qp, ib_qp);
+ if (ibqp->qp_type != IB_QPT_GSI &&
+ ibqp->qp_type != IB_QPT_SMI &&
+ aqp1->mod_qp_parm) {
+ /*
+ * firmware will reject this modify_qp() because
+ * port is not activated/initialized fully
+ */
+ ret = -EFAULT;
+ ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
+ "either port is being activated (try again) "
+ "or cabling issue", attr->port_num);
+ goto modify_qp_exit2;
+ }
+ mqpcb->prim_phys_port = attr->port_num;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
+ }
+ if (attr_mask & IB_QP_QKEY) {
+ mqpcb->qkey = attr->qkey;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
+ }
+ if (attr_mask & IB_QP_AV) {
+ mqpcb->dlid = attr->ah_attr.dlid;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
+ mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
+ mqpcb->service_level = attr->ah_attr.sl;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
+
+ if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
+ attr->ah_attr.static_rate,
+ &mqpcb->max_static_rate)) {
+ ret = -EINVAL;
+ goto modify_qp_exit2;
+ }
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
+
+ /*
+ * Always supply the GRH flag, even if it's zero, to give the
+ * hypervisor a clear "yes" or "no" instead of a "perhaps"
+ */
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+
+ /*
+ * only if GRH is TRUE we might consider SOURCE_GID_IDX
+ * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+ */
+ if (attr->ah_attr.ah_flags == IB_AH_GRH) {
+ mqpcb->send_grh_flag = 1;
+
+ mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
+
+ for (cnt = 0; cnt < 16; cnt++)
+ mqpcb->dest_gid.byte[cnt] =
+ attr->ah_attr.grh.dgid.raw[cnt];
+
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
+ mqpcb->flow_label = attr->ah_attr.grh.flow_label;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
+ mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
+ mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
+ }
+ }
+
+ if (attr_mask & IB_QP_PATH_MTU) {
+ /* store ld(MTU) */
+ my_qp->mtu_shift = attr->path_mtu + 7;
+ mqpcb->path_mtu = attr->path_mtu;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
+ }
+ if (attr_mask & IB_QP_TIMEOUT) {
+ mqpcb->timeout = attr->timeout;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
+ }
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ mqpcb->retry_count = attr->retry_cnt;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
+ }
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ mqpcb->rnr_retry_count = attr->rnr_retry;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
+ }
+ if (attr_mask & IB_QP_RQ_PSN) {
+ mqpcb->receive_psn = attr->rq_psn;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
+ }
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
+ attr->max_dest_rd_atomic : 2;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+ }
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
+ attr->max_rd_atomic : 2;
+ update_mask |=
+ EHCA_BMASK_SET
+ (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
+ }
+ if (attr_mask & IB_QP_ALT_PATH) {
+ if (attr->alt_port_num < 1
+ || attr->alt_port_num > shca->num_ports) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "Invalid alt_port=%x. "
+ "ehca_qp=%p qp_num=%x num_ports=%x",
+ attr->alt_port_num, my_qp, ibqp->qp_num,
+ shca->num_ports);
+ goto modify_qp_exit2;
+ }
+ mqpcb->alt_phys_port = attr->alt_port_num;
+
+ if (attr->alt_pkey_index >= 16) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
+ "ehca_qp=%p qp_num=%x max_pkey_index=f",
+ attr->pkey_index, my_qp, ibqp->qp_num);
+ goto modify_qp_exit2;
+ }
+ mqpcb->alt_p_key_idx = attr->alt_pkey_index;
+
+ mqpcb->timeout_al = attr->alt_timeout;
+ mqpcb->dlid_al = attr->alt_ah_attr.dlid;
+ mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
+ mqpcb->service_level_al = attr->alt_ah_attr.sl;
+
+ if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
+ attr->alt_ah_attr.static_rate,
+ &mqpcb->max_static_rate_al)) {
+ ret = -EINVAL;
+ goto modify_qp_exit2;
+ }
+
+ /* OpenIB doesn't support alternate retry counts - copy them */
+ mqpcb->retry_count_al = mqpcb->retry_count;
+ mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
+
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
+
+ /*
+ * Always supply the GRH flag, even if it's zero, to give the
+ * hypervisor a clear "yes" or "no" instead of a "perhaps"
+ */
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
+
+ /*
+ * only if GRH is TRUE we might consider SOURCE_GID_IDX
+ * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+ */
+ if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
+ mqpcb->send_grh_flag_al = 1;
+
+ for (cnt = 0; cnt < 16; cnt++)
+ mqpcb->dest_gid_al.byte[cnt] =
+ attr->alt_ah_attr.grh.dgid.raw[cnt];
+ mqpcb->source_gid_idx_al =
+ attr->alt_ah_attr.grh.sgid_index;
+ mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
+ mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
+ mqpcb->traffic_class_al =
+ attr->alt_ah_attr.grh.traffic_class;
+
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
+ EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
+ }
+ }
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+ mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN) {
+ mqpcb->send_psn = attr->sq_psn;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
+ }
+
+ if (attr_mask & IB_QP_DEST_QPN) {
+ mqpcb->dest_qp_nr = attr->dest_qp_num;
+ update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
+ }
+
+ if (attr_mask & IB_QP_PATH_MIG_STATE) {
+ if (attr->path_mig_state != IB_MIG_REARM
+ && attr->path_mig_state != IB_MIG_MIGRATED) {
+ ret = -EINVAL;
+ ehca_err(ibqp->device, "Invalid mig_state=%x",
+ attr->path_mig_state);
+ goto modify_qp_exit2;
+ }
+ mqpcb->path_migration_state = attr->path_mig_state + 1;
+ if (attr->path_mig_state == IB_MIG_REARM)
+ my_qp->mig_armed = 1;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
+ }
+
+ if (attr_mask & IB_QP_CAP) {
+ mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
+ mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
+ /* no support for max_send/recv_sge yet */
+ }
+
+ if (ehca_debug_level >= 2)
+ ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
+
+ h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ &my_qp->pf,
+ update_mask,
+ mqpcb, my_qp->galpas.kernel);
+
+ if (h_ret != H_SUCCESS) {
+ ret = ehca2ib_return_code(h_ret);
+ ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
+ "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
+ goto modify_qp_exit2;
+ }
+
+ if ((my_qp->qp_type == IB_QPT_UD ||
+ my_qp->qp_type == IB_QPT_GSI ||
+ my_qp->qp_type == IB_QPT_SMI) &&
+ statetrans == IB_QPST_SQE2RTS) {
+ /* doorbell to reprocessing wqes */
+ iosync(); /* serialize GAL register access */
+ hipz_update_sqa(my_qp, bad_wqe_cnt-1);
+ ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
+ }
+
+ if (statetrans == IB_QPST_RESET2INIT ||
+ statetrans == IB_QPST_INIT2INIT) {
+ mqpcb->qp_enable = 1;
+ mqpcb->qp_state = EHCA_QPS_INIT;
+ update_mask = 0;
+ update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+
+ h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+ my_qp->ipz_qp_handle,
+ &my_qp->pf,
+ update_mask,
+ mqpcb,
+ my_qp->galpas.kernel);
+
+ if (h_ret != H_SUCCESS) {
+ ret = ehca2ib_return_code(h_ret);
+ ehca_err(ibqp->device, "ENABLE in context of "
+ "RESET_2_INIT failed! Maybe you didn't get "
+ "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
+ h_ret, my_qp, ibqp->qp_num);
+ goto modify_qp_exit2;
+ }
+ }
+ if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
+ && !is_user) {
+ ret = check_for_left_cqes(my_qp, shca);
+ if (ret)
+ goto modify_qp_exit2;
+ }
+
+ if (statetrans == IB_QPST_ANY2RESET) {
+ ipz_qeit_reset(&my_qp->ipz_rqueue);
+ ipz_qeit_reset(&my_qp->ipz_squeue);
+
+ if (qp_cur_state == IB_QPS_ERR && !is_user) {
+ del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+ if (HAS_RQ(my_qp))
+ del_from_err_list(my_qp->recv_cq,
+ &my_qp->rq_err_node);
+ }
+ if (!is_user)
+ reset_queue_map(&my_qp->sq_map);
+
+ if (HAS_RQ(my_qp) && !is_user)
+ reset_queue_map(&my_qp->rq_map);
+ }
+
+ if (attr_mask & IB_QP_QKEY)
+ my_qp->qkey = attr->qkey;
+
+modify_qp_exit2:
+ if (squeue_locked) { /* this means: sqe -> rts */
+ spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+ my_qp->sqerr_purgeflag = 1;
+ }
+
+modify_qp_exit1:
+ ehca_free_fw_ctrlblock(mqpcb);
+
+ return ret;
+}
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+ struct ib_udata *udata)
+{
+ int ret = 0;
+
+ struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+ ib_device);
+ struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+
+ /* The if-block below caches qp_attr to be modified for GSI and SMI
+ * qps during the initialization by ib_mad. When the respective port
+ * is activated, ie we got an event PORT_ACTIVE, we'll replay the
+ * cached modify calls sequence, see ehca_recover_sqs() below.
+ * Why that is required:
+ * 1) If one port is connected, older code requires that port one
+ * to be connected and module option nr_ports=1 to be given by
+ * user, which is very inconvenient for end user.
+ * 2) Firmware accepts modify_qp() only if respective port has become
+ * active. Older code had a wait loop of 30sec create_qp()/
+ * define_aqp1(), which is not appropriate in practice. This
+ * code now removes that wait loop, see define_aqp1(), and always
+ * reports all ports to ib_mad resp. users. Only activated ports
+ * will then usable for the users.
+ */
+ if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+ int port = my_qp->init_attr.port_num;
+ struct ehca_sport *sport = &shca->sport[port - 1];
+ unsigned long flags;
+ spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+ /* cache qp_attr only during init */
+ if (my_qp->mod_qp_parm) {
+ struct ehca_mod_qp_parm *p;
+ if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
+ ehca_err(&shca->ib_device,
+ "mod_qp_parm overflow state=%x port=%x"
+ " type=%x", attr->qp_state,
+ my_qp->init_attr.port_num,
+ ibqp->qp_type);
+ spin_unlock_irqrestore(&sport->mod_sqp_lock,
+ flags);
+ return -EINVAL;
+ }
+ p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
+ p->mask = attr_mask;
+ p->attr = *attr;
+ my_qp->mod_qp_parm_idx++;
+ ehca_dbg(&shca->ib_device,
+ "Saved qp_attr for state=%x port=%x type=%x",
+ attr->qp_state, my_qp->init_attr.port_num,
+ ibqp->qp_type);
+ spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+ goto out;
+ }
+ spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+ }
+
+ ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
+
+out:
+ if ((ret == 0) && (attr_mask & IB_QP_STATE))
+ my_qp->state = attr->qp_state;
+
+ return ret;
+}
+
+void ehca_recover_sqp(struct ib_qp *sqp)
+{
+ struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
+ int port = my_sqp->init_attr.port_num;
+ struct ib_qp_attr attr;
+ struct ehca_mod_qp_parm *qp_parm;
+ int i, qp_parm_idx, ret;
+ unsigned long flags, wr_cnt;
+
+ if (!my_sqp->mod_qp_parm)
+ return;
+ ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
+
+ qp_parm = my_sqp->mod_qp_parm;
+ qp_parm_idx = my_sqp->mod_qp_parm_idx;
+ for (i = 0; i < qp_parm_idx; i++) {
+ attr = qp_parm[i].attr;
+ ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
+ if (ret) {
+ ehca_err(sqp->device, "Could not modify SQP port=%x "
+ "qp_num=%x ret=%x", port, sqp->qp_num, ret);
+ goto free_qp_parm;
+ }
+ ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
+ port, sqp->qp_num, attr.qp_state);
+ }
+
+ /* re-trigger posted recv wrs */
+ wr_cnt = my_sqp->ipz_rqueue.current_q_offset /
+ my_sqp->ipz_rqueue.qe_size;
+ if (wr_cnt) {
+ spin_lock_irqsave(&my_sqp->spinlock_r, flags);
+ hipz_update_rqa(my_sqp, wr_cnt);
+ spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
+ ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
+ port, sqp->qp_num, wr_cnt);
+ }
+
+free_qp_parm:
+ kfree(qp_parm);
+ /* this prevents subsequent calls to modify_qp() to cache qp_attr */
+ my_sqp->mod_qp_parm = NULL;
+}
+
+int ehca_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+ struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
+ ib_device);
+ struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+ struct hcp_modify_qp_control_block *qpcb;
+ int cnt, ret = 0;
+ u64 h_ret;
+
+ if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
+ ehca_err(qp->device, "Invalid attribute mask "
+ "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
+ my_qp, qp->qp_num, qp_attr_mask);
+ return -EINVAL;
+ }
+
+ qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!qpcb) {
+ ehca_err(qp->device, "Out of memory for qpcb "
+ "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_qp(adapter_handle,
+ my_qp->ipz_qp_handle,
+ &my_qp->pf,
+ qpcb, my_qp->galpas.kernel);
+
+ if (h_ret != H_SUCCESS) {
+ ret = ehca2ib_return_code(h_ret);
+ ehca_err(qp->device, "hipz_h_query_qp() failed "
+ "ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, qp->qp_num, h_ret);
+ goto query_qp_exit1;
+ }
+
+ qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
+ qp_attr->qp_state = qp_attr->cur_qp_state;
+
+ if (qp_attr->cur_qp_state == -EINVAL) {
+ ret = -EINVAL;
+ ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
+ "ehca_qp=%p qp_num=%x",
+ qpcb->qp_state, my_qp, qp->qp_num);
+ goto query_qp_exit1;
+ }
+
+ if (qp_attr->qp_state == IB_QPS_SQD)
+ qp_attr->sq_draining = 1;
+
+ qp_attr->qkey = qpcb->qkey;
+ qp_attr->path_mtu = qpcb->path_mtu;
+ qp_attr->path_mig_state = qpcb->path_migration_state - 1;
+ qp_attr->rq_psn = qpcb->receive_psn;
+ qp_attr->sq_psn = qpcb->send_psn;
+ qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
+ qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
+ qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
+ /* UD_AV CIRCUMVENTION */
+ if (my_qp->qp_type == IB_QPT_UD) {
+ qp_attr->cap.max_send_sge =
+ qpcb->actual_nr_sges_in_sq_wqe - 2;
+ qp_attr->cap.max_recv_sge =
+ qpcb->actual_nr_sges_in_rq_wqe - 2;
+ } else {
+ qp_attr->cap.max_send_sge =
+ qpcb->actual_nr_sges_in_sq_wqe;
+ qp_attr->cap.max_recv_sge =
+ qpcb->actual_nr_sges_in_rq_wqe;
+ }
+
+ qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
+ qp_attr->dest_qp_num = qpcb->dest_qp_nr;
+
+ qp_attr->pkey_index = qpcb->prim_p_key_idx;
+ qp_attr->port_num = qpcb->prim_phys_port;
+ qp_attr->timeout = qpcb->timeout;
+ qp_attr->retry_cnt = qpcb->retry_count;
+ qp_attr->rnr_retry = qpcb->rnr_retry_count;
+
+ qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
+ qp_attr->alt_port_num = qpcb->alt_phys_port;
+ qp_attr->alt_timeout = qpcb->timeout_al;
+
+ qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
+ qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
+
+ /* primary av */
+ qp_attr->ah_attr.sl = qpcb->service_level;
+
+ if (qpcb->send_grh_flag) {
+ qp_attr->ah_attr.ah_flags = IB_AH_GRH;
+ }
+
+ qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
+ qp_attr->ah_attr.dlid = qpcb->dlid;
+ qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
+ qp_attr->ah_attr.port_num = qp_attr->port_num;
+
+ /* primary GRH */
+ qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
+ qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
+ qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
+ qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
+
+ for (cnt = 0; cnt < 16; cnt++)
+ qp_attr->ah_attr.grh.dgid.raw[cnt] =
+ qpcb->dest_gid.byte[cnt];
+
+ /* alternate AV */
+ qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
+ if (qpcb->send_grh_flag_al) {
+ qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
+ }
+
+ qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
+ qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
+ qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
+
+ /* alternate GRH */
+ qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
+ qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
+ qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
+ qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
+
+ for (cnt = 0; cnt < 16; cnt++)
+ qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
+ qpcb->dest_gid_al.byte[cnt];
+
+ /* return init attributes given in ehca_create_qp */
+ if (qp_init_attr)
+ *qp_init_attr = my_qp->init_attr;
+
+ if (ehca_debug_level >= 2)
+ ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
+
+query_qp_exit1:
+ ehca_free_fw_ctrlblock(qpcb);
+
+ return ret;
+}
+
+int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct ehca_qp *my_qp =
+ container_of(ibsrq, struct ehca_qp, ib_srq);
+ struct ehca_shca *shca =
+ container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
+ struct hcp_modify_qp_control_block *mqpcb;
+ u64 update_mask;
+ u64 h_ret;
+ int ret = 0;
+
+ mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!mqpcb) {
+ ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
+ "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+ return -ENOMEM;
+ }
+
+ update_mask = 0;
+ if (attr_mask & IB_SRQ_LIMIT) {
+ attr_mask &= ~IB_SRQ_LIMIT;
+ update_mask |=
+ EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
+ | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
+ mqpcb->curr_srq_limit = attr->srq_limit;
+ mqpcb->qp_aff_asyn_ev_log_reg =
+ EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
+ }
+
+ /* by now, all bits in attr_mask should have been cleared */
+ if (attr_mask) {
+ ehca_err(ibsrq->device, "invalid attribute mask bits set "
+ "attr_mask=%x", attr_mask);
+ ret = -EINVAL;
+ goto modify_srq_exit0;
+ }
+
+ if (ehca_debug_level >= 2)
+ ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+ h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
+ NULL, update_mask, mqpcb,
+ my_qp->galpas.kernel);
+
+ if (h_ret != H_SUCCESS) {
+ ret = ehca2ib_return_code(h_ret);
+ ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
+ "ehca_qp=%p qp_num=%x",
+ h_ret, my_qp, my_qp->real_qp_num);
+ }
+
+modify_srq_exit0:
+ ehca_free_fw_ctrlblock(mqpcb);
+
+ return ret;
+}
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
+{
+ struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
+ struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
+ ib_device);
+ struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+ struct hcp_modify_qp_control_block *qpcb;
+ int ret = 0;
+ u64 h_ret;
+
+ qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+ if (!qpcb) {
+ ehca_err(srq->device, "Out of memory for qpcb "
+ "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
+ return -ENOMEM;
+ }
+
+ h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
+ NULL, qpcb, my_qp->galpas.kernel);
+
+ if (h_ret != H_SUCCESS) {
+ ret = ehca2ib_return_code(h_ret);
+ ehca_err(srq->device, "hipz_h_query_qp() failed "
+ "ehca_qp=%p qp_num=%x h_ret=%lli",
+ my_qp, my_qp->real_qp_num, h_ret);
+ goto query_srq_exit1;
+ }
+
+ srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+ srq_attr->max_sge = 3;
+ srq_attr->srq_limit = qpcb->curr_srq_limit;
+
+ if (ehca_debug_level >= 2)
+ ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+query_srq_exit1:
+ ehca_free_fw_ctrlblock(qpcb);
+
+ return ret;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+ struct ib_uobject *uobject)
+{
+ struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
+ struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
+ ib_pd);
+ struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
+ u32 qp_num = my_qp->real_qp_num;
+ int ret;
+ u64 h_ret;
+ u8 port_num;
+ int is_user = 0;
+ enum ib_qp_type qp_type;
+ unsigned long flags;
+
+ if (uobject) {
+ is_user = 1;
+ if (my_qp->mm_count_galpa ||
+ my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
+ ehca_err(dev, "Resources still referenced in "
+ "user space qp_num=%x", qp_num);
+ return -EINVAL;
+ }
+ }
+
+ if (my_qp->send_cq) {
+ ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
+ if (ret) {
+ ehca_err(dev, "Couldn't unassign qp from "
+ "send_cq ret=%i qp_num=%x cq_num=%x", ret,
+ qp_num, my_qp->send_cq->cq_number);
+ return ret;
+ }
+ }
+
+ write_lock_irqsave(&ehca_qp_idr_lock, flags);
+ idr_remove(&ehca_qp_idr, my_qp->token);
+ write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+ /*
+ * SRQs will never get into an error list and do not have a recv_cq,
+ * so we need to skip them here.
+ */
+ if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
+ del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
+
+ if (HAS_SQ(my_qp) && !is_user)
+ del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+ /* now wait until all pending events have completed */
+ wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
+
+ h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+ if (h_ret != H_SUCCESS) {
+ ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
+ "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
+ return ehca2ib_return_code(h_ret);
+ }
+
+ port_num = my_qp->init_attr.port_num;
+ qp_type = my_qp->init_attr.qp_type;
+
+ if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+ spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+ kfree(my_qp->mod_qp_parm);
+ my_qp->mod_qp_parm = NULL;
+ shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
+ spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+ }
+
+ /* no support for IB_QPT_SMI yet */
+ if (qp_type == IB_QPT_GSI) {
+ struct ib_event event;
+ ehca_info(dev, "device %s: port %x is inactive.",
+ shca->ib_device.name, port_num);
+ event.device = &shca->ib_device;
+ event.event = IB_EVENT_PORT_ERR;
+ event.element.port_num = port_num;
+ shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
+ ib_dispatch_event(&event);
+ }
+
+ if (HAS_RQ(my_qp)) {
+ ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+ if (!is_user)
+ vfree(my_qp->rq_map.map);
+ }
+ if (HAS_SQ(my_qp)) {
+ ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+ if (!is_user)
+ vfree(my_qp->sq_map.map);
+ }
+ kmem_cache_free(qp_cache, my_qp);
+ atomic_dec(&shca->num_qps);
+ return 0;
+}
+
+int ehca_destroy_qp(struct ib_qp *qp)
+{
+ return internal_destroy_qp(qp->device,
+ container_of(qp, struct ehca_qp, ib_qp),
+ qp->uobject);
+}
+
+int ehca_destroy_srq(struct ib_srq *srq)
+{
+ return internal_destroy_qp(srq->device,
+ container_of(srq, struct ehca_qp, ib_srq),
+ srq->uobject);
+}
+
+int ehca_init_qp_cache(void)
+{
+ qp_cache = kmem_cache_create("ehca_cache_qp",
+ sizeof(struct ehca_qp), 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!qp_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void ehca_cleanup_qp_cache(void)
+{
+ if (qp_cache)
+ kmem_cache_destroy(qp_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c
new file mode 100644
index 000000000000..47f94984353d
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_reqs.c
@@ -0,0 +1,953 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * post_send/recv, poll_cq, req_notify
+ *
+ * Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Waleri Fomin <fomin@de.ibm.com>
+ * Joachim Fenkes <fenkes@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+/* in RC traffic, insert an empty RDMA READ every this many packets */
+#define ACK_CIRC_THRESHOLD 2000000
+
+static u64 replace_wr_id(u64 wr_id, u16 idx)
+{
+ u64 ret;
+
+ ret = wr_id & ~QMAP_IDX_MASK;
+ ret |= idx & QMAP_IDX_MASK;
+
+ return ret;
+}
+
+static u16 get_app_wr_id(u64 wr_id)
+{
+ return wr_id & QMAP_IDX_MASK;
+}
+
+static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
+ struct ehca_wqe *wqe_p,
+ struct ib_recv_wr *recv_wr,
+ u32 rq_map_idx)
+{
+ u8 cnt_ds;
+ if (unlikely((recv_wr->num_sge < 0) ||
+ (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
+ ehca_gen_err("Invalid number of WQE SGE. "
+ "num_sqe=%x max_nr_of_sg=%x",
+ recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
+ return -EINVAL; /* invalid SG list length */
+ }
+
+ /* clear wqe header until sglist */
+ memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+ wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
+ wqe_p->nr_of_data_seg = recv_wr->num_sge;
+
+ for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
+ wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
+ recv_wr->sg_list[cnt_ds].addr;
+ wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
+ recv_wr->sg_list[cnt_ds].lkey;
+ wqe_p->u.all_rcv.sg_list[cnt_ds].length =
+ recv_wr->sg_list[cnt_ds].length;
+ }
+
+ if (ehca_debug_level >= 3) {
+ ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
+ ipz_rqueue);
+ ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
+ }
+
+ return 0;
+}
+
+#if defined(DEBUG_GSI_SEND_WR)
+
+/* need ib_mad struct */
+#include <rdma/ib_mad.h>
+
+static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
+{
+ int idx;
+ int j;
+ while (send_wr) {
+ struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
+ struct ib_sge *sge = send_wr->sg_list;
+ ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
+ "send_flags=%x opcode=%x", idx, send_wr->wr_id,
+ send_wr->num_sge, send_wr->send_flags,
+ send_wr->opcode);
+ if (mad_hdr) {
+ ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
+ "mgmt_class=%x class_version=%x method=%x "
+ "status=%x class_specific=%x tid=%lx "
+ "attr_id=%x resv=%x attr_mod=%x",
+ idx, mad_hdr->base_version,
+ mad_hdr->mgmt_class,
+ mad_hdr->class_version, mad_hdr->method,
+ mad_hdr->status, mad_hdr->class_specific,
+ mad_hdr->tid, mad_hdr->attr_id,
+ mad_hdr->resv,
+ mad_hdr->attr_mod);
+ }
+ for (j = 0; j < send_wr->num_sge; j++) {
+ u8 *data = __va(sge->addr);
+ ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
+ "lkey=%x",
+ idx, j, data, sge->length, sge->lkey);
+ /* assume length is n*16 */
+ ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
+ idx, j);
+ sge++;
+ } /* eof for j */
+ idx++;
+ send_wr = send_wr->next;
+ } /* eof while send_wr */
+}
+
+#endif /* DEBUG_GSI_SEND_WR */
+
+static inline int ehca_write_swqe(struct ehca_qp *qp,
+ struct ehca_wqe *wqe_p,
+ const struct ib_send_wr *send_wr,
+ u32 sq_map_idx,
+ int hidden)
+{
+ u32 idx;
+ u64 dma_length;
+ struct ehca_av *my_av;
+ u32 remote_qkey = send_wr->wr.ud.remote_qkey;
+ struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
+
+ if (unlikely((send_wr->num_sge < 0) ||
+ (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
+ ehca_gen_err("Invalid number of WQE SGE. "
+ "num_sqe=%x max_nr_of_sg=%x",
+ send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
+ return -EINVAL; /* invalid SG list length */
+ }
+
+ /* clear wqe header until sglist */
+ memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+ wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
+
+ qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
+ qmap_entry->reported = 0;
+ qmap_entry->cqe_req = 0;
+
+ switch (send_wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ wqe_p->optype = WQE_OPTYPE_SEND;
+ break;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
+ break;
+ case IB_WR_RDMA_READ:
+ wqe_p->optype = WQE_OPTYPE_RDMAREAD;
+ break;
+ default:
+ ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
+ return -EINVAL; /* invalid opcode */
+ }
+
+ wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
+
+ wqe_p->wr_flag = 0;
+
+ if ((send_wr->send_flags & IB_SEND_SIGNALED ||
+ qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
+ && !hidden) {
+ wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
+ qmap_entry->cqe_req = 1;
+ }
+
+ if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+ /* this might not work as long as HW does not support it */
+ wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
+ wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
+ }
+
+ wqe_p->nr_of_data_seg = send_wr->num_sge;
+
+ switch (qp->qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ /* no break is intential here */
+ case IB_QPT_UD:
+ /* IB 1.2 spec C10-15 compliance */
+ if (send_wr->wr.ud.remote_qkey & 0x80000000)
+ remote_qkey = qp->qkey;
+
+ wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
+ wqe_p->local_ee_context_qkey = remote_qkey;
+ if (unlikely(!send_wr->wr.ud.ah)) {
+ ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
+ return -EINVAL;
+ }
+ if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
+ ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
+ return -EINVAL;
+ }
+ my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
+ wqe_p->u.ud_av.ud_av = my_av->av;
+
+ /*
+ * omitted check of IB_SEND_INLINE
+ * since HW does not support it
+ */
+ for (idx = 0; idx < send_wr->num_sge; idx++) {
+ wqe_p->u.ud_av.sg_list[idx].vaddr =
+ send_wr->sg_list[idx].addr;
+ wqe_p->u.ud_av.sg_list[idx].lkey =
+ send_wr->sg_list[idx].lkey;
+ wqe_p->u.ud_av.sg_list[idx].length =
+ send_wr->sg_list[idx].length;
+ } /* eof for idx */
+ if (qp->qp_type == IB_QPT_SMI ||
+ qp->qp_type == IB_QPT_GSI)
+ wqe_p->u.ud_av.ud_av.pmtu = 1;
+ if (qp->qp_type == IB_QPT_GSI) {
+ wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
+#ifdef DEBUG_GSI_SEND_WR
+ trace_send_wr_ud(send_wr);
+#endif /* DEBUG_GSI_SEND_WR */
+ }
+ break;
+
+ case IB_QPT_UC:
+ if (send_wr->send_flags & IB_SEND_FENCE)
+ wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
+ /* no break is intentional here */
+ case IB_QPT_RC:
+ /* TODO: atomic not implemented */
+ wqe_p->u.nud.remote_virtual_address =
+ send_wr->wr.rdma.remote_addr;
+ wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;
+
+ /*
+ * omitted checking of IB_SEND_INLINE
+ * since HW does not support it
+ */
+ dma_length = 0;
+ for (idx = 0; idx < send_wr->num_sge; idx++) {
+ wqe_p->u.nud.sg_list[idx].vaddr =
+ send_wr->sg_list[idx].addr;
+ wqe_p->u.nud.sg_list[idx].lkey =
+ send_wr->sg_list[idx].lkey;
+ wqe_p->u.nud.sg_list[idx].length =
+ send_wr->sg_list[idx].length;
+ dma_length += send_wr->sg_list[idx].length;
+ } /* eof idx */
+ wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
+
+ /* unsolicited ack circumvention */
+ if (send_wr->opcode == IB_WR_RDMA_READ) {
+ /* on RDMA read, switch on and reset counters */
+ qp->message_count = qp->packet_count = 0;
+ qp->unsol_ack_circ = 1;
+ } else
+ /* else estimate #packets */
+ qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
+
+ break;
+
+ default:
+ ehca_gen_err("Invalid qptype=%x", qp->qp_type);
+ return -EINVAL;
+ }
+
+ if (ehca_debug_level >= 3) {
+ ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
+ ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
+ }
+ return 0;
+}
+
+/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
+static inline void map_ib_wc_status(u32 cqe_status,
+ enum ib_wc_status *wc_status)
+{
+ if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
+ switch (cqe_status & 0x3F) {
+ case 0x01:
+ case 0x21:
+ *wc_status = IB_WC_LOC_LEN_ERR;
+ break;
+ case 0x02:
+ case 0x22:
+ *wc_status = IB_WC_LOC_QP_OP_ERR;
+ break;
+ case 0x03:
+ case 0x23:
+ *wc_status = IB_WC_LOC_EEC_OP_ERR;
+ break;
+ case 0x04:
+ case 0x24:
+ *wc_status = IB_WC_LOC_PROT_ERR;
+ break;
+ case 0x05:
+ case 0x25:
+ *wc_status = IB_WC_WR_FLUSH_ERR;
+ break;
+ case 0x06:
+ *wc_status = IB_WC_MW_BIND_ERR;
+ break;
+ case 0x07: /* remote error - look into bits 20:24 */
+ switch ((cqe_status
+ & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
+ case 0x0:
+ /*
+ * PSN Sequence Error!
+ * couldn't find a matching status!
+ */
+ *wc_status = IB_WC_GENERAL_ERR;
+ break;
+ case 0x1:
+ *wc_status = IB_WC_REM_INV_REQ_ERR;
+ break;
+ case 0x2:
+ *wc_status = IB_WC_REM_ACCESS_ERR;
+ break;
+ case 0x3:
+ *wc_status = IB_WC_REM_OP_ERR;
+ break;
+ case 0x4:
+ *wc_status = IB_WC_REM_INV_RD_REQ_ERR;
+ break;
+ }
+ break;
+ case 0x08:
+ *wc_status = IB_WC_RETRY_EXC_ERR;
+ break;
+ case 0x09:
+ *wc_status = IB_WC_RNR_RETRY_EXC_ERR;
+ break;
+ case 0x0A:
+ case 0x2D:
+ *wc_status = IB_WC_REM_ABORT_ERR;
+ break;
+ case 0x0B:
+ case 0x2E:
+ *wc_status = IB_WC_INV_EECN_ERR;
+ break;
+ case 0x0C:
+ case 0x2F:
+ *wc_status = IB_WC_INV_EEC_STATE_ERR;
+ break;
+ case 0x0D:
+ *wc_status = IB_WC_BAD_RESP_ERR;
+ break;
+ case 0x10:
+ /* WQE purged */
+ *wc_status = IB_WC_WR_FLUSH_ERR;
+ break;
+ default:
+ *wc_status = IB_WC_FATAL_ERR;
+
+ }
+ } else
+ *wc_status = IB_WC_SUCCESS;
+}
+
+static inline int post_one_send(struct ehca_qp *my_qp,
+ struct ib_send_wr *cur_send_wr,
+ int hidden)
+{
+ struct ehca_wqe *wqe_p;
+ int ret;
+ u32 sq_map_idx;
+ u64 start_offset = my_qp->ipz_squeue.current_q_offset;
+
+ /* get pointer next to free WQE */
+ wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
+ if (unlikely(!wqe_p)) {
+ /* too many posted work requests: queue overflow */
+ ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
+ "qp_num=%x", my_qp->ib_qp.qp_num);
+ return -ENOMEM;
+ }
+
+ /*
+ * Get the index of the WQE in the send queue. The same index is used
+ * for writing into the sq_map.
+ */
+ sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
+
+ /* write a SEND WQE into the QUEUE */
+ ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
+ /*
+ * if something failed,
+ * reset the free entry pointer to the start value
+ */
+ if (unlikely(ret)) {
+ my_qp->ipz_squeue.current_q_offset = start_offset;
+ ehca_err(my_qp->ib_qp.device, "Could not write WQE "
+ "qp_num=%x", my_qp->ib_qp.qp_num);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ehca_post_send(struct ib_qp *qp,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr)
+{
+ struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+ int wqe_cnt = 0;
+ int ret = 0;
+ unsigned long flags;
+
+ /* Reject WR if QP is in RESET, INIT or RTR state */
+ if (unlikely(my_qp->state < IB_QPS_RTS)) {
+ ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x",
+ my_qp->state, qp->qp_num);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* LOCK the QUEUE */
+ spin_lock_irqsave(&my_qp->spinlock_s, flags);
+
+ /* Send an empty extra RDMA read if:
+ * 1) there has been an RDMA read on this connection before
+ * 2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
+ * 3) we can be sure that any previous extra RDMA read has been
+ * processed so we don't overflow the SQ
+ */
+ if (unlikely(my_qp->unsol_ack_circ &&
+ my_qp->packet_count > ACK_CIRC_THRESHOLD &&
+ my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
+ /* insert an empty RDMA READ to fix up the remote QP state */
+ struct ib_send_wr circ_wr;
+ memset(&circ_wr, 0, sizeof(circ_wr));
+ circ_wr.opcode = IB_WR_RDMA_READ;
+ post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
+ wqe_cnt++;
+ ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num);
+ my_qp->message_count = my_qp->packet_count = 0;
+ }
+
+ /* loop processes list of send reqs */
+ while (send_wr) {
+ ret = post_one_send(my_qp, send_wr, 0);
+ if (unlikely(ret)) {
+ goto post_send_exit0;
+ }
+ wqe_cnt++;
+ send_wr = send_wr->next;
+ }
+
+post_send_exit0:
+ iosync(); /* serialize GAL register access */
+ hipz_update_sqa(my_qp, wqe_cnt);
+ if (unlikely(ret || ehca_debug_level >= 2))
+ ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+ my_qp, qp->qp_num, wqe_cnt, ret);
+ my_qp->message_count += wqe_cnt;
+ spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+
+out:
+ if (ret)
+ *bad_send_wr = send_wr;
+ return ret;
+}
+
+static int internal_post_recv(struct ehca_qp *my_qp,
+ struct ib_device *dev,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ struct ehca_wqe *wqe_p;
+ int wqe_cnt = 0;
+ int ret = 0;
+ u32 rq_map_idx;
+ unsigned long flags;
+ struct ehca_qmap_entry *qmap_entry;
+
+ if (unlikely(!HAS_RQ(my_qp))) {
+ ehca_err(dev, "QP has no RQ ehca_qp=%p qp_num=%x ext_type=%d",
+ my_qp, my_qp->real_qp_num, my_qp->ext_type);
+ ret = -ENODEV;
+ goto out;
+ }
+
+ /* LOCK the QUEUE */
+ spin_lock_irqsave(&my_qp->spinlock_r, flags);
+
+ /* loop processes list of recv reqs */
+ while (recv_wr) {
+ u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
+ /* get pointer next to free WQE */
+ wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
+ if (unlikely(!wqe_p)) {
+ /* too many posted work requests: queue overflow */
+ ret = -ENOMEM;
+ ehca_err(dev, "Too many posted WQEs "
+ "qp_num=%x", my_qp->real_qp_num);
+ goto post_recv_exit0;
+ }
+ /*
+ * Get the index of the WQE in the recv queue. The same index
+ * is used for writing into the rq_map.
+ */
+ rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
+
+ /* write a RECV WQE into the QUEUE */
+ ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
+ rq_map_idx);
+ /*
+ * if something failed,
+ * reset the free entry pointer to the start value
+ */
+ if (unlikely(ret)) {
+ my_qp->ipz_rqueue.current_q_offset = start_offset;
+ ret = -EINVAL;
+ ehca_err(dev, "Could not write WQE "
+ "qp_num=%x", my_qp->real_qp_num);
+ goto post_recv_exit0;
+ }
+
+ qmap_entry = &my_qp->rq_map.map[rq_map_idx];
+ qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
+ qmap_entry->reported = 0;
+ qmap_entry->cqe_req = 1;
+
+ wqe_cnt++;
+ recv_wr = recv_wr->next;
+ } /* eof for recv_wr */
+
+post_recv_exit0:
+ iosync(); /* serialize GAL register access */
+ hipz_update_rqa(my_qp, wqe_cnt);
+ if (unlikely(ret || ehca_debug_level >= 2))
+ ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+ my_qp, my_qp->real_qp_num, wqe_cnt, ret);
+ spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
+
+out:
+ if (ret)
+ *bad_recv_wr = recv_wr;
+
+ return ret;
+}
+
+int ehca_post_recv(struct ib_qp *qp,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+
+ /* Reject WR if QP is in RESET state */
+ if (unlikely(my_qp->state == IB_QPS_RESET)) {
+ ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x",
+ my_qp->state, qp->qp_num);
+ *bad_recv_wr = recv_wr;
+ return -EINVAL;
+ }
+
+ return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
+}
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
+ srq->device, recv_wr, bad_recv_wr);
+}
+
+/*
+ * ib_wc_opcode table converts ehca wc opcode to ib
+ * Since we use zero to indicate invalid opcode, the actual ib opcode must
+ * be decremented!!!
+ */
+static const u8 ib_wc_opcode[255] = {
+ [0x01] = IB_WC_RECV+1,
+ [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
+ [0x04] = IB_WC_BIND_MW+1,
+ [0x08] = IB_WC_FETCH_ADD+1,
+ [0x10] = IB_WC_COMP_SWAP+1,
+ [0x20] = IB_WC_RDMA_WRITE+1,
+ [0x40] = IB_WC_RDMA_READ+1,
+ [0x80] = IB_WC_SEND+1
+};
+
+/* internal function to poll one entry of cq */
+static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
+{
+ int ret = 0, qmap_tail_idx;
+ struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+ struct ehca_cqe *cqe;
+ struct ehca_qp *my_qp;
+ struct ehca_qmap_entry *qmap_entry;
+ struct ehca_queue_map *qmap;
+ int cqe_count = 0, is_error;
+
+repoll:
+ cqe = (struct ehca_cqe *)
+ ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
+ if (!cqe) {
+ ret = -EAGAIN;
+ if (ehca_debug_level >= 3)
+ ehca_dbg(cq->device, "Completion queue is empty "
+ "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
+ goto poll_cq_one_exit0;
+ }
+
+ /* prevents loads being reordered across this point */
+ rmb();
+
+ cqe_count++;
+ if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
+ struct ehca_qp *qp;
+ int purgeflag;
+ unsigned long flags;
+
+ qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
+ if (!qp) {
+ ehca_err(cq->device, "cq_num=%x qp_num=%x "
+ "could not find qp -> ignore cqe",
+ my_cq->cq_number, cqe->local_qp_number);
+ ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
+ my_cq->cq_number, cqe->local_qp_number);
+ /* ignore this purged cqe */
+ goto repoll;
+ }
+ spin_lock_irqsave(&qp->spinlock_s, flags);
+ purgeflag = qp->sqerr_purgeflag;
+ spin_unlock_irqrestore(&qp->spinlock_s, flags);
+
+ if (purgeflag) {
+ ehca_dbg(cq->device,
+ "Got CQE with purged bit qp_num=%x src_qp=%x",
+ cqe->local_qp_number, cqe->remote_qp_number);
+ if (ehca_debug_level >= 2)
+ ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
+ cqe->local_qp_number,
+ cqe->remote_qp_number);
+ /*
+ * ignore this to avoid double cqes of bad wqe
+ * that caused sqe and turn off purge flag
+ */
+ qp->sqerr_purgeflag = 0;
+ goto repoll;
+ }
+ }
+
+ is_error = cqe->status & WC_STATUS_ERROR_BIT;
+
+ /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
+ if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
+ ehca_dbg(cq->device,
+ "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
+ is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
+ ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+ my_cq, my_cq->cq_number);
+ ehca_dbg(cq->device,
+ "ehca_cq=%p cq_num=%x -------------------------",
+ my_cq, my_cq->cq_number);
+ }
+
+ read_lock(&ehca_qp_idr_lock);
+ my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
+ read_unlock(&ehca_qp_idr_lock);
+ if (!my_qp)
+ goto repoll;
+ wc->qp = &my_qp->ib_qp;
+
+ qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
+ if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
+ /* We got a send completion. */
+ qmap = &my_qp->sq_map;
+ else
+ /* We got a receive completion. */
+ qmap = &my_qp->rq_map;
+
+ /* advance the tail pointer */
+ qmap->tail = qmap_tail_idx;
+
+ if (is_error) {
+ /*
+ * set left_to_poll to 0 because in error state, we will not
+ * get any additional CQEs
+ */
+ my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+ my_qp->sq_map.entries);
+ my_qp->sq_map.left_to_poll = 0;
+ ehca_add_to_err_list(my_qp, 1);
+
+ my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+ my_qp->rq_map.entries);
+ my_qp->rq_map.left_to_poll = 0;
+ if (HAS_RQ(my_qp))
+ ehca_add_to_err_list(my_qp, 0);
+ }
+
+ qmap_entry = &qmap->map[qmap_tail_idx];
+ if (qmap_entry->reported) {
+ ehca_warn(cq->device, "Double cqe on qp_num=%#x",
+ my_qp->real_qp_num);
+ /* found a double cqe, discard it and read next one */
+ goto repoll;
+ }
+
+ wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
+ qmap_entry->reported = 1;
+
+ /* if left_to_poll is decremented to 0, add the QP to the error list */
+ if (qmap->left_to_poll > 0) {
+ qmap->left_to_poll--;
+ if ((my_qp->sq_map.left_to_poll == 0) &&
+ (my_qp->rq_map.left_to_poll == 0)) {
+ ehca_add_to_err_list(my_qp, 1);
+ if (HAS_RQ(my_qp))
+ ehca_add_to_err_list(my_qp, 0);
+ }
+ }
+
+ /* eval ib_wc_opcode */
+ wc->opcode = ib_wc_opcode[cqe->optype]-1;
+ if (unlikely(wc->opcode == -1)) {
+ ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
+ "ehca_cq=%p cq_num=%x",
+ cqe->optype, cqe->status, my_cq, my_cq->cq_number);
+ /* dump cqe for other infos */
+ ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+ my_cq, my_cq->cq_number);
+ /* update also queue adder to throw away this entry!!! */
+ goto repoll;
+ }
+
+ /* eval ib_wc_status */
+ if (unlikely(is_error)) {
+ /* complete with errors */
+ map_ib_wc_status(cqe->status, &wc->status);
+ wc->vendor_err = wc->status;
+ } else
+ wc->status = IB_WC_SUCCESS;
+
+ wc->byte_len = cqe->nr_bytes_transferred;
+ wc->pkey_index = cqe->pkey_index;
+ wc->slid = cqe->rlid;
+ wc->dlid_path_bits = cqe->dlid;
+ wc->src_qp = cqe->remote_qp_number;
+ /*
+ * HW has "Immed data present" and "GRH present" in bits 6 and 5.
+ * SW defines those in bits 1 and 0, so we can just shift and mask.
+ */
+ wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
+ wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
+ wc->sl = cqe->service_level;
+
+poll_cq_one_exit0:
+ if (cqe_count > 0)
+ hipz_update_feca(my_cq, cqe_count);
+
+ return ret;
+}
+
+static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
+ struct ib_wc *wc, int num_entries,
+ struct ipz_queue *ipz_queue, int on_sq)
+{
+ int nr = 0;
+ struct ehca_wqe *wqe;
+ u64 offset;
+ struct ehca_queue_map *qmap;
+ struct ehca_qmap_entry *qmap_entry;
+
+ if (on_sq)
+ qmap = &my_qp->sq_map;
+ else
+ qmap = &my_qp->rq_map;
+
+ qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+ while ((nr < num_entries) && (qmap_entry->reported == 0)) {
+ /* generate flush CQE */
+
+ memset(wc, 0, sizeof(*wc));
+
+ offset = qmap->next_wqe_idx * ipz_queue->qe_size;
+ wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
+ if (!wqe) {
+ ehca_err(cq->device, "Invalid wqe offset=%#llx on "
+ "qp_num=%#x", offset, my_qp->real_qp_num);
+ return nr;
+ }
+
+ wc->wr_id = replace_wr_id(wqe->work_request_id,
+ qmap_entry->app_wr_id);
+
+ if (on_sq) {
+ switch (wqe->optype) {
+ case WQE_OPTYPE_SEND:
+ wc->opcode = IB_WC_SEND;
+ break;
+ case WQE_OPTYPE_RDMAWRITE:
+ wc->opcode = IB_WC_RDMA_WRITE;
+ break;
+ case WQE_OPTYPE_RDMAREAD:
+ wc->opcode = IB_WC_RDMA_READ;
+ break;
+ default:
+ ehca_err(cq->device, "Invalid optype=%x",
+ wqe->optype);
+ return nr;
+ }
+ } else
+ wc->opcode = IB_WC_RECV;
+
+ if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
+ wc->ex.imm_data = wqe->immediate_data;
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ }
+
+ wc->status = IB_WC_WR_FLUSH_ERR;
+
+ wc->qp = &my_qp->ib_qp;
+
+ /* mark as reported and advance next_wqe pointer */
+ qmap_entry->reported = 1;
+ qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
+ qmap->entries);
+ qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+ wc++; nr++;
+ }
+
+ return nr;
+
+}
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+{
+ struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+ int nr;
+ struct ehca_qp *err_qp;
+ struct ib_wc *current_wc = wc;
+ int ret = 0;
+ unsigned long flags;
+ int entries_left = num_entries;
+
+ if (num_entries < 1) {
+ ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
+ "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
+ ret = -EINVAL;
+ goto poll_cq_exit0;
+ }
+
+ spin_lock_irqsave(&my_cq->spinlock, flags);
+
+ /* generate flush cqes for send queues */
+ list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
+ nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+ &err_qp->ipz_squeue, 1);
+ entries_left -= nr;
+ current_wc += nr;
+
+ if (entries_left == 0)
+ break;
+ }
+
+ /* generate flush cqes for receive queues */
+ list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
+ nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+ &err_qp->ipz_rqueue, 0);
+ entries_left -= nr;
+ current_wc += nr;
+
+ if (entries_left == 0)
+ break;
+ }
+
+ for (nr = 0; nr < entries_left; nr++) {
+ ret = ehca_poll_cq_one(cq, current_wc);
+ if (ret)
+ break;
+ current_wc++;
+ } /* eof for nr */
+ entries_left -= nr;
+
+ spin_unlock_irqrestore(&my_cq->spinlock, flags);
+ if (ret == -EAGAIN || !ret)
+ ret = num_entries - entries_left;
+
+poll_cq_exit0:
+ return ret;
+}
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
+{
+ struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+ int ret = 0;
+
+ switch (notify_flags & IB_CQ_SOLICITED_MASK) {
+ case IB_CQ_SOLICITED:
+ hipz_set_cqx_n0(my_cq, 1);
+ break;
+ case IB_CQ_NEXT_COMP:
+ hipz_set_cqx_n1(my_cq, 1);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+ unsigned long spl_flags;
+ spin_lock_irqsave(&my_cq->spinlock, spl_flags);
+ ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
+ spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
+ }
+
+ return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_sqp.c b/drivers/staging/rdma/ehca/ehca_sqp.c
new file mode 100644
index 000000000000..376b031c2c7f
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_sqp.c
@@ -0,0 +1,245 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * SQP functions
+ *
+ * Authors: Khadija Souissi <souissi@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define IB_MAD_STATUS_REDIRECT cpu_to_be16(0x0002)
+#define IB_MAD_STATUS_UNSUP_VERSION cpu_to_be16(0x0004)
+#define IB_MAD_STATUS_UNSUP_METHOD cpu_to_be16(0x0008)
+
+#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001)
+
+/**
+ * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
+ * pair is created successfully, the corresponding port gets active.
+ *
+ * Define Special Queue pair 0 (SMI QP) is still not supported.
+ *
+ * @qp_init_attr: Queue pair init attributes with port and queue pair type
+ */
+
+u64 ehca_define_sqp(struct ehca_shca *shca,
+ struct ehca_qp *ehca_qp,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ u32 pma_qp_nr, bma_qp_nr;
+ u64 ret;
+ u8 port = qp_init_attr->port_num;
+ int counter;
+
+ shca->sport[port - 1].port_state = IB_PORT_DOWN;
+
+ switch (qp_init_attr->qp_type) {
+ case IB_QPT_SMI:
+ /* function not supported yet */
+ break;
+ case IB_QPT_GSI:
+ ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
+ ehca_qp->ipz_qp_handle,
+ ehca_qp->galpas.kernel,
+ (u32) qp_init_attr->port_num,
+ &pma_qp_nr, &bma_qp_nr);
+
+ if (ret != H_SUCCESS) {
+ ehca_err(&shca->ib_device,
+ "Can't define AQP1 for port %x. h_ret=%lli",
+ port, ret);
+ return ret;
+ }
+ shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
+ ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
+ port, pma_qp_nr);
+ break;
+ default:
+ ehca_err(&shca->ib_device, "invalid qp_type=%x",
+ qp_init_attr->qp_type);
+ return H_PARAMETER;
+ }
+
+ if (ehca_nr_ports < 0) /* autodetect mode */
+ return H_SUCCESS;
+
+ for (counter = 0;
+ shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
+ counter < ehca_port_act_time;
+ counter++) {
+ ehca_dbg(&shca->ib_device, "... wait until port %x is active",
+ port);
+ msleep_interruptible(1000);
+ }
+
+ if (counter == ehca_port_act_time) {
+ ehca_err(&shca->ib_device, "Port %x is not active.", port);
+ return H_HARDWARE;
+ }
+
+ return H_SUCCESS;
+}
+
+struct ib_perf {
+ struct ib_mad_hdr mad_hdr;
+ u8 reserved[40];
+ u8 data[192];
+} __attribute__ ((packed));
+
+/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
+struct tcslfl {
+ u32 tc:8;
+ u32 sl:4;
+ u32 fl:20;
+} __attribute__ ((packed));
+
+/* IP Version/TC/FL packed into 32 bits, as in GRH */
+struct vertcfl {
+ u32 ver:4;
+ u32 tc:8;
+ u32 fl:20;
+} __attribute__ ((packed));
+
+static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+ const struct ib_perf *in_perf = (const struct ib_perf *)in_mad;
+ struct ib_perf *out_perf = (struct ib_perf *)out_mad;
+ struct ib_class_port_info *poi =
+ (struct ib_class_port_info *)out_perf->data;
+ struct tcslfl *tcslfl =
+ (struct tcslfl *)&poi->redirect_tcslfl;
+ struct ehca_shca *shca =
+ container_of(ibdev, struct ehca_shca, ib_device);
+ struct ehca_sport *sport = &shca->sport[port_num - 1];
+
+ ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
+
+ *out_mad = *in_mad;
+
+ if (in_perf->mad_hdr.class_version != 1) {
+ ehca_warn(ibdev, "Unsupported class_version=%x",
+ in_perf->mad_hdr.class_version);
+ out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
+ goto perf_reply;
+ }
+
+ switch (in_perf->mad_hdr.method) {
+ case IB_MGMT_METHOD_GET:
+ case IB_MGMT_METHOD_SET:
+ /* set class port info for redirection */
+ out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
+ out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
+ memset(poi, 0, sizeof(*poi));
+ poi->base_version = 1;
+ poi->class_version = 1;
+ poi->resp_time_value = 18;
+
+ /* copy local routing information from WC where applicable */
+ tcslfl->sl = in_wc->sl;
+ poi->redirect_lid =
+ sport->saved_attr.lid | in_wc->dlid_path_bits;
+ poi->redirect_qp = sport->pma_qp_nr;
+ poi->redirect_qkey = IB_QP1_QKEY;
+
+ ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
+ &poi->redirect_pkey);
+
+ /* if request was globally routed, copy route info */
+ if (in_grh) {
+ const struct vertcfl *vertcfl =
+ (const struct vertcfl *)&in_grh->version_tclass_flow;
+ memcpy(poi->redirect_gid, in_grh->dgid.raw,
+ sizeof(poi->redirect_gid));
+ tcslfl->tc = vertcfl->tc;
+ tcslfl->fl = vertcfl->fl;
+ } else
+ /* else only fill in default GID */
+ ehca_query_gid(ibdev, port_num, 0,
+ (union ib_gid *)&poi->redirect_gid);
+
+ ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
+ sport->saved_attr.lid, sport->pma_qp_nr);
+ break;
+
+ case IB_MGMT_METHOD_GET_RESP:
+ return IB_MAD_RESULT_FAILURE;
+
+ default:
+ out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
+ break;
+ }
+
+perf_reply:
+ out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in, size_t in_mad_size,
+ struct ib_mad_hdr *out, size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ int ret;
+ const struct ib_mad *in_mad = (const struct ib_mad *)in;
+ struct ib_mad *out_mad = (struct ib_mad *)out;
+
+ if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+ *out_mad_size != sizeof(*out_mad)))
+ return IB_MAD_RESULT_FAILURE;
+
+ if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
+ return IB_MAD_RESULT_FAILURE;
+
+ /* accept only pma request */
+ if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+ return IB_MAD_RESULT_SUCCESS;
+
+ ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
+ ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
+ in_mad, out_mad);
+
+ return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_tools.h b/drivers/staging/rdma/ehca/ehca_tools.h
new file mode 100644
index 000000000000..d280b12aae64
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_tools.h
@@ -0,0 +1,155 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * auxiliary functions
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Khadija Souissi <souissik@de.ibm.com>
+ * Waleri Fomin <fomin@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef EHCA_TOOLS_H
+#define EHCA_TOOLS_H
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/vmalloc.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+
+#include <linux/atomic.h>
+#include <asm/ibmebus.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hvcall.h>
+
+extern int ehca_debug_level;
+
+#define ehca_dbg(ib_dev, format, arg...) \
+ do { \
+ if (unlikely(ehca_debug_level)) \
+ dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
+ "PU%04x EHCA_DBG:%s " format "\n", \
+ raw_smp_processor_id(), __func__, \
+ ## arg); \
+ } while (0)
+
+#define ehca_info(ib_dev, format, arg...) \
+ dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
+ raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_warn(ib_dev, format, arg...) \
+ dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
+ raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_err(ib_dev, format, arg...) \
+ dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
+ raw_smp_processor_id(), __func__, ## arg)
+
+/* use this one only if no ib_dev available */
+#define ehca_gen_dbg(format, arg...) \
+ do { \
+ if (unlikely(ehca_debug_level)) \
+ printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
+ raw_smp_processor_id(), __func__, ## arg); \
+ } while (0)
+
+#define ehca_gen_warn(format, arg...) \
+ printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
+ raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_gen_err(format, arg...) \
+ printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
+ raw_smp_processor_id(), __func__, ## arg)
+
+/**
+ * ehca_dmp - printk a memory block, whose length is n*8 bytes.
+ * Each line has the following layout:
+ * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
+ */
+#define ehca_dmp(adr, len, format, args...) \
+ do { \
+ unsigned int x; \
+ unsigned int l = (unsigned int)(len); \
+ unsigned char *deb = (unsigned char *)(adr); \
+ for (x = 0; x < l; x += 16) { \
+ printk(KERN_INFO "EHCA_DMP:%s " format \
+ " adr=%p ofs=%04x %016llx %016llx\n", \
+ __func__, ##args, deb, x, \
+ *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
+ deb += 16; \
+ } \
+ } while (0)
+
+/* define a bitmask, little endian version */
+#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
+
+/* define a bitmask, the ibm way... */
+#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
+
+/* internal function, don't use */
+#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
+
+/* internal function, don't use */
+#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
+
+/**
+ * EHCA_BMASK_SET - return value shifted and masked by mask
+ * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
+ * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
+ * in variable
+ */
+#define EHCA_BMASK_SET(mask, value) \
+ ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
+
+/**
+ * EHCA_BMASK_GET - extract a parameter from value by mask
+ */
+#define EHCA_BMASK_GET(mask, value) \
+ (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
+
+/* Converts ehca to ib return code */
+int ehca2ib_return_code(u64 ehca_rc);
+
+#endif /* EHCA_TOOLS_H */
diff --git a/drivers/staging/rdma/ehca/ehca_uverbs.c b/drivers/staging/rdma/ehca/ehca_uverbs.c
new file mode 100644
index 000000000000..1a1d5d99fcf9
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ehca_uverbs.c
@@ -0,0 +1,309 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * userspace support verbs
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+ struct ib_udata *udata)
+{
+ struct ehca_ucontext *my_context;
+
+ my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
+ if (!my_context) {
+ ehca_err(device, "Out of memory device=%p", device);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &my_context->ib_ucontext;
+}
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context)
+{
+ kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
+ return 0;
+}
+
+static void ehca_mm_open(struct vm_area_struct *vma)
+{
+ u32 *count = (u32 *)vma->vm_private_data;
+ if (!count) {
+ ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+ vma->vm_start, vma->vm_end);
+ return;
+ }
+ (*count)++;
+ if (!(*count))
+ ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
+ vma->vm_start, vma->vm_end);
+ ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+ vma->vm_start, vma->vm_end, *count);
+}
+
+static void ehca_mm_close(struct vm_area_struct *vma)
+{
+ u32 *count = (u32 *)vma->vm_private_data;
+ if (!count) {
+ ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+ vma->vm_start, vma->vm_end);
+ return;
+ }
+ (*count)--;
+ ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+ vma->vm_start, vma->vm_end, *count);
+}
+
+static const struct vm_operations_struct vm_ops = {
+ .open = ehca_mm_open,
+ .close = ehca_mm_close,
+};
+
+static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
+ u32 *mm_count)
+{
+ int ret;
+ u64 vsize, physical;
+
+ vsize = vma->vm_end - vma->vm_start;
+ if (vsize < EHCA_PAGESIZE) {
+ ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
+ return -EINVAL;
+ }
+
+ physical = galpas->user.fw_handle;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
+ /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
+ ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
+ vma->vm_page_prot);
+ if (unlikely(ret)) {
+ ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
+ return -ENOMEM;
+ }
+
+ vma->vm_private_data = mm_count;
+ (*mm_count)++;
+ vma->vm_ops = &vm_ops;
+
+ return 0;
+}
+
+static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
+ u32 *mm_count)
+{
+ int ret;
+ u64 start, ofs;
+ struct page *page;
+
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ start = vma->vm_start;
+ for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
+ u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
+ page = virt_to_page(virt_addr);
+ ret = vm_insert_page(vma, start, page);
+ if (unlikely(ret)) {
+ ehca_gen_err("vm_insert_page() failed rc=%i", ret);
+ return ret;
+ }
+ start += PAGE_SIZE;
+ }
+ vma->vm_private_data = mm_count;
+ (*mm_count)++;
+ vma->vm_ops = &vm_ops;
+
+ return 0;
+}
+
+static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
+ u32 rsrc_type)
+{
+ int ret;
+
+ switch (rsrc_type) {
+ case 0: /* galpa fw handle */
+ ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
+ ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
+ if (unlikely(ret)) {
+ ehca_err(cq->ib_cq.device,
+ "ehca_mmap_fw() failed rc=%i cq_num=%x",
+ ret, cq->cq_number);
+ return ret;
+ }
+ break;
+
+ case 1: /* cq queue_addr */
+ ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
+ ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
+ if (unlikely(ret)) {
+ ehca_err(cq->ib_cq.device,
+ "ehca_mmap_queue() failed rc=%i cq_num=%x",
+ ret, cq->cq_number);
+ return ret;
+ }
+ break;
+
+ default:
+ ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
+ rsrc_type, cq->cq_number);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
+ u32 rsrc_type)
+{
+ int ret;
+
+ switch (rsrc_type) {
+ case 0: /* galpa fw handle */
+ ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
+ ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
+ if (unlikely(ret)) {
+ ehca_err(qp->ib_qp.device,
+ "remap_pfn_range() failed ret=%i qp_num=%x",
+ ret, qp->ib_qp.qp_num);
+ return -ENOMEM;
+ }
+ break;
+
+ case 1: /* qp rqueue_addr */
+ ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
+ ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
+ &qp->mm_count_rqueue);
+ if (unlikely(ret)) {
+ ehca_err(qp->ib_qp.device,
+ "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
+ ret, qp->ib_qp.qp_num);
+ return ret;
+ }
+ break;
+
+ case 2: /* qp squeue_addr */
+ ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
+ ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
+ &qp->mm_count_squeue);
+ if (unlikely(ret)) {
+ ehca_err(qp->ib_qp.device,
+ "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
+ ret, qp->ib_qp.qp_num);
+ return ret;
+ }
+ break;
+
+ default:
+ ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
+ rsrc_type, qp->ib_qp.qp_num);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ u64 fileoffset = vma->vm_pgoff;
+ u32 idr_handle = fileoffset & 0x1FFFFFF;
+ u32 q_type = (fileoffset >> 27) & 0x1; /* CQ, QP,... */
+ u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
+ u32 ret;
+ struct ehca_cq *cq;
+ struct ehca_qp *qp;
+ struct ib_uobject *uobject;
+
+ switch (q_type) {
+ case 0: /* CQ */
+ read_lock(&ehca_cq_idr_lock);
+ cq = idr_find(&ehca_cq_idr, idr_handle);
+ read_unlock(&ehca_cq_idr_lock);
+
+ /* make sure this mmap really belongs to the authorized user */
+ if (!cq)
+ return -EINVAL;
+
+ if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
+ return -EINVAL;
+
+ ret = ehca_mmap_cq(vma, cq, rsrc_type);
+ if (unlikely(ret)) {
+ ehca_err(cq->ib_cq.device,
+ "ehca_mmap_cq() failed rc=%i cq_num=%x",
+ ret, cq->cq_number);
+ return ret;
+ }
+ break;
+
+ case 1: /* QP */
+ read_lock(&ehca_qp_idr_lock);
+ qp = idr_find(&ehca_qp_idr, idr_handle);
+ read_unlock(&ehca_qp_idr_lock);
+
+ /* make sure this mmap really belongs to the authorized user */
+ if (!qp)
+ return -EINVAL;
+
+ uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
+ if (!uobject || uobject->context != context)
+ return -EINVAL;
+
+ ret = ehca_mmap_qp(vma, qp, rsrc_type);
+ if (unlikely(ret)) {
+ ehca_err(qp->ib_qp.device,
+ "ehca_mmap_qp() failed rc=%i qp_num=%x",
+ ret, qp->ib_qp.qp_num);
+ return ret;
+ }
+ break;
+
+ default:
+ ehca_gen_err("bad queue type %x", q_type);
+ return -EINVAL;
+ }
+
+ return 0;
+}
diff --git a/drivers/staging/rdma/ehca/hcp_if.c b/drivers/staging/rdma/ehca/hcp_if.c
new file mode 100644
index 000000000000..89517ffb4389
--- /dev/null
+++ b/drivers/staging/rdma/ehca/hcp_if.c
@@ -0,0 +1,949 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Firmware Infiniband Interface code for POWER
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Joachim Fenkes <fenkes@de.ibm.com>
+ * Gerd Bayer <gerd.bayer@de.ibm.com>
+ * Waleri Fomin <fomin@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/hvcall.h>
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hcp_phyp.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define H_ALL_RES_QP_ENHANCED_OPS EHCA_BMASK_IBM(9, 11)
+#define H_ALL_RES_QP_PTE_PIN EHCA_BMASK_IBM(12, 12)
+#define H_ALL_RES_QP_SERVICE_TYPE EHCA_BMASK_IBM(13, 15)
+#define H_ALL_RES_QP_STORAGE EHCA_BMASK_IBM(16, 17)
+#define H_ALL_RES_QP_LL_RQ_CQE_POSTING EHCA_BMASK_IBM(18, 18)
+#define H_ALL_RES_QP_LL_SQ_CQE_POSTING EHCA_BMASK_IBM(19, 21)
+#define H_ALL_RES_QP_SIGNALING_TYPE EHCA_BMASK_IBM(22, 23)
+#define H_ALL_RES_QP_UD_AV_LKEY_CTRL EHCA_BMASK_IBM(31, 31)
+#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
+#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
+#define H_ALL_RES_QP_RESOURCE_TYPE EHCA_BMASK_IBM(56, 63)
+
+#define H_ALL_RES_QP_MAX_OUTST_SEND_WR EHCA_BMASK_IBM(0, 15)
+#define H_ALL_RES_QP_MAX_OUTST_RECV_WR EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_MAX_SEND_SGE EHCA_BMASK_IBM(32, 39)
+#define H_ALL_RES_QP_MAX_RECV_SGE EHCA_BMASK_IBM(40, 47)
+
+#define H_ALL_RES_QP_UD_AV_LKEY EHCA_BMASK_IBM(32, 63)
+#define H_ALL_RES_QP_SRQ_QP_TOKEN EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_SRQ_QP_HANDLE EHCA_BMASK_IBM(0, 64)
+#define H_ALL_RES_QP_SRQ_LIMIT EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_SRQ_QPN EHCA_BMASK_IBM(40, 63)
+
+#define H_ALL_RES_QP_ACT_OUTST_SEND_WR EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_ACT_OUTST_RECV_WR EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_ACT_SEND_SGE EHCA_BMASK_IBM(8, 15)
+#define H_ALL_RES_QP_ACT_RECV_SGE EHCA_BMASK_IBM(24, 31)
+
+#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES EHCA_BMASK_IBM(32, 63)
+
+#define H_MP_INIT_TYPE EHCA_BMASK_IBM(44, 47)
+#define H_MP_SHUTDOWN EHCA_BMASK_IBM(48, 48)
+#define H_MP_RESET_QKEY_CTR EHCA_BMASK_IBM(49, 49)
+
+#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
+#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
+#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
+
+static DEFINE_SPINLOCK(hcall_lock);
+
+static long ehca_plpar_hcall_norets(unsigned long opcode,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6,
+ unsigned long arg7)
+{
+ long ret;
+ int i, sleep_msecs;
+ unsigned long flags = 0;
+
+ if (unlikely(ehca_debug_level >= 2))
+ ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
+ opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
+
+ for (i = 0; i < 5; i++) {
+ /* serialize hCalls to work around firmware issue */
+ if (ehca_lock_hcalls)
+ spin_lock_irqsave(&hcall_lock, flags);
+
+ ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
+ arg5, arg6, arg7);
+
+ if (ehca_lock_hcalls)
+ spin_unlock_irqrestore(&hcall_lock, flags);
+
+ if (H_IS_LONG_BUSY(ret)) {
+ sleep_msecs = get_longbusy_msecs(ret);
+ msleep_interruptible(sleep_msecs);
+ continue;
+ }
+
+ if (ret < H_SUCCESS)
+ ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
+ opcode, ret, arg1, arg2, arg3,
+ arg4, arg5, arg6, arg7);
+ else
+ if (unlikely(ehca_debug_level >= 2))
+ ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
+
+ return ret;
+ }
+
+ return H_BUSY;
+}
+
+static long ehca_plpar_hcall9(unsigned long opcode,
+ unsigned long *outs, /* array of 9 outputs */
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6,
+ unsigned long arg7,
+ unsigned long arg8,
+ unsigned long arg9)
+{
+ long ret;
+ int i, sleep_msecs;
+ unsigned long flags = 0;
+
+ if (unlikely(ehca_debug_level >= 2))
+ ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
+ arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7, arg8, arg9);
+
+ for (i = 0; i < 5; i++) {
+ /* serialize hCalls to work around firmware issue */
+ if (ehca_lock_hcalls)
+ spin_lock_irqsave(&hcall_lock, flags);
+
+ ret = plpar_hcall9(opcode, outs,
+ arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7, arg8, arg9);
+
+ if (ehca_lock_hcalls)
+ spin_unlock_irqrestore(&hcall_lock, flags);
+
+ if (H_IS_LONG_BUSY(ret)) {
+ sleep_msecs = get_longbusy_msecs(ret);
+ msleep_interruptible(sleep_msecs);
+ continue;
+ }
+
+ if (ret < H_SUCCESS) {
+ ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
+ opcode, arg1, arg2, arg3, arg4, arg5,
+ arg6, arg7, arg8, arg9);
+ ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+ ret, outs[0], outs[1], outs[2], outs[3],
+ outs[4], outs[5], outs[6], outs[7],
+ outs[8]);
+ } else if (unlikely(ehca_debug_level >= 2))
+ ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+ ret, outs[0], outs[1], outs[2], outs[3],
+ outs[4], outs[5], outs[6], outs[7],
+ outs[8]);
+ return ret;
+ }
+
+ return H_BUSY;
+}
+
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_pfeq *pfeq,
+ const u32 neq_control,
+ const u32 number_of_entries,
+ struct ipz_eq_handle *eq_handle,
+ u32 *act_nr_of_entries,
+ u32 *act_pages,
+ u32 *eq_ist)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+ u64 allocate_controls;
+
+ /* resource type */
+ allocate_controls = 3ULL;
+
+ /* ISN is associated */
+ if (neq_control != 1)
+ allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
+ else /* notification event queue */
+ allocate_controls = (1ULL << 63) | allocate_controls;
+
+ ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+ adapter_handle.handle, /* r4 */
+ allocate_controls, /* r5 */
+ number_of_entries, /* r6 */
+ 0, 0, 0, 0, 0, 0);
+ eq_handle->handle = outs[0];
+ *act_nr_of_entries = (u32)outs[3];
+ *act_pages = (u32)outs[4];
+ *eq_ist = (u32)outs[5];
+
+ if (ret == H_NOT_ENOUGH_RESOURCES)
+ ehca_gen_err("Not enough resource - ret=%lli ", ret);
+
+ return ret;
+}
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+ struct ipz_eq_handle eq_handle,
+ const u64 event_mask)
+{
+ return ehca_plpar_hcall_norets(H_RESET_EVENTS,
+ adapter_handle.handle, /* r4 */
+ eq_handle.handle, /* r5 */
+ event_mask, /* r6 */
+ 0, 0, 0, 0);
+}
+
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_cq *cq,
+ struct ehca_alloc_cq_parms *param)
+{
+ int rc;
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+ adapter_handle.handle, /* r4 */
+ 2, /* r5 */
+ param->eq_handle.handle, /* r6 */
+ cq->token, /* r7 */
+ param->nr_cqe, /* r8 */
+ 0, 0, 0, 0);
+ cq->ipz_cq_handle.handle = outs[0];
+ param->act_nr_of_entries = (u32)outs[3];
+ param->act_pages = (u32)outs[4];
+
+ if (ret == H_SUCCESS) {
+ rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
+ if (rc) {
+ ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+ rc, outs[5]);
+
+ ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+ adapter_handle.handle, /* r4 */
+ cq->ipz_cq_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0);
+ ret = H_NO_MEM;
+ }
+ }
+
+ if (ret == H_NOT_ENOUGH_RESOURCES)
+ ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+ return ret;
+}
+
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_alloc_qp_parms *parms, int is_user)
+{
+ int rc;
+ u64 ret;
+ u64 allocate_controls, max_r10_reg, r11, r12;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ allocate_controls =
+ EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
+ parms->squeue.page_size)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
+ parms->rqueue.page_size)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
+ !!(parms->ll_comp_flags & LLQP_RECV_COMP))
+ | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
+ !!(parms->ll_comp_flags & LLQP_SEND_COMP))
+ | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
+ parms->ud_av_l_key_ctl)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
+
+ max_r10_reg =
+ EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
+ parms->squeue.max_wr + 1)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
+ parms->rqueue.max_wr + 1)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
+ parms->squeue.max_sge)
+ | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
+ parms->rqueue.max_sge);
+
+ r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
+
+ if (parms->ext_type == EQPT_SRQ)
+ r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
+ else
+ r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
+
+ ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+ adapter_handle.handle, /* r4 */
+ allocate_controls, /* r5 */
+ parms->send_cq_handle.handle,
+ parms->recv_cq_handle.handle,
+ parms->eq_handle.handle,
+ ((u64)parms->token << 32) | parms->pd.value,
+ max_r10_reg, r11, r12);
+
+ parms->qp_handle.handle = outs[0];
+ parms->real_qp_num = (u32)outs[1];
+ parms->squeue.act_nr_wqes =
+ (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
+ parms->rqueue.act_nr_wqes =
+ (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
+ parms->squeue.act_nr_sges =
+ (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
+ parms->rqueue.act_nr_sges =
+ (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
+ parms->squeue.queue_size =
+ (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
+ parms->rqueue.queue_size =
+ (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
+
+ if (ret == H_SUCCESS) {
+ rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
+ if (rc) {
+ ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+ rc, outs[6]);
+
+ ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+ adapter_handle.handle, /* r4 */
+ parms->qp_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0);
+ ret = H_NO_MEM;
+ }
+ }
+
+ if (ret == H_NOT_ENOUGH_RESOURCES)
+ ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+ return ret;
+}
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+ const u8 port_id,
+ struct hipz_query_port *query_port_response_block)
+{
+ u64 ret;
+ u64 r_cb = __pa(query_port_response_block);
+
+ if (r_cb & (EHCA_PAGESIZE-1)) {
+ ehca_gen_err("response block not page aligned");
+ return H_PARAMETER;
+ }
+
+ ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
+ adapter_handle.handle, /* r4 */
+ port_id, /* r5 */
+ r_cb, /* r6 */
+ 0, 0, 0, 0);
+
+ if (ehca_debug_level >= 2)
+ ehca_dmp(query_port_response_block, 64, "response_block");
+
+ return ret;
+}
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+ const u8 port_id, const u32 port_cap,
+ const u8 init_type, const int modify_mask)
+{
+ u64 port_attributes = port_cap;
+
+ if (modify_mask & IB_PORT_SHUTDOWN)
+ port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
+ if (modify_mask & IB_PORT_INIT_TYPE)
+ port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
+ if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
+ port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
+
+ return ehca_plpar_hcall_norets(H_MODIFY_PORT,
+ adapter_handle.handle, /* r4 */
+ port_id, /* r5 */
+ port_attributes, /* r6 */
+ 0, 0, 0, 0);
+}
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+ struct hipz_query_hca *query_hca_rblock)
+{
+ u64 r_cb = __pa(query_hca_rblock);
+
+ if (r_cb & (EHCA_PAGESIZE-1)) {
+ ehca_gen_err("response_block=%p not page aligned",
+ query_hca_rblock);
+ return H_PARAMETER;
+ }
+
+ return ehca_plpar_hcall_norets(H_QUERY_HCA,
+ adapter_handle.handle, /* r4 */
+ r_cb, /* r5 */
+ 0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 resource_handle,
+ const u64 logical_address_of_page,
+ u64 count)
+{
+ return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
+ adapter_handle.handle, /* r4 */
+ (u64)queue_type | ((u64)pagesize) << 8,
+ /* r5 */
+ resource_handle, /* r6 */
+ logical_address_of_page, /* r7 */
+ count, /* r8 */
+ 0, 0);
+}
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_eq_handle eq_handle,
+ struct ehca_pfeq *pfeq,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count)
+{
+ if (count != 1) {
+ ehca_gen_err("Ppage counter=%llx", count);
+ return H_PARAMETER;
+ }
+ return hipz_h_register_rpage(adapter_handle,
+ pagesize,
+ queue_type,
+ eq_handle.handle,
+ logical_address_of_page, count);
+}
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
+ u32 ist)
+{
+ u64 ret;
+ ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
+ adapter_handle.handle, /* r4 */
+ ist, /* r5 */
+ 0, 0, 0, 0, 0);
+
+ if (ret != H_SUCCESS && ret != H_BUSY)
+ ehca_gen_err("Could not query interrupt state.");
+
+ return ret;
+}
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_cq_handle cq_handle,
+ struct ehca_pfcq *pfcq,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count,
+ const struct h_galpa gal)
+{
+ if (count != 1) {
+ ehca_gen_err("Page counter=%llx", count);
+ return H_PARAMETER;
+ }
+
+ return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+ cq_handle.handle, logical_address_of_page,
+ count);
+}
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count,
+ const struct h_galpa galpa)
+{
+ if (count > 1) {
+ ehca_gen_err("Page counter=%llx", count);
+ return H_PARAMETER;
+ }
+
+ return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+ qp_handle.handle, logical_address_of_page,
+ count);
+}
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ void **log_addr_next_sq_wqe2processed,
+ void **log_addr_next_rq_wqe2processed,
+ int dis_and_get_function_code)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+ adapter_handle.handle, /* r4 */
+ dis_and_get_function_code, /* r5 */
+ qp_handle.handle, /* r6 */
+ 0, 0, 0, 0, 0, 0);
+ if (log_addr_next_sq_wqe2processed)
+ *log_addr_next_sq_wqe2processed = (void *)outs[0];
+ if (log_addr_next_rq_wqe2processed)
+ *log_addr_next_rq_wqe2processed = (void *)outs[1];
+
+ return ret;
+}
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ const u64 update_mask,
+ struct hcp_modify_qp_control_block *mqpcb,
+ struct h_galpa gal)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+ ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
+ adapter_handle.handle, /* r4 */
+ qp_handle.handle, /* r5 */
+ update_mask, /* r6 */
+ __pa(mqpcb), /* r7 */
+ 0, 0, 0, 0, 0);
+
+ if (ret == H_NOT_ENOUGH_RESOURCES)
+ ehca_gen_err("Insufficient resources ret=%lli", ret);
+
+ return ret;
+}
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ struct hcp_modify_qp_control_block *qqpcb,
+ struct h_galpa gal)
+{
+ return ehca_plpar_hcall_norets(H_QUERY_QP,
+ adapter_handle.handle, /* r4 */
+ qp_handle.handle, /* r5 */
+ __pa(qqpcb), /* r6 */
+ 0, 0, 0, 0);
+}
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_qp *qp)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = hcp_galpas_dtor(&qp->galpas);
+ if (ret) {
+ ehca_gen_err("Could not destruct qp->galpas");
+ return H_RESOURCE;
+ }
+ ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+ adapter_handle.handle, /* r4 */
+ /* function code */
+ 1, /* r5 */
+ qp->ipz_qp_handle.handle, /* r6 */
+ 0, 0, 0, 0, 0, 0);
+ if (ret == H_HARDWARE)
+ ehca_gen_err("HCA not operational. ret=%lli", ret);
+
+ ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+ adapter_handle.handle, /* r4 */
+ qp->ipz_qp_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0);
+
+ if (ret == H_RESOURCE)
+ ehca_gen_err("Resource still in use. ret=%lli", ret);
+
+ return ret;
+}
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u32 port)
+{
+ return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
+ adapter_handle.handle, /* r4 */
+ qp_handle.handle, /* r5 */
+ port, /* r6 */
+ 0, 0, 0, 0);
+}
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u32 port, u32 * pma_qp_nr,
+ u32 * bma_qp_nr)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
+ adapter_handle.handle, /* r4 */
+ qp_handle.handle, /* r5 */
+ port, /* r6 */
+ 0, 0, 0, 0, 0, 0);
+ *pma_qp_nr = (u32)outs[0];
+ *bma_qp_nr = (u32)outs[1];
+
+ if (ret == H_ALIAS_EXIST)
+ ehca_gen_err("AQP1 already exists. ret=%lli", ret);
+
+ return ret;
+}
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u16 mcg_dlid,
+ u64 subnet_prefix, u64 interface_id)
+{
+ u64 ret;
+
+ ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
+ adapter_handle.handle, /* r4 */
+ qp_handle.handle, /* r5 */
+ mcg_dlid, /* r6 */
+ interface_id, /* r7 */
+ subnet_prefix, /* r8 */
+ 0, 0);
+
+ if (ret == H_NOT_ENOUGH_RESOURCES)
+ ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+ return ret;
+}
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u16 mcg_dlid,
+ u64 subnet_prefix, u64 interface_id)
+{
+ return ehca_plpar_hcall_norets(H_DETACH_MCQP,
+ adapter_handle.handle, /* r4 */
+ qp_handle.handle, /* r5 */
+ mcg_dlid, /* r6 */
+ interface_id, /* r7 */
+ subnet_prefix, /* r8 */
+ 0, 0);
+}
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_cq *cq,
+ u8 force_flag)
+{
+ u64 ret;
+
+ ret = hcp_galpas_dtor(&cq->galpas);
+ if (ret) {
+ ehca_gen_err("Could not destruct cp->galpas");
+ return H_RESOURCE;
+ }
+
+ ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+ adapter_handle.handle, /* r4 */
+ cq->ipz_cq_handle.handle, /* r5 */
+ force_flag != 0 ? 1L : 0L, /* r6 */
+ 0, 0, 0, 0);
+
+ if (ret == H_RESOURCE)
+ ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
+
+ return ret;
+}
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_eq *eq)
+{
+ u64 ret;
+
+ ret = hcp_galpas_dtor(&eq->galpas);
+ if (ret) {
+ ehca_gen_err("Could not destruct eq->galpas");
+ return H_RESOURCE;
+ }
+
+ ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+ adapter_handle.handle, /* r4 */
+ eq->ipz_eq_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0);
+
+ if (ret == H_RESOURCE)
+ ehca_gen_err("Resource in use. ret=%lli ", ret);
+
+ return ret;
+}
+
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const u64 vaddr,
+ const u64 length,
+ const u32 access_ctrl,
+ const struct ipz_pd pd,
+ struct ehca_mr_hipzout_parms *outparms)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+ adapter_handle.handle, /* r4 */
+ 5, /* r5 */
+ vaddr, /* r6 */
+ length, /* r7 */
+ (((u64)access_ctrl) << 32ULL), /* r8 */
+ pd.value, /* r9 */
+ 0, 0, 0);
+ outparms->handle.handle = outs[0];
+ outparms->lkey = (u32)outs[2];
+ outparms->rkey = (u32)outs[3];
+
+ return ret;
+}
+
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count)
+{
+ u64 ret;
+
+ if (unlikely(ehca_debug_level >= 3)) {
+ if (count > 1) {
+ u64 *kpage;
+ int i;
+ kpage = __va(logical_address_of_page);
+ for (i = 0; i < count; i++)
+ ehca_gen_dbg("kpage[%d]=%p",
+ i, (void *)kpage[i]);
+ } else
+ ehca_gen_dbg("kpage=%p",
+ (void *)logical_address_of_page);
+ }
+
+ if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
+ ehca_gen_err("logical_address_of_page not on a 4k boundary "
+ "adapter_handle=%llx mr=%p mr_handle=%llx "
+ "pagesize=%x queue_type=%x "
+ "logical_address_of_page=%llx count=%llx",
+ adapter_handle.handle, mr,
+ mr->ipz_mr_handle.handle, pagesize, queue_type,
+ logical_address_of_page, count);
+ ret = H_PARAMETER;
+ } else
+ ret = hipz_h_register_rpage(adapter_handle, pagesize,
+ queue_type,
+ mr->ipz_mr_handle.handle,
+ logical_address_of_page, count);
+ return ret;
+}
+
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ struct ehca_mr_hipzout_parms *outparms)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
+ adapter_handle.handle, /* r4 */
+ mr->ipz_mr_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0, 0, 0);
+ outparms->len = outs[0];
+ outparms->vaddr = outs[1];
+ outparms->acl = outs[4] >> 32;
+ outparms->lkey = (u32)(outs[5] >> 32);
+ outparms->rkey = (u32)(outs[5] & (0xffffffff));
+
+ return ret;
+}
+
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr)
+{
+ return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+ adapter_handle.handle, /* r4 */
+ mr->ipz_mr_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const u64 vaddr_in,
+ const u64 length,
+ const u32 access_ctrl,
+ const struct ipz_pd pd,
+ const u64 mr_addr_cb,
+ struct ehca_mr_hipzout_parms *outparms)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
+ adapter_handle.handle, /* r4 */
+ mr->ipz_mr_handle.handle, /* r5 */
+ vaddr_in, /* r6 */
+ length, /* r7 */
+ /* r8 */
+ ((((u64)access_ctrl) << 32ULL) | pd.value),
+ mr_addr_cb, /* r9 */
+ 0, 0, 0);
+ outparms->vaddr = outs[1];
+ outparms->lkey = (u32)outs[2];
+ outparms->rkey = (u32)outs[3];
+
+ return ret;
+}
+
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const struct ehca_mr *orig_mr,
+ const u64 vaddr_in,
+ const u32 access_ctrl,
+ const struct ipz_pd pd,
+ struct ehca_mr_hipzout_parms *outparms)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
+ adapter_handle.handle, /* r4 */
+ orig_mr->ipz_mr_handle.handle, /* r5 */
+ vaddr_in, /* r6 */
+ (((u64)access_ctrl) << 32ULL), /* r7 */
+ pd.value, /* r8 */
+ 0, 0, 0, 0);
+ outparms->handle.handle = outs[0];
+ outparms->lkey = (u32)outs[2];
+ outparms->rkey = (u32)outs[3];
+
+ return ret;
+}
+
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mw *mw,
+ const struct ipz_pd pd,
+ struct ehca_mw_hipzout_parms *outparms)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+ adapter_handle.handle, /* r4 */
+ 6, /* r5 */
+ pd.value, /* r6 */
+ 0, 0, 0, 0, 0, 0);
+ outparms->handle.handle = outs[0];
+ outparms->rkey = (u32)outs[3];
+
+ return ret;
+}
+
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mw *mw,
+ struct ehca_mw_hipzout_parms *outparms)
+{
+ u64 ret;
+ unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+ ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
+ adapter_handle.handle, /* r4 */
+ mw->ipz_mw_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0, 0, 0);
+ outparms->rkey = (u32)outs[3];
+
+ return ret;
+}
+
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mw *mw)
+{
+ return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+ adapter_handle.handle, /* r4 */
+ mw->ipz_mw_handle.handle, /* r5 */
+ 0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+ const u64 ressource_handle,
+ void *rblock,
+ unsigned long *byte_count)
+{
+ u64 r_cb = __pa(rblock);
+
+ if (r_cb & (EHCA_PAGESIZE-1)) {
+ ehca_gen_err("rblock not page aligned.");
+ return H_PARAMETER;
+ }
+
+ return ehca_plpar_hcall_norets(H_ERROR_DATA,
+ adapter_handle.handle,
+ ressource_handle,
+ r_cb,
+ 0, 0, 0, 0);
+}
+
+u64 hipz_h_eoi(int irq)
+{
+ unsigned long xirr;
+
+ iosync();
+ xirr = (0xffULL << 24) | irq;
+
+ return plpar_hcall_norets(H_EOI, xirr);
+}
diff --git a/drivers/staging/rdma/ehca/hcp_if.h b/drivers/staging/rdma/ehca/hcp_if.h
new file mode 100644
index 000000000000..a46e514c367b
--- /dev/null
+++ b/drivers/staging/rdma/ehca/hcp_if.h
@@ -0,0 +1,265 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Firmware Infiniband Interface code for POWER
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Gerd Bayer <gerd.bayer@de.ibm.com>
+ * Waleri Fomin <fomin@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_IF_H__
+#define __HCP_IF_H__
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "hipz_hw.h"
+
+/*
+ * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
+ * resources, create the empty EQPT (ring).
+ */
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_pfeq *pfeq,
+ const u32 neq_control,
+ const u32 number_of_entries,
+ struct ipz_eq_handle *eq_handle,
+ u32 * act_nr_of_entries,
+ u32 * act_pages,
+ u32 * eq_ist);
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+ struct ipz_eq_handle eq_handle,
+ const u64 event_mask);
+/*
+ * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
+ * resources, create the empty CQPT (ring).
+ */
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_cq *cq,
+ struct ehca_alloc_cq_parms *param);
+
+
+/*
+ * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
+ * initialize resources, create empty QPPTs (2 rings).
+ */
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_alloc_qp_parms *parms, int is_user);
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+ const u8 port_id,
+ struct hipz_query_port *query_port_response_block);
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+ const u8 port_id, const u32 port_cap,
+ const u8 init_type, const int modify_mask);
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+ struct hipz_query_hca *query_hca_rblock);
+
+/*
+ * hipz_h_register_rpage internal function in hcp_if.h for all
+ * hcp_H_REGISTER_RPAGE calls.
+ */
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 resource_handle,
+ const u64 logical_address_of_page,
+ u64 count);
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_eq_handle eq_handle,
+ struct ehca_pfeq *pfeq,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count);
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle
+ hcp_adapter_handle,
+ u32 ist);
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_cq_handle cq_handle,
+ struct ehca_pfcq *pfcq,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count,
+ const struct h_galpa gal);
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count,
+ const struct h_galpa galpa);
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ void **log_addr_next_sq_wqe_tb_processed,
+ void **log_addr_next_rq_wqe_tb_processed,
+ int dis_and_get_function_code);
+enum hcall_sigt {
+ HCALL_SIGT_NO_CQE = 0,
+ HCALL_SIGT_BY_WQE = 1,
+ HCALL_SIGT_EVERY = 2
+};
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ const u64 update_mask,
+ struct hcp_modify_qp_control_block *mqpcb,
+ struct h_galpa gal);
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct ehca_pfqp *pfqp,
+ struct hcp_modify_qp_control_block *qqpcb,
+ struct h_galpa gal);
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_qp *qp);
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u32 port);
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u32 port, u32 * pma_qp_nr,
+ u32 * bma_qp_nr);
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u16 mcg_dlid,
+ u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+ const struct ipz_qp_handle qp_handle,
+ struct h_galpa gal,
+ u16 mcg_dlid,
+ u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_cq *cq,
+ u8 force_flag);
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+ struct ehca_eq *eq);
+
+/*
+ * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const u64 vaddr,
+ const u64 length,
+ const u32 access_ctrl,
+ const struct ipz_pd pd,
+ struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const u8 pagesize,
+ const u8 queue_type,
+ const u64 logical_address_of_page,
+ const u64 count);
+
+/* hipz_h_query_mr queries MR in HW and FW */
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mr frees MR resources in HW and FW */
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr);
+
+/* hipz_h_reregister_pmr reregisters MR in HW and FW */
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const u64 vaddr_in,
+ const u64 length,
+ const u32 access_ctrl,
+ const struct ipz_pd pd,
+ const u64 mr_addr_cb,
+ struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_smr register shared MR in HW and FW */
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mr *mr,
+ const struct ehca_mr *orig_mr,
+ const u64 vaddr_in,
+ const u32 access_ctrl,
+ const struct ipz_pd pd,
+ struct ehca_mr_hipzout_parms *outparms);
+
+/*
+ * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mw *mw,
+ const struct ipz_pd pd,
+ struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_query_mw queries MW in HW and FW */
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mw *mw,
+ struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mw frees MW resources in HW and FW */
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+ const struct ehca_mw *mw);
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+ const u64 ressource_handle,
+ void *rblock,
+ unsigned long *byte_count);
+u64 hipz_h_eoi(int irq);
+
+#endif /* __HCP_IF_H__ */
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.c b/drivers/staging/rdma/ehca/hcp_phyp.c
new file mode 100644
index 000000000000..077376ff3d28
--- /dev/null
+++ b/drivers/staging/rdma/ehca/hcp_phyp.c
@@ -0,0 +1,82 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * load store abstraction for ehca register access with tracing
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+u64 hcall_map_page(u64 physaddr)
+{
+ return (u64)ioremap(physaddr, EHCA_PAGESIZE);
+}
+
+int hcall_unmap_page(u64 mapaddr)
+{
+ iounmap((volatile void __iomem *) mapaddr);
+ return 0;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+ u64 paddr_kernel, u64 paddr_user)
+{
+ if (!is_user) {
+ galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
+ if (!galpas->kernel.fw_handle)
+ return -ENOMEM;
+ } else
+ galpas->kernel.fw_handle = 0;
+
+ galpas->user.fw_handle = paddr_user;
+
+ return 0;
+}
+
+int hcp_galpas_dtor(struct h_galpas *galpas)
+{
+ if (galpas->kernel.fw_handle) {
+ int ret = hcall_unmap_page(galpas->kernel.fw_handle);
+ if (ret)
+ return ret;
+ }
+
+ galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
+
+ return 0;
+}
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.h b/drivers/staging/rdma/ehca/hcp_phyp.h
new file mode 100644
index 000000000000..d1b029910249
--- /dev/null
+++ b/drivers/staging/rdma/ehca/hcp_phyp.h
@@ -0,0 +1,90 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * Firmware calls
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Waleri Fomin <fomin@de.ibm.com>
+ * Gerd Bayer <gerd.bayer@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_PHYP_H__
+#define __HCP_PHYP_H__
+
+
+/*
+ * eHCA page (mapped into memory)
+ * resource to access eHCA register pages in CPU address space
+*/
+struct h_galpa {
+ u64 fw_handle;
+ /* for pSeries this is a 64bit memory address where
+ I/O memory is mapped into CPU address space (kv) */
+};
+
+/*
+ * resource to access eHCA address space registers, all types
+ */
+struct h_galpas {
+ u32 pid; /*PID of userspace galpa checking */
+ struct h_galpa user; /* user space accessible resource,
+ set to 0 if unused */
+ struct h_galpa kernel; /* kernel space accessible resource,
+ set to 0 if unused */
+};
+
+static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
+{
+ u64 addr = galpa.fw_handle + offset;
+ return *(volatile u64 __force *)addr;
+}
+
+static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
+{
+ u64 addr = galpa.fw_handle + offset;
+ *(volatile u64 __force *)addr = value;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+ u64 paddr_kernel, u64 paddr_user);
+
+int hcp_galpas_dtor(struct h_galpas *galpas);
+
+u64 hcall_map_page(u64 physaddr);
+
+int hcall_unmap_page(u64 mapaddr);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns.h b/drivers/staging/rdma/ehca/hipz_fns.h
new file mode 100644
index 000000000000..9dac93d02140
--- /dev/null
+++ b/drivers/staging/rdma/ehca/hipz_fns.h
@@ -0,0 +1,68 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * HW abstraction register functions
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_H__
+#define __HIPZ_FNS_H__
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+#include "hipz_fns_core.h"
+
+#define hipz_galpa_store_eq(gal, offset, value) \
+ hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_eq(gal, offset) \
+ hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qped(gal, offset, value) \
+ hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_qped(gal, offset) \
+ hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
+
+#define hipz_galpa_store_mrmw(gal, offset, value) \
+ hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_mrmw(gal, offset) \
+ hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
+
+#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns_core.h b/drivers/staging/rdma/ehca/hipz_fns_core.h
new file mode 100644
index 000000000000..868735fd3187
--- /dev/null
+++ b/drivers/staging/rdma/ehca/hipz_fns_core.h
@@ -0,0 +1,100 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * HW abstraction register functions
+ *
+ * Authors: Christoph Raisch <raisch@de.ibm.com>
+ * Heiko J Schick <schickhj@de.ibm.com>
+ * Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_CORE_H__
+#define __HIPZ_FNS_CORE_H__
+
+#include "hcp_phyp.h"
+#include "hipz_hw.h"
+
+#define hipz_galpa_store_cq(gal, offset, value) \
+ hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_cq(gal, offset) \
+ hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qp(gal, offset, value) \
+ hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
+#define hipz_galpa_load_qp(gal, offset) \
+ hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
+
+static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+ /* ringing doorbell :-) */
+ hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
+ EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
+}
+
+static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+ /* ringing doorbell :-) */
+ hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
+ EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
+}
+
+static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
+{
+ hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
+ EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
+}
+
+static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
+{
+ u64 cqx_n0_reg;
+
+ hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
+ EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
+ value));
+ cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
+}
+
+static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
+{
+ u64 cqx_n1_reg;
+
+ hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
+ EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
+ cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
+}
+
+#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/staging/rdma/ehca/hipz_hw.h b/drivers/staging/rdma/ehca/hipz_hw.h
new file mode 100644
index 000000000000..bf996c7acc42
--- /dev/null
+++ b/drivers/staging/rdma/ehca/hipz_hw.h
@@ -0,0 +1,414 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * eHCA register definitions
+ *
+ * Authors: Waleri Fomin <fomin@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_HW_H__
+#define __HIPZ_HW_H__
+
+#include "ehca_tools.h"
+
+#define EHCA_MAX_MTU 4
+
+/* QP Table Entry Memory Map */
+struct hipz_qptemm {
+ u64 qpx_hcr;
+ u64 qpx_c;
+ u64 qpx_herr;
+ u64 qpx_aer;
+/* 0x20*/
+ u64 qpx_sqa;
+ u64 qpx_sqc;
+ u64 qpx_rqa;
+ u64 qpx_rqc;
+/* 0x40*/
+ u64 qpx_st;
+ u64 qpx_pmstate;
+ u64 qpx_pmfa;
+ u64 qpx_pkey;
+/* 0x60*/
+ u64 qpx_pkeya;
+ u64 qpx_pkeyb;
+ u64 qpx_pkeyc;
+ u64 qpx_pkeyd;
+/* 0x80*/
+ u64 qpx_qkey;
+ u64 qpx_dqp;
+ u64 qpx_dlidp;
+ u64 qpx_portp;
+/* 0xa0*/
+ u64 qpx_slidp;
+ u64 qpx_slidpp;
+ u64 qpx_dlida;
+ u64 qpx_porta;
+/* 0xc0*/
+ u64 qpx_slida;
+ u64 qpx_slidpa;
+ u64 qpx_slvl;
+ u64 qpx_ipd;
+/* 0xe0*/
+ u64 qpx_mtu;
+ u64 qpx_lato;
+ u64 qpx_rlimit;
+ u64 qpx_rnrlimit;
+/* 0x100*/
+ u64 qpx_t;
+ u64 qpx_sqhp;
+ u64 qpx_sqptp;
+ u64 qpx_nspsn;
+/* 0x120*/
+ u64 qpx_nspsnhwm;
+ u64 reserved1;
+ u64 qpx_sdsi;
+ u64 qpx_sdsbc;
+/* 0x140*/
+ u64 qpx_sqwsize;
+ u64 qpx_sqwts;
+ u64 qpx_lsn;
+ u64 qpx_nssn;
+/* 0x160 */
+ u64 qpx_mor;
+ u64 qpx_cor;
+ u64 qpx_sqsize;
+ u64 qpx_erc;
+/* 0x180*/
+ u64 qpx_rnrrc;
+ u64 qpx_ernrwt;
+ u64 qpx_rnrresp;
+ u64 qpx_lmsna;
+/* 0x1a0 */
+ u64 qpx_sqhpc;
+ u64 qpx_sqcptp;
+ u64 qpx_sigt;
+ u64 qpx_wqecnt;
+/* 0x1c0*/
+ u64 qpx_rqhp;
+ u64 qpx_rqptp;
+ u64 qpx_rqsize;
+ u64 qpx_nrr;
+/* 0x1e0*/
+ u64 qpx_rdmac;
+ u64 qpx_nrpsn;
+ u64 qpx_lapsn;
+ u64 qpx_lcr;
+/* 0x200*/
+ u64 qpx_rwc;
+ u64 qpx_rwva;
+ u64 qpx_rdsi;
+ u64 qpx_rdsbc;
+/* 0x220*/
+ u64 qpx_rqwsize;
+ u64 qpx_crmsn;
+ u64 qpx_rdd;
+ u64 qpx_larpsn;
+/* 0x240*/
+ u64 qpx_pd;
+ u64 qpx_scqn;
+ u64 qpx_rcqn;
+ u64 qpx_aeqn;
+/* 0x260*/
+ u64 qpx_aaelog;
+ u64 qpx_ram;
+ u64 qpx_rdmaqe0;
+ u64 qpx_rdmaqe1;
+/* 0x280*/
+ u64 qpx_rdmaqe2;
+ u64 qpx_rdmaqe3;
+ u64 qpx_nrpsnhwm;
+/* 0x298*/
+ u64 reserved[(0x400 - 0x298) / 8];
+/* 0x400 extended data */
+ u64 reserved_ext[(0x500 - 0x400) / 8];
+/* 0x500 */
+ u64 reserved2[(0x1000 - 0x500) / 8];
+/* 0x1000 */
+};
+
+#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
+
+#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
+
+/* MRMWPT Entry Memory Map */
+struct hipz_mrmwmm {
+ /* 0x00 */
+ u64 mrx_hcr;
+
+ u64 mrx_c;
+ u64 mrx_herr;
+ u64 mrx_aer;
+ /* 0x20 */
+ u64 mrx_pp;
+ u64 reserved1;
+ u64 reserved2;
+ u64 reserved3;
+ /* 0x40 */
+ u64 reserved4[(0x200 - 0x40) / 8];
+ /* 0x200 */
+ u64 mrx_ctl[64];
+
+};
+
+#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
+
+struct hipz_qpedmm {
+ /* 0x00 */
+ u64 reserved0[(0x400) / 8];
+ /* 0x400 */
+ u64 qpedx_phh;
+ u64 qpedx_ppsgp;
+ /* 0x410 */
+ u64 qpedx_ppsgu;
+ u64 qpedx_ppdgp;
+ /* 0x420 */
+ u64 qpedx_ppdgu;
+ u64 qpedx_aph;
+ /* 0x430 */
+ u64 qpedx_apsgp;
+ u64 qpedx_apsgu;
+ /* 0x440 */
+ u64 qpedx_apdgp;
+ u64 qpedx_apdgu;
+ /* 0x450 */
+ u64 qpedx_apav;
+ u64 qpedx_apsav;
+ /* 0x460 */
+ u64 qpedx_hcr;
+ u64 reserved1[4];
+ /* 0x488 */
+ u64 qpedx_rrl0;
+ /* 0x490 */
+ u64 qpedx_rrrkey0;
+ u64 qpedx_rrva0;
+ /* 0x4a0 */
+ u64 reserved2;
+ u64 qpedx_rrl1;
+ /* 0x4b0 */
+ u64 qpedx_rrrkey1;
+ u64 qpedx_rrva1;
+ /* 0x4c0 */
+ u64 reserved3;
+ u64 qpedx_rrl2;
+ /* 0x4d0 */
+ u64 qpedx_rrrkey2;
+ u64 qpedx_rrva2;
+ /* 0x4e0 */
+ u64 reserved4;
+ u64 qpedx_rrl3;
+ /* 0x4f0 */
+ u64 qpedx_rrrkey3;
+ u64 qpedx_rrva3;
+};
+
+#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
+
+/* CQ Table Entry Memory Map */
+struct hipz_cqtemm {
+ u64 cqx_hcr;
+ u64 cqx_c;
+ u64 cqx_herr;
+ u64 cqx_aer;
+/* 0x20 */
+ u64 cqx_ptp;
+ u64 cqx_tp;
+ u64 cqx_fec;
+ u64 cqx_feca;
+/* 0x40 */
+ u64 cqx_ep;
+ u64 cqx_eq;
+/* 0x50 */
+ u64 reserved1;
+ u64 cqx_n0;
+/* 0x60 */
+ u64 cqx_n1;
+ u64 reserved2[(0x1000 - 0x60) / 8];
+/* 0x1000 */
+};
+
+#define CQX_FEC_CQE_CNT EHCA_BMASK_IBM(32, 63)
+#define CQX_FECADDER EHCA_BMASK_IBM(32, 63)
+#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+
+#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
+
+/* EQ Table Entry Memory Map */
+struct hipz_eqtemm {
+ u64 eqx_hcr;
+ u64 eqx_c;
+
+ u64 eqx_herr;
+ u64 eqx_aer;
+/* 0x20 */
+ u64 eqx_ptp;
+ u64 eqx_tp;
+ u64 eqx_ssba;
+ u64 eqx_psba;
+
+/* 0x40 */
+ u64 eqx_cec;
+ u64 eqx_meql;
+ u64 eqx_xisbi;
+ u64 eqx_xisc;
+/* 0x60 */
+ u64 eqx_it;
+
+};
+
+#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
+
+/* access control defines for MR/MW */
+#define HIPZ_ACCESSCTRL_L_WRITE 0x00800000
+#define HIPZ_ACCESSCTRL_R_WRITE 0x00400000
+#define HIPZ_ACCESSCTRL_R_READ 0x00200000
+#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
+#define HIPZ_ACCESSCTRL_MW_BIND 0x00080000
+
+/* query hca response block */
+struct hipz_query_hca {
+ u32 cur_reliable_dg;
+ u32 cur_qp;
+ u32 cur_cq;
+ u32 cur_eq;
+ u32 cur_mr;
+ u32 cur_mw;
+ u32 cur_ee_context;
+ u32 cur_mcast_grp;
+ u32 cur_qp_attached_mcast_grp;
+ u32 reserved1;
+ u32 cur_ipv6_qp;
+ u32 cur_eth_qp;
+ u32 cur_hp_mr;
+ u32 reserved2[3];
+ u32 max_rd_domain;
+ u32 max_qp;
+ u32 max_cq;
+ u32 max_eq;
+ u32 max_mr;
+ u32 max_hp_mr;
+ u32 max_mw;
+ u32 max_mrwpte;
+ u32 max_special_mrwpte;
+ u32 max_rd_ee_context;
+ u32 max_mcast_grp;
+ u32 max_total_mcast_qp_attach;
+ u32 max_mcast_qp_attach;
+ u32 max_raw_ipv6_qp;
+ u32 max_raw_ethy_qp;
+ u32 internal_clock_frequency;
+ u32 max_pd;
+ u32 max_ah;
+ u32 max_cqe;
+ u32 max_wqes_wq;
+ u32 max_partitions;
+ u32 max_rr_ee_context;
+ u32 max_rr_qp;
+ u32 max_rr_hca;
+ u32 max_act_wqs_ee_context;
+ u32 max_act_wqs_qp;
+ u32 max_sge;
+ u32 max_sge_rd;
+ u32 memory_page_size_supported;
+ u64 max_mr_size;
+ u32 local_ca_ack_delay;
+ u32 num_ports;
+ u32 vendor_id;
+ u32 vendor_part_id;
+ u32 hw_ver;
+ u64 node_guid;
+ u64 hca_cap_indicators;
+ u32 data_counter_register_size;
+ u32 max_shared_rq;
+ u32 max_isns_eq;
+ u32 max_neq;
+} __attribute__ ((packed));
+
+#define HCA_CAP_AH_PORT_NR_CHECK EHCA_BMASK_IBM( 0, 0)
+#define HCA_CAP_ATOMIC EHCA_BMASK_IBM( 1, 1)
+#define HCA_CAP_AUTO_PATH_MIG EHCA_BMASK_IBM( 2, 2)
+#define HCA_CAP_BAD_P_KEY_CTR EHCA_BMASK_IBM( 3, 3)
+#define HCA_CAP_SQD_RTS_PORT_CHANGE EHCA_BMASK_IBM( 4, 4)
+#define HCA_CAP_CUR_QP_STATE_MOD EHCA_BMASK_IBM( 5, 5)
+#define HCA_CAP_INIT_TYPE EHCA_BMASK_IBM( 6, 6)
+#define HCA_CAP_PORT_ACTIVE_EVENT EHCA_BMASK_IBM( 7, 7)
+#define HCA_CAP_Q_KEY_VIOL_CTR EHCA_BMASK_IBM( 8, 8)
+#define HCA_CAP_WQE_RESIZE EHCA_BMASK_IBM( 9, 9)
+#define HCA_CAP_RAW_PACKET_MCAST EHCA_BMASK_IBM(10, 10)
+#define HCA_CAP_SHUTDOWN_PORT EHCA_BMASK_IBM(11, 11)
+#define HCA_CAP_RC_LL_QP EHCA_BMASK_IBM(12, 12)
+#define HCA_CAP_SRQ EHCA_BMASK_IBM(13, 13)
+#define HCA_CAP_UD_LL_QP EHCA_BMASK_IBM(16, 16)
+#define HCA_CAP_RESIZE_MR EHCA_BMASK_IBM(17, 17)
+#define HCA_CAP_MINI_QP EHCA_BMASK_IBM(18, 18)
+#define HCA_CAP_H_ALLOC_RES_SYNC EHCA_BMASK_IBM(19, 19)
+
+/* query port response block */
+struct hipz_query_port {
+ u32 state;
+ u32 bad_pkey_cntr;
+ u32 lmc;
+ u32 lid;
+ u32 subnet_timeout;
+ u32 qkey_viol_cntr;
+ u32 sm_sl;
+ u32 sm_lid;
+ u32 capability_mask;
+ u32 init_type_reply;
+ u32 pkey_tbl_len;
+ u32 gid_tbl_len;
+ u64 gid_prefix;
+ u32 port_nr;
+ u16 pkey_entries[16];
+ u8 reserved1[32];
+ u32 trent_size;
+ u32 trbuf_size;
+ u64 max_msg_sz;
+ u32 max_mtu;
+ u32 vl_cap;
+ u32 phys_pstate;
+ u32 phys_state;
+ u32 phys_speed;
+ u32 phys_width;
+ u8 reserved2[1884];
+ u64 guid_entries[255];
+} __attribute__ ((packed));
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.c b/drivers/staging/rdma/ehca/ipz_pt_fn.c
new file mode 100644
index 000000000000..7ffc748cb973
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ipz_pt_fn.c
@@ -0,0 +1,289 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * internal queue handling
+ *
+ * Authors: Waleri Fomin <fomin@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ipz_pt_fn.h"
+#include "ehca_classes.h"
+
+#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
+
+struct kmem_cache *small_qp_cache;
+
+void *ipz_qpageit_get_inc(struct ipz_queue *queue)
+{
+ void *ret = ipz_qeit_get(queue);
+ queue->current_q_offset += queue->pagesize;
+ if (queue->current_q_offset > queue->queue_length) {
+ queue->current_q_offset -= queue->pagesize;
+ ret = NULL;
+ }
+ if (((u64)ret) % queue->pagesize) {
+ ehca_gen_err("ERROR!! not at PAGE-Boundary");
+ return NULL;
+ }
+ return ret;
+}
+
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
+{
+ void *ret = ipz_qeit_get(queue);
+ u64 last_entry_in_q = queue->queue_length - queue->qe_size;
+
+ queue->current_q_offset += queue->qe_size;
+ if (queue->current_q_offset > last_entry_in_q) {
+ queue->current_q_offset = 0;
+ queue->toggle_state = (~queue->toggle_state) & 1;
+ }
+
+ return ret;
+}
+
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
+{
+ int i;
+ for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
+ u64 page = __pa(queue->queue_pages[i]);
+ if (addr >= page && addr < page + queue->pagesize) {
+ *q_offset = addr - page + i * queue->pagesize;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+#if PAGE_SHIFT < EHCA_PAGESHIFT
+#error Kernel pages must be at least as large than eHCA pages (4K) !
+#endif
+
+/*
+ * allocate pages for queue:
+ * outer loop allocates whole kernel pages (page aligned) and
+ * inner loop divides a kernel page into smaller hca queue pages
+ */
+static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
+{
+ int k, f = 0;
+ u8 *kpage;
+
+ while (f < nr_of_pages) {
+ kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
+ if (!kpage)
+ goto out;
+
+ for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
+ queue->queue_pages[f] = (struct ipz_page *)kpage;
+ kpage += EHCA_PAGESIZE;
+ f++;
+ }
+ }
+ return 1;
+
+out:
+ for (f = 0; f < nr_of_pages && queue->queue_pages[f];
+ f += PAGES_PER_KPAGE)
+ free_page((unsigned long)(queue->queue_pages)[f]);
+ return 0;
+}
+
+static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+ int order = ilog2(queue->pagesize) - 9;
+ struct ipz_small_queue_page *page;
+ unsigned long bit;
+
+ mutex_lock(&pd->lock);
+
+ if (!list_empty(&pd->free[order]))
+ page = list_entry(pd->free[order].next,
+ struct ipz_small_queue_page, list);
+ else {
+ page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
+ if (!page)
+ goto out;
+
+ page->page = get_zeroed_page(GFP_KERNEL);
+ if (!page->page) {
+ kmem_cache_free(small_qp_cache, page);
+ goto out;
+ }
+
+ list_add(&page->list, &pd->free[order]);
+ }
+
+ bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
+ __set_bit(bit, page->bitmap);
+ page->fill++;
+
+ if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
+ list_move(&page->list, &pd->full[order]);
+
+ mutex_unlock(&pd->lock);
+
+ queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
+ queue->small_page = page;
+ queue->offset = bit << (order + 9);
+ return 1;
+
+out:
+ ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
+ mutex_unlock(&pd->lock);
+ return 0;
+}
+
+static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+ int order = ilog2(queue->pagesize) - 9;
+ struct ipz_small_queue_page *page = queue->small_page;
+ unsigned long bit;
+ int free_page = 0;
+
+ bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
+ >> (order + 9);
+
+ mutex_lock(&pd->lock);
+
+ __clear_bit(bit, page->bitmap);
+ page->fill--;
+
+ if (page->fill == 0) {
+ list_del(&page->list);
+ free_page = 1;
+ }
+
+ if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
+ /* the page was full until we freed the chunk */
+ list_move_tail(&page->list, &pd->free[order]);
+
+ mutex_unlock(&pd->lock);
+
+ if (free_page) {
+ free_page(page->page);
+ kmem_cache_free(small_qp_cache, page);
+ }
+}
+
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ const u32 nr_of_pages, const u32 pagesize,
+ const u32 qe_size, const u32 nr_of_sg,
+ int is_small)
+{
+ if (pagesize > PAGE_SIZE) {
+ ehca_gen_err("FATAL ERROR: pagesize=%x "
+ "is greater than kernel page size", pagesize);
+ return 0;
+ }
+
+ /* init queue fields */
+ queue->queue_length = nr_of_pages * pagesize;
+ queue->pagesize = pagesize;
+ queue->qe_size = qe_size;
+ queue->act_nr_of_sg = nr_of_sg;
+ queue->current_q_offset = 0;
+ queue->toggle_state = 1;
+ queue->small_page = NULL;
+
+ /* allocate queue page pointers */
+ queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!queue->queue_pages) {
+ queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
+ if (!queue->queue_pages) {
+ ehca_gen_err("Couldn't allocate queue page list");
+ return 0;
+ }
+ }
+
+ /* allocate actual queue pages */
+ if (is_small) {
+ if (!alloc_small_queue_page(queue, pd))
+ goto ipz_queue_ctor_exit0;
+ } else
+ if (!alloc_queue_pages(queue, nr_of_pages))
+ goto ipz_queue_ctor_exit0;
+
+ return 1;
+
+ipz_queue_ctor_exit0:
+ ehca_gen_err("Couldn't alloc pages queue=%p "
+ "nr_of_pages=%x", queue, nr_of_pages);
+ kvfree(queue->queue_pages);
+
+ return 0;
+}
+
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+{
+ int i, nr_pages;
+
+ if (!queue || !queue->queue_pages) {
+ ehca_gen_dbg("queue or queue_pages is NULL");
+ return 0;
+ }
+
+ if (queue->small_page)
+ free_small_queue_page(queue, pd);
+ else {
+ nr_pages = queue->queue_length / queue->pagesize;
+ for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
+ free_page((unsigned long)queue->queue_pages[i]);
+ }
+
+ kvfree(queue->queue_pages);
+
+ return 1;
+}
+
+int ehca_init_small_qp_cache(void)
+{
+ small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
+ sizeof(struct ipz_small_queue_page),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!small_qp_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ehca_cleanup_small_qp_cache(void)
+{
+ kmem_cache_destroy(small_qp_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.h b/drivers/staging/rdma/ehca/ipz_pt_fn.h
new file mode 100644
index 000000000000..a801274ea337
--- /dev/null
+++ b/drivers/staging/rdma/ehca/ipz_pt_fn.h
@@ -0,0 +1,289 @@
+/*
+ * IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ * internal queue handling
+ *
+ * Authors: Waleri Fomin <fomin@de.ibm.com>
+ * Reinhard Ernst <rernst@de.ibm.com>
+ * Christoph Raisch <raisch@de.ibm.com>
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IPZ_PT_FN_H__
+#define __IPZ_PT_FN_H__
+
+#define EHCA_PAGESHIFT 12
+#define EHCA_PAGESIZE 4096UL
+#define EHCA_PAGEMASK (~(EHCA_PAGESIZE-1))
+#define EHCA_PT_ENTRIES 512UL
+
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+
+struct ehca_pd;
+struct ipz_small_queue_page;
+
+extern struct kmem_cache *small_qp_cache;
+
+/* struct generic ehca page */
+struct ipz_page {
+ u8 entries[EHCA_PAGESIZE];
+};
+
+#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
+
+struct ipz_small_queue_page {
+ unsigned long page;
+ unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
+ int fill;
+ void *mapped_addr;
+ u32 mmap_count;
+ struct list_head list;
+};
+
+/* struct generic queue in linux kernel virtual memory (kv) */
+struct ipz_queue {
+ u64 current_q_offset; /* current queue entry */
+
+ struct ipz_page **queue_pages; /* array of pages belonging to queue */
+ u32 qe_size; /* queue entry size */
+ u32 act_nr_of_sg;
+ u32 queue_length; /* queue length allocated in bytes */
+ u32 pagesize;
+ u32 toggle_state; /* toggle flag - per page */
+ u32 offset; /* save offset within page for small_qp */
+ struct ipz_small_queue_page *small_page;
+};
+
+/*
+ * return current Queue Entry for a certain q_offset
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
+{
+ struct ipz_page *current_page;
+ if (q_offset >= queue->queue_length)
+ return NULL;
+ current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
+ return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
+}
+
+/*
+ * return current Queue Entry
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_get(struct ipz_queue *queue)
+{
+ return ipz_qeit_calc(queue, queue->current_q_offset);
+}
+
+/*
+ * return current Queue Page , increment Queue Page iterator from
+ * page to page in struct ipz_queue, last increment will return 0! and
+ * NOT wrap
+ * returns address (kv) of Queue Page
+ * warning don't use in parallel with ipz_QE_get_inc()
+ */
+void *ipz_qpageit_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
+{
+ void *ret = ipz_qeit_get(queue);
+ queue->current_q_offset += queue->qe_size;
+ if (queue->current_q_offset >= queue->queue_length) {
+ queue->current_q_offset = 0;
+ /* toggle the valid flag */
+ queue->toggle_state = (~queue->toggle_state) & 1;
+ }
+
+ return ret;
+}
+
+/*
+ * return a bool indicating whether current Queue Entry is valid
+ */
+static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
+{
+ struct ehca_cqe *cqe = ipz_qeit_get(queue);
+ return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
+}
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
+{
+ return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
+}
+
+/*
+ * returns and resets Queue Entry iterator
+ * returns address (kv) of first Queue Entry
+ */
+static inline void *ipz_qeit_reset(struct ipz_queue *queue)
+{
+ queue->current_q_offset = 0;
+ return ipz_qeit_get(queue);
+}
+
+/*
+ * return the q_offset corresponding to an absolute address
+ */
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
+
+/*
+ * return the next queue offset. don't modify the queue.
+ */
+static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
+{
+ offset += queue->qe_size;
+ if (offset >= queue->queue_length) offset = 0;
+ return offset;
+}
+
+/* struct generic page table */
+struct ipz_pt {
+ u64 entries[EHCA_PT_ENTRIES];
+};
+
+/* struct page table for a queue, only to be used in pf */
+struct ipz_qpt {
+ /* queue page tables (kv), use u64 because we know the element length */
+ u64 *qpts;
+ u32 n_qpts;
+ u32 n_ptes; /* number of page table entries */
+ u64 *current_pte_addr;
+};
+
+/*
+ * constructor for a ipz_queue_t, placement new for ipz_queue_t,
+ * new for all dependent datastructors
+ * all QP Tables are the same
+ * flow:
+ * allocate+pin queue
+ * see ipz_qpt_ctor()
+ * returns true if ok, false if out of memory
+ */
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+ const u32 nr_of_pages, const u32 pagesize,
+ const u32 qe_size, const u32 nr_of_sg,
+ int is_small);
+
+/*
+ * destructor for a ipz_queue_t
+ * -# free queue
+ * see ipz_queue_ctor()
+ * returns true if ok, false if queue was NULL-ptr of free failed
+ */
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
+
+/*
+ * constructor for a ipz_qpt_t,
+ * placement new for struct ipz_queue, new for all dependent datastructors
+ * all QP Tables are the same,
+ * flow:
+ * -# allocate+pin queue
+ * -# initialise ptcb
+ * -# allocate+pin PTs
+ * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
+ * -# the ring must have room for exactly nr_of_PTEs
+ * see ipz_qpt_ctor()
+ */
+void ipz_qpt_ctor(struct ipz_qpt *qpt,
+ const u32 nr_of_qes,
+ const u32 pagesize,
+ const u32 qe_size,
+ const u8 lowbyte, const u8 toggle,
+ u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ * fix EQ page problems
+ */
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Event Queue Entry, increment Queue Entry iterator
+ * by one step in struct ipz_queue if valid, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_queue_QPageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ */
+static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
+{
+ void *ret = ipz_qeit_get(queue);
+ u32 qe = *(u8 *)ret;
+ if ((qe >> 7) != (queue->toggle_state & 1))
+ return NULL;
+ ipz_qeit_eq_get_inc(queue); /* this is a good one */
+ return ret;
+}
+
+static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
+{
+ void *ret = ipz_qeit_get(queue);
+ u32 qe = *(u8 *)ret;
+ if ((qe >> 7) != (queue->toggle_state & 1))
+ return NULL;
+ return ret;
+}
+
+/* returns address (GX) of first queue entry */
+static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
+{
+ return be64_to_cpu(qpt->qpts[0]);
+}
+
+/* returns address (kv) of first page of queue page table */
+static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
+{
+ return qpt->qpts;
+}
+
+#endif /* __IPZ_PT_FN_H__ */
diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig
new file mode 100644
index 000000000000..fd25078ee923
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/Kconfig
@@ -0,0 +1,37 @@
+config INFINIBAND_HFI1
+ tristate "Intel OPA Gen1 support"
+ depends on X86_64
+ default m
+ ---help---
+ This is a low-level driver for Intel OPA Gen1 adapter.
+config HFI1_DEBUG_SDMA_ORDER
+ bool "HFI1 SDMA Order debug"
+ depends on INFINIBAND_HFI1
+ default n
+ ---help---
+ This is a debug flag to test for out of order
+ sdma completions for unit testing
+config HFI1_VERBS_31BIT_PSN
+ bool "HFI1 enable 31 bit PSN"
+ depends on INFINIBAND_HFI1
+ default y
+ ---help---
+ Setting this enables 31 BIT PSN
+ For verbs RC/UC
+config SDMA_VERBOSITY
+ bool "Config SDMA Verbosity"
+ depends on INFINIBAND_HFI1
+ default n
+ ---help---
+ This is a configuration flag to enable verbose
+ SDMA debug
+config PRESCAN_RXQ
+ bool "Enable prescanning of the RX queue for ECNs"
+ depends on INFINIBAND_HFI1
+ default n
+ ---help---
+ This option toggles the prescanning of the receive queue for
+ Explicit Congestion Notifications. If an ECN is detected, it
+ is processed as quickly as possible, the ECN is toggled off.
+ After the prescanning step, the receive queue is processed as
+ usual.
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile
new file mode 100644
index 000000000000..2e5daa6cdcc2
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/Makefile
@@ -0,0 +1,19 @@
+#
+# HFI driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
+
+hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o eprom.o file_ops.o firmware.o \
+ init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \
+ qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \
+ uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o
+hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
+
+CFLAGS_trace.o = -I$(src)
+ifdef MVERSION
+CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
+endif
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
new file mode 100644
index 000000000000..05de0dad8762
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/TODO
@@ -0,0 +1,6 @@
+July, 2015
+
+- Remove unneeded file entries in sysfs
+- Remove software processing of IB protocol and place in library for use
+ by qib, ipath (if still present), hfi1, and eventually soft-roce
+
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
new file mode 100644
index 000000000000..aa58e597df06
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -0,0 +1,10798 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+ u64 flag; /* the flag */
+ char *str; /* description string */
+ u16 extra; /* extra information */
+ u16 unused0;
+ u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED 0x1
+#define SEC_PACKET_DROPPED 0x2
+#define SEC_SC_HALTED 0x4 /* per-context only */
+#define SEC_SPC_FREEZE 0x8 /* per-HFI only */
+
+#define VL15CTXT 1
+#define MIN_KERNEL_KCTXTS 2
+#define NUM_MAP_REGS 32
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT 39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields */
+
+/* packet type */
+#define IB_PACKET_TYPE 2ull
+#define QW_SHIFT 6ull
+/* QPN[7..1] */
+#define QPN_WIDTH 7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW 0ull
+#define LRH_BTH_BIT_OFFSET 48ull
+#define LRH_BTH_OFFSET(off) ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK 3ull
+#define LRH_BTH_VALUE 2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW 0ull
+#define LRH_SC_BIT_OFFSET 56ull
+#define LRH_SC_OFFSET(off) ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK 128ull
+#define LRH_SC_VALUE 0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET ((1ull << QW_SHIFT) | (1ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+ num, \
+ sc0, sc0val, \
+ sc1, sc1val, \
+ sc2, sc2val, \
+ sc3, sc3val, \
+ sc4, sc4val, \
+ sc5, sc5val, \
+ sc6, sc6val, \
+ sc7, sc7val) \
+( \
+ ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+ ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+ ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+ ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+ ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+ ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+ ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+ ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT) \
+)
+
+#define DC_SC_VL_VAL( \
+ range, \
+ e0, e0val, \
+ e1, e1val, \
+ e2, e2val, \
+ e3, e3val, \
+ e4, e4val, \
+ e5, e5val, \
+ e6, e6val, \
+ e7, e7val, \
+ e8, e8val, \
+ e9, e9val, \
+ e10, e10val, \
+ e11, e11val, \
+ e12, e12val, \
+ e13, e13val, \
+ e14, e14val, \
+ e15, e15val) \
+( \
+ ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+ ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+ ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+ ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+ ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+ ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+ ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+ ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+ ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+ ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+ ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+ ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+ ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+ ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+ ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+ ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+ | CCE_STATUS_RXE_FROZE_SMASK \
+ | CCE_STATUS_TXE_FROZE_SMASK \
+ | CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+ | CCE_STATUS_TXE_PAUSED_SMASK \
+ | CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
+ CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
+ CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+ CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
+ CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
+ CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
+ CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+ CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+ CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+ CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+ CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+ CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
+ CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
+ CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
+ CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
+ CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
+ CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
+ CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
+ CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
+ CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
+ CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
+ CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
+ CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
+ CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+ CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/ FLAG_ENTRY0("LATriggered",
+ CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
+ CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
+ CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+ CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
+ CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
+ CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
+ CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
+ CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
+ CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
+ SEC_WRITE_DROPPED,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("PioCsrParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/ FLAG_ENTRY("PioPccFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY("PioPecFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY("PioSmPktResetParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/ FLAG_ENTRY("PioInitSmIn",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/ FLAG_ENTRY("PioWriteDataParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/ FLAG_ENTRY("PioStateMachine",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
+ SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
+ SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/ FLAG_ENTRY("PioVlfSopParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/ FLAG_ENTRY("PioVlFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY("PioPpmcSopLen",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+ (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
+ SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
+ SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+ SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+ SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR \
+ (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+ | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+ | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+ SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+ SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+ SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+ SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+ SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+ SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+ SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+ SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+ SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+ SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+ SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+ SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+ SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+ SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+ SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+ SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+ SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+ SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+ SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+ SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+ SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+ SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+ (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+ | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+ | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+ | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+ | SEES(TX_LAUNCH_CSR_PARITY) \
+ | SEES(TX_SBRD_CTL_CSR_PARITY) \
+ | SEES(TX_CONFIG_PARITY) \
+ | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+ | SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("InconsistentSop",
+ SEC_PACKET_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("DisallowedPacket",
+ SEC_PACKET_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
+ SEC_WRITE_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("WriteOverflow",
+ SEC_WRITE_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
+ SEC_WRITE_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+ RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+ RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+ RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+ RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+ RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+ RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+ RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+ RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+ RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+ RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+ RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+ RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR \
+ (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+ (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+ RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+ RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+ FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+ FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+ FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+ FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+ FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+ FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+ FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+ FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+ FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+ FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+ FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+ FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+ FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+ FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+ FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+ FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+ FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+ FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+ FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+ FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+ FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+ FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+ FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+ FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+ FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+ FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+ FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+ FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+ FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+ FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+ FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+ FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+ FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+ FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+ FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+ FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+ FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+ FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+ FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+ FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+ FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+ FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+ FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+ FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+ FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+ FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+ LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+ LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+ LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+ LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+ LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+ LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+ FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+ FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+ FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+ FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+ FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+ FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+ FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+ FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+ FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+ D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+ FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+ FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED),
+ FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME),
+ FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET),
+ FLAG_ENTRY0("Serdes internal loopback failure",
+ FAILED_SERDES_INTERNAL_LOOPBACK),
+ FLAG_ENTRY0("Failed SerDes init", FAILED_SERDES_INIT),
+ FLAG_ENTRY0("Failed LNI(Polling)", FAILED_LNI_POLLING),
+ FLAG_ENTRY0("Failed LNI(Debounce)", FAILED_LNI_DEBOUNCE),
+ FLAG_ENTRY0("Failed LNI(EstbComm)", FAILED_LNI_ESTBCOMM),
+ FLAG_ENTRY0("Failed LNI(OptEq)", FAILED_LNI_OPTEQ),
+ FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+ FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+ FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT)
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+ FLAG_ENTRY0("Host request done", 0x0001),
+ FLAG_ENTRY0("BC SMA message", 0x0002),
+ FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+ FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+ FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+ FLAG_ENTRY0("External device config request", 0x0020),
+ FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+ FLAG_ENTRY0("LinkUp achieved", 0x0080),
+ FLAG_ENTRY0("Link going down", 0x0100),
+};
+
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+ u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+ u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+ u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+ u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+ u8 *device_rev);
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+ u8 *tx_polarity_inversion,
+ u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+ unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+ unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+ unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+ u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+ u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+ int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void handle_temp_err(struct hfi1_devdata *);
+static void dc_shutdown(struct hfi1_devdata *);
+static void dc_start(struct hfi1_devdata *);
+
+/*
+ * Error interrupt table entry. This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+ u32 status; /* status CSR offset */
+ u32 clear; /* clear CSR offset */
+ u32 mask; /* mask CSR offset */
+ void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+ const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries. Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+ { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+ handler, desc }
+#define DC_EE1(reg, handler, desc) \
+ { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+ { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts. Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/ EE(CCE_ERR, handle_cce_err, "CceErr"),
+/* 1*/ EE(RCV_ERR, handle_rxe_err, "RxeErr"),
+/* 2*/ EE(MISC_ERR, handle_misc_err, "MiscErr"),
+/* 3*/ { 0, 0, 0, NULL }, /* reserved */
+/* 4*/ EE(SEND_PIO_ERR, handle_pio_err, "PioErr"),
+/* 5*/ EE(SEND_DMA_ERR, handle_sdma_err, "SDmaErr"),
+/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/ EE(SEND_ERR, handle_txe_err, "TxeErr")
+ /* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+ EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/ EE(ASIC_QSFP1, handle_qsfp_int, "QSFP1"),
+/* 3*/ EE(ASIC_QSFP2, handle_qsfp_int, "QSFP2"),
+/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
+ /* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts. Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/ DC_EE1(DCC_ERR, handle_dcc_err, "DCC Err"),
+/* 1*/ DC_EE2(DC_LCB_ERR, handle_lcb_err, "LCB Err"),
+/* 2*/ DC_EE2(DC_DC8051_ERR, handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
+ /* the rest are reserved */
+};
+
+struct cntr_entry {
+ /*
+ * counter name
+ */
+ char *name;
+
+ /*
+ * csr to read for name (if applicable)
+ */
+ u64 csr;
+
+ /*
+ * offset into dd or ppd to store the counter's value
+ */
+ int offset;
+
+ /*
+ * flags
+ */
+ u8 flags;
+
+ /*
+ * accessor for stat element, context either dd or ppd
+ */
+ u64 (*rw_cntr)(const struct cntr_entry *,
+ void *context,
+ int vl,
+ int mode,
+ u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+ name, \
+ csr, \
+ offset, \
+ flags, \
+ accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY64), \
+ 0, flags, \
+ port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY64), \
+ 0, flags, \
+ dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+ (RCV_HDR_OVFL_CNT + ctx*0x100), \
+ 0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + SEND_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + SEND_COUNTER_ARRAY64), \
+ 0, flags, \
+ port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+ counter * 8 + SEND_COUNTER_ARRAY64, \
+ 0, \
+ flags, \
+ dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + CCE_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+ counter, \
+ 0, \
+ flags, \
+ dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+ counter, \
+ 0, \
+ flags, \
+ dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+ 0, \
+ 0, \
+ CNTR_SYNTH, \
+ access_ibp_##cntr)
+
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+ u64 val;
+
+ if (dd->flags & HFI1_PRESENT) {
+ val = readq((void __iomem *)dd->kregbase + offset);
+ return val;
+ }
+ return -1;
+}
+
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+ if (dd->flags & HFI1_PRESENT)
+ writeq(value, (void __iomem *)dd->kregbase + offset);
+}
+
+void __iomem *get_csr_addr(
+ struct hfi1_devdata *dd,
+ u32 offset)
+{
+ return (void __iomem *)dd->kregbase + offset;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+ int mode, u64 value)
+{
+ u64 ret;
+
+
+ if (mode == CNTR_MODE_R) {
+ ret = read_csr(dd, csr);
+ } else if (mode == CNTR_MODE_W) {
+ write_csr(dd, csr, value);
+ ret = value;
+ } else {
+ dd_dev_err(dd, "Invalid cntr register access mode");
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+ return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_csr(dd, entry->csr, mode, data);
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ u64 val = 0;
+ u64 csr = entry->csr;
+
+ if (entry->flags & CNTR_VL) {
+ if (vl == CNTR_INVALID_VL)
+ return 0;
+ csr += 8 * vl;
+ } else {
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ }
+
+ val = read_write_csr(dd, csr, mode, data);
+ return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+ u32 csr = entry->csr;
+ int ret = 0;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ if (mode == CNTR_MODE_R)
+ ret = read_lcb_csr(dd, csr, &data);
+ else if (mode == CNTR_MODE_W)
+ ret = write_lcb_csr(dd, csr, data);
+
+ if (ret) {
+ dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+ return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+ u64 val;
+ u64 csr = entry->csr;
+
+ if (entry->flags & CNTR_VL) {
+ if (vl == CNTR_INVALID_VL)
+ return 0;
+ csr += 8 * vl;
+ } else {
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ }
+ val = read_write_csr(ppd->dd, csr, mode, data);
+ return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+ u64 data)
+{
+ u64 ret;
+
+ if (mode == CNTR_MODE_R) {
+ ret = *cntr;
+ } else if (mode == CNTR_MODE_W) {
+ *cntr = data;
+ ret = data;
+ } else {
+ dd_dev_err(dd, "Invalid cntr sw access mode");
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+ return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+
+ return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+
+ return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+ mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+
+ return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+ mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+ int cpu;
+ u64 counter = 0;
+
+ for_each_possible_cpu(cpu)
+ counter += *per_cpu_ptr(cntr, cpu);
+ return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+ u64 __percpu *cntr,
+ int vl, int mode, u64 data)
+{
+
+ u64 ret = 0;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+
+ if (mode == CNTR_MODE_R) {
+ ret = get_all_cpu_total(cntr) - *z_val;
+ } else if (mode == CNTR_MODE_W) {
+ /* A write can only zero the counter */
+ if (data == 0)
+ *z_val = get_all_cpu_total(cntr);
+ else
+ dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+ } else {
+ dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+ return 0;
+ }
+
+ return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+ mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+ mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->verbs_dev.n_kmem_wait;
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \
+ void *context, int vl, int mode, u64 data) \
+{ \
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \
+ return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr, \
+ ppd->ibport_data.cntr, vl, \
+ mode, data); \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry, \
+ void *context, int vl, int mode, u64 data) \
+{ \
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \
+ \
+ if (vl != CNTR_INVALID_VL) \
+ return 0; \
+ \
+ return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr, \
+ mode, data); \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+ CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+ CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+ RCV_TID_FLOW_GEN_MISMATCH_CNT,
+ CNTR_NORMAL),
+[C_RX_CTX_RHQS] = RXE32_DEV_CNTR_ELEM(RxCtxRHQS, RCV_CONTEXT_RHQ_STALL,
+ CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+ CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+ RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+ CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+ CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+ CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+ CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+ CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+ CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+ CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+ CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+ DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+ DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+ DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+ CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+ CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+ DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+ CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+ CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+ CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+ CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+ CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+ DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+ CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+ CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+ CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+ DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+ CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+ DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+ CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+ DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+ CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+ DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+ CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+ DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+ CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+ CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+ DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+ CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+ DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+ DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+ CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+ DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+ DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+ DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+ DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+ DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+ CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+ DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+ DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+ CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+ access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+ access_sw_pio_wait),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+ access_sw_kmem_wait),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+ CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+ CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+ CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+ access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+ access_sw_link_up_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+ access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+ CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+ access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+ access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+ access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a0 */
+int is_a0(struct hfi1_devdata *dd)
+{
+ return ((dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+ & CCE_REVISION_CHIP_REV_MINOR_MASK) == 0;
+}
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+ u8 chip_rev_minor =
+ dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+ & CCE_REVISION_CHIP_REV_MINOR_MASK;
+ return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+ u8 chip_rev_minor =
+ dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+ & CCE_REVISION_CHIP_REV_MINOR_MASK;
+ return !!(chip_rev_minor & 0x10);
+}
+
+/*
+ * Append string s to buffer buf. Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+ char *p = *curp;
+ int len = *lenp;
+ int result = 0; /* success */
+ char c;
+
+ /* add a comma, if first in the buffer */
+ if (p != buf) {
+ if (len == 0) {
+ result = 1; /* out of room */
+ goto done;
+ }
+ *p++ = ',';
+ len--;
+ }
+
+ /* copy the string */
+ while ((c = *s++) != 0) {
+ if (len == 0) {
+ result = 1; /* out of room */
+ goto done;
+ }
+ *p++ = c;
+ len--;
+ }
+
+done:
+ /* write return values */
+ *curp = p;
+ *lenp = len;
+
+ return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer. End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+ struct flag_table *table, int table_size)
+{
+ char extra[32];
+ char *p = buf;
+ int len = buf_len;
+ int no_room = 0;
+ int i;
+
+ /* make sure there is at least 2 so we can form "*" */
+ if (len < 2)
+ return "";
+
+ len--; /* leave room for a nul */
+ for (i = 0; i < table_size; i++) {
+ if (flags & table[i].flag) {
+ no_room = append_str(buf, &p, &len, table[i].str);
+ if (no_room)
+ break;
+ flags &= ~table[i].flag;
+ }
+ }
+
+ /* any undocumented bits left? */
+ if (!no_room && flags) {
+ snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+ no_room = append_str(buf, &p, &len, extra);
+ }
+
+ /* add * if ran out of room */
+ if (no_room) {
+ /* may need to back up to add space for a '*' */
+ if (len == 0)
+ --p;
+ *p++ = '*';
+ }
+
+ /* add final nul - space already allocated above */
+ *p = 0;
+ return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+ "CceErrInt", /* 0 */
+ "RxeErrInt", /* 1 */
+ "MiscErrInt", /* 2 */
+ "Reserved3", /* 3 */
+ "PioErrInt", /* 4 */
+ "SDmaErrInt", /* 5 */
+ "EgressErrInt", /* 6 */
+ "TxeErrInt" /* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+ if (source < ARRAY_SIZE(cce_misc_names))
+ strncpy(buf, cce_misc_names[source], bsize);
+ else
+ snprintf(buf,
+ bsize,
+ "Reserved%u",
+ source + IS_GENERAL_ERR_START);
+
+ return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+ return buf;
+}
+
+static const char * const various_names[] = {
+ "PbcInt",
+ "GpioAssertInt",
+ "Qsfp1Int",
+ "Qsfp2Int",
+ "TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+ if (source < ARRAY_SIZE(various_names))
+ strncpy(buf, various_names[source], bsize);
+ else
+ snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START);
+ return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+ static const char * const dc_int_names[] = {
+ "common",
+ "lcb",
+ "8051",
+ "lbm" /* local block merge */
+ };
+
+ if (source < ARRAY_SIZE(dc_int_names))
+ snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+ else
+ snprintf(buf, bsize, "DCInt%u", source);
+ return buf;
+}
+
+static const char * const sdma_int_names[] = {
+ "SDmaInt",
+ "SdmaIdleInt",
+ "SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+ /* what interrupt */
+ unsigned int what = source / TXE_NUM_SDMA_ENGINES;
+ /* which engine */
+ unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+ if (likely(what < 3))
+ snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+ else
+ snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+ return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "RcvAvailInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "RcvUrgentInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "SendCreditInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+ return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, misc_err_status_flags,
+ ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ sdma_err_status_flags,
+ ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ send_err_status_flags,
+ ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ /*
+ * For most these errors, there is nothing that can be done except
+ * report or record it.
+ */
+ dd_dev_info(dd, "CCE Error: %s\n",
+ cce_err_status_string(buf, sizeof(buf), reg));
+
+ if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK)
+ && is_a0(dd)
+ && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+ /* this error requires a manual drop into SPC freeze mode */
+ /* then a fix up */
+ start_freeze_handling(dd->pport, FREEZE_SELF);
+ }
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(unsigned long opaque)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+ struct hfi1_pportdata *ppd = dd->pport;
+ u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+ if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+ ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+ dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+ set_link_down_reason(ppd,
+ OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+ OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+ queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+ }
+ dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt;
+
+ mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+ init_timer(&dd->rcverr_timer);
+ dd->rcverr_timer.function = update_rcverr_timer;
+ dd->rcverr_timer.data = (unsigned long) dd;
+ /* Assume the hardware counter has been reset */
+ dd->rcv_ovfl_cnt = 0;
+ return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+ if (dd->rcverr_timer.data)
+ del_timer_sync(&dd->rcverr_timer);
+ dd->rcverr_timer.data = 0;
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ dd_dev_info(dd, "Receive Error: %s\n",
+ rxe_err_status_string(buf, sizeof(buf), reg));
+
+ if (reg & ALL_RXE_FREEZE_ERR) {
+ int flags = 0;
+
+ /*
+ * Freeze mode recovery is disabled for the errors
+ * in RXE_FREEZE_ABORT_MASK
+ */
+ if (is_a0(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+ flags = FREEZE_ABORT;
+
+ start_freeze_handling(dd->pport, flags);
+ }
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ dd_dev_info(dd, "Misc Error: %s",
+ misc_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ dd_dev_info(dd, "PIO Error: %s\n",
+ pio_err_status_string(buf, sizeof(buf), reg));
+
+ if (reg & ALL_PIO_FREEZE_ERR)
+ start_freeze_handling(dd->pport, 0);
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ dd_dev_info(dd, "SDMA Error: %s\n",
+ sdma_err_status_string(buf, sizeof(buf), reg));
+
+ if (reg & ALL_SDMA_FREEZE_ERR)
+ start_freeze_handling(dd->pport, 0);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+
+ if (ppd->port_xmit_discards < ~(u64)0)
+ ppd->port_xmit_discards++;
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+ u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+ u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+ char buf[96];
+
+ /* clear down all observed info as quickly as possible after read */
+ write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+ dd_dev_info(dd,
+ "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+ info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+ /* Eventually add other counters for each bit */
+
+ if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) {
+ if (ppd->port_xmit_discards < ~(u64)0)
+ ppd->port_xmit_discards++;
+ }
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+ return (posn >= SEES(TX_LINKDOWN) &&
+ posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(u64 posn)
+{
+ return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+ posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ u64 reg_copy = reg, handled = 0;
+ char buf[96];
+
+ if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+ start_freeze_handling(dd->pport, 0);
+ if (is_a0(dd) && (reg &
+ SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK)
+ && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+ start_freeze_handling(dd->pport, 0);
+
+ while (reg_copy) {
+ int posn = fls64(reg_copy);
+ /*
+ * fls64() returns a 1-based offset, but we generally
+ * want 0-based offsets.
+ */
+ int shift = posn - 1;
+
+ if (port_inactive_err(shift)) {
+ count_port_inactive(dd);
+ handled |= (1ULL << shift);
+ } else if (disallowed_pkt_err(shift)) {
+ handle_send_egress_err_info(dd);
+ handled |= (1ULL << shift);
+ }
+ clear_bit(shift, (unsigned long *)&reg_copy);
+ }
+
+ reg &= ~handled;
+
+ if (reg)
+ dd_dev_info(dd, "Egress Error: %s\n",
+ egress_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ dd_dev_info(dd, "Send Error: %s\n",
+ send_err_status_string(buf, sizeof(buf), reg));
+
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error. This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register. All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+ u32 context,
+ const struct err_reg_info *eri)
+{
+ u64 reg;
+ u32 count;
+
+ /* read in a loop until no more errors are seen */
+ count = 0;
+ while (1) {
+ reg = read_kctxt_csr(dd, context, eri->status);
+ if (reg == 0)
+ break;
+ write_kctxt_csr(dd, context, eri->clear, reg);
+ if (likely(eri->handler))
+ eri->handler(dd, context, reg);
+ count++;
+ if (count > MAX_CLEAR_COUNT) {
+ u64 mask;
+
+ dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+ eri->desc, reg);
+ /*
+ * Read-modify-write so any other masked bits
+ * remain masked.
+ */
+ mask = read_kctxt_csr(dd, context, eri->mask);
+ mask &= ~reg;
+ write_kctxt_csr(dd, context, eri->mask, mask);
+ break;
+ }
+ }
+}
+
+/*
+ * CCE block "misc" interrupt. Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct err_reg_info *eri = &misc_errs[source];
+
+ if (eri->handler) {
+ interrupt_clear_down(dd, 0, eri);
+ } else {
+ dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+ source);
+ }
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt. Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt. The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+ unsigned int hw_context)
+{
+ struct send_context_info *sci;
+ struct send_context *sc;
+ char flags[96];
+ u64 status;
+ u32 sw_index;
+
+ sw_index = dd->hw_to_sw[hw_context];
+ if (sw_index >= dd->num_send_contexts) {
+ dd_dev_err(dd,
+ "out of range sw index %u for send context %u\n",
+ sw_index, hw_context);
+ return;
+ }
+ sci = &dd->send_contexts[sw_index];
+ sc = sci->sc;
+ if (!sc) {
+ dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+ sw_index, hw_context);
+ return;
+ }
+
+ /* tell the software that a halt has begun */
+ sc_stop(sc, SCF_HALTED);
+
+ status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+ dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+ send_context_err_status_string(flags, sizeof(flags), status));
+
+ if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+ handle_send_egress_err_info(dd);
+
+ /*
+ * Automatically restart halted kernel contexts out of interrupt
+ * context. User contexts must ask the driver to restart the context.
+ */
+ if (sc->type != SC_USER)
+ queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+ unsigned int source, u64 status)
+{
+ struct sdma_engine *sde;
+
+ sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+ sde->this_idx, source, (unsigned long long)status);
+#endif
+ sdma_engine_error(sde, status);
+}
+
+/*
+ * CCE block SDMA error interrupt. Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+ struct sdma_engine *sde = &dd->per_sdma[source];
+
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+ source);
+ sdma_dumpstate(sde);
+#endif
+ interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt. Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct err_reg_info *eri = &various_err[source];
+
+ /*
+ * TCritInt cannot go through interrupt_clear_down()
+ * because it is not a second tier interrupt. The handler
+ * should be called directly.
+ */
+ if (source == TCRIT_INT_SOURCE)
+ handle_temp_err(dd);
+ else if (eri->handler)
+ interrupt_clear_down(dd, 0, eri);
+ else
+ dd_dev_info(dd,
+ "%s: Unimplemented/reserved interrupt %d\n",
+ __func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+ /* source is always zero */
+ struct hfi1_pportdata *ppd = dd->pport;
+ unsigned long flags;
+ u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+ if (reg & QSFP_HFI0_MODPRST_N) {
+
+ dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n",
+ __func__);
+
+ if (!qsfp_mod_present(ppd)) {
+ ppd->driver_link_ready = 0;
+ /*
+ * Cable removed, reset all our information about the
+ * cache and cable capabilities
+ */
+
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ /*
+ * We don't set cache_refresh_required here as we expect
+ * an interrupt when a cable is inserted
+ */
+ ppd->qsfp_info.cache_valid = 0;
+ ppd->qsfp_info.qsfp_interrupt_functional = 0;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+ flags);
+ write_csr(dd,
+ dd->hfi1_id ?
+ ASIC_QSFP2_INVERT :
+ ASIC_QSFP1_INVERT,
+ qsfp_int_mgmt);
+ if (ppd->host_link_state == HLS_DN_POLL) {
+ /*
+ * The link is still in POLL. This means
+ * that the normal link down processing
+ * will not happen. We have to do it here
+ * before turning the DC off.
+ */
+ queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+ }
+ } else {
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.cache_valid = 0;
+ ppd->qsfp_info.cache_refresh_required = 1;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+ flags);
+
+ qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+ write_csr(dd,
+ dd->hfi1_id ?
+ ASIC_QSFP2_INVERT :
+ ASIC_QSFP1_INVERT,
+ qsfp_int_mgmt);
+ }
+ }
+
+ if (reg & QSFP_HFI0_INT_N) {
+
+ dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n",
+ __func__);
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.check_interrupt_flags = 1;
+ ppd->qsfp_info.qsfp_interrupt_functional = 1;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+ }
+
+ /* Schedule the QSFP work only if there is a cable attached. */
+ if (qsfp_mod_present(ppd))
+ queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = do_8051_command(dd, HCMD_MISC,
+ (u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+ NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "%s: command failed with error %d\n",
+ __func__, ret);
+ }
+ return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = do_8051_command(dd, HCMD_MISC,
+ (u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+ NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "%s: command failed with error %d\n",
+ __func__, ret);
+ }
+ return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access. The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+ write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+ DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK
+ | DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access. The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+ write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+ DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051. If the host already has access,
+ * just increment a counter. Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ * 0 on success
+ * -EBUSY if the 8051 has control and cannot be disturbed
+ * -errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+ int ret = 0;
+
+ /*
+ * Use the host link state lock so the operation of this routine
+ * { link state check, selector change, count increment } can occur
+ * as a unit against a link state change. Otherwise there is a
+ * race between the state change and the count increment.
+ */
+ if (sleep_ok) {
+ mutex_lock(&ppd->hls_lock);
+ } else {
+ while (!mutex_trylock(&ppd->hls_lock))
+ udelay(1);
+ }
+
+ /* this access is valid only when the link is up */
+ if ((ppd->host_link_state & HLS_UP) == 0) {
+ dd_dev_info(dd, "%s: link state %s not up\n",
+ __func__, link_state_name(ppd->host_link_state));
+ ret = -EBUSY;
+ goto done;
+ }
+
+ if (dd->lcb_access_count == 0) {
+ ret = request_host_lcb_access(dd);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: unable to acquire LCB access, err %d\n",
+ __func__, ret);
+ goto done;
+ }
+ set_host_lcb_access(dd);
+ }
+ dd->lcb_access_count++;
+done:
+ mutex_unlock(&ppd->hls_lock);
+ return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count. If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ * 0 on success
+ * -errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+ int ret = 0;
+
+ /*
+ * Use the host link state lock because the acquire needed it.
+ * Here, we only need to keep { selector change, count decrement }
+ * as a unit.
+ */
+ if (sleep_ok) {
+ mutex_lock(&dd->pport->hls_lock);
+ } else {
+ while (!mutex_trylock(&dd->pport->hls_lock))
+ udelay(1);
+ }
+
+ if (dd->lcb_access_count == 0) {
+ dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n",
+ __func__);
+ goto done;
+ }
+
+ if (dd->lcb_access_count == 1) {
+ set_8051_lcb_access(dd);
+ ret = request_8051_lcb_access(dd);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: unable to release LCB access, err %d\n",
+ __func__, ret);
+ /* restore host access if the grant didn't work */
+ set_host_lcb_access(dd);
+ goto done;
+ }
+ }
+ dd->lcb_access_count--;
+done:
+ mutex_unlock(&dd->pport->hls_lock);
+ return ret;
+}
+
+/*
+ * Initialize LCB access variables and state. Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host. The driver defaults to
+ * leaving access to the 8051. Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done. In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+ dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+ write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+ DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK
+ | (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT
+ | (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ u16 data;
+ u8 type;
+
+ reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+ if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+ return; /* no request */
+
+ /* zero out COMPLETED so the response is seen */
+ write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+ /* extract request details */
+ type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+ & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+ data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+ & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+ switch (type) {
+ case HREQ_LOAD_CONFIG:
+ case HREQ_SAVE_CONFIG:
+ case HREQ_READ_CONFIG:
+ case HREQ_SET_TX_EQ_ABS:
+ case HREQ_SET_TX_EQ_REL:
+ case HREQ_ENABLE:
+ dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+ type);
+ hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+ break;
+
+ case HREQ_CONFIG_DONE:
+ hreq_response(dd, HREQ_SUCCESS, 0);
+ break;
+
+ case HREQ_INTERFACE_TEST:
+ hreq_response(dd, HREQ_SUCCESS, data);
+ break;
+
+ default:
+ dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+ hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+ break;
+ }
+}
+
+static void write_global_credit(struct hfi1_devdata *dd,
+ u8 vau, u16 total, u16 shared)
+{
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+ ((u64)total
+ << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+ | ((u64)shared
+ << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+ | ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+}
+
+/*
+ * Set up initial VL15 credits of the remote. Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset .
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
+{
+ /* leave shared count at zero for both global and VL15 */
+ write_global_credit(dd, vau, vl15buf, 0);
+
+ /* We may need some credits for another VL when sending packets
+ * with the snoop interface. Dividing it down the middle for VL15
+ * and VL0 should suffice.
+ */
+ if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
+ write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
+ << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+ write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
+ << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
+ } else {
+ write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+ << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+ }
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* remove all previous VL credit limits */
+ for (i = 0; i < TXE_NUM_DATA_VL; i++)
+ write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+ write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+ write_global_credit(dd, 0, 0, 0);
+ /* reset the CM block */
+ pio_send_control(dd, PSC_CM_RESET);
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+ return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+ return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+ return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+ ppd->sm_trap_qp = 0x0;
+ ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown. This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+ u64 reg;
+
+ /* clear lcb run: LCB_CFG_RUN.EN = 0 */
+ write_csr(dd, DC_LCB_CFG_RUN, 0);
+ /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+ 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+ /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+ dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+ reg = read_csr(dd, DCC_CFG_RESET);
+ write_csr(dd, DCC_CFG_RESET,
+ reg
+ | (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT)
+ | (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+ (void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+ if (!abort) {
+ udelay(1); /* must hold for the longer of 16cclks or 20ns */
+ write_csr(dd, DCC_CFG_RESET, reg);
+ write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+ }
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ */
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ if (dd->dc_shutdown) {
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ return;
+ }
+ dd->dc_shutdown = 1;
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ /* Shutdown the LCB */
+ lcb_shutdown(dd, 1);
+ /* Going to OFFLINE would have causes the 8051 to put the
+ * SerDes into reset already. Just need to shut down the 8051,
+ * itself. */
+ write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+/* Calling this after the DC has been brought out of reset should not
+ * do any damage. */
+static void dc_start(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ if (!dd->dc_shutdown)
+ goto done;
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ /* Take the 8051 out of reset */
+ write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+ /* Wait until 8051 is ready */
+ ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+ if (ret) {
+ dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+ __func__);
+ }
+ /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+ write_csr(dd, DCC_CFG_RESET, 0x10);
+ /* lcb_shutdown() with abort=1 does not restore these */
+ write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ dd->dc_shutdown = 0;
+done:
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+ u64 rx_radr, tx_radr;
+ u32 version;
+
+ if (dd->icode != ICODE_FPGA_EMULATION)
+ return;
+
+ /*
+ * These LCB defaults on emulator _s are good, nothing to do here:
+ * LCB_CFG_TX_FIFOS_RADR
+ * LCB_CFG_RX_FIFOS_RADR
+ * LCB_CFG_LN_DCLK
+ * LCB_CFG_IGNORE_LOST_RCLK
+ */
+ if (is_emulator_s(dd))
+ return;
+ /* else this is _p */
+
+ version = emulator_rev(dd);
+ if (!is_a0(dd))
+ version = 0x2d; /* all B0 use 0x2d or higher settings */
+
+ if (version <= 0x12) {
+ /* release 0x12 and below */
+
+ /*
+ * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+ * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+ * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+ */
+ rx_radr =
+ 0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ /*
+ * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+ * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+ */
+ tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ } else if (version <= 0x18) {
+ /* release 0x13 up to 0x18 */
+ /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+ rx_radr =
+ 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ } else if (version == 0x19) {
+ /* release 0x19 */
+ /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+ rx_radr =
+ 0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ } else if (version == 0x1a) {
+ /* release 0x1a */
+ /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+ rx_radr =
+ 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+ } else {
+ /* release 0x1b and higher */
+ /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+ rx_radr =
+ 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ }
+
+ write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+ /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+ write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+ DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ sma_message_work);
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 msg;
+ int ret;
+
+ /* msg is bytes 1-4 of the 40-bit idle message - the command code
+ is stripped off */
+ ret = read_idle_sma(dd, &msg);
+ if (ret)
+ return;
+ dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+ /*
+ * React to the SMA message. Byte[1] (0 for us) is the command.
+ */
+ switch (msg & 0xff) {
+ case SMA_IDLE_ARM:
+ /*
+ * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+ * State Transitions
+ *
+ * Only expected in INIT or ARMED, discard otherwise.
+ */
+ if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+ ppd->neighbor_normal = 1;
+ break;
+ case SMA_IDLE_ACTIVE:
+ /*
+ * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+ * State Transitions
+ *
+ * Can activate the node. Discard otherwise.
+ */
+ if (ppd->host_link_state == HLS_UP_ARMED
+ && ppd->is_active_optimize_enabled) {
+ ppd->neighbor_normal = 1;
+ ret = set_link_state(ppd, HLS_UP_ACTIVE);
+ if (ret)
+ dd_dev_err(
+ dd,
+ "%s: received Active SMA idle message, couldn't set link to Active\n",
+ __func__);
+ }
+ break;
+ default:
+ dd_dev_err(dd,
+ "%s: received unexpected SMA idle message 0x%llx\n",
+ __func__, msg);
+ break;
+ }
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+ u64 rcvctrl;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+ rcvctrl = read_csr(dd, RCV_CTRL);
+ rcvctrl |= add;
+ rcvctrl &= ~clear;
+ write_csr(dd, RCV_CTRL, rcvctrl);
+ spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+ adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+ adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct send_context *sc;
+ int i;
+
+ if (flags & FREEZE_SELF)
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+ /* enter frozen mode */
+ dd->flags |= HFI1_FROZEN;
+
+ /* notify all SDMA engines that they are going into a freeze */
+ sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+ /* do halt pre-handling on all enabled send contexts */
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ sc = dd->send_contexts[i].sc;
+ if (sc && (sc->flags & SCF_ENABLED))
+ sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+ }
+
+ /* Send context are frozen. Notify user space */
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+ if (flags & FREEZE_ABORT) {
+ dd_dev_err(dd,
+ "Aborted freeze recovery. Please REBOOT system\n");
+ return;
+ }
+ /* queue non-interrupt handler */
+ queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+ unsigned long timeout;
+ u64 reg;
+
+ timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, CCE_STATUS);
+ if (freeze) {
+ /* waiting until all indicators are set */
+ if ((reg & ALL_FROZE) == ALL_FROZE)
+ return; /* all done */
+ } else {
+ /* waiting until all indicators are clear */
+ if ((reg & ALL_FROZE) == 0)
+ return; /* all done */
+ }
+
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+ freeze ? "" : "un",
+ reg & ALL_FROZE,
+ freeze ? ALL_FROZE : 0ull);
+ return;
+ }
+ usleep_range(80, 120);
+ }
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* disable port */
+ clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+ /* disable all receive contexts */
+ for (i = 0; i < dd->num_rcv_contexts; i++)
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port. User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* enable all kernel contexts */
+ for (i = 0; i < dd->n_krcv_queues; i++)
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i);
+
+ /* enable port */
+ add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ freeze_work);
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /* wait for freeze indicators on all affected blocks */
+ dd_dev_info(dd, "Entering SPC freeze\n");
+ wait_for_freeze_status(dd, 1);
+
+ /* SPC is now frozen */
+
+ /* do send PIO freeze steps */
+ pio_freeze(dd);
+
+ /* do send DMA freeze steps */
+ sdma_freeze(dd);
+
+ /* do send egress freeze steps - nothing to do */
+
+ /* do receive freeze steps */
+ rxe_freeze(dd);
+
+ /*
+ * Unfreeze the hardware - clear the freeze, wait for each
+ * block's frozen bit to clear, then clear the frozen flag.
+ */
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+ wait_for_freeze_status(dd, 0);
+
+ if (is_a0(dd)) {
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+ wait_for_freeze_status(dd, 1);
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+ wait_for_freeze_status(dd, 0);
+ }
+
+ /* do send PIO unfreeze steps for kernel contexts */
+ pio_kernel_unfreeze(dd);
+
+ /* do send DMA unfreeze steps */
+ sdma_unfreeze(dd);
+
+ /* do send egress unfreeze steps - nothing to do */
+
+ /* do receive unfreeze steps for kernel contexts */
+ rxe_kernel_unfreeze(dd);
+
+ /*
+ * The unfreeze procedure touches global device registers when
+ * it disables and re-enables RXE. Mark the device unfrozen
+ * after all that is done so other parts of the driver waiting
+ * for the device to unfreeze don't do things out of order.
+ *
+ * The above implies that the meaning of HFI1_FROZEN flag is
+ * "Device has gone into freeze mode and freeze mode handling
+ * is still in progress."
+ *
+ * The flag will be removed when freeze mode processing has
+ * completed.
+ */
+ dd->flags &= ~HFI1_FROZEN;
+ wake_up(&dd->event_queue);
+
+ /* no longer frozen */
+ dd_dev_err(dd, "Exiting SPC freeze\n");
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_up_work);
+ set_link_state(ppd, HLS_UP_INIT);
+
+ /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+ read_ltp_rtt(ppd->dd);
+ /*
+ * OPA specifies that certain counters are cleared on a transition
+ * to link up, so do that.
+ */
+ clear_linkup_counters(ppd->dd);
+ /*
+ * And (re)set link up default values.
+ */
+ set_linkup_defaults(ppd);
+
+ /* enforce link speed enabled */
+ if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+ /* oops - current speed is not enabled, bounce */
+ dd_dev_err(ppd->dd,
+ "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+ ppd->link_speed_active, ppd->link_speed_enabled);
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+ OPA_LINKDOWN_REASON_SPEED_POLICY);
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ start_link(ppd);
+ }
+}
+
+/* Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+ ppd->neighbor_guid = 0;
+ ppd->neighbor_port_number = 0;
+ ppd->neighbor_type = 0;
+ ppd->neighbor_fm_security = 0;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+ u8 lcl_reason, neigh_reason = 0;
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_down_work);
+
+ /* go offline first, then deal with reasons */
+ set_link_state(ppd, HLS_DN_OFFLINE);
+
+ lcl_reason = 0;
+ read_planned_down_reason_code(ppd->dd, &neigh_reason);
+
+ /*
+ * If no reason, assume peer-initiated but missed
+ * LinkGoingDown idle flits.
+ */
+ if (neigh_reason == 0)
+ lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+
+ set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+ reset_neighbor_info(ppd);
+
+ /* disable the port */
+ clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+ /* If there is no cable attached, turn the DC off. Otherwise,
+ * start the link bring up. */
+ if (!qsfp_mod_present(ppd))
+ dc_shutdown(ppd->dd);
+ else
+ start_link(ppd);
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_bounce_work);
+
+ /*
+ * Only do something if the link is currently up.
+ */
+ if (ppd->host_link_state & HLS_UP) {
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ start_link(ppd);
+ } else {
+ dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+ __func__, link_state_name(ppd->host_link_state));
+ }
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP. The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+ int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+ if (cap & CAP_CRC_14B)
+ port_ltp |= PORT_LTP_CRC_MODE_14;
+ if (cap & CAP_CRC_48B)
+ port_ltp |= PORT_LTP_CRC_MODE_48;
+ if (cap & CAP_CRC_12B_16B_PER_LANE)
+ port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+ return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+ int cap_mask = 0;
+
+ if (port_ltp & PORT_LTP_CRC_MODE_14)
+ cap_mask |= CAP_CRC_14B;
+ if (port_ltp & PORT_LTP_CRC_MODE_48)
+ cap_mask |= CAP_CRC_48B;
+ if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+ cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+ return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+ int port_ltp = 0;
+
+ if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+ port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+ else if (lcb_crc == LCB_CRC_48B)
+ port_ltp = PORT_LTP_CRC_MODE_48;
+ else if (lcb_crc == LCB_CRC_14B)
+ port_ltp = PORT_LTP_CRC_MODE_14;
+ else
+ port_ltp = PORT_LTP_CRC_MODE_16;
+
+ return port_ltp;
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
+ * that we should already have the limited management partition key in
+ * array element 1, and also that the port is not yet up when
+ * add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /* Sanity check - ppd->pkeys[2] should be 0 */
+ if (ppd->pkeys[2] != 0)
+ dd_dev_err(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+ __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+ ppd->pkeys[2] = FULL_MGMT_P_KEY;
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+ switch (width) {
+ case 0:
+ /*
+ * Simulator and quick linkup do not set the width.
+ * Just set it to 4x without complaint.
+ */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+ return OPA_LINK_WIDTH_4X;
+ return 0; /* no lanes up */
+ case 1: return OPA_LINK_WIDTH_1X;
+ case 2: return OPA_LINK_WIDTH_2X;
+ case 3: return OPA_LINK_WIDTH_3X;
+ default:
+ dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+ __func__, width);
+ /* fall through */
+ case 4: return OPA_LINK_WIDTH_4X;
+ }
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+static inline u8 nibble_to_count(u8 nibble)
+{
+ return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ * enable_lane_tx
+ * enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+ u16 *rx_width)
+{
+ u16 tx, rx;
+ u8 enable_lane_rx;
+ u8 enable_lane_tx;
+ u8 tx_polarity_inversion;
+ u8 rx_polarity_inversion;
+ u8 max_rate;
+
+ /* read the active lanes */
+ read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+ &rx_polarity_inversion, &max_rate);
+ read_local_lni(dd, &enable_lane_rx);
+
+ /* convert to counts */
+ tx = nibble_to_count(enable_lane_tx);
+ rx = nibble_to_count(enable_lane_rx);
+
+ /*
+ * Set link_speed_active here, overriding what was set in
+ * handle_verify_cap(). The ASIC 8051 firmware does not correctly
+ * set the max_rate field in handle_verify_cap until v0.19.
+ */
+ if ((dd->icode == ICODE_RTL_SILICON)
+ && (dd->dc8051_ver < dc8051_ver(0, 19))) {
+ /* max_rate: 0 = 12.5G, 1 = 25G */
+ switch (max_rate) {
+ case 0:
+ dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+ break;
+ default:
+ dd_dev_err(dd,
+ "%s: unexpected max rate %d, using 25Gb\n",
+ __func__, (int)max_rate);
+ /* fall through */
+ case 1:
+ dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+ break;
+ }
+ }
+
+ dd_dev_info(dd,
+ "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+ enable_lane_tx, tx, enable_lane_rx, rx);
+ *tx_width = link_width_to_bits(dd, tx);
+ *rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp. Does not change
+ * after link up. I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ * + bits [7:4] contain the number of active transmitters
+ * + bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+ u16 *rx_width)
+{
+ u16 widths, tx, rx;
+ u8 misc_bits, local_flags;
+ u16 active_tx, active_rx;
+
+ read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+ tx = widths >> 12;
+ rx = (widths >> 8) & 0xf;
+
+ *tx_width = link_width_to_bits(dd, tx);
+ *rx_width = link_width_to_bits(dd, rx);
+
+ /* print the active widths */
+ get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+ u16 tx_width, rx_width;
+
+ /* get end-of-LNI link widths */
+ get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+ /* use tx_width as the link is supposed to be symmetric on link up */
+ ppd->link_width_active = tx_width;
+ /* link width downgrade active (LWD.A) starts out matching LW.A */
+ ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+ ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+ /* per OPA spec, on link up LWD.E resets to LWD.S */
+ ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+ /* cache the active egress rate (units {10^6 bits/sec]) */
+ ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_vc_work);
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg;
+ u8 power_management;
+ u8 continious;
+ u8 vcu;
+ u8 vau;
+ u8 z;
+ u16 vl15buf;
+ u16 link_widths;
+ u16 crc_mask;
+ u16 crc_val;
+ u16 device_id;
+ u16 active_tx, active_rx;
+ u8 partner_supported_crc;
+ u8 remote_tx_rate;
+ u8 device_rev;
+
+ set_link_state(ppd, HLS_VERIFY_CAP);
+
+ lcb_shutdown(dd, 0);
+ adjust_lcb_for_fpga_serdes(dd);
+
+ /*
+ * These are now valid:
+ * remote VerifyCap fields in the general LNI config
+ * CSR DC8051_STS_REMOTE_GUID
+ * CSR DC8051_STS_REMOTE_NODE_TYPE
+ * CSR DC8051_STS_REMOTE_FM_SECURITY
+ * CSR DC8051_STS_REMOTE_PORT_NO
+ */
+
+ read_vc_remote_phy(dd, &power_management, &continious);
+ read_vc_remote_fabric(
+ dd,
+ &vau,
+ &z,
+ &vcu,
+ &vl15buf,
+ &partner_supported_crc);
+ read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+ read_remote_device_id(dd, &device_id, &device_rev);
+ /*
+ * And the 'MgmtAllowed' information, which is exchanged during
+ * LNI, is also be available at this point.
+ */
+ read_mgmt_allowed(dd, &ppd->mgmt_allowed);
+ /* print the active widths */
+ get_link_widths(dd, &active_tx, &active_rx);
+ dd_dev_info(dd,
+ "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+ (int)power_management, (int)continious);
+ dd_dev_info(dd,
+ "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+ (int)vau,
+ (int)z,
+ (int)vcu,
+ (int)vl15buf,
+ (int)partner_supported_crc);
+ dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+ (u32)remote_tx_rate, (u32)link_widths);
+ dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+ (u32)device_id, (u32)device_rev);
+ /*
+ * The peer vAU value just read is the peer receiver value. HFI does
+ * not support a transmit vAU of 0 (AU == 8). We advertised that
+ * with Z=1 in the fabric capabilities sent to the peer. The peer
+ * will see our Z=1, and, if it advertised a vAU of 0, will move its
+ * receive to vAU of 1 (AU == 16). Do the same here. We do not care
+ * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+ * subject to the Z value exception.
+ */
+ if (vau == 0)
+ vau = 1;
+ set_up_vl15(dd, vau, vl15buf);
+
+ /* set up the LCB CRC mode */
+ crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+ /* order is important: use the lowest bit in common */
+ if (crc_mask & CAP_CRC_14B)
+ crc_val = LCB_CRC_14B;
+ else if (crc_mask & CAP_CRC_48B)
+ crc_val = LCB_CRC_48B;
+ else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+ crc_val = LCB_CRC_12B_16B_PER_LANE;
+ else
+ crc_val = LCB_CRC_16B;
+
+ dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+ write_csr(dd, DC_LCB_CFG_CRC_MODE,
+ (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+ /* set (14b only) or clear sideband credit */
+ reg = read_csr(dd, SEND_CM_CTRL);
+ if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+ write_csr(dd, SEND_CM_CTRL,
+ reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+ } else {
+ write_csr(dd, SEND_CM_CTRL,
+ reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+ }
+
+ ppd->link_speed_active = 0; /* invalid value */
+ if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+ /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+ switch (remote_tx_rate) {
+ case 0:
+ ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+ break;
+ case 1:
+ ppd->link_speed_active = OPA_LINK_SPEED_25G;
+ break;
+ }
+ } else {
+ /* actual rate is highest bit of the ANDed rates */
+ u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+ if (rate & 2)
+ ppd->link_speed_active = OPA_LINK_SPEED_25G;
+ else if (rate & 1)
+ ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+ }
+ if (ppd->link_speed_active == 0) {
+ dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+ __func__, (int)remote_tx_rate);
+ ppd->link_speed_active = OPA_LINK_SPEED_25G;
+ }
+
+ /*
+ * Cache the values of the supported, enabled, and active
+ * LTP CRC modes to return in 'portinfo' queries. But the bit
+ * flags that are returned in the portinfo query differ from
+ * what's in the link_crc_mask, crc_sizes, and crc_val
+ * variables. Convert these here.
+ */
+ ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+ /* supported crc modes */
+ ppd->port_ltp_crc_mode |=
+ cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+ /* enabled crc modes */
+ ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+ /* active crc mode */
+
+ /* set up the remote credit return table */
+ assign_remote_cm_au_table(dd, vcu);
+
+ /*
+ * The LCB is reset on entry to handle_verify_cap(), so this must
+ * be applied on every link up.
+ *
+ * Adjust LCB error kill enable to kill the link if
+ * these RBUF errors are seen:
+ * REPLAY_BUF_MBE_SMASK
+ * FLIT_INPUT_BUF_MBE_SMASK
+ */
+ if (is_a0(dd)) { /* fixed in B0 */
+ reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+ reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+ | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+ write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+ }
+
+ /* pull LCB fifos out of reset - all fifo clocks must be stable */
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+ /* give 8051 access to the LCB CSRs */
+ write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+ set_8051_lcb_access(dd);
+
+ ppd->neighbor_guid =
+ read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+ ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+ DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+ ppd->neighbor_type =
+ read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+ DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+ ppd->neighbor_fm_security =
+ read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+ DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+ dd_dev_info(dd,
+ "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+ ppd->neighbor_guid, ppd->neighbor_type,
+ ppd->mgmt_allowed, ppd->neighbor_fm_security);
+ if (ppd->mgmt_allowed)
+ add_full_mgmt_pkey(ppd);
+
+ /* tell the 8051 to go to LinkUp */
+ set_link_state(ppd, HLS_GOING_UP);
+}
+
+/*
+ * Apply the link width downgrade enabled policy against the current active
+ * link widths.
+ *
+ * Called when the enabled policy changes or the active link widths change.
+ */
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
+{
+ int skip = 1;
+ int do_bounce = 0;
+ u16 lwde = ppd->link_width_downgrade_enabled;
+ u16 tx, rx;
+
+ mutex_lock(&ppd->hls_lock);
+ /* only apply if the link is up */
+ if (ppd->host_link_state & HLS_UP)
+ skip = 0;
+ mutex_unlock(&ppd->hls_lock);
+ if (skip)
+ return;
+
+ if (refresh_widths) {
+ get_link_widths(ppd->dd, &tx, &rx);
+ ppd->link_width_downgrade_tx_active = tx;
+ ppd->link_width_downgrade_rx_active = rx;
+ }
+
+ if (lwde == 0) {
+ /* downgrade is disabled */
+
+ /* bounce if not at starting active width */
+ if ((ppd->link_width_active !=
+ ppd->link_width_downgrade_tx_active)
+ || (ppd->link_width_active !=
+ ppd->link_width_downgrade_rx_active)) {
+ dd_dev_err(ppd->dd,
+ "Link downgrade is disabled and link has downgraded, downing link\n");
+ dd_dev_err(ppd->dd,
+ " original 0x%x, tx active 0x%x, rx active 0x%x\n",
+ ppd->link_width_active,
+ ppd->link_width_downgrade_tx_active,
+ ppd->link_width_downgrade_rx_active);
+ do_bounce = 1;
+ }
+ } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0
+ || (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+ /* Tx or Rx is outside the enabled policy */
+ dd_dev_err(ppd->dd,
+ "Link is outside of downgrade allowed, downing link\n");
+ dd_dev_err(ppd->dd,
+ " enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+ lwde,
+ ppd->link_width_downgrade_tx_active,
+ ppd->link_width_downgrade_rx_active);
+ do_bounce = 1;
+ }
+
+ if (do_bounce) {
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+ OPA_LINKDOWN_REASON_WIDTH_POLICY);
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ start_link(ppd);
+ }
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_downgrade_work);
+
+ dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+ apply_link_downgrade_policy(ppd, 1);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dcc_err_flags,
+ ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, lcb_err_flags,
+ ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dc8051_err_flags,
+ ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+ ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+ ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+ u64 info, err, host_msg;
+ int queue_link_down = 0;
+ char buf[96];
+
+ /* look at the flags */
+ if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+ /* 8051 information set by firmware */
+ /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+ info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+ err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+ & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+ host_msg = (info >>
+ DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+ & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+ /*
+ * Handle error flags.
+ */
+ if (err & FAILED_LNI) {
+ /*
+ * LNI error indications are cleared by the 8051
+ * only when starting polling. Only pay attention
+ * to them when in the states that occur during
+ * LNI.
+ */
+ if (ppd->host_link_state
+ & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+ queue_link_down = 1;
+ dd_dev_info(dd, "Link error: %s\n",
+ dc8051_info_err_string(buf,
+ sizeof(buf),
+ err & FAILED_LNI));
+ }
+ err &= ~(u64)FAILED_LNI;
+ }
+ if (err) {
+ /* report remaining errors, but do not do anything */
+ dd_dev_err(dd, "8051 info error: %s\n",
+ dc8051_info_err_string(buf, sizeof(buf), err));
+ }
+
+ /*
+ * Handle host message flags.
+ */
+ if (host_msg & HOST_REQ_DONE) {
+ /*
+ * Presently, the driver does a busy wait for
+ * host requests to complete. This is only an
+ * informational message.
+ * NOTE: The 8051 clears the host message
+ * information *on the next 8051 command*.
+ * Therefore, when linkup is achieved,
+ * this flag will still be set.
+ */
+ host_msg &= ~(u64)HOST_REQ_DONE;
+ }
+ if (host_msg & BC_SMA_MSG) {
+ queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+ host_msg &= ~(u64)BC_SMA_MSG;
+ }
+ if (host_msg & LINKUP_ACHIEVED) {
+ dd_dev_info(dd, "8051: Link up\n");
+ queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+ host_msg &= ~(u64)LINKUP_ACHIEVED;
+ }
+ if (host_msg & EXT_DEVICE_CFG_REQ) {
+ handle_8051_request(dd);
+ host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+ }
+ if (host_msg & VERIFY_CAP_FRAME) {
+ queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+ host_msg &= ~(u64)VERIFY_CAP_FRAME;
+ }
+ if (host_msg & LINK_GOING_DOWN) {
+ const char *extra = "";
+ /* no downgrade action needed if going down */
+ if (host_msg & LINK_WIDTH_DOWNGRADED) {
+ host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+ extra = " (ignoring downgrade)";
+ }
+ dd_dev_info(dd, "8051: Link down%s\n", extra);
+ queue_link_down = 1;
+ host_msg &= ~(u64)LINK_GOING_DOWN;
+ }
+ if (host_msg & LINK_WIDTH_DOWNGRADED) {
+ queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+ host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+ }
+ if (host_msg) {
+ /* report remaining messages, but do not do anything */
+ dd_dev_info(dd, "8051 info host message: %s\n",
+ dc8051_info_host_msg_string(buf, sizeof(buf),
+ host_msg));
+ }
+
+ reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+ }
+ if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+ /*
+ * Lost the 8051 heartbeat. If this happens, we
+ * receive constant interrupts about it. Disable
+ * the interrupt after the first.
+ */
+ dd_dev_err(dd, "Lost 8051 heartbeat\n");
+ write_csr(dd, DC_DC8051_ERR_EN,
+ read_csr(dd, DC_DC8051_ERR_EN)
+ & ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+ reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+ }
+ if (reg) {
+ /* report the error, but do not do anything */
+ dd_dev_err(dd, "8051 error: %s\n",
+ dc8051_err_string(buf, sizeof(buf), reg));
+ }
+
+ if (queue_link_down) {
+ /* if the link is already going down or disabled, do not
+ * queue another */
+ if ((ppd->host_link_state
+ & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN))
+ || ppd->link_enabled == 0) {
+ dd_dev_info(dd, "%s: not queuing link down\n",
+ __func__);
+ } else {
+ queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+ }
+ }
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+ "BadHeadDist: Distance violation between two head flits",
+[1] =
+ "BadTailDist: Distance violation between two tail flits",
+[2] =
+ "BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+ "BadCrdAck: Credits return for unsupported VL",
+[4] =
+ "UnsupportedVLMarker: Received VL Marker",
+[5] =
+ "BadPreempt: Exceeded the preemption nesting level",
+[6] =
+ "BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+ "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+ "BadPktLen: Illegal PktLen",
+[2] =
+ "PktLenTooLong: Packet longer than PktLen",
+[3] =
+ "PktLenTooShort: Packet shorter than PktLen",
+[4] =
+ "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+ "BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+ "BadL2: Illegal L2 opcode",
+[7] =
+ "BadSC: Unsupported SC",
+[9] =
+ "BadRC: Illegal RC",
+[11] =
+ "PreemptError: Preempting with same VL",
+[12] =
+ "PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ u64 info, hdr0, hdr1;
+ const char *extra;
+ char buf[96];
+ struct hfi1_pportdata *ppd = dd->pport;
+ u8 lcl_reason = 0;
+ int do_bounce = 0;
+
+ if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+ if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+ info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+ dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+ /* set status bit */
+ dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+ }
+ reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+ struct hfi1_pportdata *ppd = dd->pport;
+ /* this counter saturates at (2^32) - 1 */
+ if (ppd->link_downed < (u32)UINT_MAX)
+ ppd->link_downed++;
+ reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+ u8 reason_valid = 1;
+
+ info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+ if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+ dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+ /* set status bit */
+ dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+ }
+ switch (info) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ extra = fm_config_txt[info];
+ break;
+ case 8:
+ extra = fm_config_txt[info];
+ if (ppd->port_error_action &
+ OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+ do_bounce = 1;
+ /*
+ * lcl_reason cannot be derived from info
+ * for this error
+ */
+ lcl_reason =
+ OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+ }
+ break;
+ default:
+ reason_valid = 0;
+ snprintf(buf, sizeof(buf), "reserved%lld", info);
+ extra = buf;
+ break;
+ }
+
+ if (reason_valid && !do_bounce) {
+ do_bounce = ppd->port_error_action &
+ (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+ lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+ }
+
+ /* just report this */
+ dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
+ reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+ u8 reason_valid = 1;
+
+ info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+ hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+ hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+ if (!(dd->err_info_rcvport.status_and_code &
+ OPA_EI_STATUS_SMASK)) {
+ dd->err_info_rcvport.status_and_code =
+ info & OPA_EI_CODE_SMASK;
+ /* set status bit */
+ dd->err_info_rcvport.status_and_code |=
+ OPA_EI_STATUS_SMASK;
+ /* save first 2 flits in the packet that caused
+ * the error */
+ dd->err_info_rcvport.packet_flit1 = hdr0;
+ dd->err_info_rcvport.packet_flit2 = hdr1;
+ }
+ switch (info) {
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ case 9:
+ case 11:
+ case 12:
+ extra = port_rcv_txt[info];
+ break;
+ default:
+ reason_valid = 0;
+ snprintf(buf, sizeof(buf), "reserved%lld", info);
+ extra = buf;
+ break;
+ }
+
+ if (reason_valid && !do_bounce) {
+ do_bounce = ppd->port_error_action &
+ (1 << (OPA_LDR_PORTRCV_OFFSET + info));
+ lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+ }
+
+ /* just report this */
+ dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
+ dd_dev_info(dd, " hdr0 0x%llx, hdr1 0x%llx\n",
+ hdr0, hdr1);
+
+ reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+ /* informative only */
+ dd_dev_info(dd, "8051 access to LCB blocked\n");
+ reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+ }
+ if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+ /* informative only */
+ dd_dev_info(dd, "host access to LCB blocked\n");
+ reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+ }
+
+ /* report any remaining errors */
+ if (reg)
+ dd_dev_info(dd, "DCC Error: %s\n",
+ dcc_err_string(buf, sizeof(buf), reg));
+
+ if (lcl_reason == 0)
+ lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+ if (do_bounce) {
+ dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+ set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+ queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+ }
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ dd_dev_info(dd, "LCB Error: %s\n",
+ lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt. Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct err_reg_info *eri = &dc_errs[source];
+
+ if (eri->handler) {
+ interrupt_clear_down(dd, 0, eri);
+ } else if (source == 3 /* dc_lbm_int */) {
+ /*
+ * This indicates that a parity error has occurred on the
+ * address/control lines presented to the LBM. The error
+ * is a single pulse, there is no associated error flag,
+ * and it is non-maskable. This is because if a parity
+ * error occurs on the request the request is dropped.
+ * This should never occur, but it is nice to know if it
+ * ever does.
+ */
+ dd_dev_err(dd, "Parity error in DC LBM block\n");
+ } else {
+ dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+ }
+}
+
+/*
+ * TX block send credit interrupt. Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt. Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ * 0 - N-1 = SDma
+ * N - 2N-1 = SDmaProgress
+ * 2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ /* what interrupt */
+ unsigned int what = source / TXE_NUM_SDMA_ENGINES;
+ /* which engine */
+ unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+ slashstrip(__FILE__), __LINE__, __func__);
+ sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+ if (likely(what < 3 && which < dd->num_sdma)) {
+ sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+ } else {
+ /* should not happen */
+ dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+ }
+}
+
+/*
+ * RX block receive available interrupt. Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ struct hfi1_ctxtdata *rcd;
+ char *err_detail;
+
+ if (likely(source < dd->num_rcv_contexts)) {
+ rcd = dd->rcd[source];
+ if (rcd) {
+ if (source < dd->first_user_ctxt)
+ rcd->do_interrupt(rcd);
+ else
+ handle_user_interrupt(rcd);
+ return; /* OK */
+ }
+ /* received an interrupt, but no rcd */
+ err_detail = "dataless";
+ } else {
+ /* received an interrupt, but are not using that context */
+ err_detail = "out of range";
+ }
+ dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+ err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt. Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ struct hfi1_ctxtdata *rcd;
+ char *err_detail;
+
+ if (likely(source < dd->num_rcv_contexts)) {
+ rcd = dd->rcd[source];
+ if (rcd) {
+ /* only pay attention to user urgent interrupts */
+ if (source >= dd->first_user_ctxt)
+ handle_user_interrupt(rcd);
+ return; /* OK */
+ }
+ /* received an interrupt, but no rcd */
+ err_detail = "dataless";
+ } else {
+ /* received an interrupt, but are not using that context */
+ err_detail = "out of range";
+ }
+ dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+ err_detail, source);
+}
+
+/*
+ * Reserved range interrupt. Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ char name[64];
+
+ dd_dev_err(dd, "unexpected %s interrupt\n",
+ is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/* start end
+ name func interrupt func */
+{ IS_GENERAL_ERR_START, IS_GENERAL_ERR_END,
+ is_misc_err_name, is_misc_err_int },
+{ IS_SDMAENG_ERR_START, IS_SDMAENG_ERR_END,
+ is_sdma_eng_err_name, is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+ is_sendctxt_err_name, is_sendctxt_err_int },
+{ IS_SDMA_START, IS_SDMA_END,
+ is_sdma_eng_name, is_sdma_eng_int },
+{ IS_VARIOUS_START, IS_VARIOUS_END,
+ is_various_name, is_various_int },
+{ IS_DC_START, IS_DC_END,
+ is_dc_name, is_dc_int },
+{ IS_RCVAVAIL_START, IS_RCVAVAIL_END,
+ is_rcv_avail_name, is_rcv_avail_int },
+{ IS_RCVURGENT_START, IS_RCVURGENT_END,
+ is_rcv_urgent_name, is_rcv_urgent_int },
+{ IS_SENDCREDIT_START, IS_SENDCREDIT_END,
+ is_send_credit_name, is_send_credit_int},
+{ IS_RESERVED_START, IS_RESERVED_END,
+ is_reserved_name, is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct is_table *entry;
+
+ /* avoids a double compare by walking the table in-order */
+ for (entry = &is_table[0]; entry->is_name; entry++) {
+ if (source < entry->end) {
+ trace_hfi1_interrupt(dd, entry, source);
+ entry->is_int(dd, source - entry->start);
+ return;
+ }
+ }
+ /* fell off the end */
+ dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler. This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+ struct hfi1_devdata *dd = data;
+ u64 regs[CCE_NUM_INT_CSRS];
+ u32 bit;
+ int i;
+
+ this_cpu_inc(*dd->int_counter);
+
+ /* phase 1: scan and clear all handled interrupts */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+ if (dd->gi_mask[i] == 0) {
+ regs[i] = 0; /* used later */
+ continue;
+ }
+ regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+ dd->gi_mask[i];
+ /* only clear if anything is set */
+ if (regs[i])
+ write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+ }
+
+ /* phase 2: call the appropriate handler */
+ for_each_set_bit(bit, (unsigned long *)&regs[0],
+ CCE_NUM_INT_CSRS*64) {
+ is_interrupt(dd, bit);
+ }
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+ struct sdma_engine *sde = data;
+ struct hfi1_devdata *dd = sde->dd;
+ u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ sdma_dumpstate(sde);
+#endif
+
+ this_cpu_inc(*dd->int_counter);
+
+ /* This read_csr is really bad in the hot path */
+ status = read_csr(dd,
+ CCE_INT_STATUS + (8*(IS_SDMA_START/64)))
+ & sde->imask;
+ if (likely(status)) {
+ /* clear the interrupt(s) */
+ write_csr(dd,
+ CCE_INT_CLEAR + (8*(IS_SDMA_START/64)),
+ status);
+
+ /* handle the interrupt(s) */
+ sdma_engine_interrupt(sde, status);
+ } else
+ dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
+ sde->this_idx);
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * NOTE: this routine expects to be on its own MSI-X interrupt. If
+ * multiple receive contexts share the same MSI-X interrupt, then this
+ * routine must check for who received it.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+ struct hfi1_ctxtdata *rcd = data;
+ struct hfi1_devdata *dd = rcd->dd;
+
+ trace_hfi1_receive_interrupt(dd, rcd->ctxt);
+ this_cpu_inc(*dd->int_counter);
+
+ /* clear the interrupt */
+ write_csr(rcd->dd, CCE_INT_CLEAR + (8*rcd->ireg), rcd->imask);
+
+ /* handle the interrupt */
+ rcd->do_interrupt(rcd);
+
+ return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+ return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+ & DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+static u32 read_logical_state(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+ return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+ & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+ u64 reg;
+
+ reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+ /* clear current state, set new state */
+ reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+ reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+ write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+ u32 regno;
+ int ret;
+
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ if (acquire_lcb_access(dd, 0) == 0) {
+ *data = read_csr(dd, addr);
+ release_lcb_access(dd, 0);
+ return 0;
+ }
+ return -EBUSY;
+ }
+
+ /* register is an index of LCB registers: (offset - base) / 8 */
+ regno = (addr - DC_LCB_CFG_RUN) >> 3;
+ ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+ if (ret != HCMD_SUCCESS)
+ return -EBUSY;
+ return 0;
+}
+
+/*
+ * Read an LCB CSR. Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+
+ /* if up, go through the 8051 for the value */
+ if (ppd->host_link_state & HLS_UP)
+ return read_lcb_via_8051(dd, addr, data);
+ /* if going up or down, no access */
+ if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+ return -EBUSY;
+ /* otherwise, host has access */
+ *data = read_csr(dd, addr);
+ return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+
+ if (acquire_lcb_access(dd, 0) == 0) {
+ write_csr(dd, addr, data);
+ release_lcb_access(dd, 0);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+/*
+ * Write an LCB CSR. Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+
+ /* if up, go through the 8051 for the value */
+ if (ppd->host_link_state & HLS_UP)
+ return write_lcb_via_8051(dd, addr, data);
+ /* if going up or down, no access */
+ if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+ return -EBUSY;
+ /* otherwise, host has access */
+ write_csr(dd, addr, data);
+ return 0;
+}
+
+/*
+ * Returns:
+ * < 0 = Linux error, not able to get access
+ * > 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(
+ struct hfi1_devdata *dd,
+ u32 type,
+ u64 in_data,
+ u64 *out_data)
+{
+ u64 reg, completed;
+ int return_code;
+ unsigned long flags;
+ unsigned long timeout;
+
+ hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+ /*
+ * Alternative to holding the lock for a long time:
+ * - keep busy wait - have other users bounce off
+ */
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+
+ /* We can't send any commands to the 8051 if it's in reset */
+ if (dd->dc_shutdown) {
+ return_code = -ENODEV;
+ goto fail;
+ }
+
+ /*
+ * If an 8051 host command timed out previously, then the 8051 is
+ * stuck.
+ *
+ * On first timeout, attempt to reset and restart the entire DC
+ * block (including 8051). (Is this too big of a hammer?)
+ *
+ * If the 8051 times out a second time, the reset did not bring it
+ * back to healthy life. In that case, fail any subsequent commands.
+ */
+ if (dd->dc8051_timed_out) {
+ if (dd->dc8051_timed_out > 1) {
+ dd_dev_err(dd,
+ "Previous 8051 host command timed out, skipping command %u\n",
+ type);
+ return_code = -ENXIO;
+ goto fail;
+ }
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ dc_shutdown(dd);
+ dc_start(dd);
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ }
+
+ /*
+ * If there is no timeout, then the 8051 command interface is
+ * waiting for a command.
+ */
+
+ /*
+ * Do two writes: the first to stabilize the type and req_data, the
+ * second to activate.
+ */
+ reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+ << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+ | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+ << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+ write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+ reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+ write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+ /* wait for completion, alternate: interrupt */
+ timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+ completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+ if (completed)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd->dc8051_timed_out++;
+ dd_dev_err(dd, "8051 host command %u timeout\n", type);
+ if (out_data)
+ *out_data = 0;
+ return_code = -ETIMEDOUT;
+ goto fail;
+ }
+ udelay(2);
+ }
+
+ if (out_data) {
+ *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+ & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+ if (type == HCMD_READ_LCB_CSR) {
+ /* top 16 bits are in a different register */
+ *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+ & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+ << (48
+ - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+ }
+ }
+ return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+ & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+ dd->dc8051_timed_out = 0;
+ /*
+ * Clear command for next user.
+ */
+ write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+
+ return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+ return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+static int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+ u8 lane_id, u32 config_data)
+{
+ u64 data;
+ int ret;
+
+ data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+ | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+ | (u64)config_data << LOAD_DATA_DATA_SHIFT;
+ ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "load 8051 config: field id %d, lane %d, err %d\n",
+ (int)field_id, (int)lane_id, ret);
+ }
+ return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers". Use the RAM directly. Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+ u32 *result)
+{
+ u64 big_data;
+ u32 addr;
+ int ret;
+
+ /* address start depends on the lane_id */
+ if (lane_id < 4)
+ addr = (4 * NUM_GENERAL_FIELDS)
+ + (lane_id * 4 * NUM_LANE_FIELDS);
+ else
+ addr = 0;
+ addr += field_id * 4;
+
+ /* read is in 8-byte chunks, hardware will truncate the address down */
+ ret = read_8051_data(dd, addr, 8, &big_data);
+
+ if (ret == 0) {
+ /* extract the 4 bytes we want */
+ if (addr & 0x4)
+ *result = (u32)(big_data >> 32);
+ else
+ *result = (u32)big_data;
+ } else {
+ *result = 0;
+ dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+ __func__, lane_id, field_id);
+ }
+
+ return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+ u8 continuous)
+{
+ u32 frame;
+
+ frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+ | power_management << POWER_MANAGEMENT_SHIFT;
+ return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+ GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+ u16 vl15buf, u8 crc_sizes)
+{
+ u32 frame;
+
+ frame = (u32)vau << VAU_SHIFT
+ | (u32)z << Z_SHIFT
+ | (u32)vcu << VCU_SHIFT
+ | (u32)vl15buf << VL15BUF_SHIFT
+ | (u32)crc_sizes << CRC_SIZES_SHIFT;
+ return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+ GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+ u8 *flag_bits, u16 *link_widths)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+ &frame);
+ *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+ *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+ *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+ u8 misc_bits,
+ u8 flag_bits,
+ u16 link_widths)
+{
+ u32 frame;
+
+ frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+ | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+ | (u32)link_widths << LINK_WIDTH_SHIFT;
+ return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+ frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+ u8 device_rev)
+{
+ u32 frame;
+
+ frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+ | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+ return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+ u8 *device_rev)
+{
+ u32 frame;
+
+ read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+ *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+ *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+ & REMOTE_DEVICE_REV_MASK;
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
+{
+ u32 frame;
+
+ read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+ *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
+ *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+ u8 *continuous)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+ *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+ & POWER_MANAGEMENT_MASK;
+ *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+ & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+ u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+ *vau = (frame >> VAU_SHIFT) & VAU_MASK;
+ *z = (frame >> Z_SHIFT) & Z_MASK;
+ *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+ *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+ *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+ u8 *remote_tx_rate,
+ u16 *link_widths)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+ &frame);
+ *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+ & REMOTE_TX_RATE_MASK;
+ *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+ u32 frame;
+
+ read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+ *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
+{
+ u32 frame;
+
+ read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+ *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+ read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+ read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+ u32 frame;
+ int ret;
+
+ *link_quality = 0;
+ if (dd->pport->host_link_state & HLS_UP) {
+ ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+ &frame);
+ if (ret == 0)
+ *link_quality = (frame >> LINK_QUALITY_SHIFT)
+ & LINK_QUALITY_MASK;
+ }
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+ u32 frame;
+
+ read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+ *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+ u8 *enable_lane_tx,
+ u8 *tx_polarity_inversion,
+ u8 *rx_polarity_inversion,
+ u8 *max_rate)
+{
+ u32 frame;
+ int ret;
+
+ ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+ *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+ & ENABLE_LANE_TX_MASK;
+ *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+ & TX_POLARITY_INVERSION_MASK;
+ *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+ & RX_POLARITY_INVERSION_MASK;
+ *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+ return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+ u8 enable_lane_tx,
+ u8 tx_polarity_inversion,
+ u8 rx_polarity_inversion,
+ u8 max_rate)
+{
+ u32 frame;
+
+ /* no need to mask, all variable sizes match field widths */
+ frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+ | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+ | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+ | max_rate << MAX_RATE_SHIFT;
+ return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
+{
+ u32 frame, version, prod_id;
+ int ret, lane;
+
+ /* 4 lanes */
+ for (lane = 0; lane < 4; lane++) {
+ ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
+ if (ret) {
+ dd_dev_err(
+ dd,
+ "Unable to read lane %d firmware details\n",
+ lane);
+ continue;
+ }
+ version = (frame >> SPICO_ROM_VERSION_SHIFT)
+ & SPICO_ROM_VERSION_MASK;
+ prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
+ & SPICO_ROM_PROD_ID_MASK;
+ dd_dev_info(dd,
+ "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
+ lane, version, prod_id);
+ }
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+ int ret;
+
+ ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG,
+ type, data_out);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "read idle message: type %d, err %d\n",
+ (u32)type, ret);
+ return -EINVAL;
+ }
+ dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+ /* return only the payload as we already know the type */
+ *data_out >>= IDLE_PAYLOAD_SHIFT;
+ return 0;
+}
+
+/*
+ * Read an idle SMA message. To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+ return read_idle_message(dd,
+ (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+ int ret;
+
+ dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+ ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+ data, ret);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+ u64 data;
+
+ data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT)
+ | ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+ return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up. This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ unsigned long timeout;
+ int ret;
+
+ lcb_shutdown(dd, 0);
+
+ if (loopback) {
+ /* LCB_CFG_LOOPBACK.VAL = 2 */
+ /* LCB_CFG_LANE_WIDTH.VAL = 0 */
+ write_csr(dd, DC_LCB_CFG_LOOPBACK,
+ IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+ write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+ }
+
+ /* start the LCBs */
+ /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+ /* simulator only loopback steps */
+ if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ /* LCB_CFG_RUN.EN = 1 */
+ write_csr(dd, DC_LCB_CFG_RUN,
+ 1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+ /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+ timeout = jiffies + msecs_to_jiffies(10);
+ while (1) {
+ reg = read_csr(dd,
+ DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+ if (reg)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+ return -ETIMEDOUT;
+ }
+ udelay(2);
+ }
+
+ write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+ 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+ }
+
+ if (!loopback) {
+ /*
+ * When doing quick linkup and not in loopback, both
+ * sides must be done with LCB set-up before either
+ * starts the quick linkup. Put a delay here so that
+ * both sides can be started and have a chance to be
+ * done with LCB set up before resuming.
+ */
+ dd_dev_err(dd,
+ "Pausing for peer to be finished with LCB set up\n");
+ msleep(5000);
+ dd_dev_err(dd,
+ "Continuing with quick linkup\n");
+ }
+
+ write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+ set_8051_lcb_access(dd);
+
+ /*
+ * State "quick" LinkUp request sets the physical link state to
+ * LinkUp without a verify capability sequence.
+ * This state is in simulator v37 and later.
+ */
+ ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "%s: set physical link state to quick LinkUp failed with return %d\n",
+ __func__, ret);
+
+ set_host_lcb_access(dd);
+ write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+ if (ret >= 0)
+ ret = -EINVAL;
+ return ret;
+ }
+
+ return 0; /* success */
+}
+
+/*
+ * Set the SerDes to internal loopback mode.
+ * Returns 0 on success, -errno on error.
+ */
+static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
+ if (ret == HCMD_SUCCESS)
+ return 0;
+ dd_dev_err(dd,
+ "Set physical link state to SerDes Loopback failed with return %d\n",
+ ret);
+ if (ret >= 0)
+ ret = -EINVAL;
+ return ret;
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+ dd_dev_info(dd, "Entering loopback mode\n");
+
+ /* all loopbacks should disable self GUID check */
+ write_csr(dd, DC_DC8051_CFG_MODE,
+ (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+ /*
+ * The simulator has only one loopback option - LCB. Switch
+ * to that option, which includes quick link up.
+ *
+ * Accept all valid loopback values.
+ */
+ if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+ && (loopback == LOOPBACK_SERDES
+ || loopback == LOOPBACK_LCB
+ || loopback == LOOPBACK_CABLE)) {
+ loopback = LOOPBACK_LCB;
+ quick_linkup = 1;
+ return 0;
+ }
+
+ /* handle serdes loopback */
+ if (loopback == LOOPBACK_SERDES) {
+ /* internal serdes loopack needs quick linkup on RTL */
+ if (dd->icode == ICODE_RTL_SILICON)
+ quick_linkup = 1;
+ return set_serdes_loopback_mode(dd);
+ }
+
+ /* LCB loopback - handled at poll time */
+ if (loopback == LOOPBACK_LCB) {
+ quick_linkup = 1; /* LCB is always quick linkup */
+
+ /* not supported in emulation due to emulation RTL changes */
+ if (dd->icode == ICODE_FPGA_EMULATION) {
+ dd_dev_err(dd,
+ "LCB loopback not supported in emulation\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ /* external cable loopback requires no extra steps */
+ if (loopback == LOOPBACK_CABLE)
+ return 0;
+
+ dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+ return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+ int i;
+ u16 result = 0;
+
+ static const struct link_bits {
+ u16 from;
+ u16 to;
+ } opa_link_xlate[] = {
+ { OPA_LINK_WIDTH_1X, 1 << (1-1) },
+ { OPA_LINK_WIDTH_2X, 1 << (2-1) },
+ { OPA_LINK_WIDTH_3X, 1 << (3-1) },
+ { OPA_LINK_WIDTH_4X, 1 << (4-1) },
+ };
+
+ for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+ if (opa_widths & opa_link_xlate[i].from)
+ result |= opa_link_xlate[i].to;
+ }
+ return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u8 enable_lane_tx;
+ u8 tx_polarity_inversion;
+ u8 rx_polarity_inversion;
+ int ret;
+
+ /* reset our fabric serdes to clear any lingering problems */
+ fabric_serdes_reset(dd);
+
+ /* set the local tx rate - need to read-modify-write */
+ ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+ &rx_polarity_inversion, &ppd->local_tx_rate);
+ if (ret)
+ goto set_local_link_attributes_fail;
+
+ if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+ /* set the tx rate to the fastest enabled */
+ if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+ ppd->local_tx_rate = 1;
+ else
+ ppd->local_tx_rate = 0;
+ } else {
+ /* set the tx rate to all enabled */
+ ppd->local_tx_rate = 0;
+ if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+ ppd->local_tx_rate |= 2;
+ if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+ ppd->local_tx_rate |= 1;
+ }
+ ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+ rx_polarity_inversion, ppd->local_tx_rate);
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ /*
+ * DC supports continuous updates.
+ */
+ ret = write_vc_local_phy(dd, 0 /* no power management */,
+ 1 /* continuous updates */);
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ /* z=1 in the next call: AU of 0 is not supported by the hardware */
+ ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+ ppd->port_crc_mode_enabled);
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ ret = write_vc_local_link_width(dd, 0, 0,
+ opa_to_vc_link_widths(ppd->link_width_enabled));
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ /* let peer know who we are */
+ ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+ if (ret == HCMD_SUCCESS)
+ return 0;
+
+set_local_link_attributes_fail:
+ dd_dev_err(dd,
+ "Failed to set local link attributes, return 0x%x\n",
+ ret);
+ return ret;
+}
+
+/*
+ * Call this to start the link. Schedule a retry if the cable is not
+ * present or if unable to start polling. Do not do anything if the
+ * link is disabled. Returns 0 if link is disabled or moved to polling
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+ if (!ppd->link_enabled) {
+ dd_dev_info(ppd->dd,
+ "%s: stopping link start because link is disabled\n",
+ __func__);
+ return 0;
+ }
+ if (!ppd->driver_link_ready) {
+ dd_dev_info(ppd->dd,
+ "%s: stopping link start because driver is not ready\n",
+ __func__);
+ return 0;
+ }
+
+ if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
+ loopback == LOOPBACK_LCB ||
+ ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+ return set_link_state(ppd, HLS_DN_POLL);
+
+ dd_dev_info(ppd->dd,
+ "%s: stopping link start because no cable is present\n",
+ __func__);
+ return -EAGAIN;
+}
+
+static void reset_qsfp(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 mask, qsfp_mask;
+
+ mask = (u64)QSFP_HFI0_RESET_N;
+ qsfp_mask = read_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
+ qsfp_mask |= mask;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE,
+ qsfp_mask);
+
+ qsfp_mask = read_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+ qsfp_mask &= ~mask;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+ qsfp_mask);
+
+ udelay(10);
+
+ qsfp_mask |= mask;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+ qsfp_mask);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+ u8 *qsfp_interrupt_status)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+ (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+ dd_dev_info(dd,
+ "%s: QSFP cable on fire\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+ (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+ dd_dev_info(dd,
+ "%s: QSFP cable temperature too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+ (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+ dd_dev_info(dd,
+ "%s: QSFP supply voltage too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+ (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+ dd_dev_info(dd,
+ "%s: QSFP supply voltage too low\n",
+ __func__);
+
+ /* Byte 2 is vendor specific */
+
+ if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable RX channel 1/2 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable RX channel 1/2 power too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable RX channel 3/4 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable RX channel 3/4 power too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+ (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 1/2 bias too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+ (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 1/2 bias too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+ (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 3/4 bias too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+ (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 3/4 bias too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 1/2 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 1/2 power too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 3/4 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd,
+ "%s: Cable TX channel 3/4 power too low\n",
+ __func__);
+
+ /* Bytes 9-10 and 11-12 are reserved */
+ /* Bytes 13-15 are vendor specific */
+
+ return 0;
+}
+
+static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd)
+{
+ refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+ return 0;
+}
+
+static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u8 qsfp_interrupt_status = 0;
+
+ if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1)
+ != 1) {
+ dd_dev_info(dd,
+ "%s: Failed to read status of QSFP module\n",
+ __func__);
+ return -EIO;
+ }
+
+ /* We don't care about alarms & warnings with a non-functional INT_N */
+ if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY))
+ do_pre_lni_host_behaviors(ppd);
+
+ return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module is present */
+static void qsfp_event(struct work_struct *work)
+{
+ struct qsfp_data *qd;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+
+ qd = container_of(work, struct qsfp_data, qsfp_work);
+ ppd = qd->ppd;
+ dd = ppd->dd;
+
+ /* Sanity check */
+ if (!qsfp_mod_present(ppd))
+ return;
+
+ /*
+ * Turn DC back on after cables has been
+ * re-inserted. Up until now, the DC has been in
+ * reset to save power.
+ */
+ dc_start(dd);
+
+ if (qd->cache_refresh_required) {
+ msleep(3000);
+ reset_qsfp(ppd);
+
+ /* Check for QSFP interrupt after t_init (SFF 8679)
+ * + extra
+ */
+ msleep(3000);
+ if (!qd->qsfp_interrupt_functional) {
+ if (do_qsfp_intr_fallback(ppd) < 0)
+ dd_dev_info(dd, "%s: QSFP fallback failed\n",
+ __func__);
+ ppd->driver_link_ready = 1;
+ start_link(ppd);
+ }
+ }
+
+ if (qd->check_interrupt_flags) {
+ u8 qsfp_interrupt_status[16] = {0,};
+
+ if (qsfp_read(ppd, dd->hfi1_id, 6,
+ &qsfp_interrupt_status[0], 16) != 16) {
+ dd_dev_info(dd,
+ "%s: Failed to read status of QSFP module\n",
+ __func__);
+ } else {
+ unsigned long flags;
+ u8 data_status;
+
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.check_interrupt_flags = 0;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+ flags);
+
+ if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1)
+ != 1) {
+ dd_dev_info(dd,
+ "%s: Failed to read status of QSFP module\n",
+ __func__);
+ }
+ if (!(data_status & QSFP_DATA_NOT_READY)) {
+ do_pre_lni_host_behaviors(ppd);
+ start_link(ppd);
+ } else
+ handle_qsfp_error_conditions(ppd,
+ qsfp_interrupt_status);
+ }
+ }
+}
+
+void init_qsfp(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 qsfp_mask;
+
+ if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+ ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
+ !HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
+ ppd->driver_link_ready = 1;
+ return;
+ }
+
+ ppd->qsfp_info.ppd = ppd;
+ INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+ qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+ /* Clear current status to avoid spurious interrupts */
+ write_csr(dd,
+ dd->hfi1_id ?
+ ASIC_QSFP2_CLEAR :
+ ASIC_QSFP1_CLEAR,
+ qsfp_mask);
+
+ /* Handle active low nature of INT_N and MODPRST_N pins */
+ if (qsfp_mod_present(ppd))
+ qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+ qsfp_mask);
+
+ /* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */
+ qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+ qsfp_mask);
+
+ if (qsfp_mod_present(ppd)) {
+ msleep(3000);
+ reset_qsfp(ppd);
+
+ /* Check for QSFP interrupt after t_init (SFF 8679)
+ * + extra
+ */
+ msleep(3000);
+ if (!ppd->qsfp_info.qsfp_interrupt_functional) {
+ if (do_qsfp_intr_fallback(ppd) < 0)
+ dd_dev_info(dd,
+ "%s: QSFP fallback failed\n",
+ __func__);
+ ppd->driver_link_ready = 1;
+ }
+ }
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 guid;
+ int ret;
+
+ if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+ add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+ guid = ppd->guid;
+ if (!guid) {
+ if (dd->base_guid)
+ guid = dd->base_guid + ppd->port - 1;
+ ppd->guid = guid;
+ }
+
+ /* the link defaults to enabled */
+ ppd->link_enabled = 1;
+ /* Set linkinit_reason on power up per OPA spec */
+ ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+ if (loopback) {
+ ret = init_loopback(dd);
+ if (ret < 0)
+ return ret;
+ }
+
+ return start_link(ppd);
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /*
+ * Shut down the link and keep it down. First turn off that the
+ * driver wants to allow the link to be up (driver_link_ready).
+ * Then make sure the link is not automatically restarted
+ * (link_enabled). Cancel any pending restart. And finally
+ * go offline.
+ */
+ ppd->driver_link_ready = 0;
+ ppd->link_enabled = 0;
+
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
+ OPA_LINKDOWN_REASON_SMA_DISABLED);
+ set_link_state(ppd, HLS_DN_OFFLINE);
+
+ /* disable the port */
+ clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ ppd->ibport_data.rc_acks = NULL;
+ ppd->ibport_data.rc_qacks = NULL;
+ ppd->ibport_data.rc_acks = alloc_percpu(u64);
+ ppd->ibport_data.rc_qacks = alloc_percpu(u64);
+ ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64);
+ if ((ppd->ibport_data.rc_acks == NULL) ||
+ (ppd->ibport_data.rc_delayed_comp == NULL) ||
+ (ppd->ibport_data.rc_qacks == NULL))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static const char * const pt_names[] = {
+ "expected",
+ "eager",
+ "invalid"
+};
+
+static const char *pt_name(u32 type)
+{
+ return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+ u32 type, unsigned long pa, u16 order)
+{
+ u64 reg;
+ void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+ (dd->kregbase + RCV_ARRAY));
+
+ if (!(dd->flags & HFI1_PRESENT))
+ goto done;
+
+ if (type == PT_INVALID) {
+ pa = 0;
+ } else if (type > PT_INVALID) {
+ dd_dev_err(dd,
+ "unexpected receive array type %u for index %u, not handled\n",
+ type, index);
+ goto done;
+ }
+
+ hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+ pt_name(type), index, pa, (unsigned long)order);
+
+#define RT_ADDR_SHIFT 12 /* 4KB kernel address boundary */
+ reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+ | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+ | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+ << RCV_ARRAY_RT_ADDR_SHIFT;
+ writeq(reg, base + (index * 8));
+
+ if (type == PT_EAGER)
+ /*
+ * Eager entries are written one-by-one so we have to push them
+ * after we write the entry.
+ */
+ flush_wc();
+done:
+ return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 i;
+
+ /* this could be optimized */
+ for (i = rcd->eager_base; i < rcd->eager_base +
+ rcd->egrbufs.alloced; i++)
+ hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+ for (i = rcd->expected_base;
+ i < rcd->expected_base + rcd->expected_count; i++)
+ hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+ struct hfi1_ctxt_info *kinfo)
+{
+ kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
+ HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
+ return 0;
+}
+
+struct hfi1_message_header *hfi1_get_msgheader(
+ struct hfi1_devdata *dd, __le32 *rhf_addr)
+{
+ u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+ return (struct hfi1_message_header *)
+ (rhf_addr - dd->rhf_offset + offset);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+ "HFI1_IB_CFG_LIDLMC",
+ "HFI1_IB_CFG_LWID_DG_ENB",
+ "HFI1_IB_CFG_LWID_ENB",
+ "HFI1_IB_CFG_LWID",
+ "HFI1_IB_CFG_SPD_ENB",
+ "HFI1_IB_CFG_SPD",
+ "HFI1_IB_CFG_RXPOL_ENB",
+ "HFI1_IB_CFG_LREV_ENB",
+ "HFI1_IB_CFG_LINKLATENCY",
+ "HFI1_IB_CFG_HRTBT",
+ "HFI1_IB_CFG_OP_VLS",
+ "HFI1_IB_CFG_VL_HIGH_CAP",
+ "HFI1_IB_CFG_VL_LOW_CAP",
+ "HFI1_IB_CFG_OVERRUN_THRESH",
+ "HFI1_IB_CFG_PHYERR_THRESH",
+ "HFI1_IB_CFG_LINKDEFAULT",
+ "HFI1_IB_CFG_PKEYS",
+ "HFI1_IB_CFG_MTU",
+ "HFI1_IB_CFG_LSTATE",
+ "HFI1_IB_CFG_VL_HIGH_LIMIT",
+ "HFI1_IB_CFG_PMA_TICKS",
+ "HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+ if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+ return "invalid";
+ return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int val = 0;
+
+ switch (which) {
+ case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+ val = ppd->link_width_enabled;
+ break;
+ case HFI1_IB_CFG_LWID: /* currently active Link-width */
+ val = ppd->link_width_active;
+ break;
+ case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+ val = ppd->link_speed_enabled;
+ break;
+ case HFI1_IB_CFG_SPD: /* current Link speed */
+ val = ppd->link_speed_active;
+ break;
+
+ case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+ case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+ case HFI1_IB_CFG_LINKLATENCY:
+ goto unimplemented;
+
+ case HFI1_IB_CFG_OP_VLS:
+ val = ppd->vls_operational;
+ break;
+ case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+ val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+ break;
+ case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+ val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+ break;
+ case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+ val = ppd->overrun_threshold;
+ break;
+ case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+ val = ppd->phy_error_threshold;
+ break;
+ case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+ val = dd->link_default;
+ break;
+
+ case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+ case HFI1_IB_CFG_PMA_TICKS:
+ default:
+unimplemented:
+ if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+ dd_dev_info(
+ dd,
+ "%s: which %s: not implemented\n",
+ __func__,
+ ib_cfg_name(which));
+ break;
+ }
+
+ return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+ /*
+ * The maximum non-payload (MTU) bytes in LRH.PktLen are
+ * the Receive Header Entry Size minus the PBC (or RHF) size
+ * plus one DW for the ICRC appended by HW.
+ *
+ * dd->rcd[0].rcvhdrqentsize is in DW.
+ * We use rcd[0] as all context will have the same value. Also,
+ * the first kernel context would have been allocated by now so
+ * we are guaranteed a valid value.
+ */
+ return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent. The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 max_hb = lrh_max_header_bytes(dd), maxvlmtu = 0, dcmtu;
+ u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+ & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+ SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+ int i;
+
+ for (i = 0; i < ppd->vls_supported; i++) {
+ if (dd->vld[i].mtu > maxvlmtu)
+ maxvlmtu = dd->vld[i].mtu;
+ if (i <= 3)
+ len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+ & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+ ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+ else
+ len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+ & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+ ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+ }
+ write_csr(dd, SEND_LEN_CHECK0, len1);
+ write_csr(dd, SEND_LEN_CHECK1, len2);
+ /* adjust kernel credit return thresholds based on new MTUs */
+ /* all kernel receive contexts have the same hdrqentsize */
+ for (i = 0; i < ppd->vls_supported; i++) {
+ sc_set_cr_threshold(dd->vld[i].sc,
+ sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu,
+ dd->rcd[0]->rcvhdrqentsize));
+ }
+ sc_set_cr_threshold(dd->vld[15].sc,
+ sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu,
+ dd->rcd[0]->rcvhdrqentsize));
+
+ /* Adjust maximum MTU for the port in DC */
+ dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+ (ilog2(maxvlmtu >> 8) + 1);
+ len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+ len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+ len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+ DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+ write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+ int i;
+ u64 sreg = 0;
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 mask = ~((1U << ppd->lmc) - 1);
+ u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+
+ if (dd->hfi1_snoop.mode_flag)
+ dd_dev_info(dd, "Set lid/lmc while snooping");
+
+ c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+ | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+ c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+ << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)|
+ ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+ << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+ write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+ /*
+ * Iterate over all the send contexts and set their SLID check
+ */
+ sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+ SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+ (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+ SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+ for (i = 0; i < dd->chip_send_contexts; i++) {
+ hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+ i, (u32)sreg);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+ }
+
+ /* Now we have to do the same thing for the sdma engines */
+ sdma_update_lmc(dd, mask, ppd->lid);
+}
+
+static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
+{
+ unsigned long timeout;
+ u32 curr_state;
+
+ timeout = jiffies + msecs_to_jiffies(msecs);
+ while (1) {
+ curr_state = read_physical_state(dd);
+ if (curr_state == state)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+ state, curr_state);
+ return -ETIMEDOUT;
+ }
+ usleep_range(1950, 2050); /* sleep 2ms-ish */
+ }
+
+ return 0;
+}
+
+/*
+ * Helper for set_link_state(). Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 pstate, previous_state;
+ u32 last_local_state;
+ u32 last_remote_state;
+ int ret;
+ int do_transition;
+ int do_wait;
+
+ previous_state = ppd->host_link_state;
+ ppd->host_link_state = HLS_GOING_OFFLINE;
+ pstate = read_physical_state(dd);
+ if (pstate == PLS_OFFLINE) {
+ do_transition = 0; /* in right state */
+ do_wait = 0; /* ...no need to wait */
+ } else if ((pstate & 0xff) == PLS_OFFLINE) {
+ do_transition = 0; /* in an offline transient state */
+ do_wait = 1; /* ...wait for it to settle */
+ } else {
+ do_transition = 1; /* need to move to offline */
+ do_wait = 1; /* ...will need to wait */
+ }
+
+ if (do_transition) {
+ ret = set_physical_link_state(dd,
+ PLS_OFFLINE | (rem_reason << 8));
+
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to Offline link state, return %d\n",
+ ret);
+ return -EINVAL;
+ }
+ if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE)
+ ppd->offline_disabled_reason =
+ OPA_LINKDOWN_REASON_TRANSIENT;
+ }
+
+ if (do_wait) {
+ /* it can take a while for the link to go down */
+ ret = wait_phy_linkstate(dd, PLS_OFFLINE, 5000);
+ if (ret < 0)
+ return ret;
+ }
+
+ /* make sure the logical state is also down */
+ wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+
+ /*
+ * Now in charge of LCB - must be after the physical state is
+ * offline.quiet and before host_link_state is changed.
+ */
+ set_host_lcb_access(dd);
+ write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+ ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+
+ /*
+ * The LNI has a mandatory wait time after the physical state
+ * moves to Offline.Quiet. The wait time may be different
+ * depending on how the link went down. The 8051 firmware
+ * will observe the needed wait time and only move to ready
+ * when that is completed. The largest of the quiet timeouts
+ * is 2.5s, so wait that long and then a bit more.
+ */
+ ret = wait_fm_ready(dd, 3000);
+ if (ret) {
+ dd_dev_err(dd,
+ "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+ /* state is really offline, so make it so */
+ ppd->host_link_state = HLS_DN_OFFLINE;
+ return ret;
+ }
+
+ /*
+ * The state is now offline and the 8051 is ready to accept host
+ * requests.
+ * - change our state
+ * - notify others if we were previously in a linkup state
+ */
+ ppd->host_link_state = HLS_DN_OFFLINE;
+ if (previous_state & HLS_UP) {
+ /* went down while link was up */
+ handle_linkup_change(dd, 0);
+ } else if (previous_state
+ & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+ /* went down while attempting link up */
+ /* byte 1 of last_*_state is the failure reason */
+ read_last_local_state(dd, &last_local_state);
+ read_last_remote_state(dd, &last_remote_state);
+ dd_dev_err(dd,
+ "LNI failure last states: local 0x%08x, remote 0x%08x\n",
+ last_local_state, last_remote_state);
+ }
+
+ /* the active link width (downgrade) is 0 on link down */
+ ppd->link_width_active = 0;
+ ppd->link_width_downgrade_tx_active = 0;
+ ppd->link_width_downgrade_rx_active = 0;
+ ppd->current_egress_rate = 0;
+ return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+ const char *name;
+ int n = ilog2(state);
+ static const char * const names[] = {
+ [__HLS_UP_INIT_BP] = "INIT",
+ [__HLS_UP_ARMED_BP] = "ARMED",
+ [__HLS_UP_ACTIVE_BP] = "ACTIVE",
+ [__HLS_DN_DOWNDEF_BP] = "DOWNDEF",
+ [__HLS_DN_POLL_BP] = "POLL",
+ [__HLS_DN_DISABLE_BP] = "DISABLE",
+ [__HLS_DN_OFFLINE_BP] = "OFFLINE",
+ [__HLS_VERIFY_CAP_BP] = "VERIFY_CAP",
+ [__HLS_GOING_UP_BP] = "GOING_UP",
+ [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+ [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+ };
+
+ name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+ return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+ if (state == HLS_UP_INIT) {
+ switch (ppd->linkinit_reason) {
+ case OPA_LINKINIT_REASON_LINKUP:
+ return "(LINKUP)";
+ case OPA_LINKINIT_REASON_FLAPPING:
+ return "(FLAPPING)";
+ case OPA_LINKINIT_OUTSIDE_POLICY:
+ return "(OUTSIDE_POLICY)";
+ case OPA_LINKINIT_QUARANTINED:
+ return "(QUARANTINED)";
+ case OPA_LINKINIT_INSUFIC_CAPABILITY:
+ return "(INSUFIC_CAPABILITY)";
+ default:
+ break;
+ }
+ }
+ return "";
+}
+
+/*
+ * driver_physical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_physical_state(struct hfi1_pportdata *ppd)
+{
+ switch (ppd->host_link_state) {
+ case HLS_UP_INIT:
+ case HLS_UP_ARMED:
+ case HLS_UP_ACTIVE:
+ return IB_PORTPHYSSTATE_LINKUP;
+ case HLS_DN_POLL:
+ return IB_PORTPHYSSTATE_POLLING;
+ case HLS_DN_DISABLE:
+ return IB_PORTPHYSSTATE_DISABLED;
+ case HLS_DN_OFFLINE:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case HLS_VERIFY_CAP:
+ return IB_PORTPHYSSTATE_POLLING;
+ case HLS_GOING_UP:
+ return IB_PORTPHYSSTATE_POLLING;
+ case HLS_GOING_OFFLINE:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case HLS_LINK_COOLDOWN:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case HLS_DN_DOWNDEF:
+ default:
+ dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+ ppd->host_link_state);
+ return -1;
+ }
+}
+
+/*
+ * driver_logical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_logical_state(struct hfi1_pportdata *ppd)
+{
+ if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
+ return IB_PORT_DOWN;
+
+ switch (ppd->host_link_state & HLS_UP) {
+ case HLS_UP_INIT:
+ return IB_PORT_INIT;
+ case HLS_UP_ARMED:
+ return IB_PORT_ARMED;
+ case HLS_UP_ACTIVE:
+ return IB_PORT_ACTIVE;
+ default:
+ dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+ ppd->host_link_state);
+ return -1;
+ }
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+ u8 neigh_reason, u8 rem_reason)
+{
+ if (ppd->local_link_down_reason.latest == 0 &&
+ ppd->neigh_link_down_reason.latest == 0) {
+ ppd->local_link_down_reason.latest = lcl_reason;
+ ppd->neigh_link_down_reason.latest = neigh_reason;
+ ppd->remote_link_down_reason = rem_reason;
+ }
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt. It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct ib_event event = {.device = NULL};
+ int ret1, ret = 0;
+ int was_up, is_down;
+ int orig_new_state, poll_bounce;
+
+ mutex_lock(&ppd->hls_lock);
+
+ orig_new_state = state;
+ if (state == HLS_DN_DOWNDEF)
+ state = dd->link_default;
+
+ /* interpret poll -> poll as a link bounce */
+ poll_bounce = ppd->host_link_state == HLS_DN_POLL
+ && state == HLS_DN_POLL;
+
+ dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+ link_state_name(ppd->host_link_state),
+ link_state_name(orig_new_state),
+ poll_bounce ? "(bounce) " : "",
+ link_state_reason_name(ppd, state));
+
+ was_up = !!(ppd->host_link_state & HLS_UP);
+
+ /*
+ * If we're going to a (HLS_*) link state that implies the logical
+ * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+ * reset is_sm_config_started to 0.
+ */
+ if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+ ppd->is_sm_config_started = 0;
+
+ /*
+ * Do nothing if the states match. Let a poll to poll link bounce
+ * go through.
+ */
+ if (ppd->host_link_state == state && !poll_bounce)
+ goto done;
+
+ switch (state) {
+ case HLS_UP_INIT:
+ if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup
+ || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+ /*
+ * Quick link up jumps from polling to here.
+ *
+ * Whether in normal or loopback mode, the
+ * simulator jumps from polling to link up.
+ * Accept that here.
+ */
+ /* OK */;
+ } else if (ppd->host_link_state != HLS_GOING_UP) {
+ goto unexpected;
+ }
+
+ ppd->host_link_state = HLS_UP_INIT;
+ ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+ if (ret) {
+ /* logical state didn't change, stay at going_up */
+ ppd->host_link_state = HLS_GOING_UP;
+ dd_dev_err(dd,
+ "%s: logical state did not change to INIT\n",
+ __func__);
+ } else {
+ /* clear old transient LINKINIT_REASON code */
+ if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+ ppd->linkinit_reason =
+ OPA_LINKINIT_REASON_LINKUP;
+
+ /* enable the port */
+ add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+ handle_linkup_change(dd, 1);
+ }
+ break;
+ case HLS_UP_ARMED:
+ if (ppd->host_link_state != HLS_UP_INIT)
+ goto unexpected;
+
+ ppd->host_link_state = HLS_UP_ARMED;
+ set_logical_state(dd, LSTATE_ARMED);
+ ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+ if (ret) {
+ /* logical state didn't change, stay at init */
+ ppd->host_link_state = HLS_UP_INIT;
+ dd_dev_err(dd,
+ "%s: logical state did not change to ARMED\n",
+ __func__);
+ }
+ /*
+ * The simulator does not currently implement SMA messages,
+ * so neighbor_normal is not set. Set it here when we first
+ * move to Armed.
+ */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+ ppd->neighbor_normal = 1;
+ break;
+ case HLS_UP_ACTIVE:
+ if (ppd->host_link_state != HLS_UP_ARMED)
+ goto unexpected;
+
+ ppd->host_link_state = HLS_UP_ACTIVE;
+ set_logical_state(dd, LSTATE_ACTIVE);
+ ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+ if (ret) {
+ /* logical state didn't change, stay at armed */
+ ppd->host_link_state = HLS_UP_ARMED;
+ dd_dev_err(dd,
+ "%s: logical state did not change to ACTIVE\n",
+ __func__);
+ } else {
+
+ /* tell all engines to go running */
+ sdma_all_running(dd);
+
+ /* Signal the IB layer that the port has went active */
+ event.device = &dd->verbs_dev.ibdev;
+ event.element.port_num = ppd->port;
+ event.event = IB_EVENT_PORT_ACTIVE;
+ }
+ break;
+ case HLS_DN_POLL:
+ if ((ppd->host_link_state == HLS_DN_DISABLE ||
+ ppd->host_link_state == HLS_DN_OFFLINE) &&
+ dd->dc_shutdown)
+ dc_start(dd);
+ /* Hand LED control to the DC */
+ write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+ if (ppd->host_link_state != HLS_DN_OFFLINE) {
+ u8 tmp = ppd->link_enabled;
+
+ ret = goto_offline(ppd, ppd->remote_link_down_reason);
+ if (ret) {
+ ppd->link_enabled = tmp;
+ break;
+ }
+ ppd->remote_link_down_reason = 0;
+
+ if (ppd->driver_link_ready)
+ ppd->link_enabled = 1;
+ }
+
+ ret = set_local_link_attributes(ppd);
+ if (ret)
+ break;
+
+ ppd->port_error_action = 0;
+ ppd->host_link_state = HLS_DN_POLL;
+
+ if (quick_linkup) {
+ /* quick linkup does not go into polling */
+ ret = do_quick_linkup(dd);
+ } else {
+ ret1 = set_physical_link_state(dd, PLS_POLLING);
+ if (ret1 != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to Polling link state, return 0x%x\n",
+ ret1);
+ ret = -EINVAL;
+ }
+ }
+ ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+ /*
+ * If an error occurred above, go back to offline. The
+ * caller may reschedule another attempt.
+ */
+ if (ret)
+ goto_offline(ppd, 0);
+ break;
+ case HLS_DN_DISABLE:
+ /* link is disabled */
+ ppd->link_enabled = 0;
+
+ /* allow any state to transition to disabled */
+
+ /* must transition to offline first */
+ if (ppd->host_link_state != HLS_DN_OFFLINE) {
+ ret = goto_offline(ppd, ppd->remote_link_down_reason);
+ if (ret)
+ break;
+ ppd->remote_link_down_reason = 0;
+ }
+
+ ret1 = set_physical_link_state(dd, PLS_DISABLED);
+ if (ret1 != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to Disabled link state, return 0x%x\n",
+ ret1);
+ ret = -EINVAL;
+ break;
+ }
+ ppd->host_link_state = HLS_DN_DISABLE;
+ dc_shutdown(dd);
+ break;
+ case HLS_DN_OFFLINE:
+ if (ppd->host_link_state == HLS_DN_DISABLE)
+ dc_start(dd);
+
+ /* allow any state to transition to offline */
+ ret = goto_offline(ppd, ppd->remote_link_down_reason);
+ if (!ret)
+ ppd->remote_link_down_reason = 0;
+ break;
+ case HLS_VERIFY_CAP:
+ if (ppd->host_link_state != HLS_DN_POLL)
+ goto unexpected;
+ ppd->host_link_state = HLS_VERIFY_CAP;
+ break;
+ case HLS_GOING_UP:
+ if (ppd->host_link_state != HLS_VERIFY_CAP)
+ goto unexpected;
+
+ ret1 = set_physical_link_state(dd, PLS_LINKUP);
+ if (ret1 != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to link up state, return 0x%x\n",
+ ret1);
+ ret = -EINVAL;
+ break;
+ }
+ ppd->host_link_state = HLS_GOING_UP;
+ break;
+
+ case HLS_GOING_OFFLINE: /* transient within goto_offline() */
+ case HLS_LINK_COOLDOWN: /* transient within goto_offline() */
+ default:
+ dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+ __func__, state);
+ ret = -EINVAL;
+ break;
+ }
+
+ is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
+ HLS_DN_DISABLE | HLS_DN_OFFLINE));
+
+ if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
+ ppd->neigh_link_down_reason.sma == 0) {
+ ppd->local_link_down_reason.sma =
+ ppd->local_link_down_reason.latest;
+ ppd->neigh_link_down_reason.sma =
+ ppd->neigh_link_down_reason.latest;
+ }
+
+ goto done;
+
+unexpected:
+ dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+ __func__, link_state_name(ppd->host_link_state),
+ link_state_name(state));
+ ret = -EINVAL;
+
+done:
+ mutex_unlock(&ppd->hls_lock);
+
+ if (event.device)
+ ib_dispatch_event(&event);
+
+ return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+ u64 reg;
+ int ret = 0;
+
+ switch (which) {
+ case HFI1_IB_CFG_LIDLMC:
+ set_lidlmc(ppd);
+ break;
+ case HFI1_IB_CFG_VL_HIGH_LIMIT:
+ /*
+ * The VL Arbitrator high limit is sent in units of 4k
+ * bytes, while HFI stores it in units of 64 bytes.
+ */
+ val *= 4096/64;
+ reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+ << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+ write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+ break;
+ case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+ /* HFI only supports POLL as the default link down state */
+ if (val != HLS_DN_POLL)
+ ret = -EINVAL;
+ break;
+ case HFI1_IB_CFG_OP_VLS:
+ if (ppd->vls_operational != val) {
+ ppd->vls_operational = val;
+ if (!ppd->port)
+ ret = -EINVAL;
+ else
+ ret = sdma_map_init(
+ ppd->dd,
+ ppd->port - 1,
+ val,
+ NULL);
+ }
+ break;
+ /*
+ * For link width, link width downgrade, and speed enable, always AND
+ * the setting with what is actually supported. This has two benefits.
+ * First, enabled can't have unsupported values, no matter what the
+ * SM or FM might want. Second, the ALL_SUPPORTED wildcards that mean
+ * "fill in with your supported value" have all the bits in the
+ * field set, so simply ANDing with supported has the desired result.
+ */
+ case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+ ppd->link_width_enabled = val & ppd->link_width_supported;
+ break;
+ case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+ ppd->link_width_downgrade_enabled =
+ val & ppd->link_width_downgrade_supported;
+ break;
+ case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+ ppd->link_speed_enabled = val & ppd->link_speed_supported;
+ break;
+ case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+ /*
+ * HFI does not follow IB specs, save this value
+ * so we can report it, if asked.
+ */
+ ppd->overrun_threshold = val;
+ break;
+ case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+ /*
+ * HFI does not follow IB specs, save this value
+ * so we can report it, if asked.
+ */
+ ppd->phy_error_threshold = val;
+ break;
+
+ case HFI1_IB_CFG_MTU:
+ set_send_length(ppd);
+ break;
+
+ case HFI1_IB_CFG_PKEYS:
+ if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+ set_partition_keys(ppd);
+ break;
+
+ default:
+ if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+ dd_dev_info(ppd->dd,
+ "%s: which %s, val 0x%x: not implemented\n",
+ __func__, ib_cfg_name(which), val);
+ break;
+ }
+ return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+ int i;
+
+ BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+ VL_ARB_LOW_PRIO_TABLE_SIZE);
+ BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+ VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+ /*
+ * Note that we always return values directly from the
+ * 'vl_arb_cache' (and do no CSR reads) in response to a
+ * 'Get(VLArbTable)'. This is obviously correct after a
+ * 'Set(VLArbTable)', since the cache will then be up to
+ * date. But it's also correct prior to any 'Set(VLArbTable)'
+ * since then both the cache, and the relevant h/w registers
+ * will be zeroed.
+ */
+
+ for (i = 0; i < MAX_PRIO_TABLE; i++)
+ spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+ if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+ return NULL;
+ spin_lock(&ppd->vl_arb_cache[idx].lock);
+ return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+ spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+ struct ib_vl_weight_elem *vl)
+{
+ memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+ struct ib_vl_weight_elem *vl)
+{
+ memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+ struct ib_vl_weight_elem *vl)
+{
+ return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+ u32 size, struct ib_vl_weight_elem *vl)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg;
+ unsigned int i, is_up = 0;
+ int drain, ret = 0;
+
+ mutex_lock(&ppd->hls_lock);
+
+ if (ppd->host_link_state & HLS_UP)
+ is_up = 1;
+
+ drain = !is_ax(dd) && is_up;
+
+ if (drain)
+ /*
+ * Before adjusting VL arbitration weights, empty per-VL
+ * FIFOs, otherwise a packet whose VL weight is being
+ * set to 0 could get stuck in a FIFO with no chance to
+ * egress.
+ */
+ ret = stop_drain_data_vls(dd);
+
+ if (ret) {
+ dd_dev_err(
+ dd,
+ "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+ __func__);
+ goto err;
+ }
+
+ for (i = 0; i < size; i++, vl++) {
+ /*
+ * NOTE: The low priority shift and mask are used here, but
+ * they are the same for both the low and high registers.
+ */
+ reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+ << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+ | (((u64)vl->weight
+ & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+ << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+ write_csr(dd, target + (i * 8), reg);
+ }
+ pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+ if (drain)
+ open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+ mutex_unlock(&ppd->hls_lock);
+
+ return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+ struct vl_limit *vll)
+{
+ u64 reg = read_csr(dd, csr);
+
+ vll->dedicated = cpu_to_be16(
+ (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+ & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+ vll->shared = cpu_to_be16(
+ (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+ & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+ struct buffer_control *bc, u16 *overall_limit)
+{
+ u64 reg;
+ int i;
+
+ /* not all entries are filled in */
+ memset(bc, 0, sizeof(*bc));
+
+ /* OPA and HFI have a 1-1 mapping */
+ for (i = 0; i < TXE_NUM_DATA_VL; i++)
+ read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]);
+
+ /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+ read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+ reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+ bc->overall_shared_limit = cpu_to_be16(
+ (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+ & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+ if (overall_limit)
+ *overall_limit = (reg
+ >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+ & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+ return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+ u64 reg;
+ int i;
+
+ /* each register contains 16 SC->VLnt mappings, 4 bits each */
+ reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+ for (i = 0; i < sizeof(u64); i++) {
+ u8 byte = *(((u8 *)&reg) + i);
+
+ dp->vlnt[2 * i] = byte & 0xf;
+ dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+ }
+
+ reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+ for (i = 0; i < sizeof(u64); i++) {
+ u8 byte = *(((u8 *)&reg) + i);
+
+ dp->vlnt[16 + (2 * i)] = byte & 0xf;
+ dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+ }
+ return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+ struct ib_vl_weight_elem *vl)
+{
+ unsigned int i;
+
+ for (i = 0; i < nelems; i++, vl++) {
+ vl->vl = 0xf;
+ vl->weight = 0;
+ }
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+ DC_SC_VL_VAL(15_0,
+ 0, dp->vlnt[0] & 0xf,
+ 1, dp->vlnt[1] & 0xf,
+ 2, dp->vlnt[2] & 0xf,
+ 3, dp->vlnt[3] & 0xf,
+ 4, dp->vlnt[4] & 0xf,
+ 5, dp->vlnt[5] & 0xf,
+ 6, dp->vlnt[6] & 0xf,
+ 7, dp->vlnt[7] & 0xf,
+ 8, dp->vlnt[8] & 0xf,
+ 9, dp->vlnt[9] & 0xf,
+ 10, dp->vlnt[10] & 0xf,
+ 11, dp->vlnt[11] & 0xf,
+ 12, dp->vlnt[12] & 0xf,
+ 13, dp->vlnt[13] & 0xf,
+ 14, dp->vlnt[14] & 0xf,
+ 15, dp->vlnt[15] & 0xf));
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+ DC_SC_VL_VAL(31_16,
+ 16, dp->vlnt[16] & 0xf,
+ 17, dp->vlnt[17] & 0xf,
+ 18, dp->vlnt[18] & 0xf,
+ 19, dp->vlnt[19] & 0xf,
+ 20, dp->vlnt[20] & 0xf,
+ 21, dp->vlnt[21] & 0xf,
+ 22, dp->vlnt[22] & 0xf,
+ 23, dp->vlnt[23] & 0xf,
+ 24, dp->vlnt[24] & 0xf,
+ 25, dp->vlnt[25] & 0xf,
+ 26, dp->vlnt[26] & 0xf,
+ 27, dp->vlnt[27] & 0xf,
+ 28, dp->vlnt[28] & 0xf,
+ 29, dp->vlnt[29] & 0xf,
+ 30, dp->vlnt[30] & 0xf,
+ 31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+ u16 limit)
+{
+ if (limit != 0)
+ dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+ what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+ u64 reg;
+
+ reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+ reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+ reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+ u64 reg;
+
+ reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+ reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+ reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+ u64 reg;
+ u32 addr;
+
+ if (vl < TXE_NUM_DATA_VL)
+ addr = SEND_CM_CREDIT_VL + (8 * vl);
+ else
+ addr = SEND_CM_CREDIT_VL15;
+
+ reg = read_csr(dd, addr);
+ reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+ reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+ write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+ u64 reg;
+ u32 addr;
+
+ if (vl < TXE_NUM_DATA_VL)
+ addr = SEND_CM_CREDIT_VL + (8 * vl);
+ else
+ addr = SEND_CM_CREDIT_VL15;
+
+ reg = read_csr(dd, addr);
+ reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+ reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+ write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+ const char *which)
+{
+ unsigned long timeout;
+ u64 reg;
+
+ timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+ if (reg == 0)
+ return; /* success */
+ if (time_after(jiffies, timeout))
+ break; /* timed out */
+ udelay(1);
+ }
+
+ dd_dev_err(dd,
+ "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+ which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+ /*
+ * If this occurs, it is likely there was a credit loss on the link.
+ * The only recovery from that is a link bounce.
+ */
+ dd_dev_err(dd,
+ "Continuing anyway. A credit loss may occur. Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented. In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ * set Global_Shared_Credit_Limit = 0
+ * use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ * mask1 = all VLs that are lowering dedicated limits
+ * lower Dedicated_Limit[mask1]
+ * spin until Return_Credit_Status[mask1] == 0
+ * raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ * earlier in the algorithm), set the new limit to the new value
+ */
+static int set_buffer_control(struct hfi1_devdata *dd,
+ struct buffer_control *new_bc)
+{
+ u64 changing_mask, ld_mask, stat_mask;
+ int change_count;
+ int i, use_all_mask;
+ int this_shared_changing;
+ /*
+ * A0: add the variable any_shared_limit_changing below and in the
+ * algorithm above. If removing A0 support, it can be removed.
+ */
+ int any_shared_limit_changing;
+ struct buffer_control cur_bc;
+ u8 changing[OPA_MAX_VLS];
+ u8 lowering_dedicated[OPA_MAX_VLS];
+ u16 cur_total;
+ u32 new_total = 0;
+ const u64 all_mask =
+ SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16 /* look at VL15 and less */
+
+
+ /* find the new total credits, do sanity check on unused VLs */
+ for (i = 0; i < OPA_MAX_VLS; i++) {
+ if (valid_vl(i)) {
+ new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+ continue;
+ }
+ nonzero_msg(dd, i, "dedicated",
+ be16_to_cpu(new_bc->vl[i].dedicated));
+ nonzero_msg(dd, i, "shared",
+ be16_to_cpu(new_bc->vl[i].shared));
+ new_bc->vl[i].dedicated = 0;
+ new_bc->vl[i].shared = 0;
+ }
+ new_total += be16_to_cpu(new_bc->overall_shared_limit);
+ if (new_total > (u32)dd->link_credits)
+ return -EINVAL;
+ /* fetch the current values */
+ get_buffer_control(dd, &cur_bc, &cur_total);
+
+ /*
+ * Create the masks we will use.
+ */
+ memset(changing, 0, sizeof(changing));
+ memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+ /* NOTE: Assumes that the individual VL bits are adjacent and in
+ increasing order */
+ stat_mask =
+ SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+ changing_mask = 0;
+ ld_mask = 0;
+ change_count = 0;
+ any_shared_limit_changing = 0;
+ for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+ if (!valid_vl(i))
+ continue;
+ this_shared_changing = new_bc->vl[i].shared
+ != cur_bc.vl[i].shared;
+ if (this_shared_changing)
+ any_shared_limit_changing = 1;
+ if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated
+ || this_shared_changing) {
+ changing[i] = 1;
+ changing_mask |= stat_mask;
+ change_count++;
+ }
+ if (be16_to_cpu(new_bc->vl[i].dedicated) <
+ be16_to_cpu(cur_bc.vl[i].dedicated)) {
+ lowering_dedicated[i] = 1;
+ ld_mask |= stat_mask;
+ }
+ }
+
+ /* bracket the credit change with a total adjustment */
+ if (new_total > cur_total)
+ set_global_limit(dd, new_total);
+
+ /*
+ * Start the credit change algorithm.
+ */
+ use_all_mask = 0;
+ if ((be16_to_cpu(new_bc->overall_shared_limit) <
+ be16_to_cpu(cur_bc.overall_shared_limit))
+ || (is_a0(dd) && any_shared_limit_changing)) {
+ set_global_shared(dd, 0);
+ cur_bc.overall_shared_limit = 0;
+ use_all_mask = 1;
+ }
+
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (changing[i]) {
+ set_vl_shared(dd, i, 0);
+ cur_bc.vl[i].shared = 0;
+ }
+ }
+
+ wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+ "shared");
+
+ if (change_count > 0) {
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (lowering_dedicated[i]) {
+ set_vl_dedicated(dd, i,
+ be16_to_cpu(new_bc->vl[i].dedicated));
+ cur_bc.vl[i].dedicated =
+ new_bc->vl[i].dedicated;
+ }
+ }
+
+ wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+ /* now raise all dedicated that are going up */
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (be16_to_cpu(new_bc->vl[i].dedicated) >
+ be16_to_cpu(cur_bc.vl[i].dedicated))
+ set_vl_dedicated(dd, i,
+ be16_to_cpu(new_bc->vl[i].dedicated));
+ }
+ }
+
+ /* next raise all shared that are going up */
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (be16_to_cpu(new_bc->vl[i].shared) >
+ be16_to_cpu(cur_bc.vl[i].shared))
+ set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+ }
+
+ /* finally raise the global shared */
+ if (be16_to_cpu(new_bc->overall_shared_limit) >
+ be16_to_cpu(cur_bc.overall_shared_limit))
+ set_global_shared(dd,
+ be16_to_cpu(new_bc->overall_shared_limit));
+
+ /* bracket the credit change with a total adjustment */
+ if (new_total < cur_total)
+ set_global_limit(dd, new_total);
+ return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+ int size;
+ struct vl_arb_cache *vlc;
+
+ switch (which) {
+ case FM_TBL_VL_HIGH_ARB:
+ size = 256;
+ /*
+ * OPA specifies 128 elements (of 2 bytes each), though
+ * HFI supports only 16 elements in h/w.
+ */
+ vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+ vl_arb_get_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+ break;
+ case FM_TBL_VL_LOW_ARB:
+ size = 256;
+ /*
+ * OPA specifies 128 elements (of 2 bytes each), though
+ * HFI supports only 16 elements in h/w.
+ */
+ vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+ vl_arb_get_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+ break;
+ case FM_TBL_BUFFER_CONTROL:
+ size = get_buffer_control(ppd->dd, t, NULL);
+ break;
+ case FM_TBL_SC2VLNT:
+ size = get_sc2vlnt(ppd->dd, t);
+ break;
+ case FM_TBL_VL_PREEMPT_ELEMS:
+ size = 256;
+ /* OPA specifies 128 elements, of 2 bytes each */
+ get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+ break;
+ case FM_TBL_VL_PREEMPT_MATRIX:
+ size = 256;
+ /*
+ * OPA specifies that this is the same size as the VL
+ * arbitration tables (i.e., 256 bytes).
+ */
+ break;
+ default:
+ return -EINVAL;
+ }
+ return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+ int ret = 0;
+ struct vl_arb_cache *vlc;
+
+ switch (which) {
+ case FM_TBL_VL_HIGH_ARB:
+ vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+ if (vl_arb_match_cache(vlc, t)) {
+ vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+ break;
+ }
+ vl_arb_set_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+ ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+ VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+ break;
+ case FM_TBL_VL_LOW_ARB:
+ vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+ if (vl_arb_match_cache(vlc, t)) {
+ vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+ break;
+ }
+ vl_arb_set_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+ ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+ VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+ break;
+ case FM_TBL_BUFFER_CONTROL:
+ ret = set_buffer_control(ppd->dd, t);
+ break;
+ case FM_TBL_SC2VLNT:
+ set_sc2vlnt(ppd->dd, t);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+ if (is_a0(dd))
+ return 1;
+
+ pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+ return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+ if (is_a0(dd))
+ return 1;
+
+ pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+ return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+ sc_wait(dd);
+ sdma_wait(dd);
+ pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs. This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = disable_data_vls(dd);
+ if (ret == 0)
+ drain_data_vls(dd);
+
+ return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count. No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+ u32 cclocks;
+
+ if (dd->icode == ICODE_FPGA_EMULATION)
+ cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+ else /* simulation pretends to be ASIC */
+ cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+ if (ns && !cclocks) /* if ns nonzero, must be at least 1 */
+ cclocks = 1;
+ return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+ u32 ns;
+
+ if (dd->icode == ICODE_FPGA_EMULATION)
+ ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+ else /* simulation pretends to be ASIC */
+ ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+ if (cclocks && !ns)
+ ns = 1;
+ return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 timeout = rcd->rcvavail_timeout;
+
+ /*
+ * This algorithm doubles or halves the timeout depending on whether
+ * the number of packets received in this interrupt were less than or
+ * greater equal the interrupt count.
+ *
+ * The calculations below do not allow a steady state to be achieved.
+ * Only at the endpoints it is possible to have an unchanging
+ * timeout.
+ */
+ if (npkts < rcv_intr_count) {
+ /*
+ * Not enough packets arrived before the timeout, adjust
+ * timeout downward.
+ */
+ if (timeout < 2) /* already at minimum? */
+ return;
+ timeout >>= 1;
+ } else {
+ /*
+ * More than enough packets arrived before the timeout, adjust
+ * timeout upward.
+ */
+ if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+ return;
+ timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+ }
+
+ rcd->rcvavail_timeout = timeout;
+ /* timeout cannot be larger than rcv_intr_timeout_csr which has already
+ been verified to be in range */
+ write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+ (u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+ u32 intr_adjust, u32 npkts)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u64 reg;
+ u32 ctxt = rcd->ctxt;
+
+ /*
+ * Need to write timeout register before updating RcvHdrHead to ensure
+ * that a new value is used when the HW decides to restart counting.
+ */
+ if (intr_adjust)
+ adjust_rcv_timeout(rcd, npkts);
+ if (updegr) {
+ reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+ << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+ write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+ }
+ mmiowb();
+ reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+ (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+ << RCV_HDR_HEAD_HEAD_SHIFT);
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+ mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+ u32 head, tail;
+
+ head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+ & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+ if (rcd->rcvhdrtail_kvaddr)
+ tail = get_rcvhdrtail(rcd);
+ else
+ tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+ return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ * 0x0 invalid
+ * 0x1 4 KB
+ * 0x2 8 KB
+ * 0x3 16 KB
+ * 0x4 32 KB
+ * 0x5 64 KB
+ * 0x6 128 KB
+ * 0x7 256 KB
+ * 0x8 512 KB (Receive Array only)
+ * 0x9 1 MB (Receive Array only)
+ * 0xa 2 MB (Receive Array only)
+ *
+ * 0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+ switch (size) {
+ case 4*1024: return 0x1;
+ case 8*1024: return 0x2;
+ case 16*1024: return 0x3;
+ case 32*1024: return 0x4;
+ case 64*1024: return 0x5;
+ case 128*1024: return 0x6;
+ case 256*1024: return 0x7;
+ case 512*1024: return 0x8;
+ case 1*1024*1024: return 0x9;
+ case 2*1024*1024: return 0xa;
+ }
+ return 0x1; /* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+{
+ struct hfi1_ctxtdata *rcd;
+ u64 rcvctrl, reg;
+ int did_enable = 0;
+
+ rcd = dd->rcd[ctxt];
+ if (!rcd)
+ return;
+
+ hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+ rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+ /* if the context already enabled, don't do the extra steps */
+ if ((op & HFI1_RCVCTRL_CTXT_ENB)
+ && !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+ /* reset the tail and hdr addresses, and sequence count */
+ write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+ rcd->rcvhdrq_phys);
+ if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+ write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+ rcd->rcvhdrqtailaddr_phys);
+ rcd->seq_cnt = 1;
+
+ /* reset the cached receive header queue head value */
+ rcd->head = 0;
+
+ /*
+ * Zero the receive header queue so we don't get false
+ * positives when checking the sequence number. The
+ * sequence numbers could land exactly on the same spot.
+ * E.g. a rcd restart before the receive header wrapped.
+ */
+ memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+ /* starting timeout */
+ rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+ /* enable the context */
+ rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+ /* clean the egr buffer size first */
+ rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+ rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+ & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+ << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+ /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+ did_enable = 1;
+
+ /* zero RcvEgrIndexHead */
+ write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+ /* set eager count and base index */
+ reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+ & RCV_EGR_CTRL_EGR_CNT_MASK)
+ << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+ (((rcd->eager_base >> RCV_SHIFT)
+ & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+ << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+ write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+ /*
+ * Set TID (expected) count and base index.
+ * rcd->expected_count is set to individual RcvArray entries,
+ * not pairs, and the CSR takes a pair-count in groups of
+ * four, so divide by 8.
+ */
+ reg = (((rcd->expected_count >> RCV_SHIFT)
+ & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+ << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+ (((rcd->expected_base >> RCV_SHIFT)
+ & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+ << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+ write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+ if (ctxt == VL15CTXT)
+ write_csr(dd, RCV_VL15, VL15CTXT);
+ }
+ if (op & HFI1_RCVCTRL_CTXT_DIS) {
+ write_csr(dd, RCV_VL15, 0);
+ rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+ }
+ if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+ if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+ if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
+ rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+ if (op & HFI1_RCVCTRL_TAILUPD_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+ if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+ if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+ if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+ /* In one-packet-per-eager mode, the size comes from
+ the RcvArray entry. */
+ rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+ rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+ }
+ if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+ if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+ if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+ if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+ if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+ rcd->rcvctrl = rcvctrl;
+ hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+ write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+ /* work around sticky RcvCtxtStatus.BlockedRHQFull */
+ if (did_enable
+ && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+ reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+ if (reg != 0) {
+ dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+ ctxt, reg);
+ read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+ read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+ reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+ dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+ ctxt, reg, reg == 0 ? "not" : "still");
+ }
+ }
+
+ if (did_enable) {
+ /*
+ * The interrupt timeout and count must be set after
+ * the context is enabled to take effect.
+ */
+ /* set interrupt timeout */
+ write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+ (u64)rcd->rcvavail_timeout <<
+ RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+ /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+ reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+ }
+
+ if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+ /*
+ * If the context has been disabled and the Tail Update has
+ * been cleared, clear the RCV_HDR_TAIL_ADDR CSR so
+ * it doesn't contain an address that is invalid.
+ */
+ write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, 0);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
+ u64 **cntrp)
+{
+ int ret;
+ u64 val = 0;
+
+ if (namep) {
+ ret = dd->cntrnameslen;
+ if (pos != 0) {
+ dd_dev_err(dd, "read_cntrs does not support indexing");
+ return 0;
+ }
+ *namep = dd->cntrnames;
+ } else {
+ const struct cntr_entry *entry;
+ int i, j;
+
+ ret = (dd->ndevcntrs) * sizeof(u64);
+ if (pos != 0) {
+ dd_dev_err(dd, "read_cntrs does not support indexing");
+ return 0;
+ }
+
+ /* Get the start of the block of counters */
+ *cntrp = dd->cntrs;
+
+ /*
+ * Now go and fill in each counter in the block.
+ */
+ for (i = 0; i < DEV_CNTR_LAST; i++) {
+ entry = &dev_cntrs[i];
+ hfi1_cdbg(CNTR, "reading %s", entry->name);
+ if (entry->flags & CNTR_DISABLED) {
+ /* Nothing */
+ hfi1_cdbg(CNTR, "\tDisabled\n");
+ } else {
+ if (entry->flags & CNTR_VL) {
+ hfi1_cdbg(CNTR, "\tPer VL\n");
+ for (j = 0; j < C_VL_COUNT; j++) {
+ val = entry->rw_cntr(entry,
+ dd, j,
+ CNTR_MODE_R,
+ 0);
+ hfi1_cdbg(
+ CNTR,
+ "\t\tRead 0x%llx for %d\n",
+ val, j);
+ dd->cntrs[entry->offset + j] =
+ val;
+ }
+ } else {
+ val = entry->rw_cntr(entry, dd,
+ CNTR_INVALID_VL,
+ CNTR_MODE_R, 0);
+ dd->cntrs[entry->offset] = val;
+ hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+ }
+ }
+ }
+ }
+ return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
+ char **namep, u64 **cntrp)
+{
+ int ret;
+ u64 val = 0;
+
+ if (namep) {
+ ret = dd->portcntrnameslen;
+ if (pos != 0) {
+ dd_dev_err(dd, "index not supported");
+ return 0;
+ }
+ *namep = dd->portcntrnames;
+ } else {
+ const struct cntr_entry *entry;
+ struct hfi1_pportdata *ppd;
+ int i, j;
+
+ ret = (dd->nportcntrs) * sizeof(u64);
+ if (pos != 0) {
+ dd_dev_err(dd, "indexing not supported");
+ return 0;
+ }
+ ppd = (struct hfi1_pportdata *)(dd + 1 + port);
+ *cntrp = ppd->cntrs;
+
+ for (i = 0; i < PORT_CNTR_LAST; i++) {
+ entry = &port_cntrs[i];
+ hfi1_cdbg(CNTR, "reading %s", entry->name);
+ if (entry->flags & CNTR_DISABLED) {
+ /* Nothing */
+ hfi1_cdbg(CNTR, "\tDisabled\n");
+ continue;
+ }
+
+ if (entry->flags & CNTR_VL) {
+ hfi1_cdbg(CNTR, "\tPer VL");
+ for (j = 0; j < C_VL_COUNT; j++) {
+ val = entry->rw_cntr(entry, ppd, j,
+ CNTR_MODE_R,
+ 0);
+ hfi1_cdbg(
+ CNTR,
+ "\t\tRead 0x%llx for %d",
+ val, j);
+ ppd->cntrs[entry->offset + j] = val;
+ }
+ } else {
+ val = entry->rw_cntr(entry, ppd,
+ CNTR_INVALID_VL,
+ CNTR_MODE_R,
+ 0);
+ ppd->cntrs[entry->offset] = val;
+ hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+ }
+ }
+ }
+ return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ if (dd->synth_stats_timer.data)
+ del_timer_sync(&dd->synth_stats_timer);
+ dd->synth_stats_timer.data = 0;
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ kfree(ppd->cntrs);
+ kfree(ppd->scntrs);
+ free_percpu(ppd->ibport_data.rc_acks);
+ free_percpu(ppd->ibport_data.rc_qacks);
+ free_percpu(ppd->ibport_data.rc_delayed_comp);
+ ppd->cntrs = NULL;
+ ppd->scntrs = NULL;
+ ppd->ibport_data.rc_acks = NULL;
+ ppd->ibport_data.rc_qacks = NULL;
+ ppd->ibport_data.rc_delayed_comp = NULL;
+ }
+ kfree(dd->portcntrnames);
+ dd->portcntrnames = NULL;
+ kfree(dd->cntrs);
+ dd->cntrs = NULL;
+ kfree(dd->scntrs);
+ dd->scntrs = NULL;
+ kfree(dd->cntrnames);
+ dd->cntrnames = NULL;
+}
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+ u64 *psval, void *context, int vl)
+{
+ u64 val;
+ u64 sval = *psval;
+
+ if (entry->flags & CNTR_DISABLED) {
+ dd_dev_err(dd, "Counter %s not enabled", entry->name);
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+ /* If its a synthetic counter there is more work we need to do */
+ if (entry->flags & CNTR_SYNTH) {
+ if (sval == CNTR_MAX) {
+ /* No need to read already saturated */
+ return CNTR_MAX;
+ }
+
+ if (entry->flags & CNTR_32BIT) {
+ /* 32bit counters can wrap multiple times */
+ u64 upper = sval >> 32;
+ u64 lower = (sval << 32) >> 32;
+
+ if (lower > val) { /* hw wrapped */
+ if (upper == CNTR_32BIT_MAX)
+ val = CNTR_MAX;
+ else
+ upper++;
+ }
+
+ if (val != CNTR_MAX)
+ val = (upper << 32) | val;
+
+ } else {
+ /* If we rolled we are saturated */
+ if ((val < sval) || (val > CNTR_MAX))
+ val = CNTR_MAX;
+ }
+ }
+
+ *psval = val;
+
+ hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+ return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+ struct cntr_entry *entry,
+ u64 *psval, void *context, int vl, u64 data)
+{
+ u64 val;
+
+ if (entry->flags & CNTR_DISABLED) {
+ dd_dev_err(dd, "Counter %s not enabled", entry->name);
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+ if (entry->flags & CNTR_SYNTH) {
+ *psval = data;
+ if (entry->flags & CNTR_32BIT) {
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+ (data << 32) >> 32);
+ val = data; /* return the full 64bit value */
+ } else {
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+ data);
+ }
+ } else {
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+ }
+
+ *psval = val;
+
+ hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+ return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &dev_cntrs[index];
+ sval = dd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &dev_cntrs[index];
+ sval = dd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &port_cntrs[index];
+ sval = ppd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+ (index <= C_RCV_HDR_OVF_LAST)) {
+ /* We do not want to bother for disabled contexts */
+ return 0;
+ }
+
+ return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &port_cntrs[index];
+ sval = ppd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+ (index <= C_RCV_HDR_OVF_LAST)) {
+ /* We do not want to bother for disabled contexts */
+ return 0;
+ }
+
+ return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void update_synth_timer(unsigned long opaque)
+{
+ u64 cur_tx;
+ u64 cur_rx;
+ u64 total_flits;
+ u8 update = 0;
+ int i, j, vl;
+ struct hfi1_pportdata *ppd;
+ struct cntr_entry *entry;
+
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+
+ /*
+ * Rather than keep beating on the CSRs pick a minimal set that we can
+ * check to watch for potential roll over. We can do this by looking at
+ * the number of flits sent/recv. If the total flits exceeds 32bits then
+ * we have to iterate all the counters and update.
+ */
+ entry = &dev_cntrs[C_DC_RCV_FLITS];
+ cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+ entry = &dev_cntrs[C_DC_XMIT_FLITS];
+ cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+ hfi1_cdbg(
+ CNTR,
+ "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+ dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+ if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+ /*
+ * May not be strictly necessary to update but it won't hurt and
+ * simplifies the logic here.
+ */
+ update = 1;
+ hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+ dd->unit);
+ } else {
+ total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+ hfi1_cdbg(CNTR,
+ "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+ total_flits, (u64)CNTR_32BIT_MAX);
+ if (total_flits >= CNTR_32BIT_MAX) {
+ hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+ dd->unit);
+ update = 1;
+ }
+ }
+
+ if (update) {
+ hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+ for (i = 0; i < DEV_CNTR_LAST; i++) {
+ entry = &dev_cntrs[i];
+ if (entry->flags & CNTR_VL) {
+ for (vl = 0; vl < C_VL_COUNT; vl++)
+ read_dev_cntr(dd, i, vl);
+ } else {
+ read_dev_cntr(dd, i, CNTR_INVALID_VL);
+ }
+ }
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ for (j = 0; j < PORT_CNTR_LAST; j++) {
+ entry = &port_cntrs[j];
+ if (entry->flags & CNTR_VL) {
+ for (vl = 0; vl < C_VL_COUNT; vl++)
+ read_port_cntr(ppd, j, vl);
+ } else {
+ read_port_cntr(ppd, j, CNTR_INVALID_VL);
+ }
+ }
+ }
+
+ /*
+ * We want the value in the register. The goal is to keep track
+ * of the number of "ticks" not the counter value. In other
+ * words if the register rolls we want to notice it and go ahead
+ * and force an update.
+ */
+ entry = &dev_cntrs[C_DC_XMIT_FLITS];
+ dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+ CNTR_MODE_R, 0);
+
+ entry = &dev_cntrs[C_DC_RCV_FLITS];
+ dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+ CNTR_MODE_R, 0);
+
+ hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+ dd->unit, dd->last_tx, dd->last_rx);
+
+ } else {
+ hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+ }
+
+mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 13 /* 12 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+ int i, rcv_ctxts, index, j;
+ size_t sz;
+ char *p;
+ char name[C_MAX_NAME];
+ struct hfi1_pportdata *ppd;
+
+ /* set up the stats timer; the add_timer is done at the end */
+ init_timer(&dd->synth_stats_timer);
+ dd->synth_stats_timer.function = update_synth_timer;
+ dd->synth_stats_timer.data = (unsigned long) dd;
+
+ /***********************/
+ /* per device counters */
+ /***********************/
+
+ /* size names and determine how many we have*/
+ dd->ndevcntrs = 0;
+ sz = 0;
+ index = 0;
+
+ for (i = 0; i < DEV_CNTR_LAST; i++) {
+ hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name);
+ if (dev_cntrs[i].flags & CNTR_DISABLED) {
+ hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+ continue;
+ }
+
+ if (dev_cntrs[i].flags & CNTR_VL) {
+ hfi1_dbg_early("\tProcessing VL cntr\n");
+ dev_cntrs[i].offset = index;
+ for (j = 0; j < C_VL_COUNT; j++) {
+ memset(name, '\0', C_MAX_NAME);
+ snprintf(name, C_MAX_NAME, "%s%d",
+ dev_cntrs[i].name,
+ vl_from_idx(j));
+ sz += strlen(name);
+ sz++;
+ hfi1_dbg_early("\t\t%s\n", name);
+ dd->ndevcntrs++;
+ index++;
+ }
+ } else {
+ /* +1 for newline */
+ sz += strlen(dev_cntrs[i].name) + 1;
+ dd->ndevcntrs++;
+ dev_cntrs[i].offset = index;
+ index++;
+ hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name);
+ }
+ }
+
+ /* allocate space for the counter values */
+ dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+ if (!dd->cntrs)
+ goto bail;
+
+ dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+ if (!dd->scntrs)
+ goto bail;
+
+
+ /* allocate space for the counter names */
+ dd->cntrnameslen = sz;
+ dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+ if (!dd->cntrnames)
+ goto bail;
+
+ /* fill in the names */
+ for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) {
+ if (dev_cntrs[i].flags & CNTR_DISABLED) {
+ /* Nothing */
+ } else {
+ if (dev_cntrs[i].flags & CNTR_VL) {
+ for (j = 0; j < C_VL_COUNT; j++) {
+ memset(name, '\0', C_MAX_NAME);
+ snprintf(name, C_MAX_NAME, "%s%d",
+ dev_cntrs[i].name,
+ vl_from_idx(j));
+ memcpy(p, name, strlen(name));
+ p += strlen(name);
+ *p++ = '\n';
+ }
+ } else {
+ memcpy(p, dev_cntrs[i].name,
+ strlen(dev_cntrs[i].name));
+ p += strlen(dev_cntrs[i].name);
+ *p++ = '\n';
+ }
+ index++;
+ }
+ }
+
+ /*********************/
+ /* per port counters */
+ /*********************/
+
+ /*
+ * Go through the counters for the overflows and disable the ones we
+ * don't need. This varies based on platform so we need to do it
+ * dynamically here.
+ */
+ rcv_ctxts = dd->num_rcv_contexts;
+ for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+ i <= C_RCV_HDR_OVF_LAST; i++) {
+ port_cntrs[i].flags |= CNTR_DISABLED;
+ }
+
+ /* size port counter names and determine how many we have*/
+ sz = 0;
+ dd->nportcntrs = 0;
+ for (i = 0; i < PORT_CNTR_LAST; i++) {
+ hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name);
+ if (port_cntrs[i].flags & CNTR_DISABLED) {
+ hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+ continue;
+ }
+
+ if (port_cntrs[i].flags & CNTR_VL) {
+ hfi1_dbg_early("\tProcessing VL cntr\n");
+ port_cntrs[i].offset = dd->nportcntrs;
+ for (j = 0; j < C_VL_COUNT; j++) {
+ memset(name, '\0', C_MAX_NAME);
+ snprintf(name, C_MAX_NAME, "%s%d",
+ port_cntrs[i].name,
+ vl_from_idx(j));
+ sz += strlen(name);
+ sz++;
+ hfi1_dbg_early("\t\t%s\n", name);
+ dd->nportcntrs++;
+ }
+ } else {
+ /* +1 for newline */
+ sz += strlen(port_cntrs[i].name) + 1;
+ port_cntrs[i].offset = dd->nportcntrs;
+ dd->nportcntrs++;
+ hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name);
+ }
+ }
+
+ /* allocate space for the counter names */
+ dd->portcntrnameslen = sz;
+ dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+ if (!dd->portcntrnames)
+ goto bail;
+
+ /* fill in port cntr names */
+ for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+ if (port_cntrs[i].flags & CNTR_DISABLED)
+ continue;
+
+ if (port_cntrs[i].flags & CNTR_VL) {
+ for (j = 0; j < C_VL_COUNT; j++) {
+ memset(name, '\0', C_MAX_NAME);
+ snprintf(name, C_MAX_NAME, "%s%d",
+ port_cntrs[i].name,
+ vl_from_idx(j));
+ memcpy(p, name, strlen(name));
+ p += strlen(name);
+ *p++ = '\n';
+ }
+ } else {
+ memcpy(p, port_cntrs[i].name,
+ strlen(port_cntrs[i].name));
+ p += strlen(port_cntrs[i].name);
+ *p++ = '\n';
+ }
+ }
+
+ /* allocate per port storage for counter values */
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+ if (!ppd->cntrs)
+ goto bail;
+
+ ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+ if (!ppd->scntrs)
+ goto bail;
+ }
+
+ /* CPU counters need to be allocated and zeroed */
+ if (init_cpu_counters(dd))
+ goto bail;
+
+ mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+ return 0;
+bail:
+ free_cntrs(dd);
+ return -ENOMEM;
+}
+
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+ switch (chip_lstate) {
+ default:
+ dd_dev_err(dd,
+ "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+ chip_lstate);
+ /* fall through */
+ case LSTATE_DOWN:
+ return IB_PORT_DOWN;
+ case LSTATE_INIT:
+ return IB_PORT_INIT;
+ case LSTATE_ARMED:
+ return IB_PORT_ARMED;
+ case LSTATE_ACTIVE:
+ return IB_PORT_ACTIVE;
+ }
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+ /* look at the HFI meta-states only */
+ switch (chip_pstate & 0xf0) {
+ default:
+ dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+ chip_pstate);
+ /* fall through */
+ case PLS_DISABLED:
+ return IB_PORTPHYSSTATE_DISABLED;
+ case PLS_OFFLINE:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case PLS_POLLING:
+ return IB_PORTPHYSSTATE_POLLING;
+ case PLS_CONFIGPHY:
+ return IB_PORTPHYSSTATE_TRAINING;
+ case PLS_LINKUP:
+ return IB_PORTPHYSSTATE_LINKUP;
+ case PLS_PHYTEST:
+ return IB_PORTPHYSSTATE_PHY_TEST;
+ }
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+ static const char * const port_logical_names[] = {
+ "PORT_NOP",
+ "PORT_DOWN",
+ "PORT_INIT",
+ "PORT_ARMED",
+ "PORT_ACTIVE",
+ "PORT_ACTIVE_DEFER",
+ };
+ if (lstate < ARRAY_SIZE(port_logical_names))
+ return port_logical_names[lstate];
+ return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+ static const char * const port_physical_names[] = {
+ "PHYS_NOP",
+ "reserved1",
+ "PHYS_POLL",
+ "PHYS_DISABLED",
+ "PHYS_TRAINING",
+ "PHYS_LINKUP",
+ "PHYS_LINK_ERR_RECOVER",
+ "PHYS_PHY_TEST",
+ "reserved8",
+ "PHYS_OFFLINE",
+ "PHYS_GANGED",
+ "PHYS_TEST",
+ };
+ if (pstate < ARRAY_SIZE(port_physical_names))
+ return port_physical_names[pstate];
+ return "unknown";
+}
+
+/*
+ * Read the hardware link state and set the driver's cached value of it.
+ * Return the (new) current value.
+ */
+u32 get_logical_state(struct hfi1_pportdata *ppd)
+{
+ u32 new_state;
+
+ new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
+ if (new_state != ppd->lstate) {
+ dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+ opa_lstate_name(new_state), new_state);
+ ppd->lstate = new_state;
+ }
+ /*
+ * Set port status flags in the page mapped into userspace
+ * memory. Do it here to ensure a reliable state - this is
+ * the only function called by all state handling code.
+ * Always set the flags due to the fact that the cache value
+ * might have been changed explicitly outside of this
+ * function.
+ */
+ if (ppd->statusp) {
+ switch (ppd->lstate) {
+ case IB_PORT_DOWN:
+ case IB_PORT_INIT:
+ *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+ HFI1_STATUS_IB_READY);
+ break;
+ case IB_PORT_ARMED:
+ *ppd->statusp |= HFI1_STATUS_IB_CONF;
+ break;
+ case IB_PORT_ACTIVE:
+ *ppd->statusp |= HFI1_STATUS_IB_READY;
+ break;
+ }
+ }
+ return ppd->lstate;
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+ int msecs)
+{
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(msecs);
+ while (1) {
+ if (get_logical_state(ppd) == state)
+ return 0;
+ if (time_after(jiffies, timeout))
+ break;
+ msleep(20);
+ }
+ dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
+
+ return -ETIMEDOUT;
+}
+
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+{
+ static u32 remembered_state = 0xff;
+ u32 pstate;
+ u32 ib_pstate;
+
+ pstate = read_physical_state(ppd->dd);
+ ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
+ if (remembered_state != ib_pstate) {
+ dd_dev_info(ppd->dd,
+ "%s: physical state changed to %s (0x%x), phy 0x%x\n",
+ __func__, opa_pstate_name(ib_pstate), ib_pstate,
+ pstate);
+ remembered_state = ib_pstate;
+ }
+ return ib_pstate;
+}
+
+/*
+ * Read/modify/write ASIC_QSFP register bits as selected by mask
+ * data: 0 or 1 in the positions depending on what needs to be written
+ * dir: 0 for read, 1 for write
+ * mask: select by setting
+ * I2CCLK (bit 0)
+ * I2CDATA (bit 1)
+ */
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+ u32 mask)
+{
+ u64 qsfp_oe, target_oe;
+
+ target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+ if (mask) {
+ /* We are writing register bits, so lock access */
+ dir &= mask;
+ data &= mask;
+
+ qsfp_oe = read_csr(dd, target_oe);
+ qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
+ write_csr(dd, target_oe, qsfp_oe);
+ }
+ /* We are exclusively reading bits here, but it is unlikely
+ * we'll get valid data when we set the direction of the pin
+ * in the same call, so read should call this function again
+ * to get valid data
+ */
+ return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+int hfi1_init_ctxt(struct send_context *sc)
+{
+ if (sc != NULL) {
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg;
+ u8 set = (sc->type == SC_USER ?
+ HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+ HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+ reg = read_kctxt_csr(dd, sc->hw_context,
+ SEND_CTXT_CHECK_ENABLE);
+ if (set)
+ CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+ else
+ SET_STATIC_RATE_CONTROL_SMASK(reg);
+ write_kctxt_csr(dd, sc->hw_context,
+ SEND_CTXT_CHECK_ENABLE, reg);
+ }
+ return 0;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+ int ret = 0;
+ u64 reg;
+
+ if (dd->icode != ICODE_RTL_SILICON) {
+ if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+ dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+ __func__);
+ return -EINVAL;
+ }
+ reg = read_csr(dd, ASIC_STS_THERM);
+ temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+ ASIC_STS_THERM_CURR_TEMP_MASK);
+ temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+ ASIC_STS_THERM_LO_TEMP_MASK);
+ temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+ ASIC_STS_THERM_HI_TEMP_MASK);
+ temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+ ASIC_STS_THERM_CRIT_TEMP_MASK);
+ /* triggers is a 3-bit value - 1 bit per trigger. */
+ temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+ return ret;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+ int i;
+
+ /*
+ * In HFI, the mask needs to be 1 to allow interrupts.
+ */
+ if (enable) {
+ u64 cce_int_mask;
+ const int qsfp1_int_smask = QSFP1_INT % 64;
+ const int qsfp2_int_smask = QSFP2_INT % 64;
+
+ /* enable all interrupts */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0);
+
+ /*
+ * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+ * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+ * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+ * the index of the appropriate CSR in the CCEIntMask CSR array
+ */
+ cce_int_mask = read_csr(dd, CCE_INT_MASK +
+ (8*(QSFP1_INT/64)));
+ if (dd->hfi1_id) {
+ cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+ write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)),
+ cce_int_mask);
+ } else {
+ cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+ write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)),
+ cce_int_mask);
+ }
+ } else {
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+ }
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0);
+
+ write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+ for (i = 0; i < dd->chip_sdma_engines; i++)
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+ write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+ write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+ write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+ pci_intx(pdev, 0);
+}
+
+static void clean_up_interrupts(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* remove irqs - must happen before disabling/turning off */
+ if (dd->num_msix_entries) {
+ /* MSI-X */
+ struct hfi1_msix_entry *me = dd->msix_entries;
+
+ for (i = 0; i < dd->num_msix_entries; i++, me++) {
+ if (me->arg == NULL) /* => no irq, no affinity */
+ break;
+ irq_set_affinity_hint(dd->msix_entries[i].msix.vector,
+ NULL);
+ free_irq(me->msix.vector, me->arg);
+ }
+ } else {
+ /* INTx */
+ if (dd->requested_intx_irq) {
+ free_irq(dd->pcidev->irq, dd);
+ dd->requested_intx_irq = 0;
+ }
+ }
+
+ /* turn off interrupts */
+ if (dd->num_msix_entries) {
+ /* MSI-X */
+ hfi1_nomsix(dd);
+ } else {
+ /* INTx */
+ disable_intx(dd->pcidev);
+ }
+
+ /* clean structures */
+ for (i = 0; i < dd->num_msix_entries; i++)
+ free_cpumask_var(dd->msix_entries[i].mask);
+ kfree(dd->msix_entries);
+ dd->msix_entries = NULL;
+ dd->num_msix_entries = 0;
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+ u64 reg;
+ int m, n;
+
+ /* clear from the handled mask of the general interrupt */
+ m = isrc / 64;
+ n = isrc % 64;
+ dd->gi_mask[m] &= ~((u64)1 << n);
+
+ /* direct the chip source to the given MSI-X interrupt */
+ m = isrc / 8;
+ n = isrc % 8;
+ reg = read_csr(dd, CCE_INT_MAP + (8*m));
+ reg &= ~((u64)0xff << (8*n));
+ reg |= ((u64)msix_intr & 0xff) << (8*n);
+ write_csr(dd, CCE_INT_MAP + (8*m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+ int engine, int msix_intr)
+{
+ /*
+ * SDMA engine interrupt sources grouped by type, rather than
+ * engine. Per-engine interrupts are as follows:
+ * SDMA
+ * SDMAProgress
+ * SDMAIdle
+ */
+ remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine,
+ msix_intr);
+ remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine,
+ msix_intr);
+ remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine,
+ msix_intr);
+}
+
+static void remap_receive_available_interrupt(struct hfi1_devdata *dd,
+ int rx, int msix_intr)
+{
+ remap_intr(dd, IS_RCVAVAIL_START + rx, msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME"_%d",
+ dd->unit);
+ ret = request_irq(dd->pcidev->irq, general_interrupt,
+ IRQF_SHARED, dd->intx_name, dd);
+ if (ret)
+ dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+ ret);
+ else
+ dd->requested_intx_irq = 1;
+ return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+ const struct cpumask *local_mask;
+ cpumask_var_t def, rcv;
+ bool def_ret, rcv_ret;
+ int first_general, last_general;
+ int first_sdma, last_sdma;
+ int first_rx, last_rx;
+ int first_cpu, restart_cpu, curr_cpu;
+ int rcv_cpu, sdma_cpu;
+ int i, ret = 0, possible;
+ int ht;
+
+ /* calculate the ranges we are going to use */
+ first_general = 0;
+ first_sdma = last_general = first_general + 1;
+ first_rx = last_sdma = first_sdma + dd->num_sdma;
+ last_rx = first_rx + dd->n_krcv_queues;
+
+ /*
+ * Interrupt affinity.
+ *
+ * non-rcv avail gets a default mask that
+ * starts as possible cpus with threads reset
+ * and each rcv avail reset.
+ *
+ * rcv avail gets node relative 1 wrapping back
+ * to the node relative 1 as necessary.
+ *
+ */
+ local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+ /* if first cpu is invalid, use NUMA 0 */
+ if (cpumask_first(local_mask) >= nr_cpu_ids)
+ local_mask = topology_core_cpumask(0);
+
+ def_ret = zalloc_cpumask_var(&def, GFP_KERNEL);
+ rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL);
+ if (!def_ret || !rcv_ret)
+ goto bail;
+ /* use local mask as default */
+ cpumask_copy(def, local_mask);
+ possible = cpumask_weight(def);
+ /* disarm threads from default */
+ ht = cpumask_weight(
+ topology_sibling_cpumask(cpumask_first(local_mask)));
+ for (i = possible/ht; i < possible; i++)
+ cpumask_clear_cpu(i, def);
+ /* reset possible */
+ possible = cpumask_weight(def);
+ /* def now has full cores on chosen node*/
+ first_cpu = cpumask_first(def);
+ if (nr_cpu_ids >= first_cpu)
+ first_cpu++;
+ restart_cpu = first_cpu;
+ curr_cpu = restart_cpu;
+
+ for (i = first_cpu; i < dd->n_krcv_queues + first_cpu; i++) {
+ cpumask_clear_cpu(curr_cpu, def);
+ cpumask_set_cpu(curr_cpu, rcv);
+ if (curr_cpu >= possible)
+ curr_cpu = restart_cpu;
+ else
+ curr_cpu++;
+ }
+ /* def mask has non-rcv, rcv has recv mask */
+ rcv_cpu = cpumask_first(rcv);
+ sdma_cpu = cpumask_first(def);
+
+ /*
+ * Sanity check - the code expects all SDMA chip source
+ * interrupts to be in the same CSR, starting at bit 0. Verify
+ * that this is true by checking the bit location of the start.
+ */
+ BUILD_BUG_ON(IS_SDMA_START % 64);
+
+ for (i = 0; i < dd->num_msix_entries; i++) {
+ struct hfi1_msix_entry *me = &dd->msix_entries[i];
+ const char *err_info;
+ irq_handler_t handler;
+ void *arg;
+ int idx;
+ struct hfi1_ctxtdata *rcd = NULL;
+ struct sdma_engine *sde = NULL;
+
+ /* obtain the arguments to request_irq */
+ if (first_general <= i && i < last_general) {
+ idx = i - first_general;
+ handler = general_interrupt;
+ arg = dd;
+ snprintf(me->name, sizeof(me->name),
+ DRIVER_NAME"_%d", dd->unit);
+ err_info = "general";
+ } else if (first_sdma <= i && i < last_sdma) {
+ idx = i - first_sdma;
+ sde = &dd->per_sdma[idx];
+ handler = sdma_interrupt;
+ arg = sde;
+ snprintf(me->name, sizeof(me->name),
+ DRIVER_NAME"_%d sdma%d", dd->unit, idx);
+ err_info = "sdma";
+ remap_sdma_interrupts(dd, idx, i);
+ } else if (first_rx <= i && i < last_rx) {
+ idx = i - first_rx;
+ rcd = dd->rcd[idx];
+ /* no interrupt if no rcd */
+ if (!rcd)
+ continue;
+ /*
+ * Set the interrupt register and mask for this
+ * context's interrupt.
+ */
+ rcd->ireg = (IS_RCVAVAIL_START+idx) / 64;
+ rcd->imask = ((u64)1) <<
+ ((IS_RCVAVAIL_START+idx) % 64);
+ handler = receive_context_interrupt;
+ arg = rcd;
+ snprintf(me->name, sizeof(me->name),
+ DRIVER_NAME"_%d kctxt%d", dd->unit, idx);
+ err_info = "receive context";
+ remap_receive_available_interrupt(dd, idx, i);
+ } else {
+ /* not in our expected range - complain, then
+ ignore it */
+ dd_dev_err(dd,
+ "Unexpected extra MSI-X interrupt %d\n", i);
+ continue;
+ }
+ /* no argument, no interrupt */
+ if (arg == NULL)
+ continue;
+ /* make sure the name is terminated */
+ me->name[sizeof(me->name)-1] = 0;
+
+ ret = request_irq(me->msix.vector, handler, 0, me->name, arg);
+ if (ret) {
+ dd_dev_err(dd,
+ "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+ err_info, me->msix.vector, idx, ret);
+ return ret;
+ }
+ /*
+ * assign arg after request_irq call, so it will be
+ * cleaned up
+ */
+ me->arg = arg;
+
+ if (!zalloc_cpumask_var(
+ &dd->msix_entries[i].mask,
+ GFP_KERNEL))
+ goto bail;
+ if (handler == sdma_interrupt) {
+ dd_dev_info(dd, "sdma engine %d cpu %d\n",
+ sde->this_idx, sdma_cpu);
+ cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
+ sdma_cpu = cpumask_next(sdma_cpu, def);
+ if (sdma_cpu >= nr_cpu_ids)
+ sdma_cpu = cpumask_first(def);
+ } else if (handler == receive_context_interrupt) {
+ dd_dev_info(dd, "rcv ctxt %d cpu %d\n",
+ rcd->ctxt, rcv_cpu);
+ cpumask_set_cpu(rcv_cpu, dd->msix_entries[i].mask);
+ rcv_cpu = cpumask_next(rcv_cpu, rcv);
+ if (rcv_cpu >= nr_cpu_ids)
+ rcv_cpu = cpumask_first(rcv);
+ } else {
+ /* otherwise first def */
+ dd_dev_info(dd, "%s cpu %d\n",
+ err_info, cpumask_first(def));
+ cpumask_set_cpu(
+ cpumask_first(def), dd->msix_entries[i].mask);
+ }
+ irq_set_affinity_hint(
+ dd->msix_entries[i].msix.vector,
+ dd->msix_entries[i].mask);
+ }
+
+out:
+ free_cpumask_var(def);
+ free_cpumask_var(rcv);
+ return ret;
+bail:
+ ret = -ENOMEM;
+ goto out;
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* all interrupts handled by the general handler */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ dd->gi_mask[i] = ~(u64)0;
+
+ /* all chip interrupts map to MSI-X 0 */
+ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+ write_csr(dd, CCE_INT_MAP + (8*i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+ struct hfi1_msix_entry *entries;
+ u32 total, request;
+ int i, ret;
+ int single_interrupt = 0; /* we expect to have all the interrupts */
+
+ /*
+ * Interrupt count:
+ * 1 general, "slow path" interrupt (includes the SDMA engines
+ * slow source, SDMACleanupDone)
+ * N interrupts - one per used SDMA engine
+ * M interrupt - one per kernel receive context
+ */
+ total = 1 + dd->num_sdma + dd->n_krcv_queues;
+
+ entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
+ if (!entries) {
+ dd_dev_err(dd, "cannot allocate msix table\n");
+ ret = -ENOMEM;
+ goto fail;
+ }
+ /* 1-1 MSI-X entry assignment */
+ for (i = 0; i < total; i++)
+ entries[i].msix.entry = i;
+
+ /* ask for MSI-X interrupts */
+ request = total;
+ request_msix(dd, &request, entries);
+
+ if (request == 0) {
+ /* using INTx */
+ /* dd->num_msix_entries already zero */
+ kfree(entries);
+ single_interrupt = 1;
+ dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+ } else {
+ /* using MSI-X */
+ dd->num_msix_entries = request;
+ dd->msix_entries = entries;
+
+ if (request != total) {
+ /* using MSI-X, with reduced interrupts */
+ dd_dev_err(
+ dd,
+ "cannot handle reduced interrupt case, want %u, got %u\n",
+ total, request);
+ ret = -EINVAL;
+ goto fail;
+ }
+ dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+ }
+
+ /* mask all interrupts */
+ set_intr_state(dd, 0);
+ /* clear all pending interrupts */
+ clear_all_interrupts(dd);
+
+ /* reset general handler mask, chip MSI-X mappings */
+ reset_interrupts(dd);
+
+ if (single_interrupt)
+ ret = request_intx_irq(dd);
+ else
+ ret = request_msix_irqs(dd);
+ if (ret)
+ goto fail;
+
+ return 0;
+
+fail:
+ clean_up_interrupts(dd);
+ return ret;
+}
+
+/*
+ * Set up context values in dd. Sets:
+ *
+ * num_rcv_contexts - number of contexts being used
+ * n_krcv_queues - number of kernel contexts
+ * first_user_ctxt - first non-kernel context in array of contexts
+ * freectxts - number of free user contexts
+ * num_send_contexts - number of PIO send contexts being used
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+ int num_kernel_contexts;
+ int num_user_contexts;
+ int total_contexts;
+ int ret;
+ unsigned ngroups;
+
+ /*
+ * Kernel contexts: (to be fixed later):
+ * - min or 2 or 1 context/numa
+ * - Context 0 - default/errors
+ * - Context 1 - VL15
+ */
+ if (n_krcvqs)
+ num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS;
+ else
+ num_kernel_contexts = num_online_nodes();
+ num_kernel_contexts =
+ max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+ /*
+ * Every kernel receive context needs an ACK send context.
+ * one send context is allocated for each VL{0-7} and VL15
+ */
+ if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+ dd_dev_err(dd,
+ "Reducing # kernel rcv contexts to: %d, from %d\n",
+ (int)(dd->chip_send_contexts - num_vls - 1),
+ (int)num_kernel_contexts);
+ num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+ }
+ /*
+ * User contexts: (to be fixed later)
+ * - set to num_rcv_contexts if non-zero
+ * - default to 1 user context per CPU
+ */
+ if (num_rcv_contexts)
+ num_user_contexts = num_rcv_contexts;
+ else
+ num_user_contexts = num_online_cpus();
+
+ total_contexts = num_kernel_contexts + num_user_contexts;
+
+ /*
+ * Adjust the counts given a global max.
+ */
+ if (total_contexts > dd->chip_rcv_contexts) {
+ dd_dev_err(dd,
+ "Reducing # user receive contexts to: %d, from %d\n",
+ (int)(dd->chip_rcv_contexts - num_kernel_contexts),
+ (int)num_user_contexts);
+ num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
+ /* recalculate */
+ total_contexts = num_kernel_contexts + num_user_contexts;
+ }
+
+ /* the first N are kernel contexts, the rest are user contexts */
+ dd->num_rcv_contexts = total_contexts;
+ dd->n_krcv_queues = num_kernel_contexts;
+ dd->first_user_ctxt = num_kernel_contexts;
+ dd->freectxts = num_user_contexts;
+ dd_dev_info(dd,
+ "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+ (int)dd->chip_rcv_contexts,
+ (int)dd->num_rcv_contexts,
+ (int)dd->n_krcv_queues,
+ (int)dd->num_rcv_contexts - dd->n_krcv_queues);
+
+ /*
+ * Receive array allocation:
+ * All RcvArray entries are divided into groups of 8. This
+ * is required by the hardware and will speed up writes to
+ * consecutive entries by using write-combining of the entire
+ * cacheline.
+ *
+ * The number of groups are evenly divided among all contexts.
+ * any left over groups will be given to the first N user
+ * contexts.
+ */
+ dd->rcv_entries.group_size = RCV_INCREMENT;
+ ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+ dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+ dd->rcv_entries.nctxt_extra = ngroups -
+ (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+ dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+ dd->rcv_entries.ngroups,
+ dd->rcv_entries.nctxt_extra);
+ if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+ MAX_EAGER_ENTRIES * 2) {
+ dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+ dd->rcv_entries.group_size;
+ dd_dev_info(dd,
+ "RcvArray group count too high, change to %u\n",
+ dd->rcv_entries.ngroups);
+ dd->rcv_entries.nctxt_extra = 0;
+ }
+ /*
+ * PIO send contexts
+ */
+ ret = init_sc_pools_and_sizes(dd);
+ if (ret >= 0) { /* success */
+ dd->num_send_contexts = ret;
+ dd_dev_info(
+ dd,
+ "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+ dd->chip_send_contexts,
+ dd->num_send_contexts,
+ dd->sc_sizes[SC_KERNEL].count,
+ dd->sc_sizes[SC_ACK].count,
+ dd->sc_sizes[SC_USER].count);
+ ret = 0; /* success */
+ }
+
+ return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg = 0;
+ int i;
+
+ dd_dev_info(dd, "Setting partition keys\n");
+ for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+ reg |= (ppd->pkeys[i] &
+ RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+ ((i % 4) *
+ RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+ /* Each register holds 4 PKey values. */
+ if ((i % 4) == 3) {
+ write_csr(dd, RCV_PARTITION_KEY +
+ ((i - 3) * 2), reg);
+ reg = 0;
+ }
+ }
+
+ /* Always enable HW pkeys check when pkeys table is set */
+ add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+ int i, j;
+
+ /* CceIntMap */
+ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+ write_csr(dd, CCE_INT_MAP+(8*i), 0);
+
+ /* SendCtxtCreditReturnAddr */
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+ /* PIO Send buffers */
+ /* SDMA Send buffers */
+ /* These are not normally read, and (presently) have no method
+ to be read, so are not pre-initialized */
+
+ /* RcvHdrAddr */
+ /* RcvHdrTailAddr */
+ /* RcvTidFlowTable */
+ for (i = 0; i < dd->chip_rcv_contexts; i++) {
+ write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+ for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+ write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0);
+ }
+
+ /* RcvArray */
+ for (i = 0; i < dd->chip_rcv_array_count; i++)
+ write_csr(dd, RCV_ARRAY + (8*i),
+ RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+
+ /* RcvQPMapTable */
+ for (i = 0; i < 32; i++)
+ write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+ u64 ctrl_bits)
+{
+ unsigned long timeout;
+ u64 reg;
+
+ /* is the condition present? */
+ reg = read_csr(dd, CCE_STATUS);
+ if ((reg & status_bits) == 0)
+ return;
+
+ /* clear the condition */
+ write_csr(dd, CCE_CTRL, ctrl_bits);
+
+ /* wait for the condition to clear */
+ timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, CCE_STATUS);
+ if ((reg & status_bits) == 0)
+ return;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+ status_bits, reg & status_bits);
+ return;
+ }
+ udelay(1);
+ }
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* CCE_REVISION read-only */
+ /* CCE_REVISION2 read-only */
+ /* CCE_CTRL - bits clear automatically */
+ /* CCE_STATUS read-only, use CceCtrl to clear */
+ clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+ clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+ clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+ for (i = 0; i < CCE_NUM_SCRATCH; i++)
+ write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+ /* CCE_ERR_STATUS read-only */
+ write_csr(dd, CCE_ERR_MASK, 0);
+ write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+ /* CCE_ERR_FORCE leave alone */
+ for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+ write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+ /* CCE_PCIE_CTRL leave alone */
+ for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+ write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+ write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+ CCE_MSIX_TABLE_UPPER_RESETCSR);
+ }
+ for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+ /* CCE_MSIX_PBA read-only */
+ write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+ write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+ }
+ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+ write_csr(dd, CCE_INT_MAP, 0);
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+ /* CCE_INT_STATUS read-only */
+ write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+ write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+ /* CCE_INT_FORCE leave alone */
+ /* CCE_INT_BLOCKED read-only */
+ }
+ for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+ write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set ASIC CSRs to chip reset defaults */
+static void reset_asic_csrs(struct hfi1_devdata *dd)
+{
+ static DEFINE_MUTEX(asic_mutex);
+ static int called;
+ int i;
+
+ /*
+ * If the HFIs are shared between separate nodes or VMs,
+ * then more will need to be done here. One idea is a module
+ * parameter that returns early, letting the first power-on or
+ * a known first load do the reset and blocking all others.
+ */
+
+ /*
+ * These CSRs should only be reset once - the first one here will
+ * do the work. Use a mutex so that a non-first caller waits until
+ * the first is finished before it can proceed.
+ */
+ mutex_lock(&asic_mutex);
+ if (called)
+ goto done;
+ called = 1;
+
+ if (dd->icode != ICODE_FPGA_EMULATION) {
+ /* emulation does not have an SBus - leave these alone */
+ /*
+ * All writes to ASIC_CFG_SBUS_REQUEST do something.
+ * Notes:
+ * o The reset is not zero if aimed at the core. See the
+ * SBus documentation for details.
+ * o If the SBus firmware has been updated (e.g. by the BIOS),
+ * will the reset revert that?
+ */
+ /* ASIC_CFG_SBUS_REQUEST leave alone */
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+ }
+ /* ASIC_SBUS_RESULT read-only */
+ write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0);
+ for (i = 0; i < ASIC_NUM_SCRATCH; i++)
+ write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0);
+ write_csr(dd, ASIC_CFG_MUTEX, 0); /* this will clear it */
+ write_csr(dd, ASIC_CFG_DRV_STR, 0);
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0);
+ /* ASIC_STS_THERM read-only */
+ /* ASIC_CFG_RESET leave alone */
+
+ write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0);
+ /* ASIC_PCIE_SD_HOST_STATUS read-only */
+ write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0);
+ write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0);
+ /* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */
+ write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */
+ /* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */
+ /* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */
+ for (i = 0; i < 16; i++)
+ write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0);
+
+ /* ASIC_GPIO_IN read-only */
+ write_csr(dd, ASIC_GPIO_OE, 0);
+ write_csr(dd, ASIC_GPIO_INVERT, 0);
+ write_csr(dd, ASIC_GPIO_OUT, 0);
+ write_csr(dd, ASIC_GPIO_MASK, 0);
+ /* ASIC_GPIO_STATUS read-only */
+ write_csr(dd, ASIC_GPIO_CLEAR, ~0ull);
+ /* ASIC_GPIO_FORCE leave alone */
+
+ /* ASIC_QSFP1_IN read-only */
+ write_csr(dd, ASIC_QSFP1_OE, 0);
+ write_csr(dd, ASIC_QSFP1_INVERT, 0);
+ write_csr(dd, ASIC_QSFP1_OUT, 0);
+ write_csr(dd, ASIC_QSFP1_MASK, 0);
+ /* ASIC_QSFP1_STATUS read-only */
+ write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull);
+ /* ASIC_QSFP1_FORCE leave alone */
+
+ /* ASIC_QSFP2_IN read-only */
+ write_csr(dd, ASIC_QSFP2_OE, 0);
+ write_csr(dd, ASIC_QSFP2_INVERT, 0);
+ write_csr(dd, ASIC_QSFP2_OUT, 0);
+ write_csr(dd, ASIC_QSFP2_MASK, 0);
+ /* ASIC_QSFP2_STATUS read-only */
+ write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull);
+ /* ASIC_QSFP2_FORCE leave alone */
+
+ write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR);
+ /* this also writes a NOP command, clearing paging mode */
+ write_csr(dd, ASIC_EEP_ADDR_CMD, 0);
+ write_csr(dd, ASIC_EEP_DATA, 0);
+
+done:
+ mutex_unlock(&asic_mutex);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < 32; i++) {
+ write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+ write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+ write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+ }
+ /* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+ only be written 128-byte chunks */
+ /* init RSA engine to clear lingering errors */
+ write_csr(dd, MISC_CFG_RSA_CMD, 1);
+ write_csr(dd, MISC_CFG_RSA_MU, 0);
+ write_csr(dd, MISC_CFG_FW_CTRL, 0);
+ /* MISC_STS_8051_DIGEST read-only */
+ /* MISC_STS_SBM_DIGEST read-only */
+ /* MISC_STS_PCIE_DIGEST read-only */
+ /* MISC_STS_FAB_DIGEST read-only */
+ /* MISC_ERR_STATUS read-only */
+ write_csr(dd, MISC_ERR_MASK, 0);
+ write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+ /* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /*
+ * TXE Kernel CSRs
+ */
+ write_csr(dd, SEND_CTRL, 0);
+ __cm_reset(dd, 0); /* reset CM internal state */
+ /* SEND_CONTEXTS read-only */
+ /* SEND_DMA_ENGINES read-only */
+ /* SEND_PIO_MEM_SIZE read-only */
+ /* SEND_DMA_MEM_SIZE read-only */
+ write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+ pio_reset_all(dd); /* SEND_PIO_INIT_CTXT */
+ /* SEND_PIO_ERR_STATUS read-only */
+ write_csr(dd, SEND_PIO_ERR_MASK, 0);
+ write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+ /* SEND_PIO_ERR_FORCE leave alone */
+ /* SEND_DMA_ERR_STATUS read-only */
+ write_csr(dd, SEND_DMA_ERR_MASK, 0);
+ write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+ /* SEND_DMA_ERR_FORCE leave alone */
+ /* SEND_EGRESS_ERR_STATUS read-only */
+ write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+ write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+ /* SEND_EGRESS_ERR_FORCE leave alone */
+ write_csr(dd, SEND_BTH_QP, 0);
+ write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+ write_csr(dd, SEND_SC2VLT0, 0);
+ write_csr(dd, SEND_SC2VLT1, 0);
+ write_csr(dd, SEND_SC2VLT2, 0);
+ write_csr(dd, SEND_SC2VLT3, 0);
+ write_csr(dd, SEND_LEN_CHECK0, 0);
+ write_csr(dd, SEND_LEN_CHECK1, 0);
+ /* SEND_ERR_STATUS read-only */
+ write_csr(dd, SEND_ERR_MASK, 0);
+ write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+ /* SEND_ERR_FORCE read-only */
+ for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+ write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0);
+ for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+ write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0);
+ for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++)
+ write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0);
+ for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+ write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0);
+ for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+ write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0);
+ write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+ SEND_CM_GLOBAL_CREDIT_RESETCSR);
+ /* SEND_CM_CREDIT_USED_STATUS read-only */
+ write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+ write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+ write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+ write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+ write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+ for (i = 0; i < TXE_NUM_DATA_VL; i++)
+ write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+ write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+ /* SEND_CM_CREDIT_USED_VL read-only */
+ /* SEND_CM_CREDIT_USED_VL15 read-only */
+ /* SEND_EGRESS_CTXT_STATUS read-only */
+ /* SEND_EGRESS_SEND_DMA_STATUS read-only */
+ write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+ /* SEND_EGRESS_ERR_INFO read-only */
+ /* SEND_EGRESS_ERR_SOURCE read-only */
+
+ /*
+ * TXE Per-Context CSRs
+ */
+ for (i = 0; i < dd->chip_send_contexts; i++) {
+ write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+ }
+
+ /*
+ * TXE Per-SDMA CSRs
+ */
+ for (i = 0; i < dd->chip_sdma_engines; i++) {
+ write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+ /* SEND_DMA_STATUS read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+ /* SEND_DMA_HEAD read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+ /* SEND_DMA_IDLE_CNT read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+ /* SEND_DMA_DESC_FETCHED_CNT read-only */
+ /* SEND_DMA_ENG_ERR_STATUS read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+ /* SEND_DMA_ENG_ERR_FORCE leave alone */
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+ }
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ int count;
+
+ /*
+ * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+ * clear.
+ */
+ count = 0;
+ while (1) {
+ reg = read_csr(dd, RCV_STATUS);
+ if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+ | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+ break;
+ /*
+ * Give up after 1ms - maximum wait time.
+ *
+ * RBuf size is 148KiB. Slowest possible is PCIe Gen1 x1 at
+ * 250MB/s bandwidth. Lower rate to 66% for overhead to get:
+ * 148 KB / (66% * 250MB/s) = 920us
+ */
+ if (count++ > 500) {
+ dd_dev_err(dd,
+ "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+ __func__, reg);
+ break;
+ }
+ udelay(2); /* do not busy-wait the CSR */
+ }
+
+ /* start the init - expect RcvCtrl to be 0 */
+ write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+ /*
+ * Read to force the write of Rcvtrl.RxRbufInit. There is a brief
+ * period after the write before RcvStatus.RxRbufInitDone is valid.
+ * The delay in the first run through the loop below is sufficient and
+ * required before the first read of RcvStatus.RxRbufInintDone.
+ */
+ read_csr(dd, RCV_CTRL);
+
+ /* wait for the init to finish */
+ count = 0;
+ while (1) {
+ /* delay is required first time through - see above */
+ udelay(2); /* do not busy-wait the CSR */
+ reg = read_csr(dd, RCV_STATUS);
+ if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+ break;
+
+ /* give up after 100us - slowest possible at 33MHz is 73us */
+ if (count++ > 50) {
+ dd_dev_err(dd,
+ "%s: RcvStatus.RxRbufInit not set, continuing\n",
+ __func__);
+ break;
+ }
+ }
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+ int i, j;
+
+ /*
+ * RXE Kernel CSRs
+ */
+ write_csr(dd, RCV_CTRL, 0);
+ init_rbufs(dd);
+ /* RCV_STATUS read-only */
+ /* RCV_CONTEXTS read-only */
+ /* RCV_ARRAY_CNT read-only */
+ /* RCV_BUF_SIZE read-only */
+ write_csr(dd, RCV_BTH_QP, 0);
+ write_csr(dd, RCV_MULTICAST, 0);
+ write_csr(dd, RCV_BYPASS, 0);
+ write_csr(dd, RCV_VL15, 0);
+ /* this is a clear-down */
+ write_csr(dd, RCV_ERR_INFO,
+ RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+ /* RCV_ERR_STATUS read-only */
+ write_csr(dd, RCV_ERR_MASK, 0);
+ write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+ /* RCV_ERR_FORCE leave alone */
+ for (i = 0; i < 32; i++)
+ write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+ for (i = 0; i < 4; i++)
+ write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+ for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+ write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+ for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+ write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+ for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
+ write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
+ write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
+ write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
+ }
+ for (i = 0; i < 32; i++)
+ write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+ /*
+ * RXE Kernel and User Per-Context CSRs
+ */
+ for (i = 0; i < dd->chip_rcv_contexts; i++) {
+ /* kernel */
+ write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+ /* RCV_CTXT_STATUS read-only */
+ write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+ write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+ write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+ write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+ /* user */
+ /* RCV_HDR_TAIL read-only */
+ write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+ /* RCV_EGR_INDEX_TAIL read-only */
+ write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+ /* RCV_EGR_OFFSET_TAIL read-only */
+ for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+ write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j),
+ 0);
+ }
+ }
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15 -> VL 15
+ * otherwise
+ * -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+ int i;
+ /* init per architecture spec, constrained by hardware capability */
+
+ /* HFI maps sent packets */
+ write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+ 0,
+ 0, 0, 1, 1,
+ 2, 2, 3, 3,
+ 4, 4, 5, 5,
+ 6, 6, 7, 7));
+ write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+ 1,
+ 8, 0, 9, 0,
+ 10, 0, 11, 0,
+ 12, 0, 13, 0,
+ 14, 0, 15, 15));
+ write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+ 2,
+ 16, 0, 17, 0,
+ 18, 0, 19, 0,
+ 20, 0, 21, 0,
+ 22, 0, 23, 0));
+ write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+ 3,
+ 24, 0, 25, 0,
+ 26, 0, 27, 0,
+ 28, 0, 29, 0,
+ 30, 0, 31, 0));
+
+ /* DC maps received packets */
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+ 15_0,
+ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+ 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+ 31_16,
+ 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+ 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+ /* initialize the cached sc2vl values consistently with h/w */
+ for (i = 0; i < 32; i++) {
+ if (i < 8 || i == 15)
+ *((u8 *)(dd->sc2vl) + i) = (u8)i;
+ else
+ *((u8 *)(dd->sc2vl) + i) = 0;
+ }
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values. We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static void init_chip(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /*
+ * Put the HFI CSRs in a known state.
+ * Combine this with a DC reset.
+ *
+ * Stop the device from doing anything while we do a
+ * reset. We know there are no other active users of
+ * the device since we are now in charge. Turn off
+ * off all outbound and inbound traffic and make sure
+ * the device does not generate any interrupts.
+ */
+
+ /* disable send contexts and SDMA engines */
+ write_csr(dd, SEND_CTRL, 0);
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+ for (i = 0; i < dd->chip_sdma_engines; i++)
+ write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+ /* disable port (turn off RXE inbound traffic) and contexts */
+ write_csr(dd, RCV_CTRL, 0);
+ for (i = 0; i < dd->chip_rcv_contexts; i++)
+ write_csr(dd, RCV_CTXT_CTRL, 0);
+ /* mask all interrupt sources */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+
+ /*
+ * DC Reset: do a full DC reset before the register clear.
+ * A recommended length of time to hold is one CSR read,
+ * so reread the CceDcCtrl. Then, hold the DC in reset
+ * across the clear.
+ */
+ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+ (void) read_csr(dd, CCE_DC_CTRL);
+
+ if (use_flr) {
+ /*
+ * A FLR will reset the SPC core and part of the PCIe.
+ * The parts that need to be restored have already been
+ * saved.
+ */
+ dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+ /* do the FLR, the DC reset will remain */
+ hfi1_pcie_flr(dd);
+
+ /* restore command and BARs */
+ restore_pci_variables(dd);
+
+ if (is_a0(dd)) {
+ dd_dev_info(dd, "Resetting CSRs with FLR\n");
+ hfi1_pcie_flr(dd);
+ restore_pci_variables(dd);
+ }
+
+ } else {
+ dd_dev_info(dd, "Resetting CSRs with writes\n");
+ reset_cce_csrs(dd);
+ reset_txe_csrs(dd);
+ reset_rxe_csrs(dd);
+ reset_asic_csrs(dd);
+ reset_misc_csrs(dd);
+ }
+ /* clear the DC reset */
+ write_csr(dd, CCE_DC_CTRL, 0);
+ /* Set the LED off */
+ if (is_a0(dd))
+ setextled(dd, 0);
+ /*
+ * Clear the QSFP reset.
+ * A0 leaves the out lines floating on power on, then on an FLR
+ * enforces a 0 on all out pins. The driver does not touch
+ * ASIC_QSFPn_OUT otherwise. This leaves RESET_N low and
+ * anything plugged constantly in reset, if it pays attention
+ * to RESET_N.
+ * A prime example of this is SiPh. For now, set all pins high.
+ * I2CCLK and I2CDAT will change per direction, and INT_N and
+ * MODPRS_N are input only and their value is ignored.
+ */
+ if (is_a0(dd)) {
+ write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+ write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+ }
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* assign link credit variables */
+ dd->vau = CM_VAU;
+ dd->link_credits = CM_GLOBAL_CREDITS;
+ if (is_a0(dd))
+ dd->link_credits--;
+ dd->vcu = cu_to_vcu(hfi1_cu);
+ /* enough room for 8 MAD packets plus header - 17K */
+ dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+ if (dd->vl15_init > dd->link_credits)
+ dd->vl15_init = dd->link_credits;
+
+ write_uninitialized_csrs_and_memories(dd);
+
+ if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+ for (i = 0; i < dd->num_pports; i++) {
+ struct hfi1_pportdata *ppd = &dd->pport[i];
+
+ set_partition_keys(ppd);
+ }
+ init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+ /* user changed the KDETH_QP */
+ if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+ /* out of range or illegal value */
+ dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+ kdeth_qp = 0;
+ }
+ if (kdeth_qp == 0) /* not set, or failed range check */
+ kdeth_qp = DEFAULT_KDETH_QP;
+
+ write_csr(dd, SEND_BTH_QP,
+ (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK)
+ << SEND_BTH_QP_KDETH_QP_SHIFT);
+
+ write_csr(dd, RCV_BTH_QP,
+ (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK)
+ << RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass. Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+ u32 first_ctxt,
+ u32 last_ctxt)
+{
+ u64 reg = 0;
+ u64 regno = RCV_QP_MAP_TABLE;
+ int i;
+ u64 ctxt = first_ctxt;
+
+ for (i = 0; i < 256;) {
+ if (ctxt == VL15CTXT) {
+ ctxt++;
+ if (ctxt > last_ctxt)
+ ctxt = first_ctxt;
+ continue;
+ }
+ reg |= ctxt << (8 * (i % 8));
+ i++;
+ ctxt++;
+ if (ctxt > last_ctxt)
+ ctxt = first_ctxt;
+ if (i % 8 == 0) {
+ write_csr(dd, regno, reg);
+ reg = 0;
+ regno += 8;
+ }
+ }
+ if (i % 8)
+ write_csr(dd, regno, reg);
+
+ add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+ | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @first_context
+ *
+ * This routine initializes Rule 0 and the
+ * RSM map table to implement qos.
+ *
+ * If all of the limit tests succeed,
+ * qos is applied based on the array
+ * interpretation of krcvqs where
+ * entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn
+ * bits (m) are computed to feed both the RSM map table
+ * and the single rule.
+ *
+ */
+static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+{
+ u8 max_by_vl = 0;
+ unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+ u64 *rsmmap;
+ u64 reg;
+ u8 rxcontext = is_a0(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
+
+ /* validate */
+ if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+ num_vls == 1 ||
+ krcvqsset <= 1)
+ goto bail;
+ for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+ if (krcvqs[i] > max_by_vl)
+ max_by_vl = krcvqs[i];
+ if (max_by_vl > 32)
+ goto bail;
+ qpns_per_vl = __roundup_pow_of_two(max_by_vl);
+ /* determine bits vl */
+ n = ilog2(num_vls);
+ /* determine bits for qpn */
+ m = ilog2(qpns_per_vl);
+ if ((m + n) > 7)
+ goto bail;
+ if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+ goto bail;
+ rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
+ memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
+ /* init the local copy of the table */
+ for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+ unsigned tctxt;
+
+ for (qpn = 0, tctxt = ctxt;
+ krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+ unsigned idx, regoff, regidx;
+
+ /* generate index <= 128 */
+ idx = (qpn << n) ^ i;
+ regoff = (idx % 8) * 8;
+ regidx = idx / 8;
+ reg = rsmmap[regidx];
+ /* replace 0xff with context number */
+ reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+ << regoff);
+ reg |= (u64)(tctxt++) << regoff;
+ rsmmap[regidx] = reg;
+ if (tctxt == ctxt + krcvqs[i])
+ tctxt = ctxt;
+ }
+ ctxt += krcvqs[i];
+ }
+ /* flush cached copies to chip */
+ for (i = 0; i < NUM_MAP_REGS; i++)
+ write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
+ /* add rule0 */
+ write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
+ RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK
+ << RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
+ 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+ write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
+ LRH_BTH_MATCH_OFFSET
+ << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+ LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+ LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+ ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+ QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+ ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+ write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
+ LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
+ LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
+ LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
+ LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
+ /* Enable RSM */
+ add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+ kfree(rsmmap);
+ /* map everything else (non-VL15) to context 0 */
+ init_qpmap_table(
+ dd,
+ 0,
+ 0);
+ dd->qos_shift = n + 1;
+ return;
+bail:
+ dd->qos_shift = 1;
+ init_qpmap_table(
+ dd,
+ dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0,
+ dd->n_krcv_queues - 1);
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+ /* enable all receive errors */
+ write_csr(dd, RCV_ERR_MASK, ~0ull);
+ /* setup QPN map table - start where VL15 context leaves off */
+ init_qos(
+ dd,
+ dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0);
+ /*
+ * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+ * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+ * space, PciCfgCap2.MaxPayloadSize in HFI). There is only one
+ * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+ * Max_PayLoad_Size set to its minimum of 128.
+ *
+ * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+ * (64 bytes). Max_Payload_Size is possibly modified upward in
+ * tune_pcie_caps() which is called after this routine.
+ */
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+ /* enable all CCE errors */
+ write_csr(dd, CCE_ERR_MASK, ~0ull);
+ /* enable *some* Misc errors */
+ write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+ /* enable all DC errors, except LCB */
+ write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+ write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU. A CU is defined in terms
+ * AUs. The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+ u32 csr0to3, u32 csr4to7)
+{
+ write_csr(dd, csr0to3,
+ 0ull <<
+ SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT
+ | 1ull <<
+ SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT
+ | 2ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT
+ | 4ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+ write_csr(dd, csr4to7,
+ 8ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT
+ | 16ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT
+ | 32ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT
+ | 64ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+ assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+ SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+ assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+ SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* enable all PIO, SDMA, general, and Egress errors */
+ write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+ write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+ write_csr(dd, SEND_ERR_MASK, ~0ull);
+ write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+ /* enable all per-context and per-SDMA engine errors */
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+ for (i = 0; i < dd->chip_sdma_engines; i++)
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+ /* set the local CU to AU mapping */
+ assign_local_cm_au_table(dd, dd->vcu);
+
+ /*
+ * Set reasonable default for Credit Return Timer
+ * Don't set on Simulator - causes it to choke.
+ */
+ if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+ write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+{
+ struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+ ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+ SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+ /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+ if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+ reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+ /*
+ * Enable send-side J_KEY integrity check, unless this is A0 h/w
+ * (due to A0 erratum).
+ */
+ if (!is_a0(dd)) {
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+ }
+
+ /* Enable J_KEY check on receive context. */
+ reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+ ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+ RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+ write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
+done:
+ return ret;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+ struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+ /*
+ * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+ * This check would not have been enabled for A0 h/w, see
+ * set_ctxt_jkey().
+ */
+ if (!is_a0(dd)) {
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+ }
+ /* Turn off the J_KEY on the receive side */
+ write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
+done:
+ return ret;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (ctxt < dd->num_rcv_contexts)
+ rcd = dd->rcd[ctxt];
+ else {
+ ret = -EINVAL;
+ goto done;
+ }
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+ SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+done:
+ return ret;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (ctxt < dd->num_rcv_contexts)
+ rcd = dd->rcd[ctxt];
+ else {
+ ret = -EINVAL;
+ goto done;
+ }
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+done:
+ return ret;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+ free_cntrs(dd);
+ free_rcverr(dd);
+ clean_up_interrupts(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+ ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Certain chip functions need to be initialized only once per asic
+ * instead of per-device. This function finds the peer device and
+ * checks whether that chip initialization needs to be done by this
+ * device.
+ */
+static void asic_should_init(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+ struct hfi1_devdata *tmp, *peer = NULL;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ /* Find our peer device */
+ list_for_each_entry(tmp, &hfi1_dev_list, list) {
+ if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+ dd->unit != tmp->unit) {
+ peer = tmp;
+ break;
+ }
+ }
+
+ /*
+ * "Claim" the ASIC for initialization if it hasn't been
+ " "claimed" yet.
+ */
+ if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC))
+ dd->flags |= HFI1_DO_INIT_ASIC;
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+}
+
+/**
+ * Allocate an initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ u64 reg;
+ int i, ret;
+ static const char * const inames[] = { /* implementation names */
+ "RTL silicon",
+ "RTL VCS simulation",
+ "RTL FPGA emulation",
+ "Functional simulator"
+ };
+
+ dd = hfi1_alloc_devdata(pdev,
+ NUM_IB_PORTS * sizeof(struct hfi1_pportdata));
+ if (IS_ERR(dd))
+ goto bail;
+ ppd = dd->pport;
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ int vl;
+ /* init common fields */
+ hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+ /* DC supports 4 link widths */
+ ppd->link_width_supported =
+ OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+ OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+ ppd->link_width_downgrade_supported =
+ ppd->link_width_supported;
+ /* start out enabling only 4X */
+ ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+ ppd->link_width_downgrade_enabled =
+ ppd->link_width_downgrade_supported;
+ /* link width active is 0 when link is down */
+ /* link width downgrade active is 0 when link is down */
+
+ if (num_vls < HFI1_MIN_VLS_SUPPORTED
+ || num_vls > HFI1_MAX_VLS_SUPPORTED) {
+ hfi1_early_err(&pdev->dev,
+ "Invalid num_vls %u, using %u VLs\n",
+ num_vls, HFI1_MAX_VLS_SUPPORTED);
+ num_vls = HFI1_MAX_VLS_SUPPORTED;
+ }
+ ppd->vls_supported = num_vls;
+ ppd->vls_operational = ppd->vls_supported;
+ /* Set the default MTU. */
+ for (vl = 0; vl < num_vls; vl++)
+ dd->vld[vl].mtu = hfi1_max_mtu;
+ dd->vld[15].mtu = MAX_MAD_PACKET;
+ /*
+ * Set the initial values to reasonable default, will be set
+ * for real when link is up.
+ */
+ ppd->lstate = IB_PORT_DOWN;
+ ppd->overrun_threshold = 0x4;
+ ppd->phy_error_threshold = 0xf;
+ ppd->port_crc_mode_enabled = link_crc_mask;
+ /* initialize supported LTP CRC mode */
+ ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+ /* initialize enabled LTP CRC mode */
+ ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+ /* start in offline */
+ ppd->host_link_state = HLS_DN_OFFLINE;
+ init_vl_arb_caches(ppd);
+ }
+
+ dd->link_default = HLS_DN_POLL;
+
+ /*
+ * Do remaining PCIe setup and save PCIe values in dd.
+ * Any error printing is already done by the init code.
+ * On return, we have the chip mapped.
+ */
+ ret = hfi1_pcie_ddinit(dd, pdev, ent);
+ if (ret < 0)
+ goto bail_free;
+
+ /* verify that reads actually work, save revision for reset check */
+ dd->revision = read_csr(dd, CCE_REVISION);
+ if (dd->revision == ~(u64)0) {
+ dd_dev_err(dd, "cannot read chip CSRs\n");
+ ret = -EINVAL;
+ goto bail_cleanup;
+ }
+ dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+ & CCE_REVISION_CHIP_REV_MAJOR_MASK;
+ dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+ & CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+ /* obtain the hardware ID - NOT related to unit, which is a
+ software enumeration */
+ reg = read_csr(dd, CCE_REVISION2);
+ dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+ & CCE_REVISION2_HFI_ID_MASK;
+ /* the variable size will remove unwanted bits */
+ dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+ dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+ dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+ dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown",
+ (int)dd->irev);
+
+ /* speeds the hardware can support */
+ dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+ /* speeds allowed to run at */
+ dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+ /* give a reasonable active value, will be set on link up */
+ dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+ dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+ dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+ dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+ dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+ dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+ /* fix up link widths for emulation _p */
+ ppd = dd->pport;
+ if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+ ppd->link_width_supported =
+ ppd->link_width_enabled =
+ ppd->link_width_downgrade_supported =
+ ppd->link_width_downgrade_enabled =
+ OPA_LINK_WIDTH_1X;
+ }
+ /* insure num_vls isn't larger than number of sdma engines */
+ if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+ dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+ num_vls, HFI1_MAX_VLS_SUPPORTED);
+ ppd->vls_supported = num_vls = HFI1_MAX_VLS_SUPPORTED;
+ ppd->vls_operational = ppd->vls_supported;
+ }
+
+ /*
+ * Convert the ns parameter to the 64 * cclocks used in the CSR.
+ * Limit the max if larger than the field holds. If timeout is
+ * non-zero, then the calculated field will be at least 1.
+ *
+ * Must be after icode is set up - the cclock rate depends
+ * on knowing the hardware being used.
+ */
+ dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+ if (dd->rcv_intr_timeout_csr >
+ RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+ dd->rcv_intr_timeout_csr =
+ RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+ else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+ dd->rcv_intr_timeout_csr = 1;
+
+ /* obtain chip sizes, reset chip CSRs */
+ init_chip(dd);
+
+ /* read in the PCIe link speed information */
+ ret = pcie_speeds(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* needs to be done before we look for the peer device */
+ read_guid(dd);
+
+ asic_should_init(dd);
+
+ /* read in firmware */
+ ret = hfi1_firmware_init(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /*
+ * In general, the PCIe Gen3 transition must occur after the
+ * chip has been idled (so it won't initiate any PCIe transactions
+ * e.g. an interrupt) and before the driver changes any registers
+ * (the transition will reset the registers).
+ *
+ * In particular, place this call after:
+ * - init_chip() - the chip will not initiate any PCIe transactions
+ * - pcie_speeds() - reads the current link speed
+ * - hfi1_firmware_init() - the needed firmware is ready to be
+ * downloaded
+ */
+ ret = do_pcie_gen3_transition(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* start setting dd values and adjusting CSRs */
+ init_early_variables(dd);
+
+ parse_platform_config(dd);
+
+ /* add board names as they are defined */
+ dd->boardname = kmalloc(64, GFP_KERNEL);
+ if (!dd->boardname)
+ goto bail_cleanup;
+ snprintf(dd->boardname, 64, "Board ID 0x%llx",
+ dd->revision >> CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT
+ & CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK);
+
+ snprintf(dd->boardversion, BOARD_VERS_MAX,
+ "ChipABI %u.%u, %s, ChipRev %u.%u, SW Compat %llu\n",
+ HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+ dd->boardname,
+ (u32)dd->majrev,
+ (u32)dd->minrev,
+ (dd->revision >> CCE_REVISION_SW_SHIFT)
+ & CCE_REVISION_SW_MASK);
+
+ ret = set_up_context_variables(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* set initial RXE CSRs */
+ init_rxe(dd);
+ /* set initial TXE CSRs */
+ init_txe(dd);
+ /* set initial non-RXE, non-TXE CSRs */
+ init_other(dd);
+ /* set up KDETH QP prefix in both RX and TX CSRs */
+ init_kdeth_qp(dd);
+
+ /* send contexts must be set up before receive contexts */
+ ret = init_send_contexts(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ ret = hfi1_create_ctxts(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+ /*
+ * rcd[0] is guaranteed to be valid by this point. Also, all
+ * context are using the same value, as per the module parameter.
+ */
+ dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+ ret = init_pervl_scs(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* sdma init */
+ for (i = 0; i < dd->num_pports; ++i) {
+ ret = sdma_init(dd, i);
+ if (ret)
+ goto bail_cleanup;
+ }
+
+ /* use contexts created by hfi1_create_ctxts */
+ ret = set_up_interrupts(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* set up LCB access - must be after set_up_interrupts() */
+ init_lcb_access(dd);
+
+ snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+ dd->base_guid & 0xFFFFFF);
+
+ dd->oui1 = dd->base_guid >> 56 & 0xFF;
+ dd->oui2 = dd->base_guid >> 48 & 0xFF;
+ dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+ ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+ if (ret)
+ goto bail_clear_intr;
+ check_fabric_firmware_versions(dd);
+
+ thermal_init(dd);
+
+ ret = init_cntrs(dd);
+ if (ret)
+ goto bail_clear_intr;
+
+ ret = init_rcverr(dd);
+ if (ret)
+ goto bail_free_cntrs;
+
+ ret = eprom_init(dd);
+ if (ret)
+ goto bail_free_rcverr;
+
+ goto bail;
+
+bail_free_rcverr:
+ free_rcverr(dd);
+bail_free_cntrs:
+ free_cntrs(dd);
+bail_clear_intr:
+ clean_up_interrupts(dd);
+bail_cleanup:
+ hfi1_pcie_ddcleanup(dd);
+bail_free:
+ hfi1_free_devdata(dd);
+ dd = ERR_PTR(ret);
+bail:
+ return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+ u32 dw_len)
+{
+ u32 delta_cycles;
+ u32 current_egress_rate = ppd->current_egress_rate;
+ /* rates here are in units of 10^6 bits/sec */
+
+ if (desired_egress_rate == -1)
+ return 0; /* shouldn't happen */
+
+ if (desired_egress_rate >= current_egress_rate)
+ return 0; /* we can't help go faster, only slower */
+
+ delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+ egress_cycles(dw_len * 4, current_egress_rate);
+
+ return (u16)delta_cycles;
+}
+
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature. The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+ u32 dw_len)
+{
+ u64 pbc, delay = 0;
+
+ if (unlikely(srate_mbs))
+ delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+ pbc = flags
+ | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+ | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+ | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+ | (dw_len & PBC_LENGTH_DWS_MASK)
+ << PBC_LENGTH_DWS_SHIFT;
+
+ return pbc;
+}
+
+#define SBUS_THERMAL 0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+ dd_dev_err((dd), \
+ "Thermal sensor initialization failed: %s (%d)\n", \
+ (reason), (ret))
+
+/*
+ * Initialize the Avago Thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+
+ if (dd->icode != ICODE_RTL_SILICON ||
+ !(dd->flags & HFI1_DO_INIT_ASIC))
+ return ret;
+
+ acquire_hw_mutex(dd);
+ dd_dev_info(dd, "Initializing thermal sensor\n");
+ /* Thermal Sensor Initialization */
+ /* Step 1: Reset the Thermal SBus Receiver */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+ RESET_SBUS_RECEIVER, 0);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Bus Reset");
+ goto done;
+ }
+ /* Step 2: Set Reset bit in Thermal block */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+ WRITE_SBUS_RECEIVER, 0x1);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Therm Block Reset");
+ goto done;
+ }
+ /* Step 3: Write clock divider value (100MHz -> 2MHz) */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+ WRITE_SBUS_RECEIVER, 0x32);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Write Clock Div");
+ goto done;
+ }
+ /* Step 4: Select temperature mode */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+ WRITE_SBUS_RECEIVER,
+ SBUS_THERM_MONITOR_MODE);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Write Mode Sel");
+ goto done;
+ }
+ /* Step 5: De-assert block reset and start conversion */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+ WRITE_SBUS_RECEIVER, 0x2);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Write Reset Deassert");
+ goto done;
+ }
+ /* Step 5.1: Wait for first conversion (21.5ms per spec) */
+ msleep(22);
+
+ /* Enable polling of thermal readings */
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+done:
+ release_hw_mutex(dd);
+ return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd = &dd->pport[0];
+ /*
+ * Thermal Critical Interrupt
+ * Put the device into forced freeze mode, take link down to
+ * offline, and put DC into reset.
+ */
+ dd_dev_emerg(dd,
+ "Critical temperature reached! Forcing device into freeze mode!\n");
+ dd->flags |= HFI1_FORCED_FREEZE;
+ start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT);
+ /*
+ * Shut DC down as much and as quickly as possible.
+ *
+ * Step 1: Take the link down to OFFLINE. This will cause the
+ * 8051 to put the Serdes in reset. However, we don't want to
+ * go through the entire link state machine since we want to
+ * shutdown ASAP. Furthermore, this is not a graceful shutdown
+ * but rather an attempt to save the chip.
+ * Code below is almost the same as quiet_serdes() but avoids
+ * all the extra work and the sleeps.
+ */
+ ppd->driver_link_ready = 0;
+ ppd->link_enabled = 0;
+ set_physical_link_state(dd, PLS_OFFLINE |
+ (OPA_LINKDOWN_REASON_SMA_DISABLED << 8));
+ /*
+ * Step 2: Shutdown LCB and 8051
+ * After shutdown, do not restore DC_CFG_RESET value.
+ */
+ dc_shutdown(dd);
+}
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
new file mode 100644
index 000000000000..f89a432c7334
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip.h
@@ -0,0 +1,1035 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+/* sizes */
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define NUM_INTERRUPT_SOURCES 768
+#define RXE_NUM_CONTEXTS 160
+#define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_NUM_DATA_VL 8
+#define TXE_NUM_CONTEXTS 160
+#define TXE_NUM_SDMA_ENGINES 16
+#define NUM_CONTEXTS_PER_SET 8
+#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
+#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
+#define VL_ARB_TABLE_SIZE 16
+#define TXE_NUM_32_BIT_COUNTER 7
+#define TXE_NUM_64_BIT_COUNTER 30
+#define TXE_NUM_DATA_VL 8
+#define TXE_PIO_SIZE (32 * 0x100000) /* 32 MB */
+#define PIO_BLOCK_SIZE 64 /* bytes */
+#define SDMA_BLOCK_SIZE 64 /* bytes */
+#define RCV_BUF_BLOCK_SIZE 64 /* bytes */
+#define PIO_CMASK 0x7ff /* counter mask for free and fill counters */
+#define MAX_EAGER_ENTRIES 2048 /* max receive eager entries */
+#define MAX_TID_PAIR_ENTRIES 1024 /* max receive expected pairs */
+/* Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
+ at 64 bytes for all generation one devices */
+#define CM_VAU 3
+/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
+#define CM_GLOBAL_CREDITS 0x940
+/* Number of PKey entries in the HW */
+#define MAX_PKEY_VALUES 16
+
+#include "chip_registers.h"
+
+#define RXE_PER_CONTEXT_USER (RXE + RXE_PER_CONTEXT_OFFSET)
+#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
+
+/* PBC flags */
+#define PBC_INTR (1ull << 31)
+#define PBC_DC_INFO_SHIFT (30)
+#define PBC_DC_INFO (1ull << PBC_DC_INFO_SHIFT)
+#define PBC_TEST_EBP (1ull << 29)
+#define PBC_PACKET_BYPASS (1ull << 28)
+#define PBC_CREDIT_RETURN (1ull << 25)
+#define PBC_INSERT_BYPASS_ICRC (1ull << 24)
+#define PBC_TEST_BAD_ICRC (1ull << 23)
+#define PBC_FECN (1ull << 22)
+
+/* PbcInsertHcrc field settings */
+#define PBC_IHCRC_LKDETH 0x0 /* insert @ local KDETH offset */
+#define PBC_IHCRC_GKDETH 0x1 /* insert @ global KDETH offset */
+#define PBC_IHCRC_NONE 0x2 /* no HCRC inserted */
+
+/* PBC fields */
+#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
+#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
+#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
+ (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
+ PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+
+#define PBC_INSERT_HCRC_SHIFT 26
+#define PBC_INSERT_HCRC_MASK 0x3ull
+#define PBC_INSERT_HCRC_SMASK \
+ (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
+
+#define PBC_VL_SHIFT 12
+#define PBC_VL_MASK 0xfull
+#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
+
+#define PBC_LENGTH_DWS_SHIFT 0
+#define PBC_LENGTH_DWS_MASK 0xfffull
+#define PBC_LENGTH_DWS_SMASK \
+ (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
+
+/* Credit Return Fields */
+#define CR_COUNTER_SHIFT 0
+#define CR_COUNTER_MASK 0x7ffull
+#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
+
+#define CR_STATUS_SHIFT 11
+#define CR_STATUS_MASK 0x1ull
+#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
+#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
+#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
+
+/* interrupt source numbers */
+#define IS_GENERAL_ERR_START 0
+#define IS_SDMAENG_ERR_START 16
+#define IS_SENDCTXT_ERR_START 32
+#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */
+#define IS_VARIOUS_START 240
+#define IS_DC_START 248
+#define IS_RCVAVAIL_START 256
+#define IS_RCVURGENT_START 416
+#define IS_SENDCREDIT_START 576
+#define IS_RESERVED_START 736
+#define IS_MAX_SOURCES 768
+
+/* derived interrupt source values */
+#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START
+#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START
+#define IS_SENDCTXT_ERR_END IS_SDMA_START
+#define IS_SDMA_END IS_VARIOUS_START
+#define IS_VARIOUS_END IS_DC_START
+#define IS_DC_END IS_RCVAVAIL_START
+#define IS_RCVAVAIL_END IS_RCVURGENT_START
+#define IS_RCVURGENT_END IS_SENDCREDIT_START
+#define IS_SENDCREDIT_END IS_RESERVED_START
+#define IS_RESERVED_END IS_MAX_SOURCES
+
+/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
+#define QSFP1_INT 242
+#define QSFP2_INT 243
+
+/* DCC_CFG_PORT_CONFIG logical link states */
+#define LSTATE_DOWN 0x1
+#define LSTATE_INIT 0x2
+#define LSTATE_ARMED 0x3
+#define LSTATE_ACTIVE 0x4
+
+/* DC8051_STS_CUR_STATE port values (physical link states) */
+#define PLS_DISABLED 0x30
+#define PLS_OFFLINE 0x90
+#define PLS_OFFLINE_QUIET 0x90
+#define PLS_OFFLINE_PLANNED_DOWN_INFORM 0x91
+#define PLS_OFFLINE_READY_TO_QUIET_LT 0x92
+#define PLS_OFFLINE_REPORT_FAILURE 0x93
+#define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94
+#define PLS_POLLING 0x20
+#define PLS_POLLING_QUIET 0x20
+#define PLS_POLLING_ACTIVE 0x21
+#define PLS_CONFIGPHY 0x40
+#define PLS_CONFIGPHY_DEBOUCE 0x40
+#define PLS_CONFIGPHY_ESTCOMM 0x41
+#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT 0x42
+#define PLS_CONFIGPHY_ESTcOMM_LOCAL_COMPLETE 0x43
+#define PLS_CONFIGPHY_OPTEQ 0x44
+#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING 0x44
+#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE 0x45
+#define PLS_CONFIGPHY_VERIFYCAP 0x46
+#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE 0x46
+#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
+#define PLS_CONFIGLT 0x48
+#define PLS_CONFIGLT_CONFIGURE 0x48
+#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE 0x49
+#define PLS_LINKUP 0x50
+#define PLS_PHYTEST 0xB0
+#define PLS_INTERNAL_SERDES_LOOPBACK 0xe1
+#define PLS_QUICK_LINKUP 0xe2
+
+/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
+#define HCMD_LOAD_CONFIG_DATA 0x01
+#define HCMD_READ_CONFIG_DATA 0x02
+#define HCMD_CHANGE_PHY_STATE 0x03
+#define HCMD_SEND_LCB_IDLE_MSG 0x04
+#define HCMD_MISC 0x05
+#define HCMD_READ_LCB_IDLE_MSG 0x06
+#define HCMD_READ_LCB_CSR 0x07
+#define HCMD_INTERFACE_TEST 0xff
+
+/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
+#define HCMD_SUCCESS 2
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
+#define SPICO_ROM_FAILED (1 << 0)
+#define UNKNOWN_FRAME (1 << 1)
+#define TARGET_BER_NOT_MET (1 << 2)
+#define FAILED_SERDES_INTERNAL_LOOPBACK (1 << 3)
+#define FAILED_SERDES_INIT (1 << 4)
+#define FAILED_LNI_POLLING (1 << 5)
+#define FAILED_LNI_DEBOUNCE (1 << 6)
+#define FAILED_LNI_ESTBCOMM (1 << 7)
+#define FAILED_LNI_OPTEQ (1 << 8)
+#define FAILED_LNI_VERIFY_CAP1 (1 << 9)
+#define FAILED_LNI_VERIFY_CAP2 (1 << 10)
+#define FAILED_LNI_CONFIGLT (1 << 11)
+
+#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
+ | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
+ | FAILED_LNI_VERIFY_CAP1 \
+ | FAILED_LNI_VERIFY_CAP2 \
+ | FAILED_LNI_CONFIGLT)
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
+#define HOST_REQ_DONE (1 << 0)
+#define BC_PWR_MGM_MSG (1 << 1)
+#define BC_SMA_MSG (1 << 2)
+#define BC_BCC_UNKOWN_MSG (1 << 3)
+#define BC_IDLE_UNKNOWN_MSG (1 << 4)
+#define EXT_DEVICE_CFG_REQ (1 << 5)
+#define VERIFY_CAP_FRAME (1 << 6)
+#define LINKUP_ACHIEVED (1 << 7)
+#define LINK_GOING_DOWN (1 << 8)
+#define LINK_WIDTH_DOWNGRADED (1 << 9)
+
+/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
+#define HREQ_LOAD_CONFIG 0x01
+#define HREQ_SAVE_CONFIG 0x02
+#define HREQ_READ_CONFIG 0x03
+#define HREQ_SET_TX_EQ_ABS 0x04
+#define HREQ_SET_TX_EQ_REL 0x05
+#define HREQ_ENABLE 0x06
+#define HREQ_CONFIG_DONE 0xfe
+#define HREQ_INTERFACE_TEST 0xff
+
+/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
+#define HREQ_INVALID 0x01
+#define HREQ_SUCCESS 0x02
+#define HREQ_NOT_SUPPORTED 0x03
+#define HREQ_FEATURE_NOT_SUPPORTED 0x04 /* request specific feature */
+#define HREQ_REQUEST_REJECTED 0xfe
+#define HREQ_EXECUTION_ONGOING 0xff
+
+/* MISC host command functions */
+#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
+#define HCMD_MISC_GRANT_LCB_ACCESS 0x2
+
+/* idle flit message types */
+#define IDLE_PHYSICAL_LINK_MGMT 0x1
+#define IDLE_CRU 0x2
+#define IDLE_SMA 0x3
+#define IDLE_POWER_MGMT 0x4
+
+/* idle flit message send fields (both send and read) */
+#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
+#define IDLE_PAYLOAD_SHIFT 8
+#define IDLE_MSG_TYPE_MASK 0xf
+#define IDLE_MSG_TYPE_SHIFT 0
+
+/* idle flit message read fields */
+#define READ_IDLE_MSG_TYPE_MASK 0xf
+#define READ_IDLE_MSG_TYPE_SHIFT 0
+
+/* SMA idle flit payload commands */
+#define SMA_IDLE_ARM 1
+#define SMA_IDLE_ACTIVE 2
+
+/* DC_DC8051_CFG_MODE.GENERAL bits */
+#define DISABLE_SELF_GUID_CHECK 0x2
+
+/*
+ * Eager buffer minimum and maximum sizes supported by the hardware.
+ * All power-of-two sizes in between are supported as well.
+ * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
+ * allocatable for Eager buffer to a single context. All others
+ * are limits for the RcvArray entries.
+ */
+#define MIN_EAGER_BUFFER (4 * 1024)
+#define MAX_EAGER_BUFFER (256 * 1024)
+#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
+#define MAX_EXPECTED_BUFFER (2048 * 1024)
+
+/*
+ * Receive expected base and count and eager base and count increment -
+ * the CSR fields hold multiples of this value.
+ */
+#define RCV_SHIFT 3
+#define RCV_INCREMENT (1 << RCV_SHIFT)
+
+/*
+ * Receive header queue entry increment - the CSR holds multiples of
+ * this value.
+ */
+#define HDRQ_SIZE_SHIFT 5
+#define HDRQ_INCREMENT (1 << HDRQ_SIZE_SHIFT)
+
+/*
+ * Freeze handling flags
+ */
+#define FREEZE_ABORT 0x01 /* do not do recovery */
+#define FREEZE_SELF 0x02 /* initiate the freeze */
+#define FREEZE_LINK_DOWN 0x04 /* link is down */
+
+/*
+ * Chip implementation codes.
+ */
+#define ICODE_RTL_SILICON 0x00
+#define ICODE_RTL_VCS_SIMULATION 0x01
+#define ICODE_FPGA_EMULATION 0x02
+#define ICODE_FUNCTIONAL_SIMULATOR 0x03
+
+/*
+ * 8051 data memory size.
+ */
+#define DC8051_DATA_MEM_SIZE 0x1000
+
+/*
+ * 8051 firmware registers
+ */
+#define NUM_GENERAL_FIELDS 0x17
+#define NUM_LANE_FIELDS 0x8
+
+/* 8051 general register Field IDs */
+#define TX_SETTINGS 0x06
+#define VERIFY_CAP_LOCAL_PHY 0x07
+#define VERIFY_CAP_LOCAL_FABRIC 0x08
+#define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09
+#define LOCAL_DEVICE_ID 0x0a
+#define LOCAL_LNI_INFO 0x0c
+#define REMOTE_LNI_INFO 0x0d
+#define MISC_STATUS 0x0e
+#define VERIFY_CAP_REMOTE_PHY 0x0f
+#define VERIFY_CAP_REMOTE_FABRIC 0x10
+#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
+#define LAST_LOCAL_STATE_COMPLETE 0x12
+#define LAST_REMOTE_STATE_COMPLETE 0x13
+#define LINK_QUALITY_INFO 0x14
+#define REMOTE_DEVICE_ID 0x15
+
+/* Lane ID for general configuration registers */
+#define GENERAL_CONFIG 4
+
+/* LOAD_DATA 8051 command shifts and fields */
+#define LOAD_DATA_FIELD_ID_SHIFT 40
+#define LOAD_DATA_FIELD_ID_MASK 0xfull
+#define LOAD_DATA_LANE_ID_SHIFT 32
+#define LOAD_DATA_LANE_ID_MASK 0xfull
+#define LOAD_DATA_DATA_SHIFT 0x0
+#define LOAD_DATA_DATA_MASK 0xffffffffull
+
+/* READ_DATA 8051 command shifts and fields */
+#define READ_DATA_FIELD_ID_SHIFT 40
+#define READ_DATA_FIELD_ID_MASK 0xffull
+#define READ_DATA_LANE_ID_SHIFT 32
+#define READ_DATA_LANE_ID_MASK 0xffull
+#define READ_DATA_DATA_SHIFT 0x0
+#define READ_DATA_DATA_MASK 0xffffffffull
+
+/* TX settings fields */
+#define ENABLE_LANE_TX_SHIFT 0
+#define ENABLE_LANE_TX_MASK 0xff
+#define TX_POLARITY_INVERSION_SHIFT 8
+#define TX_POLARITY_INVERSION_MASK 0xff
+#define RX_POLARITY_INVERSION_SHIFT 16
+#define RX_POLARITY_INVERSION_MASK 0xff
+#define MAX_RATE_SHIFT 24
+#define MAX_RATE_MASK 0xff
+
+/* verify capability PHY fields */
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK 0x1
+#define POWER_MANAGEMENT_SHIFT 0x0
+#define POWER_MANAGEMENT_MASK 0xf
+
+/* 8051 lane register Field IDs */
+#define SPICO_FW_VERSION 0x7 /* SPICO firmware version */
+
+/* SPICO firmware version fields */
+#define SPICO_ROM_VERSION_SHIFT 0
+#define SPICO_ROM_VERSION_MASK 0xffff
+#define SPICO_ROM_PROD_ID_SHIFT 16
+#define SPICO_ROM_PROD_ID_MASK 0xffff
+
+/* verify capability fabric fields */
+#define VAU_SHIFT 0
+#define VAU_MASK 0x0007
+#define Z_SHIFT 3
+#define Z_MASK 0x0001
+#define VCU_SHIFT 4
+#define VCU_MASK 0x0007
+#define VL15BUF_SHIFT 8
+#define VL15BUF_MASK 0x0fff
+#define CRC_SIZES_SHIFT 20
+#define CRC_SIZES_MASK 0x7
+
+/* verify capability local link width fields */
+#define LINK_WIDTH_SHIFT 0 /* also for remote link width */
+#define LINK_WIDTH_MASK 0xffff /* also for remote link width */
+#define LOCAL_FLAG_BITS_SHIFT 16
+#define LOCAL_FLAG_BITS_MASK 0xff
+#define MISC_CONFIG_BITS_SHIFT 24
+#define MISC_CONFIG_BITS_MASK 0xff
+
+/* verify capability remote link width fields */
+#define REMOTE_TX_RATE_SHIFT 16
+#define REMOTE_TX_RATE_MASK 0xff
+
+/* LOCAL_DEVICE_ID fields */
+#define LOCAL_DEVICE_REV_SHIFT 0
+#define LOCAL_DEVICE_REV_MASK 0xff
+#define LOCAL_DEVICE_ID_SHIFT 8
+#define LOCAL_DEVICE_ID_MASK 0xffff
+
+/* REMOTE_DEVICE_ID fields */
+#define REMOTE_DEVICE_REV_SHIFT 0
+#define REMOTE_DEVICE_REV_MASK 0xff
+#define REMOTE_DEVICE_ID_SHIFT 8
+#define REMOTE_DEVICE_ID_MASK 0xffff
+
+/* local LNI link width fields */
+#define ENABLE_LANE_RX_SHIFT 16
+#define ENABLE_LANE_RX_MASK 0xff
+
+/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
+#define MGMT_ALLOWED_SHIFT 23
+#define MGMT_ALLOWED_MASK 0x1
+
+/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
+#define LINK_QUALITY_SHIFT 24
+#define LINK_QUALITY_MASK 0x7
+
+/*
+ * mask, shift for reading 'planned_down_remote_reason_code'
+ * from LINK_QUALITY_INFO field
+ */
+#define DOWN_REMOTE_REASON_SHIFT 16
+#define DOWN_REMOTE_REASON_MASK 0xff
+
+/* verify capability PHY power management bits */
+#define PWRM_BER_CONTROL 0x1
+#define PWRM_BANDWIDTH_CONTROL 0x2
+
+/* verify capability fabric CRC size bits */
+enum {
+ CAP_CRC_14B = (1 << 0), /* 14b CRC */
+ CAP_CRC_48B = (1 << 1), /* 48b CRC */
+ CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
+};
+
+#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
+
+/* misc status version fields */
+#define STS_FM_VERSION_A_SHIFT 16
+#define STS_FM_VERSION_A_MASK 0xff
+#define STS_FM_VERSION_B_SHIFT 24
+#define STS_FM_VERSION_B_MASK 0xff
+
+/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
+#define LCB_CRC_16B 0x0 /* 16b CRC */
+#define LCB_CRC_14B 0x1 /* 14b CRC */
+#define LCB_CRC_48B 0x2 /* 48b CRC */
+#define LCB_CRC_12B_16B_PER_LANE 0x3 /* 12b-16b per lane CRC */
+
+/* the following enum is (almost) a copy/paste of the definition
+ * in the OPA spec, section 20.2.2.6.8 (PortInfo) */
+enum {
+ PORT_LTP_CRC_MODE_NONE = 0,
+ PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
+ PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
+ PORT_LTP_CRC_MODE_48 = 4,
+ /* 48-bit overlapping LTP CRC mode (optional) */
+ PORT_LTP_CRC_MODE_PER_LANE = 8
+ /* 12 to 16 bit per lane LTP CRC mode (optional) */
+};
+
+/* timeouts */
+#define LINK_RESTART_DELAY 1000 /* link restart delay, in ms */
+#define TIMEOUT_8051_START 5000 /* 8051 start timeout, in ms */
+#define DC8051_COMMAND_TIMEOUT 20000 /* DC8051 command timeout, in ms */
+#define FREEZE_STATUS_TIMEOUT 20 /* wait for freeze indicators, in ms */
+#define VL_STATUS_CLEAR_TIMEOUT 5000 /* per-VL status clear, in ms */
+#define CCE_STATUS_TIMEOUT 10 /* time to clear CCE Status, in ms */
+
+/* cclock tick time, in picoseconds per tick: 1/speed * 10^12 */
+#define ASIC_CCLOCK_PS 1242 /* 805 MHz */
+#define FPGA_CCLOCK_PS 30300 /* 33 MHz */
+
+/*
+ * Mask of enabled MISC errors. Do not enable the two RSA engine errors -
+ * see firmware.c:run_rsa() for details.
+ */
+#define DRIVER_MISC_MASK \
+ (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
+ | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
+
+/* valid values for the loopback module parameter */
+#define LOOPBACK_NONE 0 /* no loopback - default */
+#define LOOPBACK_SERDES 1
+#define LOOPBACK_LCB 2
+#define LOOPBACK_CABLE 3 /* external cable */
+
+/* read and write hardware registers */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
+
+/*
+ * The *_kctxt_* flavor of the CSR read/write functions are for
+ * per-context or per-SDMA CSRs that are not mappable to user-space.
+ * Their spacing is not a PAGE_SIZE multiple.
+ */
+static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+ u32 offset0)
+{
+ /* kernel per-context CSRs are separated by 0x100 */
+ return read_csr(dd, offset0 + (0x100 * ctxt));
+}
+
+static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
+ u32 offset0, u64 value)
+{
+ /* kernel per-context CSRs are separated by 0x100 */
+ write_csr(dd, offset0 + (0x100 * ctxt), value);
+}
+
+int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
+int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
+
+void __iomem *get_csr_addr(
+ struct hfi1_devdata *dd,
+ u32 offset);
+
+static inline void __iomem *get_kctxt_csr_addr(
+ struct hfi1_devdata *dd,
+ int ctxt,
+ u32 offset0)
+{
+ return get_csr_addr(dd, offset0 + (0x100 * ctxt));
+}
+
+/*
+ * The *_uctxt_* flavor of the CSR read/write functions are for
+ * per-context CSRs that are mappable to user space. All these CSRs
+ * are spaced by a PAGE_SIZE multiple in order to be mappable to
+ * different processes without exposing other contexts' CSRs
+ */
+static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+ u32 offset0)
+{
+ /* user per-context CSRs are separated by 0x1000 */
+ return read_csr(dd, offset0 + (0x1000 * ctxt));
+}
+
+static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
+ u32 offset0, u64 value)
+{
+ /* user per-context CSRs are separated by 0x1000 */
+ write_csr(dd, offset0 + (0x1000 * ctxt), value);
+}
+
+u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
+
+/* firmware.c */
+#define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */
+extern const u8 pcie_serdes_broadcast[];
+extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
+/* SBus commands */
+#define RESET_SBUS_RECEIVER 0x20
+#define WRITE_SBUS_RECEIVER 0x21
+void sbus_request(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+int sbus_request_slow(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+void set_sbus_fast_mode(struct hfi1_devdata *dd);
+void clear_sbus_fast_mode(struct hfi1_devdata *dd);
+int hfi1_firmware_init(struct hfi1_devdata *dd);
+int load_pcie_firmware(struct hfi1_devdata *dd);
+int load_firmware(struct hfi1_devdata *dd);
+void dispose_firmware(void);
+int acquire_hw_mutex(struct hfi1_devdata *dd);
+void release_hw_mutex(struct hfi1_devdata *dd);
+void fabric_serdes_reset(struct hfi1_devdata *dd);
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
+
+/* chip.c */
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
+void read_guid(struct hfi1_devdata *dd);
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+ u8 neigh_reason, u8 rem_reason);
+int set_link_state(struct hfi1_pportdata *, u32 state);
+int port_ltp_to_cap(int port_ltp);
+void handle_verify_cap(struct work_struct *work);
+void handle_freeze(struct work_struct *work);
+void handle_link_up(struct work_struct *work);
+void handle_link_down(struct work_struct *work);
+void handle_link_downgrade(struct work_struct *work);
+void handle_link_bounce(struct work_struct *work);
+void handle_sma_message(struct work_struct *work);
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
+int send_idle_sma(struct hfi1_devdata *dd, u64 message);
+int start_link(struct hfi1_pportdata *ppd);
+void init_qsfp(struct hfi1_pportdata *ppd);
+int bringup_serdes(struct hfi1_pportdata *ppd);
+void set_intr_state(struct hfi1_devdata *dd, u32 enable);
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+ int refresh_widths);
+void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
+int stop_drain_data_vls(struct hfi1_devdata *dd);
+int open_fill_data_vls(struct hfi1_devdata *dd);
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
+void get_linkup_link_widths(struct hfi1_pportdata *ppd);
+void read_ltp_rtt(struct hfi1_devdata *dd);
+void clear_linkup_counters(struct hfi1_devdata *dd);
+u32 hdrqempty(struct hfi1_ctxtdata *rcd);
+int is_a0(struct hfi1_devdata *dd);
+int is_ax(struct hfi1_devdata *dd);
+int is_bx(struct hfi1_devdata *dd);
+u32 read_physical_state(struct hfi1_devdata *dd);
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
+u32 get_logical_state(struct hfi1_pportdata *ppd);
+const char *opa_lstate_name(u32 lstate);
+const char *opa_pstate_name(u32 pstate);
+u32 driver_physical_state(struct hfi1_pportdata *ppd);
+u32 driver_logical_state(struct hfi1_pportdata *ppd);
+
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+#define LCB_START DC_LCB_CSRS
+#define LCB_END DC_8051_CSRS /* next block is 8051 */
+static inline int is_lcb_offset(u32 offset)
+{
+ return (offset >= LCB_START && offset < LCB_END);
+}
+
+extern uint num_vls;
+
+extern uint disable_integrity;
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
+
+/* Per VL indexes */
+enum {
+ C_VL_0 = 0,
+ C_VL_1,
+ C_VL_2,
+ C_VL_3,
+ C_VL_4,
+ C_VL_5,
+ C_VL_6,
+ C_VL_7,
+ C_VL_15,
+ C_VL_COUNT
+};
+
+static inline int vl_from_idx(int idx)
+{
+ return (idx == C_VL_15 ? 15 : idx);
+}
+
+static inline int idx_from_vl(int vl)
+{
+ return (vl == 15 ? C_VL_15 : vl);
+}
+
+/* Per device counter indexes */
+enum {
+ C_RCV_OVF = 0,
+ C_RX_TID_FULL,
+ C_RX_TID_INVALID,
+ C_RX_TID_FLGMS,
+ C_RX_CTX_RHQS,
+ C_RX_CTX_EGRS,
+ C_RCV_TID_FLSMS,
+ C_CCE_PCI_CR_ST,
+ C_CCE_PCI_TR_ST,
+ C_CCE_PIO_WR_ST,
+ C_CCE_ERR_INT,
+ C_CCE_SDMA_INT,
+ C_CCE_MISC_INT,
+ C_CCE_RCV_AV_INT,
+ C_CCE_RCV_URG_INT,
+ C_CCE_SEND_CR_INT,
+ C_DC_UNC_ERR,
+ C_DC_RCV_ERR,
+ C_DC_FM_CFG_ERR,
+ C_DC_RMT_PHY_ERR,
+ C_DC_DROPPED_PKT,
+ C_DC_MC_XMIT_PKTS,
+ C_DC_MC_RCV_PKTS,
+ C_DC_XMIT_CERR,
+ C_DC_RCV_CERR,
+ C_DC_RCV_FCC,
+ C_DC_XMIT_FCC,
+ C_DC_XMIT_FLITS,
+ C_DC_RCV_FLITS,
+ C_DC_XMIT_PKTS,
+ C_DC_RCV_PKTS,
+ C_DC_RX_FLIT_VL,
+ C_DC_RX_PKT_VL,
+ C_DC_RCV_FCN,
+ C_DC_RCV_FCN_VL,
+ C_DC_RCV_BCN,
+ C_DC_RCV_BCN_VL,
+ C_DC_RCV_BBL,
+ C_DC_RCV_BBL_VL,
+ C_DC_MARK_FECN,
+ C_DC_MARK_FECN_VL,
+ C_DC_TOTAL_CRC,
+ C_DC_CRC_LN0,
+ C_DC_CRC_LN1,
+ C_DC_CRC_LN2,
+ C_DC_CRC_LN3,
+ C_DC_CRC_MULT_LN,
+ C_DC_TX_REPLAY,
+ C_DC_RX_REPLAY,
+ C_DC_SEQ_CRC_CNT,
+ C_DC_ESC0_ONLY_CNT,
+ C_DC_ESC0_PLUS1_CNT,
+ C_DC_ESC0_PLUS2_CNT,
+ C_DC_REINIT_FROM_PEER_CNT,
+ C_DC_SBE_CNT,
+ C_DC_MISC_FLG_CNT,
+ C_DC_PRF_GOOD_LTP_CNT,
+ C_DC_PRF_ACCEPTED_LTP_CNT,
+ C_DC_PRF_RX_FLIT_CNT,
+ C_DC_PRF_TX_FLIT_CNT,
+ C_DC_PRF_CLK_CNTR,
+ C_DC_PG_DBG_FLIT_CRDTS_CNT,
+ C_DC_PG_STS_PAUSE_COMPLETE_CNT,
+ C_DC_PG_STS_TX_SBE_CNT,
+ C_DC_PG_STS_TX_MBE_CNT,
+ C_SW_CPU_INTR,
+ C_SW_CPU_RCV_LIM,
+ C_SW_VTX_WAIT,
+ C_SW_PIO_WAIT,
+ C_SW_KMEM_WAIT,
+ DEV_CNTR_LAST /* Must be kept last */
+};
+
+/* Per port counter indexes */
+enum {
+ C_TX_UNSUP_VL = 0,
+ C_TX_INVAL_LEN,
+ C_TX_MM_LEN_ERR,
+ C_TX_UNDERRUN,
+ C_TX_FLOW_STALL,
+ C_TX_DROPPED,
+ C_TX_HDR_ERR,
+ C_TX_PKT,
+ C_TX_WORDS,
+ C_TX_WAIT,
+ C_TX_FLIT_VL,
+ C_TX_PKT_VL,
+ C_TX_WAIT_VL,
+ C_RX_PKT,
+ C_RX_WORDS,
+ C_SW_LINK_DOWN,
+ C_SW_LINK_UP,
+ C_SW_XMIT_DSCD,
+ C_SW_XMIT_DSCD_VL,
+ C_SW_XMIT_CSTR_ERR,
+ C_SW_RCV_CSTR_ERR,
+ C_SW_IBP_LOOP_PKTS,
+ C_SW_IBP_RC_RESENDS,
+ C_SW_IBP_RNR_NAKS,
+ C_SW_IBP_OTHER_NAKS,
+ C_SW_IBP_RC_TIMEOUTS,
+ C_SW_IBP_PKT_DROPS,
+ C_SW_IBP_DMA_WAIT,
+ C_SW_IBP_RC_SEQNAK,
+ C_SW_IBP_RC_DUPREQ,
+ C_SW_IBP_RDMA_SEQ,
+ C_SW_IBP_UNALIGNED,
+ C_SW_IBP_SEQ_NAK,
+ C_SW_CPU_RC_ACKS,
+ C_SW_CPU_RC_QACKS,
+ C_SW_CPU_RC_DELAYED_COMP,
+ C_RCV_HDR_OVF_0,
+ C_RCV_HDR_OVF_1,
+ C_RCV_HDR_OVF_2,
+ C_RCV_HDR_OVF_3,
+ C_RCV_HDR_OVF_4,
+ C_RCV_HDR_OVF_5,
+ C_RCV_HDR_OVF_6,
+ C_RCV_HDR_OVF_7,
+ C_RCV_HDR_OVF_8,
+ C_RCV_HDR_OVF_9,
+ C_RCV_HDR_OVF_10,
+ C_RCV_HDR_OVF_11,
+ C_RCV_HDR_OVF_12,
+ C_RCV_HDR_OVF_13,
+ C_RCV_HDR_OVF_14,
+ C_RCV_HDR_OVF_15,
+ C_RCV_HDR_OVF_16,
+ C_RCV_HDR_OVF_17,
+ C_RCV_HDR_OVF_18,
+ C_RCV_HDR_OVF_19,
+ C_RCV_HDR_OVF_20,
+ C_RCV_HDR_OVF_21,
+ C_RCV_HDR_OVF_22,
+ C_RCV_HDR_OVF_23,
+ C_RCV_HDR_OVF_24,
+ C_RCV_HDR_OVF_25,
+ C_RCV_HDR_OVF_26,
+ C_RCV_HDR_OVF_27,
+ C_RCV_HDR_OVF_28,
+ C_RCV_HDR_OVF_29,
+ C_RCV_HDR_OVF_30,
+ C_RCV_HDR_OVF_31,
+ C_RCV_HDR_OVF_32,
+ C_RCV_HDR_OVF_33,
+ C_RCV_HDR_OVF_34,
+ C_RCV_HDR_OVF_35,
+ C_RCV_HDR_OVF_36,
+ C_RCV_HDR_OVF_37,
+ C_RCV_HDR_OVF_38,
+ C_RCV_HDR_OVF_39,
+ C_RCV_HDR_OVF_40,
+ C_RCV_HDR_OVF_41,
+ C_RCV_HDR_OVF_42,
+ C_RCV_HDR_OVF_43,
+ C_RCV_HDR_OVF_44,
+ C_RCV_HDR_OVF_45,
+ C_RCV_HDR_OVF_46,
+ C_RCV_HDR_OVF_47,
+ C_RCV_HDR_OVF_48,
+ C_RCV_HDR_OVF_49,
+ C_RCV_HDR_OVF_50,
+ C_RCV_HDR_OVF_51,
+ C_RCV_HDR_OVF_52,
+ C_RCV_HDR_OVF_53,
+ C_RCV_HDR_OVF_54,
+ C_RCV_HDR_OVF_55,
+ C_RCV_HDR_OVF_56,
+ C_RCV_HDR_OVF_57,
+ C_RCV_HDR_OVF_58,
+ C_RCV_HDR_OVF_59,
+ C_RCV_HDR_OVF_60,
+ C_RCV_HDR_OVF_61,
+ C_RCV_HDR_OVF_62,
+ C_RCV_HDR_OVF_63,
+ C_RCV_HDR_OVF_64,
+ C_RCV_HDR_OVF_65,
+ C_RCV_HDR_OVF_66,
+ C_RCV_HDR_OVF_67,
+ C_RCV_HDR_OVF_68,
+ C_RCV_HDR_OVF_69,
+ C_RCV_HDR_OVF_70,
+ C_RCV_HDR_OVF_71,
+ C_RCV_HDR_OVF_72,
+ C_RCV_HDR_OVF_73,
+ C_RCV_HDR_OVF_74,
+ C_RCV_HDR_OVF_75,
+ C_RCV_HDR_OVF_76,
+ C_RCV_HDR_OVF_77,
+ C_RCV_HDR_OVF_78,
+ C_RCV_HDR_OVF_79,
+ C_RCV_HDR_OVF_80,
+ C_RCV_HDR_OVF_81,
+ C_RCV_HDR_OVF_82,
+ C_RCV_HDR_OVF_83,
+ C_RCV_HDR_OVF_84,
+ C_RCV_HDR_OVF_85,
+ C_RCV_HDR_OVF_86,
+ C_RCV_HDR_OVF_87,
+ C_RCV_HDR_OVF_88,
+ C_RCV_HDR_OVF_89,
+ C_RCV_HDR_OVF_90,
+ C_RCV_HDR_OVF_91,
+ C_RCV_HDR_OVF_92,
+ C_RCV_HDR_OVF_93,
+ C_RCV_HDR_OVF_94,
+ C_RCV_HDR_OVF_95,
+ C_RCV_HDR_OVF_96,
+ C_RCV_HDR_OVF_97,
+ C_RCV_HDR_OVF_98,
+ C_RCV_HDR_OVF_99,
+ C_RCV_HDR_OVF_100,
+ C_RCV_HDR_OVF_101,
+ C_RCV_HDR_OVF_102,
+ C_RCV_HDR_OVF_103,
+ C_RCV_HDR_OVF_104,
+ C_RCV_HDR_OVF_105,
+ C_RCV_HDR_OVF_106,
+ C_RCV_HDR_OVF_107,
+ C_RCV_HDR_OVF_108,
+ C_RCV_HDR_OVF_109,
+ C_RCV_HDR_OVF_110,
+ C_RCV_HDR_OVF_111,
+ C_RCV_HDR_OVF_112,
+ C_RCV_HDR_OVF_113,
+ C_RCV_HDR_OVF_114,
+ C_RCV_HDR_OVF_115,
+ C_RCV_HDR_OVF_116,
+ C_RCV_HDR_OVF_117,
+ C_RCV_HDR_OVF_118,
+ C_RCV_HDR_OVF_119,
+ C_RCV_HDR_OVF_120,
+ C_RCV_HDR_OVF_121,
+ C_RCV_HDR_OVF_122,
+ C_RCV_HDR_OVF_123,
+ C_RCV_HDR_OVF_124,
+ C_RCV_HDR_OVF_125,
+ C_RCV_HDR_OVF_126,
+ C_RCV_HDR_OVF_127,
+ C_RCV_HDR_OVF_128,
+ C_RCV_HDR_OVF_129,
+ C_RCV_HDR_OVF_130,
+ C_RCV_HDR_OVF_131,
+ C_RCV_HDR_OVF_132,
+ C_RCV_HDR_OVF_133,
+ C_RCV_HDR_OVF_134,
+ C_RCV_HDR_OVF_135,
+ C_RCV_HDR_OVF_136,
+ C_RCV_HDR_OVF_137,
+ C_RCV_HDR_OVF_138,
+ C_RCV_HDR_OVF_139,
+ C_RCV_HDR_OVF_140,
+ C_RCV_HDR_OVF_141,
+ C_RCV_HDR_OVF_142,
+ C_RCV_HDR_OVF_143,
+ C_RCV_HDR_OVF_144,
+ C_RCV_HDR_OVF_145,
+ C_RCV_HDR_OVF_146,
+ C_RCV_HDR_OVF_147,
+ C_RCV_HDR_OVF_148,
+ C_RCV_HDR_OVF_149,
+ C_RCV_HDR_OVF_150,
+ C_RCV_HDR_OVF_151,
+ C_RCV_HDR_OVF_152,
+ C_RCV_HDR_OVF_153,
+ C_RCV_HDR_OVF_154,
+ C_RCV_HDR_OVF_155,
+ C_RCV_HDR_OVF_156,
+ C_RCV_HDR_OVF_157,
+ C_RCV_HDR_OVF_158,
+ C_RCV_HDR_OVF_159,
+ PORT_CNTR_LAST /* Must be kept last */
+};
+
+u64 get_all_cpu_total(u64 __percpu *cntr);
+void hfi1_start_cleanup(struct hfi1_devdata *dd);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+struct hfi1_message_header *hfi1_get_msgheader(
+ struct hfi1_devdata *dd, __le32 *rhf_addr);
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+ struct hfi1_ctxt_info *kinfo);
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+ u32 mask);
+int hfi1_init_ctxt(struct send_context *sc);
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+ u32 type, unsigned long pa, u16 order);
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
+ u64 **cntrp);
+u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
+ char **namep, u64 **cntrp);
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+
+/*
+ * Interrupt source table.
+ *
+ * Each entry is an interrupt source "type". It is ordered by increasing
+ * number.
+ */
+struct is_table {
+ int start; /* interrupt source type start */
+ int end; /* interrupt source type end */
+ /* routine that returns the name of the interrupt source */
+ char *(*is_name)(char *name, size_t size, unsigned int source);
+ /* routine to call when receiving an interrupt */
+ void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
+};
+
+#endif /* _CHIP_H */
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
new file mode 100644
index 000000000000..bf45de29d8bd
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/chip_registers.h
@@ -0,0 +1,1292 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE 0x000000000000
+#define CCE (CORE + 0x000000000000)
+#define ASIC (CORE + 0x000000400000)
+#define MISC (CORE + 0x000000500000)
+#define DC_TOP_CSRS (CORE + 0x000000600000)
+#define CHIP_DEBUG (CORE + 0x000000700000)
+#define RXE (CORE + 0x000001000000)
+#define TXE (CORE + 0x000001800000)
+#define DCC_CSRS (DC_TOP_CSRS + 0x000000000000)
+#define DC_LCB_CSRS (DC_TOP_CSRS + 0x000000001000)
+#define DC_8051_CSRS (DC_TOP_CSRS + 0x000000002000)
+#define PCIE 0
+
+#define ASIC_NUM_SCRATCH 4
+#define CCE_ERR_INT_CNT 0
+#define CCE_MISC_INT_CNT 2
+#define CCE_NUM_32_BIT_COUNTERS 3
+#define CCE_NUM_32_BIT_INT_COUNTERS 6
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define CCE_NUM_MSIX_PBAS 4
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_SCRATCH 4
+#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
+#define CCE_PCIE_TRGT_STALL_CNT 0
+#define CCE_PIO_WR_STALL_CNT 1
+#define CCE_RCV_AVAIL_INT_CNT 3
+#define CCE_RCV_URGENT_INT_CNT 4
+#define CCE_SDMA_INT_CNT 1
+#define CCE_SEND_CREDIT_INT_CNT 5
+#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
+#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
+#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
+#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
+#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
+#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
+#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
+#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
+#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
+#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
+#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
+#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
+#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
+#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
+#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
+#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
+#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
+#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
+#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
+#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
+#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
+#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
+#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
+#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
+#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
+#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
+#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
+#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
+#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
+#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
+#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
+#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
+#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
+#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
+#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
+#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
+#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
+#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
+#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
+#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
+#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
+#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
+#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
+#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
+#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
+#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
+#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
+#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
+#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
+#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
+#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
+#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
+#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
+#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
+#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
+#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
+#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
+#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
+#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
+#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
+#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
+#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
+#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
+#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
+#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
+#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
+#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
+#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
+#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
+#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
+#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
+#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
+#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
+#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
+#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
+#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
+#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
+#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
+#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
+#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
+#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
+#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
+#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
+#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
+#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
+#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
+#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
+#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
+#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
+#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
+#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
+#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
+#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
+#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
+#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
+#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
+#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
+#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
+#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
+#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
+#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
+#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
+#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
+#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
+#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
+#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
+#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
+#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
+#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
+#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
+#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
+#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
+#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
+#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
+#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
+#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
+#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
+#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
+#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
+#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
+#define DC_LCB_CFG_RUN_EN_SHIFT 0
+#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
+#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
+#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
+#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
+#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
+#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
+#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
+#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
+#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
+#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
+#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
+#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
+#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
+#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
+#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
+#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
+#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
+#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
+#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
+#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
+#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
+#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
+#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
+#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
+#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
+#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
+#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
+#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
+#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
+#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
+#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
+#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
+#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
+#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
+#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
+#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
+#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
+#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
+#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
+#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
+#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
+#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
+#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
+#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
+#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
+#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_BUF_OVFL_CNT 10
+#define RCV_CONTEXT_EGR_STALL 22
+#define RCV_CONTEXT_RHQ_STALL 21
+#define RCV_DATA_PKT_CNT 0
+#define RCV_DWORD_CNT 1
+#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
+#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
+#define RCV_TID_FULL_ERR_CNT 18
+#define RCV_TID_VALID_ERR_CNT 19
+#define RXE_NUM_32_BIT_COUNTERS 24
+#define RXE_NUM_64_BIT_COUNTERS 2
+#define RXE_NUM_RSM_INSTANCES 4
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_PER_CONTEXT_OFFSET 0x0300000
+#define SEND_DATA_PKT_CNT 0
+#define SEND_DATA_PKT_VL0_CNT 12
+#define SEND_DATA_VL0_CNT 3
+#define SEND_DROPPED_PKT_CNT 5
+#define SEND_DWORD_CNT 1
+#define SEND_FLOW_STALL_CNT 4
+#define SEND_HEADERS_ERR_CNT 6
+#define SEND_LEN_ERR_CNT 1
+#define SEND_MAX_MIN_LEN_ERR_CNT 2
+#define SEND_UNDERRUN_CNT 3
+#define SEND_UNSUP_VL_ERR_CNT 0
+#define SEND_WAIT_CNT 2
+#define SEND_WAIT_VL0_CNT 21
+#define TXE_PIO_SEND_OFFSET 0x0800000
+#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
+#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
+#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
+#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
+#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
+#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
+#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
+#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
+#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
+#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
+#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
+#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
+#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
+#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
+#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
+#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
+#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
+#define ASIC_EEP_DATA (ASIC + 0x000000000310)
+#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
+#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
+#define ASIC_GPIO_IN (ASIC + 0x000000000200)
+#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
+#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
+#define ASIC_GPIO_OE (ASIC + 0x000000000208)
+#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
+#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
+#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
+#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
+#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
+#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
+#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
+#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
+#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
+#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
+#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
+#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
+#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
+#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
+#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
+#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
+#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
+#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
+#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
+#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
+#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
+#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
+#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
+#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
+#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
+#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
+#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
+#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_THERM (ASIC + 0x000000000058)
+#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
+#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
+#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
+#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
+#define ASIC_STS_THERM_LOW_SHIFT 13
+#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
+#define CCE_CTRL (CCE + 0x000000000010)
+#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
+#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
+#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
+#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
+#define CCE_DC_CTRL (CCE + 0x0000000000B8)
+#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
+#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
+#define CCE_ERR_CLEAR (CCE + 0x000000000050)
+#define CCE_ERR_MASK (CCE + 0x000000000048)
+#define CCE_ERR_STATUS (CCE + 0x000000000040)
+#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
+ 0x200ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
+ 0x800ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
+ 0x400ull
+#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
+#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
+#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
+#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
+#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
+#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
+#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
+#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
+#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
+#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
+#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
+#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
+#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
+#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
+#define CCE_INT_CLEAR (CCE + 0x000000110A00)
+#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
+#define CCE_INT_FORCE (CCE + 0x000000110B00)
+#define CCE_INT_MAP (CCE + 0x000000110500)
+#define CCE_INT_MASK (CCE + 0x000000110900)
+#define CCE_INT_STATUS (CCE + 0x000000110800)
+#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
+#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
+#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
+#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
+#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
+#define CCE_REVISION (CCE + 0x000000000000)
+#define CCE_REVISION2 (CCE + 0x000000000008)
+#define CCE_REVISION2_HFI_ID_MASK 0x1ull
+#define CCE_REVISION2_HFI_ID_SHIFT 0
+#define CCE_REVISION2_IMPL_CODE_SHIFT 8
+#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
+#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
+#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
+#define CCE_REVISION_SW_MASK 0xFFull
+#define CCE_REVISION_SW_SHIFT 24
+#define CCE_SCRATCH (CCE + 0x000000000020)
+#define CCE_STATUS (CCE + 0x000000000018)
+#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
+#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
+#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
+#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
+#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
+#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
+#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
+#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
+#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
+#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
+#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
+#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
+#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
+#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
+#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
+#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
+#define MISC_ERR_CLEAR (MISC + 0x000000002010)
+#define MISC_ERR_MASK (MISC + 0x000000002008)
+#define MISC_ERR_STATUS (MISC + 0x000000002000)
+#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
+#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
+#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
+#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
+#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
+#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
+#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
+#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
+#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
+#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
+#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
+#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
+#define PCI_CFG_REG1 (PCIE + 0x000000000004)
+#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
+#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
+#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
+#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
+#define RCV_BTH_QP (RXE + 0x000000000028)
+#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
+#define RCV_BTH_QP_KDETH_QP_SHIFT 16
+#define RCV_BYPASS (RXE + 0x000000000038)
+#define RCV_CONTEXTS (RXE + 0x000000000010)
+#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
+#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
+#define RCV_CTRL (RXE + 0x000000000000)
+#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
+#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
+#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
+#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
+#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
+#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
+#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
+#define RCV_CTXT_CTRL (RXE + 0x000000100000)
+#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
+#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
+#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
+#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
+#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
+#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
+#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
+#define RCV_CTXT_STATUS (RXE + 0x000000100008)
+#define RCV_EGR_CTRL (RXE + 0x000000100010)
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
+#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
+#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
+#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
+#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
+#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
+#define RCV_ERR_CLEAR (RXE + 0x000000000070)
+#define RCV_ERR_INFO (RXE + 0x000000000050)
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
+#define RCV_ERR_MASK (RXE + 0x000000000068)
+#define RCV_ERR_STATUS (RXE + 0x000000000060)
+#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
+ 0x4000000000000000ull
+#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
+#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
+ 0x40000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+ 0x20000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+ 0x800000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+ 0x400000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+ 0x10000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+ 0x20000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+ 0x200000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+ 0x8000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+ 0x1000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
+#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
+#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
+#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
+#define RCV_HDR_ADDR (RXE + 0x000000100028)
+#define RCV_HDR_CNT (RXE + 0x000000100030)
+#define RCV_HDR_CNT_CNT_MASK 0x1FFull
+#define RCV_HDR_CNT_CNT_SHIFT 0
+#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
+#define RCV_HDR_HEAD (RXE + 0x000000300008)
+#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
+#define RCV_HDR_HEAD_COUNTER_SHIFT 32
+#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
+#define RCV_HDR_HEAD_HEAD_SHIFT 0
+#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
+#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
+#define RCV_HDR_SIZE (RXE + 0x000000100040)
+#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
+#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
+#define RCV_HDR_TAIL (RXE + 0x000000300000)
+#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
+#define RCV_KEY_CTRL (RXE + 0x000000100020)
+#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
+#define RCV_MULTICAST (RXE + 0x000000000030)
+#define RCV_PARTITION_KEY (RXE + 0x000000000200)
+#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
+#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
+#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
+#define RCV_RSM_CFG (RXE + 0x000000000600)
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
+#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
+#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
+#define RCV_RSM_MATCH (RXE + 0x000000000800)
+#define RCV_RSM_MATCH_MASK1_SHIFT 0
+#define RCV_RSM_MATCH_MASK2_SHIFT 16
+#define RCV_RSM_MATCH_VALUE1_SHIFT 8
+#define RCV_RSM_MATCH_VALUE2_SHIFT 24
+#define RCV_RSM_SELECT (RXE + 0x000000000700)
+#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
+#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
+#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
+#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
+#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
+#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
+#define RCV_STATUS (RXE + 0x000000000008)
+#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
+#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
+#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
+#define RCV_TID_CTRL (RXE + 0x000000100018)
+#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
+#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
+#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
+#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
+#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
+#define RCV_VL15 (RXE + 0x000000000048)
+#define SEND_BTH_QP (TXE + 0x0000000000A0)
+#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
+#define SEND_BTH_QP_KDETH_QP_SHIFT 16
+#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
+#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
+ 0x1000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
+ 0x8000000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
+ 0x2000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
+ 0x4000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
+ 0x8000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
+ 0x10000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
+ 0x20000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
+ 0x40000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
+ 0x80000000000000ull
+#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
+#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
+#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
+#define SEND_CM_CTRL (TXE + 0x000000000500)
+#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
+#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
+#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
+#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
+#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
+#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
+#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
+#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
+#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
+#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
+#define SEND_CONTEXTS (TXE + 0x000000000010)
+#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
+#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
+#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
+#define SEND_CTRL (TXE + 0x000000000000)
+#define SEND_CTRL_CM_RESET_SMASK 0x4ull
+#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
+#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
+ 0x200000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+ 0x100000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+ 0x80000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
+ 0x40000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+ 0x8000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
+ 0x4000ull
+#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
+#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
+#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
+#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
+#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
+#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
+#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
+#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
+#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
+#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
+#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
+#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
+#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
+#define SEND_CTXT_CTRL (TXE + 0x000000100000)
+#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
+#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
+#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
+#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
+#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
+#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
+#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
+#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
+#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
+#define SEND_CTXT_STATUS (TXE + 0x000000100008)
+#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
+#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
+#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
+#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+ 0x100000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+ 0x80000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+ 0x8000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
+#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
+#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
+#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
+#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
+#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
+#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
+#define SEND_DMA_CTRL (TXE + 0x000000200000)
+#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
+#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
+#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
+#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
+#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
+#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
+#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
+#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
+#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
+ 0x40000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
+ 0x20000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
+ 0x100ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
+ 0x10000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
+#define SEND_DMA_ENGINES (TXE + 0x000000000018)
+#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
+#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
+#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
+#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
+#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
+#define SEND_DMA_HEAD (TXE + 0x000000200028)
+#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
+#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
+#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
+#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
+#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
+#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
+#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
+#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
+#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
+#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
+#define SEND_DMA_STATUS (TXE + 0x000000200008)
+#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
+#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
+#define SEND_DMA_TAIL (TXE + 0x000000200020)
+#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
+ 0x3FFFull
+#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
+#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
+#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
+#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
+#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
+#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
+#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
+#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
+#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
+#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
+#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
+#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
+#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
+#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
+#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
+#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
+ 0x200000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
+ 0x20000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
+ 0x800000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
+ 0x2000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
+ 0x200000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
+ 0x8ull
+#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
+ 0x400000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
+ 0x1000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
+ 0x100000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
+ 0x2000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
+ 0x200000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
+ 0x4000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
+ 0x400000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
+ 0x8000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
+ 0x800000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
+ 0x10000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
+ 0x1000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
+ 0x20000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
+ 0x2000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
+ 0x40000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
+ 0x4000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
+ 0x80000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
+ 0x8000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
+ 0x100000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
+ 0x10000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
+ 0x1000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
+ 0x8000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
+ 0x100000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
+ 0x800000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
+ 0x4000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
+ 0x80000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
+ 0x800ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
+ 0x10000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
+ 0x4000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
+ 0x8000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
+ 0x10000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
+ 0x20000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
+ 0x40000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
+ 0x80000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
+ 0x20000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
+ 0x40000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
+ 0x80000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
+ 0x100000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
+ 0x200000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
+ 0x400000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
+ 0x800000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
+ 0x1000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
+ 0x2000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
+ 0x100ull
+#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+ 0x3FFFull
+#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
+#define SEND_ERR_MASK (TXE + 0x0000000000E8)
+#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
+#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
+#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
+#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
+#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
+#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
+#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
+#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
+#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
+#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
+#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
+#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
+#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
+#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
+#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
+#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
+#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+ 0x1000000ull
+#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
+#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
+#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+ 0x100000000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
+#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+ 0x200000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+ 0x400000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
+ 0x800000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+ 0x100ull
+#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
+#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
+#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
+#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
+#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
+#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
+#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
+#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
+#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
+#define SEND_SC2VLT0_SC0_SHIFT 0
+#define SEND_SC2VLT0_SC1_SHIFT 8
+#define SEND_SC2VLT0_SC2_SHIFT 16
+#define SEND_SC2VLT0_SC3_SHIFT 24
+#define SEND_SC2VLT0_SC4_SHIFT 32
+#define SEND_SC2VLT0_SC5_SHIFT 40
+#define SEND_SC2VLT0_SC6_SHIFT 48
+#define SEND_SC2VLT0_SC7_SHIFT 56
+#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
+#define SEND_SC2VLT1_SC10_SHIFT 16
+#define SEND_SC2VLT1_SC11_SHIFT 24
+#define SEND_SC2VLT1_SC12_SHIFT 32
+#define SEND_SC2VLT1_SC13_SHIFT 40
+#define SEND_SC2VLT1_SC14_SHIFT 48
+#define SEND_SC2VLT1_SC15_SHIFT 56
+#define SEND_SC2VLT1_SC8_SHIFT 0
+#define SEND_SC2VLT1_SC9_SHIFT 8
+#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
+#define SEND_SC2VLT2_SC16_SHIFT 0
+#define SEND_SC2VLT2_SC17_SHIFT 8
+#define SEND_SC2VLT2_SC18_SHIFT 16
+#define SEND_SC2VLT2_SC19_SHIFT 24
+#define SEND_SC2VLT2_SC20_SHIFT 32
+#define SEND_SC2VLT2_SC21_SHIFT 40
+#define SEND_SC2VLT2_SC22_SHIFT 48
+#define SEND_SC2VLT2_SC23_SHIFT 56
+#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
+#define SEND_SC2VLT3_SC24_SHIFT 0
+#define SEND_SC2VLT3_SC25_SHIFT 8
+#define SEND_SC2VLT3_SC26_SHIFT 16
+#define SEND_SC2VLT3_SC27_SHIFT 24
+#define SEND_SC2VLT3_SC28_SHIFT 32
+#define SEND_SC2VLT3_SC29_SHIFT 40
+#define SEND_SC2VLT3_SC30_SHIFT 48
+#define SEND_SC2VLT3_SC31_SHIFT 56
+#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
+#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
+#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
+#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
+#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
+#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
+#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
+#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
+#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
+#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
+#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
+#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
+#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
+#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
+#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
+#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
+#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
+
+#endif /* DEF_CHIP_REG */
diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h
new file mode 100644
index 000000000000..5f2293729cf9
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/common.h
@@ -0,0 +1,415 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT 24
+#define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT 63
+#define HFI1_CAP_LOCKED_MASK 0x1ULL
+#define HFI1_CAP_LOCKED_SMASK (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+ HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap) \
+ ({ \
+ hfi1_cap_mask &= ~HFI1_CAP_##cap; \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_USET(cap) \
+ ({ \
+ hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_UCLEAR(cap) \
+ ({ \
+ hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_SET(cap) \
+ ({ \
+ hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap << \
+ HFI1_CAP_USER_SHIFT)); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_CLEAR(cap) \
+ ({ \
+ hfi1_cap_mask &= ~(HFI1_CAP_##cap | \
+ (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_LOCK() \
+ ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK (HFI1_CAP_SDMA_AHG | \
+ HFI1_CAP_HDRSUPP | \
+ HFI1_CAP_MULTI_PKT_EGR | \
+ HFI1_CAP_NODROP_RHQ_FULL | \
+ HFI1_CAP_NODROP_EGR_FULL | \
+ HFI1_CAP_ALLOW_PERM_JKEY | \
+ HFI1_CAP_STATIC_RATE_CTRL | \
+ HFI1_CAP_PRINT_UNIMPL)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \
+ HFI1_CAP_USE_SDMA_HEAD | \
+ HFI1_CAP_EXTENDED_PSN | \
+ HFI1_CAP_PRINT_UNIMPL | \
+ HFI1_CAP_QSFP_ENABLED | \
+ HFI1_CAP_NO_INTEGRITY | \
+ HFI1_CAP_PKEY_CHECK) << \
+ HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT (HFI1_CAP_HDRSUPP | \
+ HFI1_CAP_NODROP_RHQ_FULL | \
+ HFI1_CAP_NODROP_EGR_FULL | \
+ HFI1_CAP_SDMA | \
+ HFI1_CAP_PRINT_UNIMPL | \
+ HFI1_CAP_STATIC_RATE_CTRL | \
+ HFI1_CAP_QSFP_ENABLED | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_MULTI_PKT_EGR | \
+ HFI1_CAP_EXTENDED_PSN | \
+ ((HFI1_CAP_HDRSUPP | \
+ HFI1_CAP_MULTI_PKT_EGR | \
+ HFI1_CAP_STATIC_RATE_CTRL | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_EARLY_CREDIT_RETURN) << \
+ HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA | \
+ HFI1_CAP_EXTENDED_PSN | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user. It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number. This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-248"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+ __u16 version; /* structure version */
+ __u16 unit; /* which device */
+ __u16 sw_index; /* send sw index to use */
+ __u16 len; /* data length, in bytes */
+ __u16 port; /* port number */
+ __u16 unused;
+ __u32 flags; /* call flags */
+ __u64 data; /* user data pointer */
+ __u64 pbc; /* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1 /* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT 0
+#define RHF_PKT_LEN_MASK 0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT 12
+#define RHF_RCV_TYPE_MASK 0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT 15
+#define RHF_USE_EGR_BFR_MASK 0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT 16
+#define RHF_EGR_INDEX_MASK 0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT 27
+#define RHF_DC_INFO_MASK 0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT 28
+#define RHF_RCV_SEQ_MASK 0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT 32
+#define RHF_EGR_OFFSET_MASK 0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT 44
+#define RHF_HDRQ_OFFSET_MASK 0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR (0x1ull << 53)
+#define RHF_DC_UNC_ERR (0x1ull << 54)
+#define RHF_DC_ERR (0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT 56
+#define RHF_RCV_TYPE_ERR_MASK 0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR (0x1ull << 59)
+#define RHF_LEN_ERR (0x1ull << 60)
+#define RHF_ECC_ERR (0x1ull << 61)
+#define RHF_VCRC_ERR (0x1ull << 62)
+#define RHF_ICRC_ERR (0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER 1
+#define RHF_RCV_TYPE_IB 2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR 3
+#define RHF_RCV_TYPE_BYPASS 4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR 0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR 0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR 0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR 0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR 0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR 0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR 0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR 0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR 0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR 0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR 0x0
+
+/*
+ * This structure contains the first field common to all protocols
+ * that employ this chip.
+ */
+struct hfi1_message_header {
+ __be16 lrh[4];
+};
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define LIM_MGMT_P_KEY 0x7FFF
+#define FULL_MGMT_P_KEY 0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+#define HFI1_PERMISSIVE_LID 0xFFFF
+#define HFI1_AETH_CREDIT_SHIFT 24
+#define HFI1_AETH_CREDIT_MASK 0x1F
+#define HFI1_AETH_CREDIT_INVAL 0x1F
+#define HFI1_MSN_MASK 0xFFFFFF
+#define HFI1_QPN_MASK 0xFFFFFF
+#define HFI1_FECN_SHIFT 31
+#define HFI1_FECN_MASK 1
+#define HFI1_FECN_SMASK (1 << HFI1_FECN_SHIFT)
+#define HFI1_BECN_SHIFT 30
+#define HFI1_BECN_MASK 1
+#define HFI1_BECN_SMASK (1 << HFI1_BECN_SHIFT)
+#define HFI1_MULTICAST_LID_BASE 0xC000
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+ return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+ return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+ return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+ return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+ return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+ return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+ return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+ return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+ return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+ return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+ return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
diff --git a/drivers/staging/rdma/hfi1/cq.c b/drivers/staging/rdma/hfi1/cq.c
new file mode 100644
index 000000000000..4f046ffe7e60
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/cq.c
@@ -0,0 +1,558 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+
+#include "verbs.h"
+#include "hfi.h"
+
+/**
+ * hfi1_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicited entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited)
+{
+ struct hfi1_cq_wc *wc;
+ unsigned long flags;
+ u32 head;
+ u32 next;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ /*
+ * Note that the head pointer might be writable by user processes.
+ * Take care to verify it is a sane value.
+ */
+ wc = cq->queue;
+ head = wc->head;
+ if (head >= (unsigned) cq->ibcq.cqe) {
+ head = cq->ibcq.cqe;
+ next = 0;
+ } else
+ next = head + 1;
+ if (unlikely(next == wc->tail)) {
+ spin_unlock_irqrestore(&cq->lock, flags);
+ if (cq->ibcq.event_handler) {
+ struct ib_event ev;
+
+ ev.device = cq->ibcq.device;
+ ev.element.cq = &cq->ibcq;
+ ev.event = IB_EVENT_CQ_ERR;
+ cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+ }
+ return;
+ }
+ if (cq->ip) {
+ wc->uqueue[head].wr_id = entry->wr_id;
+ wc->uqueue[head].status = entry->status;
+ wc->uqueue[head].opcode = entry->opcode;
+ wc->uqueue[head].vendor_err = entry->vendor_err;
+ wc->uqueue[head].byte_len = entry->byte_len;
+ wc->uqueue[head].ex.imm_data =
+ (__u32 __force)entry->ex.imm_data;
+ wc->uqueue[head].qp_num = entry->qp->qp_num;
+ wc->uqueue[head].src_qp = entry->src_qp;
+ wc->uqueue[head].wc_flags = entry->wc_flags;
+ wc->uqueue[head].pkey_index = entry->pkey_index;
+ wc->uqueue[head].slid = entry->slid;
+ wc->uqueue[head].sl = entry->sl;
+ wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+ wc->uqueue[head].port_num = entry->port_num;
+ /* Make sure entry is written before the head index. */
+ smp_wmb();
+ } else
+ wc->kqueue[head] = *entry;
+ wc->head = next;
+
+ if (cq->notify == IB_CQ_NEXT_COMP ||
+ (cq->notify == IB_CQ_SOLICITED &&
+ (solicited || entry->status != IB_WC_SUCCESS))) {
+ struct kthread_worker *worker;
+ /*
+ * This will cause send_complete() to be called in
+ * another thread.
+ */
+ smp_read_barrier_depends(); /* see hfi1_cq_exit */
+ worker = cq->dd->worker;
+ if (likely(worker)) {
+ cq->notify = IB_CQ_NONE;
+ cq->triggered++;
+ queue_kthread_work(worker, &cq->comptask);
+ }
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+/**
+ * hfi1_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context. Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+ struct hfi1_cq *cq = to_icq(ibcq);
+ struct hfi1_cq_wc *wc;
+ unsigned long flags;
+ int npolled;
+ u32 tail;
+
+ /* The kernel can only poll a kernel completion queue */
+ if (cq->ip) {
+ npolled = -EINVAL;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ wc = cq->queue;
+ tail = wc->tail;
+ if (tail > (u32) cq->ibcq.cqe)
+ tail = (u32) cq->ibcq.cqe;
+ for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+ if (tail == wc->head)
+ break;
+ /* The kernel doesn't need a RMB since it has the lock. */
+ *entry = wc->kqueue[tail];
+ if (tail >= cq->ibcq.cqe)
+ tail = 0;
+ else
+ tail++;
+ }
+ wc->tail = tail;
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+ return npolled;
+}
+
+static void send_complete(struct kthread_work *work)
+{
+ struct hfi1_cq *cq = container_of(work, struct hfi1_cq, comptask);
+
+ /*
+ * The completion handler will most likely rearm the notification
+ * and poll for all pending entries. If a new completion entry
+ * is added while we are in this routine, queue_work()
+ * won't call us again until we return so we check triggered to
+ * see if we need to call the handler again.
+ */
+ for (;;) {
+ u8 triggered = cq->triggered;
+
+ /*
+ * IPoIB connected mode assumes the callback is from a
+ * soft IRQ. We simulate this by blocking "bottom halves".
+ * See the implementation for ipoib_cm_handle_tx_wc(),
+ * netif_tx_lock_bh() and netif_tx_lock().
+ */
+ local_bh_disable();
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+ local_bh_enable();
+
+ if (cq->triggered == triggered)
+ return;
+ }
+}
+
+/**
+ * hfi1_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @attr: creation attributes
+ * @context: unused by the driver
+ * @udata: user data for libibverbs.so
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *hfi1_create_cq(
+ struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct hfi1_ibdev *dev = to_idev(ibdev);
+ struct hfi1_cq *cq;
+ struct hfi1_cq_wc *wc;
+ struct ib_cq *ret;
+ u32 sz;
+ unsigned int entries = attr->cqe;
+
+ if (attr->flags)
+ return ERR_PTR(-EINVAL);
+
+ if (entries < 1 || entries > hfi1_max_cqes)
+ return ERR_PTR(-EINVAL);
+
+ /* Allocate the completion queue structure. */
+ cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+ if (!cq)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Allocate the completion queue entries and head/tail pointers.
+ * This is allocated separately so that it can be resized and
+ * also mapped into user space.
+ * We need to use vmalloc() in order to support mmap and large
+ * numbers of entries.
+ */
+ sz = sizeof(*wc);
+ if (udata && udata->outlen >= sizeof(__u64))
+ sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+ else
+ sz += sizeof(struct ib_wc) * (entries + 1);
+ wc = vmalloc_user(sz);
+ if (!wc) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_cq;
+ }
+
+ /*
+ * Return the address of the WC as the offset to mmap.
+ * See hfi1_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ int err;
+
+ cq->ip = hfi1_create_mmap_info(dev, sz, context, wc);
+ if (!cq->ip) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_wc;
+ }
+
+ err = ib_copy_to_udata(udata, &cq->ip->offset,
+ sizeof(cq->ip->offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ } else
+ cq->ip = NULL;
+
+ spin_lock(&dev->n_cqs_lock);
+ if (dev->n_cqs_allocated == hfi1_max_cqs) {
+ spin_unlock(&dev->n_cqs_lock);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ dev->n_cqs_allocated++;
+ spin_unlock(&dev->n_cqs_lock);
+
+ if (cq->ip) {
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ /*
+ * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+ * The number of entries should be >= the number requested or return
+ * an error.
+ */
+ cq->dd = dd_from_dev(dev);
+ cq->ibcq.cqe = entries;
+ cq->notify = IB_CQ_NONE;
+ cq->triggered = 0;
+ spin_lock_init(&cq->lock);
+ init_kthread_work(&cq->comptask, send_complete);
+ wc->head = 0;
+ wc->tail = 0;
+ cq->queue = wc;
+
+ ret = &cq->ibcq;
+
+ goto done;
+
+bail_ip:
+ kfree(cq->ip);
+bail_wc:
+ vfree(wc);
+bail_cq:
+ kfree(cq);
+done:
+ return ret;
+}
+
+/**
+ * hfi1_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int hfi1_destroy_cq(struct ib_cq *ibcq)
+{
+ struct hfi1_ibdev *dev = to_idev(ibcq->device);
+ struct hfi1_cq *cq = to_icq(ibcq);
+
+ flush_kthread_work(&cq->comptask);
+ spin_lock(&dev->n_cqs_lock);
+ dev->n_cqs_allocated--;
+ spin_unlock(&dev->n_cqs_lock);
+ if (cq->ip)
+ kref_put(&cq->ip->ref, hfi1_release_mmap_info);
+ else
+ vfree(cq->queue);
+ kfree(cq);
+
+ return 0;
+}
+
+/**
+ * hfi1_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context. Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int hfi1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+ struct hfi1_cq *cq = to_icq(ibcq);
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&cq->lock, flags);
+ /*
+ * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+ * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+ */
+ if (cq->notify != IB_CQ_NEXT_COMP)
+ cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+ if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+ cq->queue->head != cq->queue->tail)
+ ret = 1;
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return ret;
+}
+
+/**
+ * hfi1_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+ struct hfi1_cq *cq = to_icq(ibcq);
+ struct hfi1_cq_wc *old_wc;
+ struct hfi1_cq_wc *wc;
+ u32 head, tail, n;
+ int ret;
+ u32 sz;
+
+ if (cqe < 1 || cqe > hfi1_max_cqes) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /*
+ * Need to use vmalloc() if we want to support large #s of entries.
+ */
+ sz = sizeof(*wc);
+ if (udata && udata->outlen >= sizeof(__u64))
+ sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+ else
+ sz += sizeof(struct ib_wc) * (cqe + 1);
+ wc = vmalloc_user(sz);
+ if (!wc) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ /* Check that we can write the offset to mmap. */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ __u64 offset = 0;
+
+ ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+ if (ret)
+ goto bail_free;
+ }
+
+ spin_lock_irq(&cq->lock);
+ /*
+ * Make sure head and tail are sane since they
+ * might be user writable.
+ */
+ old_wc = cq->queue;
+ head = old_wc->head;
+ if (head > (u32) cq->ibcq.cqe)
+ head = (u32) cq->ibcq.cqe;
+ tail = old_wc->tail;
+ if (tail > (u32) cq->ibcq.cqe)
+ tail = (u32) cq->ibcq.cqe;
+ if (head < tail)
+ n = cq->ibcq.cqe + 1 + head - tail;
+ else
+ n = head - tail;
+ if (unlikely((u32)cqe < n)) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ for (n = 0; tail != head; n++) {
+ if (cq->ip)
+ wc->uqueue[n] = old_wc->uqueue[tail];
+ else
+ wc->kqueue[n] = old_wc->kqueue[tail];
+ if (tail == (u32) cq->ibcq.cqe)
+ tail = 0;
+ else
+ tail++;
+ }
+ cq->ibcq.cqe = cqe;
+ wc->head = n;
+ wc->tail = 0;
+ cq->queue = wc;
+ spin_unlock_irq(&cq->lock);
+
+ vfree(old_wc);
+
+ if (cq->ip) {
+ struct hfi1_ibdev *dev = to_idev(ibcq->device);
+ struct hfi1_mmap_info *ip = cq->ip;
+
+ hfi1_update_mmap_info(dev, ip, sz, wc);
+
+ /*
+ * Return the offset to mmap.
+ * See hfi1_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ ret = ib_copy_to_udata(udata, &ip->offset,
+ sizeof(ip->offset));
+ if (ret)
+ goto bail;
+ }
+
+ spin_lock_irq(&dev->pending_lock);
+ if (list_empty(&ip->pending_mmaps))
+ list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ ret = 0;
+ goto bail;
+
+bail_unlock:
+ spin_unlock_irq(&cq->lock);
+bail_free:
+ vfree(wc);
+bail:
+ return ret;
+}
+
+int hfi1_cq_init(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+ int cpu;
+ struct task_struct *task;
+
+ if (dd->worker)
+ return 0;
+ dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL);
+ if (!dd->worker)
+ return -ENOMEM;
+ init_kthread_worker(dd->worker);
+ task = kthread_create_on_node(
+ kthread_worker_fn,
+ dd->worker,
+ dd->assigned_node_id,
+ "hfi1_cq%d", dd->unit);
+ if (IS_ERR(task))
+ goto task_fail;
+ cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id));
+ kthread_bind(task, cpu);
+ wake_up_process(task);
+out:
+ return ret;
+task_fail:
+ ret = PTR_ERR(task);
+ kfree(dd->worker);
+ dd->worker = NULL;
+ goto out;
+}
+
+void hfi1_cq_exit(struct hfi1_devdata *dd)
+{
+ struct kthread_worker *worker;
+
+ worker = dd->worker;
+ if (!worker)
+ return;
+ /* blocks future queuing from send_complete() */
+ dd->worker = NULL;
+ smp_wmb(); /* See hfi1_cq_enter */
+ flush_kthread_worker(worker);
+ kthread_stop(worker->task);
+ kfree(worker);
+}
diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c
new file mode 100644
index 000000000000..acd2269e9f14
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/debugfs.c
@@ -0,0 +1,899 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+
+#include "hfi.h"
+#include "debugfs.h"
+#include "device.h"
+#include "qp.h"
+#include "sdma.h"
+
+static struct dentry *hfi1_dbg_root;
+
+#define private2dd(file) (file_inode(file)->i_private)
+#define private2ppd(file) (file_inode(file)->i_private)
+
+#define DEBUGFS_SEQ_FILE_OPS(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+ .start = _##name##_seq_start, \
+ .next = _##name##_seq_next, \
+ .stop = _##name##_seq_stop, \
+ .show = _##name##_seq_show \
+}
+#define DEBUGFS_SEQ_FILE_OPEN(name) \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+ struct seq_file *seq; \
+ int ret; \
+ ret = seq_open(s, &_##name##_seq_ops); \
+ if (ret) \
+ return ret; \
+ seq = s->private_data; \
+ seq->private = inode->i_private; \
+ return 0; \
+}
+
+#define DEBUGFS_FILE_OPS(name) \
+static const struct file_operations _##name##_file_ops = { \
+ .owner = THIS_MODULE, \
+ .open = _##name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = seq_release \
+}
+
+#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \
+do { \
+ struct dentry *ent; \
+ ent = debugfs_create_file(name, mode, parent, \
+ data, ops); \
+ if (!ent) \
+ pr_warn("create of %s failed\n", name); \
+} while (0)
+
+
+#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
+ DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+ struct hfi1_opcode_stats_perctx *opstats;
+
+ rcu_read_lock();
+ if (*pos >= ARRAY_SIZE(opstats->stats))
+ return NULL;
+ return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_opcode_stats_perctx *opstats;
+
+ ++*pos;
+ if (*pos >= ARRAY_SIZE(opstats->stats))
+ return NULL;
+ return pos;
+}
+
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos = v;
+ loff_t i = *spos, j;
+ u64 n_packets = 0, n_bytes = 0;
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ for (j = 0; j < dd->first_user_ctxt; j++) {
+ if (!dd->rcd[j])
+ continue;
+ n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+ n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+ }
+ if (!n_packets && !n_bytes)
+ return SEQ_SKIP;
+ seq_printf(s, "%02llx %llu/%llu\n", i,
+ (unsigned long long) n_packets,
+ (unsigned long long) n_bytes);
+
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
+DEBUGFS_FILE_OPS(opcode_stats);
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ if (!*pos)
+ return SEQ_START_TOKEN;
+ if (*pos >= dd->first_user_ctxt)
+ return NULL;
+ return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ if (v == SEQ_START_TOKEN)
+ return pos;
+
+ ++*pos;
+ if (*pos >= dd->first_user_ctxt)
+ return NULL;
+ return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+ /* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos;
+ loff_t i, j;
+ u64 n_packets = 0;
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(s, "Ctx:npkts\n");
+ return 0;
+ }
+
+ spos = v;
+ i = *spos;
+
+ if (!dd->rcd[i])
+ return SEQ_SKIP;
+
+ for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+ n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+ if (!n_packets)
+ return SEQ_SKIP;
+
+ seq_printf(s, " %llu:%llu\n", i, n_packets);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(ctx_stats);
+DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
+DEBUGFS_FILE_OPS(ctx_stats);
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+ struct qp_iter *iter;
+ loff_t n = *pos;
+
+ rcu_read_lock();
+ iter = qp_iter_init(s->private);
+ if (!iter)
+ return NULL;
+
+ while (n--) {
+ if (qp_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+ }
+
+ return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+ loff_t *pos)
+{
+ struct qp_iter *iter = iter_ptr;
+
+ (*pos)++;
+
+ if (qp_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+__releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+ struct qp_iter *iter = iter_ptr;
+
+ if (!iter)
+ return 0;
+
+ qp_iter_print(s, iter);
+
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(qp_stats);
+DEBUGFS_SEQ_FILE_OPEN(qp_stats)
+DEBUGFS_FILE_OPS(qp_stats);
+
+static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+ struct hfi1_ibdev *ibd;
+ struct hfi1_devdata *dd;
+
+ rcu_read_lock();
+ ibd = (struct hfi1_ibdev *)s->private;
+ dd = dd_from_dev(ibd);
+ if (!dd->per_sdma || *pos >= dd->num_sdma)
+ return NULL;
+ return pos;
+}
+
+static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ ++*pos;
+ if (!dd->per_sdma || *pos >= dd->num_sdma)
+ return NULL;
+ return pos;
+}
+
+
+static void _sdes_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int _sdes_seq_show(struct seq_file *s, void *v)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+ loff_t *spos = v;
+ loff_t i = *spos;
+
+ sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdes);
+DEBUGFS_SEQ_FILE_OPEN(sdes)
+DEBUGFS_FILE_OPS(sdes);
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 *counters;
+ size_t avail;
+ struct hfi1_devdata *dd;
+ ssize_t rval;
+
+ rcu_read_lock();
+ dd = private2dd(file);
+ avail = hfi1_read_cntrs(dd, *ppos, NULL, &counters);
+ rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+ rcu_read_unlock();
+ return rval;
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *names;
+ size_t avail;
+ struct hfi1_devdata *dd;
+ ssize_t rval;
+
+ rcu_read_lock();
+ dd = private2dd(file);
+ avail = hfi1_read_cntrs(dd, *ppos, &names, NULL);
+ rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+ rcu_read_unlock();
+ return rval;
+}
+
+struct counter_info {
+ char *name;
+ const struct file_operations ops;
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *names;
+ size_t avail;
+ struct hfi1_devdata *dd;
+ ssize_t rval;
+
+ rcu_read_lock();
+ dd = private2dd(file);
+ /* port number n/a here since names are constant */
+ avail = hfi1_read_portcntrs(dd, *ppos, 0, &names, NULL);
+ rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+ rcu_read_unlock();
+ return rval;
+}
+
+/* read the per-port counters */
+static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 *counters;
+ size_t avail;
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ ssize_t rval;
+
+ rcu_read_lock();
+ ppd = private2ppd(file);
+ dd = ppd->dd;
+ avail = hfi1_read_portcntrs(dd, *ppos, ppd->port - 1, NULL, &counters);
+ rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+ rcu_read_unlock();
+ return rval;
+}
+
+/*
+ * read the per-port QSFP data for ppd
+ */
+static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hfi1_pportdata *ppd;
+ char *tmp;
+ int ret;
+
+ rcu_read_lock();
+ ppd = private2ppd(file);
+ tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!tmp) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
+
+ ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
+ if (ret > 0)
+ ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+ rcu_read_unlock();
+ kfree(tmp);
+ return ret;
+}
+
+/* Do an i2c write operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int i2c_addr;
+ int offset;
+ int total_written;
+
+ rcu_read_lock();
+ ppd = private2ppd(file);
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff) {
+ ret = -ENOMEM;
+ goto _return;
+ }
+
+ ret = copy_from_user(buff, buf, count);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+
+ i2c_addr = (*ppos >> 16) & 0xff;
+ offset = *ppos & 0xffff;
+
+ total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
+ if (total_written < 0) {
+ ret = total_written;
+ goto _free;
+ }
+
+ *ppos += total_written;
+
+ ret = total_written;
+
+ _free:
+ kfree(buff);
+ _return:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Do an i2c write operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c write operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do an i2c read operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int i2c_addr;
+ int offset;
+ int total_read;
+
+ rcu_read_lock();
+ ppd = private2ppd(file);
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff) {
+ ret = -ENOMEM;
+ goto _return;
+ }
+
+ i2c_addr = (*ppos >> 16) & 0xff;
+ offset = *ppos & 0xffff;
+
+ total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
+ if (total_read < 0) {
+ ret = total_read;
+ goto _free;
+ }
+
+ *ppos += total_read;
+
+ ret = copy_to_user(buf, buff, total_read);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+
+ ret = total_read;
+
+ _free:
+ kfree(buff);
+ _return:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Do an i2c read operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c read operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_read(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP write operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int total_written;
+
+ rcu_read_lock();
+ if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+ ret = -EINVAL;
+ goto _return;
+ }
+
+ ppd = private2ppd(file);
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff) {
+ ret = -ENOMEM;
+ goto _return;
+ }
+
+ ret = copy_from_user(buff, buf, count);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+
+ total_written = qsfp_write(ppd, target, *ppos, buff, count);
+ if (total_written < 0) {
+ ret = total_written;
+ goto _free;
+ }
+
+ *ppos += total_written;
+
+ ret = total_written;
+
+ _free:
+ kfree(buff);
+ _return:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP read operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int total_read;
+
+ rcu_read_lock();
+ if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+ ret = -EINVAL;
+ goto _return;
+ }
+
+ ppd = private2ppd(file);
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff) {
+ ret = -ENOMEM;
+ goto _return;
+ }
+
+ total_read = qsfp_read(ppd, target, *ppos, buff, count);
+ if (total_read < 0) {
+ ret = total_read;
+ goto _free;
+ }
+
+ *ppos += total_read;
+
+ ret = copy_to_user(buf, buff, total_read);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+
+ ret = total_read;
+
+ _free:
+ kfree(buff);
+ _return:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_read(file, buf, count, ppos, 1);
+}
+
+#define DEBUGFS_OPS(nm, readroutine, writeroutine) \
+{ \
+ .name = nm, \
+ .ops = { \
+ .read = readroutine, \
+ .write = writeroutine, \
+ .llseek = generic_file_llseek, \
+ }, \
+}
+
+static const struct counter_info cntr_ops[] = {
+ DEBUGFS_OPS("counter_names", dev_names_read, NULL),
+ DEBUGFS_OPS("counters", dev_counters_read, NULL),
+ DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
+};
+
+static const struct counter_info port_cntr_ops[] = {
+ DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
+ DEBUGFS_OPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write),
+ DEBUGFS_OPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write),
+ DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
+ DEBUGFS_OPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write),
+ DEBUGFS_OPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write),
+};
+
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+ char name[sizeof("port0counters") + 1];
+ char link[10];
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+ struct hfi1_pportdata *ppd;
+ int unit = dd->unit;
+ int i, j;
+
+ if (!hfi1_dbg_root)
+ return;
+ snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
+ snprintf(link, sizeof(link), "%d", unit);
+ ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
+ if (!ibd->hfi1_ibdev_dbg) {
+ pr_warn("create of %s failed\n", name);
+ return;
+ }
+ ibd->hfi1_ibdev_link =
+ debugfs_create_symlink(link, hfi1_dbg_root, name);
+ if (!ibd->hfi1_ibdev_link) {
+ pr_warn("create of %s symlink failed\n", name);
+ return;
+ }
+ DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+ DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
+ DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
+ DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
+ /* dev counter files */
+ for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
+ DEBUGFS_FILE_CREATE(cntr_ops[i].name,
+ ibd->hfi1_ibdev_dbg,
+ dd,
+ &cntr_ops[i].ops, S_IRUGO);
+ /* per port files */
+ for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
+ for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
+ snprintf(name,
+ sizeof(name),
+ port_cntr_ops[i].name,
+ j + 1);
+ DEBUGFS_FILE_CREATE(name,
+ ibd->hfi1_ibdev_dbg,
+ ppd,
+ &port_cntr_ops[i].ops,
+ port_cntr_ops[i].ops.write == NULL ?
+ S_IRUGO : S_IRUGO|S_IWUSR);
+ }
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+ if (!hfi1_dbg_root)
+ goto out;
+ debugfs_remove(ibd->hfi1_ibdev_link);
+ debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
+out:
+ ibd->hfi1_ibdev_dbg = NULL;
+ synchronize_rcu();
+}
+
+/*
+ * driver stats field names, one line per stat, single string. Used by
+ * programs like hfistats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if hfi1_ib_stats changes, this needs to change. Names need to be
+ * 12 chars or less (w/o newline), for proper display by hfistats utility.
+ */
+static const char * const hfi1_statnames[] = {
+ /* must be element 0*/
+ "KernIntr",
+ "ErrorIntr",
+ "Tx_Errs",
+ "Rcv_Errs",
+ "H/W_Errs",
+ "NoPIOBufs",
+ "CtxtsOpen",
+ "RcvLen_Errs",
+ "EgrBufFull",
+ "EgrHdrFull"
+};
+
+static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+ rcu_read_lock();
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void *_driver_stats_names_seq_next(
+ struct seq_file *s,
+ void *v,
+ loff_t *pos)
+{
+ ++*pos;
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos = v;
+
+ seq_printf(s, "%s\n", hfi1_statnames[*spos]);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
+DEBUGFS_FILE_OPS(driver_stats_names);
+
+static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+ rcu_read_lock();
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void _driver_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static u64 hfi1_sps_ints(void)
+{
+ unsigned long flags;
+ struct hfi1_devdata *dd;
+ u64 sps_ints = 0;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ list_for_each_entry(dd, &hfi1_dev_list, list) {
+ sps_ints += get_all_cpu_total(dd->int_counter);
+ }
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ return sps_ints;
+}
+
+static int _driver_stats_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos = v;
+ char *buffer;
+ u64 *stats = (u64 *)&hfi1_stats;
+ size_t sz = seq_get_buf(s, &buffer);
+
+ if (sz < sizeof(u64))
+ return SEQ_SKIP;
+ /* special case for interrupts */
+ if (*spos == 0)
+ *(u64 *)buffer = hfi1_sps_ints();
+ else
+ *(u64 *)buffer = stats[*spos];
+ seq_commit(s, sizeof(u64));
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats)
+DEBUGFS_FILE_OPS(driver_stats);
+
+void hfi1_dbg_init(void)
+{
+ hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL);
+ if (!hfi1_dbg_root)
+ pr_warn("init of debugfs failed\n");
+ DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
+ DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+}
+
+void hfi1_dbg_exit(void)
+{
+ debugfs_remove_recursive(hfi1_dbg_root);
+ hfi1_dbg_root = NULL;
+}
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h
new file mode 100644
index 000000000000..92d6fe146714
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/debugfs.h
@@ -0,0 +1,78 @@
+#ifndef _HFI1_DEBUGFS_H
+#define _HFI1_DEBUGFS_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_ibdev;
+#ifdef CONFIG_DEBUG_FS
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
+void hfi1_dbg_init(void);
+void hfi1_dbg_exit(void);
+#else
+static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_init(void)
+{
+}
+
+void hfi1_dbg_exit(void)
+{
+}
+
+#endif
+
+#endif /* _HFI1_DEBUGFS_H */
diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c
new file mode 100644
index 000000000000..bc26a5392712
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/device.c
@@ -0,0 +1,184 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+
+#include "hfi.h"
+#include "device.h"
+
+static struct class *class;
+static struct class *user_class;
+static dev_t hfi1_dev;
+
+int hfi1_cdev_init(int minor, const char *name,
+ const struct file_operations *fops,
+ struct cdev *cdev, struct device **devp,
+ bool user_accessible)
+{
+ const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
+ struct device *device = NULL;
+ int ret;
+
+ cdev_init(cdev, fops);
+ cdev->owner = THIS_MODULE;
+ kobject_set_name(&cdev->kobj, name);
+
+ ret = cdev_add(cdev, dev, 1);
+ if (ret < 0) {
+ pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+ minor, name, -ret);
+ goto done;
+ }
+
+ if (user_accessible)
+ device = device_create(user_class, NULL, dev, NULL, "%s", name);
+ else
+ device = device_create(class, NULL, dev, NULL, "%s", name);
+
+ if (!IS_ERR(device))
+ goto done;
+ ret = PTR_ERR(device);
+ device = NULL;
+ pr_err("Could not create device for minor %d, %s (err %d)\n",
+ minor, name, -ret);
+ cdev_del(cdev);
+done:
+ *devp = device;
+ return ret;
+}
+
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
+{
+ struct device *device = *devp;
+
+ if (device) {
+ device_unregister(device);
+ *devp = NULL;
+
+ cdev_del(cdev);
+ }
+}
+
+static const char *hfi1_class_name = "hfi1";
+
+const char *class_name(void)
+{
+ return hfi1_class_name;
+}
+
+static char *hfi1_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0600;
+ return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+static const char *hfi1_class_name_user = "hfi1_user";
+const char *class_name_user(void)
+{
+ return hfi1_class_name_user;
+}
+
+static char *hfi1_user_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+int __init dev_init(void)
+{
+ int ret;
+
+ ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
+ if (ret < 0) {
+ pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+ goto done;
+ }
+
+ class = class_create(THIS_MODULE, class_name());
+ if (IS_ERR(class)) {
+ ret = PTR_ERR(class);
+ pr_err("Could not create device class (err %d)\n", -ret);
+ unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+ goto done;
+ }
+ class->devnode = hfi1_devnode;
+
+ user_class = class_create(THIS_MODULE, class_name_user());
+ if (IS_ERR(user_class)) {
+ ret = PTR_ERR(user_class);
+ pr_err("Could not create device class for user accessible files (err %d)\n",
+ -ret);
+ class_destroy(class);
+ class = NULL;
+ user_class = NULL;
+ unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+ goto done;
+ }
+ user_class->devnode = hfi1_user_devnode;
+
+done:
+ return ret;
+}
+
+void dev_cleanup(void)
+{
+ class_destroy(class);
+ class = NULL;
+
+ class_destroy(user_class);
+ user_class = NULL;
+
+ unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+}
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h
new file mode 100644
index 000000000000..2850ff739d81
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/device.h
@@ -0,0 +1,62 @@
+#ifndef _HFI1_DEVICE_H
+#define _HFI1_DEVICE_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+int hfi1_cdev_init(int minor, const char *name,
+ const struct file_operations *fops,
+ struct cdev *cdev, struct device **devp,
+ bool user_accessible);
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
+const char *class_name(void);
+int __init dev_init(void);
+void dev_cleanup(void);
+
+#endif /* _HFI1_DEVICE_H */
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
new file mode 100644
index 000000000000..3e8d5ac4c626
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/diag.c
@@ -0,0 +1,1873 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains support for diagnostic functions. It is accessed by
+ * opening the hfi1_diag device, normally minor number 129. Diagnostic use
+ * of the chip may render the chip or board unusable until the driver
+ * is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <rdma/ib_smi.h>
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+#define snoop_dbg(fmt, ...) \
+ hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__)
+
+/* Snoop option mask */
+#define SNOOP_DROP_SEND (1 << 0)
+#define SNOOP_USE_METADATA (1 << 1)
+
+static u8 snoop_flags;
+
+/*
+ * Extract packet length from LRH header.
+ * Why & 0x7FF? Because len is only 11 bits in case it wasn't 0'd we throw the
+ * bogus bits away. This is in Dwords so multiply by 4 to get size in bytes
+ */
+#define HFI1_GET_PKT_LEN(x) (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2)
+
+enum hfi1_filter_status {
+ HFI1_FILTER_HIT,
+ HFI1_FILTER_ERR,
+ HFI1_FILTER_MISS
+};
+
+/* snoop processing functions */
+rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = {
+ [RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler,
+ [RHF_RCV_TYPE_EAGER] = snoop_recv_handler,
+ [RHF_RCV_TYPE_IB] = snoop_recv_handler,
+ [RHF_RCV_TYPE_ERROR] = snoop_recv_handler,
+ [RHF_RCV_TYPE_BYPASS] = snoop_recv_handler,
+ [RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
+ [RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
+ [RHF_RCV_TYPE_INVALID7] = process_receive_invalid
+};
+
+/* Snoop packet structure */
+struct snoop_packet {
+ struct list_head list;
+ u32 total_len;
+ u8 data[];
+};
+
+/* Do not make these an enum or it will blow up the capture_md */
+#define PKT_DIR_EGRESS 0x0
+#define PKT_DIR_INGRESS 0x1
+
+/* Packet capture metadata returned to the user with the packet. */
+struct capture_md {
+ u8 port;
+ u8 dir;
+ u8 reserved[6];
+ union {
+ u64 pbc;
+ u64 rhf;
+ } u;
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev diagpkt_cdev;
+static struct device *diagpkt_device;
+
+static ssize_t diagpkt_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+ .owner = THIS_MODULE,
+ .write = diagpkt_write,
+ .llseek = noop_llseek,
+};
+
+/*
+ * This is used for communication with user space for snoop extended IOCTLs
+ */
+struct hfi1_link_info {
+ __be64 node_guid;
+ u8 port_mode;
+ u8 port_state;
+ u16 link_speed_active;
+ u16 link_width_active;
+ u16 vl15_init;
+ u8 port_number;
+ /*
+ * Add padding to make this a full IB SMP payload. Note: changing the
+ * size of this structure will make the IOCTLs created with _IOWR
+ * change.
+ * Be sure to run tests on all IOCTLs when making changes to this
+ * structure.
+ */
+ u8 res[47];
+};
+
+/*
+ * This starts our ioctl sequence numbers *way* off from the ones
+ * defined in ib_core.
+ */
+#define SNOOP_CAPTURE_VERSION 0x1
+
+#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl-number.txt */
+#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC
+#define HFI1_SNOOP_IOC_BASE_SEQ 0x80
+
+#define HFI1_SNOOP_IOCGETLINKSTATE \
+ _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ)
+#define HFI1_SNOOP_IOCSETLINKSTATE \
+ _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+1)
+#define HFI1_SNOOP_IOCCLEARQUEUE \
+ _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+2)
+#define HFI1_SNOOP_IOCCLEARFILTER \
+ _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+3)
+#define HFI1_SNOOP_IOCSETFILTER \
+ _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+4)
+#define HFI1_SNOOP_IOCGETVERSION \
+ _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+5)
+#define HFI1_SNOOP_IOCSET_OPTS \
+ _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6)
+
+/*
+ * These offsets +6/+7 could change, but these are already known and used
+ * IOCTL numbers so don't change them without a good reason.
+ */
+#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \
+ _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6, \
+ struct hfi1_link_info)
+#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \
+ _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+7, \
+ struct hfi1_link_info)
+
+static int hfi1_snoop_open(struct inode *in, struct file *fp);
+static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
+ size_t pkt_len, loff_t *off);
+static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *off);
+static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
+static unsigned int hfi1_snoop_poll(struct file *fp,
+ struct poll_table_struct *wait);
+static int hfi1_snoop_release(struct inode *in, struct file *fp);
+
+struct hfi1_packet_filter_command {
+ int opcode;
+ int length;
+ void *value_ptr;
+};
+
+/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */
+#define HFI1_SNOOP_INGRESS 0x1
+#define HFI1_SNOOP_EGRESS 0x2
+
+enum hfi1_packet_filter_opcodes {
+ FILTER_BY_LID,
+ FILTER_BY_DLID,
+ FILTER_BY_MAD_MGMT_CLASS,
+ FILTER_BY_QP_NUMBER,
+ FILTER_BY_PKT_TYPE,
+ FILTER_BY_SERVICE_LEVEL,
+ FILTER_BY_PKEY,
+ FILTER_BY_DIRECTION,
+};
+
+static const struct file_operations snoop_file_ops = {
+ .owner = THIS_MODULE,
+ .open = hfi1_snoop_open,
+ .read = hfi1_snoop_read,
+ .unlocked_ioctl = hfi1_ioctl,
+ .poll = hfi1_snoop_poll,
+ .write = hfi1_snoop_write,
+ .release = hfi1_snoop_release
+};
+
+struct hfi1_filter_array {
+ int (*filter)(void *, void *, void *);
+};
+
+static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
+ void *value);
+static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
+ void *value);
+static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
+ void *value);
+static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
+
+static struct hfi1_filter_array hfi1_filters[] = {
+ { hfi1_filter_lid },
+ { hfi1_filter_dlid },
+ { hfi1_filter_mad_mgmt_class },
+ { hfi1_filter_qp_number },
+ { hfi1_filter_ibpacket_type },
+ { hfi1_filter_ib_service_level },
+ { hfi1_filter_ib_pkey },
+ { hfi1_filter_direction },
+};
+
+#define HFI1_MAX_FILTERS ARRAY_SIZE(hfi1_filters)
+#define HFI1_DIAG_MINOR_BASE 129
+
+static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name);
+
+int hfi1_diag_add(struct hfi1_devdata *dd)
+{
+ char name[16];
+ int ret = 0;
+
+ snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(),
+ dd->unit);
+ /*
+ * Do this for each device as opposed to the normal diagpkt
+ * interface which is one per host
+ */
+ ret = hfi1_snoop_add(dd, name);
+ if (ret)
+ dd_dev_err(dd, "Unable to init snoop/capture device");
+
+ snprintf(name, sizeof(name), "%s_diagpkt", class_name());
+ if (atomic_inc_return(&diagpkt_count) == 1) {
+ ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name,
+ &diagpkt_file_ops, &diagpkt_cdev,
+ &diagpkt_device, false);
+ }
+
+ return ret;
+}
+
+/* this must be called w/ dd->snoop_in_lock held */
+static void drain_snoop_list(struct list_head *queue)
+{
+ struct list_head *pos, *q;
+ struct snoop_packet *packet;
+
+ list_for_each_safe(pos, q, queue) {
+ packet = list_entry(pos, struct snoop_packet, list);
+ list_del(pos);
+ kfree(packet);
+ }
+}
+
+static void hfi1_snoop_remove(struct hfi1_devdata *dd)
+{
+ unsigned long flags = 0;
+
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+ drain_snoop_list(&dd->hfi1_snoop.queue);
+ hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+}
+
+void hfi1_diag_remove(struct hfi1_devdata *dd)
+{
+
+ hfi1_snoop_remove(dd);
+ if (atomic_dec_and_test(&diagpkt_count))
+ hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
+ hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
+}
+
+
+/*
+ * Allocated structure shared between the credit return mechanism and
+ * diagpkt_send().
+ */
+struct diagpkt_wait {
+ struct completion credits_returned;
+ int code;
+ atomic_t count;
+};
+
+/*
+ * When each side is finished with the structure, they call this.
+ * The last user frees the structure.
+ */
+static void put_diagpkt_wait(struct diagpkt_wait *wait)
+{
+ if (atomic_dec_and_test(&wait->count))
+ kfree(wait);
+}
+
+/*
+ * Callback from the credit return code. Set the complete, which
+ * will let diapkt_send() continue.
+ */
+static void diagpkt_complete(void *arg, int code)
+{
+ struct diagpkt_wait *wait = (struct diagpkt_wait *)arg;
+
+ wait->code = code;
+ complete(&wait->credits_returned);
+ put_diagpkt_wait(wait); /* finished with the structure */
+}
+
+/**
+ * diagpkt_send - send a packet
+ * @dp: diag packet descriptor
+ */
+static ssize_t diagpkt_send(struct diag_pkt *dp)
+{
+ struct hfi1_devdata *dd;
+ struct send_context *sc;
+ struct pio_buf *pbuf;
+ u32 *tmpbuf = NULL;
+ ssize_t ret = 0;
+ u32 pkt_len, total_len;
+ pio_release_cb credit_cb = NULL;
+ void *credit_arg = NULL;
+ struct diagpkt_wait *wait = NULL;
+
+ dd = hfi1_lookup(dp->unit);
+ if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) {
+ ret = -ENODEV;
+ goto bail;
+ }
+ if (!(dd->flags & HFI1_INITTED)) {
+ /* no hardware, freeze, etc. */
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ if (dp->version != _DIAG_PKT_VERS) {
+ dd_dev_err(dd, "Invalid version %u for diagpkt_write\n",
+ dp->version);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* send count must be an exact number of dwords */
+ if (dp->len & 3) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* there is only port 1 */
+ if (dp->port != 1) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* need a valid context */
+ if (dp->sw_index >= dd->num_send_contexts) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ /* can only use kernel contexts */
+ if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ /* must be allocated */
+ sc = dd->send_contexts[dp->sw_index].sc;
+ if (!sc) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ /* must be enabled */
+ if (!(sc->flags & SCF_ENABLED)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* allocate a buffer and copy the data in */
+ tmpbuf = vmalloc(dp->len);
+ if (!tmpbuf) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ if (copy_from_user(tmpbuf,
+ (const void __user *) (unsigned long) dp->data,
+ dp->len)) {
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ /*
+ * pkt_len is how much data we have to write, includes header and data.
+ * total_len is length of the packet in Dwords plus the PBC should not
+ * include the CRC.
+ */
+ pkt_len = dp->len >> 2;
+ total_len = pkt_len + 2; /* PBC + packet */
+
+ /* if 0, fill in a default */
+ if (dp->pbc == 0) {
+ struct hfi1_pportdata *ppd = dd->pport;
+
+ hfi1_cdbg(PKT, "Generating PBC");
+ dp->pbc = create_pbc(ppd, 0, 0, 0, total_len);
+ } else {
+ hfi1_cdbg(PKT, "Using passed in PBC");
+ }
+
+ hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc);
+
+ /*
+ * The caller wants to wait until the packet is sent and to
+ * check for errors. The best we can do is wait until
+ * the buffer credits are returned and check if any packet
+ * error has occurred. If there are any late errors, this
+ * could miss it. If there are other senders who generate
+ * an error, this may find it. However, in general, it
+ * should catch most.
+ */
+ if (dp->flags & F_DIAGPKT_WAIT) {
+ /* always force a credit return */
+ dp->pbc |= PBC_CREDIT_RETURN;
+ /* turn on credit return interrupts */
+ sc_add_credit_return_intr(sc);
+ wait = kmalloc(sizeof(*wait), GFP_KERNEL);
+ if (!wait) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ init_completion(&wait->credits_returned);
+ atomic_set(&wait->count, 2);
+ wait->code = PRC_OK;
+
+ credit_cb = diagpkt_complete;
+ credit_arg = wait;
+ }
+
+ pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg);
+ if (!pbuf) {
+ /*
+ * No send buffer means no credit callback. Undo
+ * the wait set-up that was done above. We free wait
+ * because the callback will never be called.
+ */
+ if (dp->flags & F_DIAGPKT_WAIT) {
+ sc_del_credit_return_intr(sc);
+ kfree(wait);
+ wait = NULL;
+ }
+ ret = -ENOSPC;
+ goto bail;
+ }
+
+ pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len);
+ /* no flush needed as the HW knows the packet size */
+
+ ret = sizeof(*dp);
+
+ if (dp->flags & F_DIAGPKT_WAIT) {
+ /* wait for credit return */
+ ret = wait_for_completion_interruptible(
+ &wait->credits_returned);
+ /*
+ * If the wait returns an error, the wait was interrupted,
+ * e.g. with a ^C in the user program. The callback is
+ * still pending. This is OK as the wait structure is
+ * kmalloc'ed and the structure will free itself when
+ * all users are done with it.
+ *
+ * A context disable occurs on a send context restart, so
+ * include that in the list of errors below to check for.
+ * NOTE: PRC_FILL_ERR is at best informational and cannot
+ * be depended on.
+ */
+ if (!ret && (((wait->code & PRC_STATUS_ERR)
+ || (wait->code & PRC_FILL_ERR)
+ || (wait->code & PRC_SC_DISABLE))))
+ ret = -EIO;
+
+ put_diagpkt_wait(wait); /* finished with the structure */
+ sc_del_credit_return_intr(sc);
+ }
+
+bail:
+ vfree(tmpbuf);
+ return ret;
+}
+
+static ssize_t diagpkt_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *off)
+{
+ struct hfi1_devdata *dd;
+ struct send_context *sc;
+ u8 vl;
+
+ struct diag_pkt dp;
+
+ if (count != sizeof(dp))
+ return -EINVAL;
+
+ if (copy_from_user(&dp, data, sizeof(dp)))
+ return -EFAULT;
+
+ /*
+ * The Send Context is derived from the PbcVL value
+ * if PBC is populated
+ */
+ if (dp.pbc) {
+ dd = hfi1_lookup(dp.unit);
+ if (dd == NULL)
+ return -ENODEV;
+ vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
+ sc = dd->vld[vl].sc;
+ if (sc) {
+ dp.sw_index = sc->sw_index;
+ hfi1_cdbg(
+ PKT,
+ "Packet sent over VL %d via Send Context %u(%u)",
+ vl, sc->sw_index, sc->hw_context);
+ }
+ }
+
+ return diagpkt_send(&dp);
+}
+
+static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name)
+{
+ int ret = 0;
+
+ dd->hfi1_snoop.mode_flag = 0;
+ spin_lock_init(&dd->hfi1_snoop.snoop_lock);
+ INIT_LIST_HEAD(&dd->hfi1_snoop.queue);
+ init_waitqueue_head(&dd->hfi1_snoop.waitq);
+
+ ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name,
+ &snoop_file_ops,
+ &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev,
+ false);
+
+ if (ret) {
+ dd_dev_err(dd, "Couldn't create %s device: %d", name, ret);
+ hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev,
+ &dd->hfi1_snoop.class_dev);
+ }
+
+ return ret;
+}
+
+static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in)
+{
+ int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE;
+ struct hfi1_devdata *dd = NULL;
+
+ dd = hfi1_lookup(unit);
+ return dd;
+
+}
+
+/* clear or restore send context integrity checks */
+static void adjust_integrity_checks(struct hfi1_devdata *dd)
+{
+ struct send_context *sc;
+ unsigned long sc_flags;
+ int i;
+
+ spin_lock_irqsave(&dd->sc_lock, sc_flags);
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ int enable;
+
+ sc = dd->send_contexts[i].sc;
+
+ if (!sc)
+ continue; /* not allocated */
+
+ enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+ dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE;
+
+ set_pio_integrity(sc);
+
+ if (enable) /* take HFI_CAP_* flags into account */
+ hfi1_init_ctxt(sc);
+ }
+ spin_unlock_irqrestore(&dd->sc_lock, sc_flags);
+}
+
+static int hfi1_snoop_open(struct inode *in, struct file *fp)
+{
+ int ret;
+ int mode_flag = 0;
+ unsigned long flags = 0;
+ struct hfi1_devdata *dd;
+ struct list_head *queue;
+
+ mutex_lock(&hfi1_mutex);
+
+ dd = hfi1_dd_from_sc_inode(in);
+ if (dd == NULL) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ /*
+ * File mode determines snoop or capture. Some existing user
+ * applications expect the capture device to be able to be opened RDWR
+ * because they expect a dedicated capture device. For this reason we
+ * support a module param to force capture mode even if the file open
+ * mode matches snoop.
+ */
+ if ((fp->f_flags & O_ACCMODE) == O_RDONLY) {
+ snoop_dbg("Capture Enabled");
+ mode_flag = HFI1_PORT_CAPTURE_MODE;
+ } else if ((fp->f_flags & O_ACCMODE) == O_RDWR) {
+ snoop_dbg("Snoop Enabled");
+ mode_flag = HFI1_PORT_SNOOP_MODE;
+ } else {
+ snoop_dbg("Invalid");
+ ret = -EINVAL;
+ goto bail;
+ }
+ queue = &dd->hfi1_snoop.queue;
+
+ /*
+ * We are not supporting snoop and capture at the same time.
+ */
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+ if (dd->hfi1_snoop.mode_flag) {
+ ret = -EBUSY;
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+ goto bail;
+ }
+
+ dd->hfi1_snoop.mode_flag = mode_flag;
+ drain_snoop_list(queue);
+
+ dd->hfi1_snoop.filter_callback = NULL;
+ dd->hfi1_snoop.filter_value = NULL;
+
+ /*
+ * Send side packet integrity checks are not helpful when snooping so
+ * disable and re-enable when we stop snooping.
+ */
+ if (mode_flag == HFI1_PORT_SNOOP_MODE) {
+ /* clear after snoop mode is on */
+ adjust_integrity_checks(dd); /* clear */
+
+ /*
+ * We also do not want to be doing the DLID LMC check for
+ * ingressed packets.
+ */
+ dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1);
+ write_csr(dd, DCC_CFG_PORT_CONFIG1,
+ (dd->hfi1_snoop.dcc_cfg >> 32) << 32);
+ }
+
+ /*
+ * As soon as we set these function pointers the recv and send handlers
+ * are active. This is a race condition so we must make sure to drain
+ * the queue and init filter values above. Technically we should add
+ * locking here but all that will happen is on recv a packet will get
+ * allocated and get stuck on the snoop_lock before getting added to the
+ * queue. Same goes for send.
+ */
+ dd->rhf_rcv_function_map = snoop_rhf_rcv_functions;
+ dd->process_pio_send = snoop_send_pio_handler;
+ dd->process_dma_send = snoop_send_pio_handler;
+ dd->pio_inline_send = snoop_inline_pio_send;
+
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+ ret = 0;
+
+bail:
+ mutex_unlock(&hfi1_mutex);
+
+ return ret;
+}
+
+static int hfi1_snoop_release(struct inode *in, struct file *fp)
+{
+ unsigned long flags = 0;
+ struct hfi1_devdata *dd;
+ int mode_flag;
+
+ dd = hfi1_dd_from_sc_inode(in);
+ if (dd == NULL)
+ return -ENODEV;
+
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+ /* clear the snoop mode before re-adjusting send context CSRs */
+ mode_flag = dd->hfi1_snoop.mode_flag;
+ dd->hfi1_snoop.mode_flag = 0;
+
+ /*
+ * Drain the queue and clear the filters we are done with it. Don't
+ * forget to restore the packet integrity checks
+ */
+ drain_snoop_list(&dd->hfi1_snoop.queue);
+ if (mode_flag == HFI1_PORT_SNOOP_MODE) {
+ /* restore after snoop mode is clear */
+ adjust_integrity_checks(dd); /* restore */
+
+ /*
+ * Also should probably reset the DCC_CONFIG1 register for DLID
+ * checking on incoming packets again. Use the value saved when
+ * opening the snoop device.
+ */
+ write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg);
+ }
+
+ dd->hfi1_snoop.filter_callback = NULL;
+ kfree(dd->hfi1_snoop.filter_value);
+ dd->hfi1_snoop.filter_value = NULL;
+
+ /*
+ * User is done snooping and capturing, return control to the normal
+ * handler. Re-enable SDMA handling.
+ */
+ dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+ dd->process_pio_send = hfi1_verbs_send_pio;
+ dd->process_dma_send = hfi1_verbs_send_dma;
+ dd->pio_inline_send = pio_copy;
+
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+ snoop_dbg("snoop/capture device released");
+
+ return 0;
+}
+
+static unsigned int hfi1_snoop_poll(struct file *fp,
+ struct poll_table_struct *wait)
+{
+ int ret = 0;
+ unsigned long flags = 0;
+
+ struct hfi1_devdata *dd;
+
+ dd = hfi1_dd_from_sc_inode(fp->f_inode);
+ if (dd == NULL)
+ return -ENODEV;
+
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+ poll_wait(fp, &dd->hfi1_snoop.waitq, wait);
+ if (!list_empty(&dd->hfi1_snoop.queue))
+ ret |= POLLIN | POLLRDNORM;
+
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+ return ret;
+
+}
+
+static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *off)
+{
+ struct diag_pkt dpkt;
+ struct hfi1_devdata *dd;
+ size_t ret;
+ u8 byte_two, sl, sc5, sc4, vl, byte_one;
+ struct send_context *sc;
+ u32 len;
+ u64 pbc;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+
+ dd = hfi1_dd_from_sc_inode(fp->f_inode);
+ if (dd == NULL)
+ return -ENODEV;
+
+ ppd = dd->pport;
+ snoop_dbg("received %lu bytes from user", count);
+
+ memset(&dpkt, 0, sizeof(struct diag_pkt));
+ dpkt.version = _DIAG_PKT_VERS;
+ dpkt.unit = dd->unit;
+ dpkt.port = 1;
+
+ if (likely(!(snoop_flags & SNOOP_USE_METADATA))) {
+ /*
+ * We need to generate the PBC and not let diagpkt_send do it,
+ * to do this we need the VL and the length in dwords.
+ * The VL can be determined by using the SL and looking up the
+ * SC. Then the SC can be converted into VL. The exception to
+ * this is those packets which are from an SMI queue pair.
+ * Since we can't detect anything about the QP here we have to
+ * rely on the SC. If its 0xF then we assume its SMI and
+ * do not look at the SL.
+ */
+ if (copy_from_user(&byte_one, data, 1))
+ return -EINVAL;
+
+ if (copy_from_user(&byte_two, data+1, 1))
+ return -EINVAL;
+
+ sc4 = (byte_one >> 4) & 0xf;
+ if (sc4 == 0xF) {
+ snoop_dbg("Detected VL15 packet ignoring SL in packet");
+ vl = sc4;
+ } else {
+ sl = (byte_two >> 4) & 0xf;
+ ibp = to_iport(&dd->verbs_dev.ibdev, 1);
+ sc5 = ibp->sl_to_sc[sl];
+ vl = sc_to_vlt(dd, sc5);
+ if (vl != sc4) {
+ snoop_dbg("VL %d does not match SC %d of packet",
+ vl, sc4);
+ return -EINVAL;
+ }
+ }
+
+ sc = dd->vld[vl].sc; /* Look up the context based on VL */
+ if (sc) {
+ dpkt.sw_index = sc->sw_index;
+ snoop_dbg("Sending on context %u(%u)", sc->sw_index,
+ sc->hw_context);
+ } else {
+ snoop_dbg("Could not find context for vl %d", vl);
+ return -EINVAL;
+ }
+
+ len = (count >> 2) + 2; /* Add in PBC */
+ pbc = create_pbc(ppd, 0, 0, vl, len);
+ } else {
+ if (copy_from_user(&pbc, data, sizeof(pbc)))
+ return -EINVAL;
+ vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
+ sc = dd->vld[vl].sc; /* Look up the context based on VL */
+ if (sc) {
+ dpkt.sw_index = sc->sw_index;
+ } else {
+ snoop_dbg("Could not find context for vl %d", vl);
+ return -EINVAL;
+ }
+ data += sizeof(pbc);
+ count -= sizeof(pbc);
+ }
+ dpkt.len = count;
+ dpkt.data = (unsigned long)data;
+
+ snoop_dbg("PBC: vl=0x%llx Length=0x%llx",
+ (pbc >> 12) & 0xf,
+ (pbc & 0xfff));
+
+ dpkt.pbc = pbc;
+ ret = diagpkt_send(&dpkt);
+ /*
+ * diagpkt_send only returns number of bytes in the diagpkt so patch
+ * that up here before returning.
+ */
+ if (ret == sizeof(dpkt))
+ return count;
+
+ return ret;
+}
+
+static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
+ size_t pkt_len, loff_t *off)
+{
+ ssize_t ret = 0;
+ unsigned long flags = 0;
+ struct snoop_packet *packet = NULL;
+ struct hfi1_devdata *dd;
+
+ dd = hfi1_dd_from_sc_inode(fp->f_inode);
+ if (dd == NULL)
+ return -ENODEV;
+
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+ while (list_empty(&dd->hfi1_snoop.queue)) {
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+ if (fp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(
+ dd->hfi1_snoop.waitq,
+ !list_empty(&dd->hfi1_snoop.queue)))
+ return -EINTR;
+
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+ }
+
+ if (!list_empty(&dd->hfi1_snoop.queue)) {
+ packet = list_entry(dd->hfi1_snoop.queue.next,
+ struct snoop_packet, list);
+ list_del(&packet->list);
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+ if (pkt_len >= packet->total_len) {
+ if (copy_to_user(data, packet->data,
+ packet->total_len))
+ ret = -EFAULT;
+ else
+ ret = packet->total_len;
+ } else
+ ret = -EINVAL;
+
+ kfree(packet);
+ } else
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+ return ret;
+}
+
+static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
+{
+ struct hfi1_devdata *dd;
+ void *filter_value = NULL;
+ long ret = 0;
+ int value = 0;
+ u8 physState = 0;
+ u8 linkState = 0;
+ u16 devState = 0;
+ unsigned long flags = 0;
+ unsigned long *argp = NULL;
+ struct hfi1_packet_filter_command filter_cmd = {0};
+ int mode_flag = 0;
+ struct hfi1_pportdata *ppd = NULL;
+ unsigned int index;
+ struct hfi1_link_info link_info;
+
+ dd = hfi1_dd_from_sc_inode(fp->f_inode);
+ if (dd == NULL)
+ return -ENODEV;
+
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+ mode_flag = dd->hfi1_snoop.mode_flag;
+
+ if (((_IOC_DIR(cmd) & _IOC_READ)
+ && !access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd)))
+ || ((_IOC_DIR(cmd) & _IOC_WRITE)
+ && !access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd)))) {
+ ret = -EFAULT;
+ } else if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ } else if ((mode_flag & HFI1_PORT_CAPTURE_MODE) &&
+ (cmd != HFI1_SNOOP_IOCCLEARQUEUE) &&
+ (cmd != HFI1_SNOOP_IOCCLEARFILTER) &&
+ (cmd != HFI1_SNOOP_IOCSETFILTER)) {
+ /* Capture devices are allowed only 3 operations
+ * 1.Clear capture queue
+ * 2.Clear capture filter
+ * 3.Set capture filter
+ * Other are invalid.
+ */
+ ret = -EINVAL;
+ } else {
+ switch (cmd) {
+ case HFI1_SNOOP_IOCSETLINKSTATE:
+ snoop_dbg("HFI1_SNOOP_IOCSETLINKSTATE is not valid");
+ ret = -EINVAL;
+ break;
+
+ case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA:
+ memset(&link_info, 0, sizeof(link_info));
+
+ if (copy_from_user(&link_info,
+ (struct hfi1_link_info __user *)arg,
+ sizeof(link_info)))
+ ret = -EFAULT;
+
+ value = link_info.port_state;
+ index = link_info.port_number;
+ if (index > dd->num_pports - 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ppd = &dd->pport[index];
+ if (!ppd) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* What we want to transition to */
+ physState = (value >> 4) & 0xF;
+ linkState = value & 0xF;
+ snoop_dbg("Setting link state 0x%x", value);
+
+ switch (linkState) {
+ case IB_PORT_NOP:
+ if (physState == 0)
+ break;
+ /* fall through */
+ case IB_PORT_DOWN:
+ switch (physState) {
+ case 0:
+ devState = HLS_DN_DOWNDEF;
+ break;
+ case 2:
+ devState = HLS_DN_POLL;
+ break;
+ case 3:
+ devState = HLS_DN_DISABLE;
+ break;
+ default:
+ ret = -EINVAL;
+ goto done;
+ }
+ ret = set_link_state(ppd, devState);
+ break;
+ case IB_PORT_ARMED:
+ ret = set_link_state(ppd, HLS_UP_ARMED);
+ if (!ret)
+ send_idle_sma(dd, SMA_IDLE_ARM);
+ break;
+ case IB_PORT_ACTIVE:
+ ret = set_link_state(ppd, HLS_UP_ACTIVE);
+ if (!ret)
+ send_idle_sma(dd, SMA_IDLE_ACTIVE);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ break;
+ /* fall through */
+ case HFI1_SNOOP_IOCGETLINKSTATE:
+ case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA:
+ if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) {
+ memset(&link_info, 0, sizeof(link_info));
+ if (copy_from_user(&link_info,
+ (struct hfi1_link_info __user *)arg,
+ sizeof(link_info)))
+ ret = -EFAULT;
+ index = link_info.port_number;
+ } else {
+ ret = __get_user(index, (int __user *) arg);
+ if (ret != 0)
+ break;
+ }
+
+ if (index > dd->num_pports - 1) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ppd = &dd->pport[index];
+ if (!ppd) {
+ ret = -EINVAL;
+ break;
+ }
+ value = hfi1_ibphys_portstate(ppd);
+ value <<= 4;
+ value |= driver_lstate(ppd);
+
+ snoop_dbg("Link port | Link State: %d", value);
+
+ if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) ||
+ (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) {
+ link_info.port_state = value;
+ link_info.node_guid = cpu_to_be64(ppd->guid);
+ link_info.link_speed_active =
+ ppd->link_speed_active;
+ link_info.link_width_active =
+ ppd->link_width_active;
+ if (copy_to_user(
+ (struct hfi1_link_info __user *)arg,
+ &link_info, sizeof(link_info)))
+ ret = -EFAULT;
+ } else {
+ ret = __put_user(value, (int __user *)arg);
+ }
+ break;
+
+ case HFI1_SNOOP_IOCCLEARQUEUE:
+ snoop_dbg("Clearing snoop queue");
+ drain_snoop_list(&dd->hfi1_snoop.queue);
+ break;
+
+ case HFI1_SNOOP_IOCCLEARFILTER:
+ snoop_dbg("Clearing filter");
+ if (dd->hfi1_snoop.filter_callback) {
+ /* Drain packets first */
+ drain_snoop_list(&dd->hfi1_snoop.queue);
+ dd->hfi1_snoop.filter_callback = NULL;
+ }
+ kfree(dd->hfi1_snoop.filter_value);
+ dd->hfi1_snoop.filter_value = NULL;
+ break;
+
+ case HFI1_SNOOP_IOCSETFILTER:
+ snoop_dbg("Setting filter");
+ /* just copy command structure */
+ argp = (unsigned long *)arg;
+ if (copy_from_user(&filter_cmd, (void __user *)argp,
+ sizeof(filter_cmd))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (filter_cmd.opcode >= HFI1_MAX_FILTERS) {
+ pr_alert("Invalid opcode in request\n");
+ ret = -EINVAL;
+ break;
+ }
+
+ snoop_dbg("Opcode %d Len %d Ptr %p",
+ filter_cmd.opcode, filter_cmd.length,
+ filter_cmd.value_ptr);
+
+ filter_value = kzalloc(
+ filter_cmd.length * sizeof(u8),
+ GFP_KERNEL);
+ if (!filter_value) {
+ pr_alert("Not enough memory\n");
+ ret = -ENOMEM;
+ break;
+ }
+ /* copy remaining data from userspace */
+ if (copy_from_user((u8 *)filter_value,
+ (void __user *)filter_cmd.value_ptr,
+ filter_cmd.length)) {
+ kfree(filter_value);
+ ret = -EFAULT;
+ break;
+ }
+ /* Drain packets first */
+ drain_snoop_list(&dd->hfi1_snoop.queue);
+ dd->hfi1_snoop.filter_callback =
+ hfi1_filters[filter_cmd.opcode].filter;
+ /* just in case we see back to back sets */
+ kfree(dd->hfi1_snoop.filter_value);
+ dd->hfi1_snoop.filter_value = filter_value;
+
+ break;
+ case HFI1_SNOOP_IOCGETVERSION:
+ value = SNOOP_CAPTURE_VERSION;
+ snoop_dbg("Getting version: %d", value);
+ ret = __put_user(value, (int __user *)arg);
+ break;
+ case HFI1_SNOOP_IOCSET_OPTS:
+ snoop_flags = 0;
+ ret = __get_user(value, (int __user *) arg);
+ if (ret != 0)
+ break;
+
+ snoop_dbg("Setting snoop option %d", value);
+ if (value & SNOOP_DROP_SEND)
+ snoop_flags |= SNOOP_DROP_SEND;
+ if (value & SNOOP_USE_METADATA)
+ snoop_flags |= SNOOP_USE_METADATA;
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+ }
+done:
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+ return ret;
+}
+
+static void snoop_list_add_tail(struct snoop_packet *packet,
+ struct hfi1_devdata *dd)
+{
+ unsigned long flags = 0;
+
+ spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+ if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) ||
+ (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) {
+ list_add_tail(&packet->list, &dd->hfi1_snoop.queue);
+ snoop_dbg("Added packet to list");
+ }
+
+ /*
+ * Technically we can could have closed the snoop device while waiting
+ * on the above lock and it is gone now. The snoop mode_flag will
+ * prevent us from adding the packet to the queue though.
+ */
+
+ spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+ wake_up_interruptible(&dd->hfi1_snoop.waitq);
+}
+
+static inline int hfi1_filter_check(void *val, const char *msg)
+{
+ if (!val) {
+ snoop_dbg("Error invalid %s value for filter", msg);
+ return HFI1_FILTER_ERR;
+ }
+ return 0;
+}
+
+static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value)
+{
+ struct hfi1_ib_header *hdr;
+ int ret;
+
+ ret = hfi1_filter_check(ibhdr, "header");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+ hdr = (struct hfi1_ib_header *)ibhdr;
+
+ if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */
+ return HFI1_FILTER_HIT; /* matched */
+
+ return HFI1_FILTER_MISS; /* Not matched */
+}
+
+static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value)
+{
+ struct hfi1_ib_header *hdr;
+ int ret;
+
+ ret = hfi1_filter_check(ibhdr, "header");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+
+ hdr = (struct hfi1_ib_header *)ibhdr;
+
+ if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1]))
+ return HFI1_FILTER_HIT;
+
+ return HFI1_FILTER_MISS;
+}
+
+/* Not valid for outgoing packets, send handler passes null for data*/
+static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
+ void *value)
+{
+ struct hfi1_ib_header *hdr;
+ struct hfi1_other_headers *ohdr = NULL;
+ struct ib_smp *smp = NULL;
+ u32 qpn = 0;
+ int ret;
+
+ ret = hfi1_filter_check(ibhdr, "header");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(packet_data, "packet_data");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+
+ hdr = (struct hfi1_ib_header *)ibhdr;
+
+ /* Check for GRH */
+ if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
+ else
+ ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
+
+ qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF;
+ if (qpn <= 1) {
+ smp = (struct ib_smp *)packet_data;
+ if (*((u8 *)value) == smp->mgmt_class)
+ return HFI1_FILTER_HIT;
+ else
+ return HFI1_FILTER_MISS;
+ }
+ return HFI1_FILTER_ERR;
+}
+
+static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value)
+{
+
+ struct hfi1_ib_header *hdr;
+ struct hfi1_other_headers *ohdr = NULL;
+ int ret;
+
+ ret = hfi1_filter_check(ibhdr, "header");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+
+ hdr = (struct hfi1_ib_header *)ibhdr;
+
+ /* Check for GRH */
+ if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
+ else
+ ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
+ if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF))
+ return HFI1_FILTER_HIT;
+
+ return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
+ void *value)
+{
+ u32 lnh = 0;
+ u8 opcode = 0;
+ struct hfi1_ib_header *hdr;
+ struct hfi1_other_headers *ohdr = NULL;
+ int ret;
+
+ ret = hfi1_filter_check(ibhdr, "header");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+
+ hdr = (struct hfi1_ib_header *)ibhdr;
+
+ lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
+
+ if (lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else if (lnh == HFI1_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else
+ return HFI1_FILTER_ERR;
+
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+ if (*((u8 *)value) == ((opcode >> 5) & 0x7))
+ return HFI1_FILTER_HIT;
+
+ return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
+ void *value)
+{
+ struct hfi1_ib_header *hdr;
+ int ret;
+
+ ret = hfi1_filter_check(ibhdr, "header");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+
+ hdr = (struct hfi1_ib_header *)ibhdr;
+
+ if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF))
+ return HFI1_FILTER_HIT;
+
+ return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value)
+{
+
+ u32 lnh = 0;
+ struct hfi1_ib_header *hdr;
+ struct hfi1_other_headers *ohdr = NULL;
+ int ret;
+
+ ret = hfi1_filter_check(ibhdr, "header");
+ if (ret)
+ return ret;
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+
+ hdr = (struct hfi1_ib_header *)ibhdr;
+
+ lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
+ if (lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else if (lnh == HFI1_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else
+ return HFI1_FILTER_ERR;
+
+ /* P_key is 16-bit entity, however top most bit indicates
+ * type of membership. 0 for limited and 1 for Full.
+ * Limited members cannot accept information from other
+ * Limited members, but communication is allowed between
+ * every other combination of membership.
+ * Hence we'll omit comparing top-most bit while filtering
+ */
+
+ if ((*(u16 *)value & 0x7FFF) ==
+ ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF))
+ return HFI1_FILTER_HIT;
+
+ return HFI1_FILTER_MISS;
+}
+
+/*
+ * If packet_data is NULL then this is coming from one of the send functions.
+ * Thus we know if its an ingressed or egressed packet.
+ */
+static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value)
+{
+ u8 user_dir = *(u8 *)value;
+ int ret;
+
+ ret = hfi1_filter_check(value, "user");
+ if (ret)
+ return ret;
+
+ if (packet_data) {
+ /* Incoming packet */
+ if (user_dir & HFI1_SNOOP_INGRESS)
+ return HFI1_FILTER_HIT;
+ } else {
+ /* Outgoing packet */
+ if (user_dir & HFI1_SNOOP_EGRESS)
+ return HFI1_FILTER_HIT;
+ }
+
+ return HFI1_FILTER_MISS;
+}
+
+/*
+ * Allocate a snoop packet. The structure that is stored in the ring buffer, not
+ * to be confused with an hfi packet type.
+ */
+static struct snoop_packet *allocate_snoop_packet(u32 hdr_len,
+ u32 data_len,
+ u32 md_len)
+{
+
+ struct snoop_packet *packet = NULL;
+
+ packet = kzalloc(sizeof(struct snoop_packet) + hdr_len + data_len
+ + md_len,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (likely(packet))
+ INIT_LIST_HEAD(&packet->list);
+
+
+ return packet;
+}
+
+/*
+ * Instead of having snoop and capture code intermixed with the recv functions,
+ * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call
+ * and land in here for snoop/capture but if not enabled the call will go
+ * through as before. This gives us a single point to constrain all of the snoop
+ * snoop recv logic. There is nothing special that needs to happen for bypass
+ * packets. This routine should not try to look into the packet. It just copied
+ * it. There is no guarantee for filters when it comes to bypass packets as
+ * there is no specific support. Bottom line is this routine does now even know
+ * what a bypass packet is.
+ */
+int snoop_recv_handler(struct hfi1_packet *packet)
+{
+ struct hfi1_pportdata *ppd = packet->rcd->ppd;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ int header_size = packet->hlen;
+ void *data = packet->ebuf;
+ u32 tlen = packet->tlen;
+ struct snoop_packet *s_packet = NULL;
+ int ret;
+ int snoop_mode = 0;
+ u32 md_len = 0;
+ struct capture_md md;
+
+ snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen,
+ data);
+
+ trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size,
+ data);
+
+ if (!ppd->dd->hfi1_snoop.filter_callback) {
+ snoop_dbg("filter not set");
+ ret = HFI1_FILTER_HIT;
+ } else {
+ ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data,
+ ppd->dd->hfi1_snoop.filter_value);
+ }
+
+ switch (ret) {
+ case HFI1_FILTER_ERR:
+ snoop_dbg("Error in filter call");
+ break;
+ case HFI1_FILTER_MISS:
+ snoop_dbg("Filter Miss");
+ break;
+ case HFI1_FILTER_HIT:
+
+ if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+ snoop_mode = 1;
+ if ((snoop_mode == 0) ||
+ unlikely(snoop_flags & SNOOP_USE_METADATA))
+ md_len = sizeof(struct capture_md);
+
+
+ s_packet = allocate_snoop_packet(header_size,
+ tlen - header_size,
+ md_len);
+
+ if (unlikely(s_packet == NULL)) {
+ dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
+ break;
+ }
+
+ if (md_len > 0) {
+ memset(&md, 0, sizeof(struct capture_md));
+ md.port = 1;
+ md.dir = PKT_DIR_INGRESS;
+ md.u.rhf = packet->rhf;
+ memcpy(s_packet->data, &md, md_len);
+ }
+
+ /* We should always have a header */
+ if (hdr) {
+ memcpy(s_packet->data + md_len, hdr, header_size);
+ } else {
+ dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n");
+ kfree(s_packet);
+ break;
+ }
+
+ /*
+ * Packets with no data are possible. If there is no data needed
+ * to take care of the last 4 bytes which are normally included
+ * with data buffers and are included in tlen. Since we kzalloc
+ * the buffer we do not need to set any values but if we decide
+ * not to use kzalloc we should zero them.
+ */
+ if (data)
+ memcpy(s_packet->data + header_size + md_len, data,
+ tlen - header_size);
+
+ s_packet->total_len = tlen + md_len;
+ snoop_list_add_tail(s_packet, ppd->dd);
+
+ /*
+ * If we are snooping the packet not capturing then throw away
+ * after adding to the list.
+ */
+ snoop_dbg("Capturing packet");
+ if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) {
+ snoop_dbg("Throwing packet away");
+ /*
+ * If we are dropping the packet we still may need to
+ * handle the case where error flags are set, this is
+ * normally done by the type specific handler but that
+ * won't be called in this case.
+ */
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ handle_eflags(packet);
+
+ /* throw the packet on the floor */
+ return RHF_RCV_CONTINUE;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * We do not care what type of packet came in here - just pass it off
+ * to the normal handler.
+ */
+ return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)]
+ (packet);
+}
+
+/*
+ * Handle snooping and capturing packets when sdma is being used.
+ */
+int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc)
+{
+ pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n");
+ snoop_dbg("Unsupported Operation");
+ return hfi1_verbs_send_dma(qp, ibhdr, hdrwords, ss, len, plen, dwords,
+ 0);
+}
+
+/*
+ * Handle snooping and capturing packets when pio is being used. Does not handle
+ * bypass packets. The only way to send a bypass packet currently is to use the
+ * diagpkt interface. When that interface is enable snoop/capture is not.
+ */
+int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct snoop_packet *s_packet = NULL;
+ u32 *hdr = (u32 *)&ahdr->ibh;
+ u32 length = 0;
+ struct hfi1_sge_state temp_ss;
+ void *data = NULL;
+ void *data_start = NULL;
+ int ret;
+ int snoop_mode = 0;
+ int md_len = 0;
+ struct capture_md md;
+ u32 vl;
+ u32 hdr_len = hdrwords << 2;
+ u32 tlen = HFI1_GET_PKT_LEN(&ahdr->ibh);
+
+ md.u.pbc = 0;
+
+ snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u",
+ hdrwords, len, plen, dwords, tlen);
+ if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+ snoop_mode = 1;
+ if ((snoop_mode == 0) ||
+ unlikely(snoop_flags & SNOOP_USE_METADATA))
+ md_len = sizeof(struct capture_md);
+
+ /* not using ss->total_len as arg 2 b/c that does not count CRC */
+ s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len);
+
+ if (unlikely(s_packet == NULL)) {
+ dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
+ goto out;
+ }
+
+ s_packet->total_len = tlen + md_len;
+
+ if (md_len > 0) {
+ memset(&md, 0, sizeof(struct capture_md));
+ md.port = 1;
+ md.dir = PKT_DIR_EGRESS;
+ if (likely(pbc == 0)) {
+ vl = be16_to_cpu(ahdr->ibh.lrh[0]) >> 12;
+ md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
+ } else {
+ md.u.pbc = 0;
+ }
+ memcpy(s_packet->data, &md, md_len);
+ } else {
+ md.u.pbc = pbc;
+ }
+
+ /* Copy header */
+ if (likely(hdr)) {
+ memcpy(s_packet->data + md_len, hdr, hdr_len);
+ } else {
+ dd_dev_err(ppd->dd,
+ "Unable to copy header to snoop/capture packet\n");
+ kfree(s_packet);
+ goto out;
+ }
+
+ if (ss) {
+ data = s_packet->data + hdr_len + md_len;
+ data_start = data;
+
+ /*
+ * Copy SGE State
+ * The update_sge() function below will not modify the
+ * individual SGEs in the array. It will make a copy each time
+ * and operate on that. So we only need to copy this instance
+ * and it won't impact PIO.
+ */
+ temp_ss = *ss;
+ length = len;
+
+ snoop_dbg("Need to copy %d bytes", length);
+ while (length) {
+ void *addr = temp_ss.sge.vaddr;
+ u32 slen = temp_ss.sge.length;
+
+ if (slen > length) {
+ slen = length;
+ snoop_dbg("slen %d > len %d", slen, length);
+ }
+ snoop_dbg("copy %d to %p", slen, addr);
+ memcpy(data, addr, slen);
+ update_sge(&temp_ss, slen);
+ length -= slen;
+ data += slen;
+ snoop_dbg("data is now %p bytes left %d", data, length);
+ }
+ snoop_dbg("Completed SGE copy");
+ }
+
+ /*
+ * Why do the filter check down here? Because the event tracing has its
+ * own filtering and we need to have the walked the SGE list.
+ */
+ if (!ppd->dd->hfi1_snoop.filter_callback) {
+ snoop_dbg("filter not set\n");
+ ret = HFI1_FILTER_HIT;
+ } else {
+ ret = ppd->dd->hfi1_snoop.filter_callback(
+ &ahdr->ibh,
+ NULL,
+ ppd->dd->hfi1_snoop.filter_value);
+ }
+
+ switch (ret) {
+ case HFI1_FILTER_ERR:
+ snoop_dbg("Error in filter call");
+ /* fall through */
+ case HFI1_FILTER_MISS:
+ snoop_dbg("Filter Miss");
+ kfree(s_packet);
+ break;
+ case HFI1_FILTER_HIT:
+ snoop_dbg("Capturing packet");
+ snoop_list_add_tail(s_packet, ppd->dd);
+
+ if (unlikely((snoop_flags & SNOOP_DROP_SEND) &&
+ (ppd->dd->hfi1_snoop.mode_flag &
+ HFI1_PORT_SNOOP_MODE))) {
+ unsigned long flags;
+
+ snoop_dbg("Dropping packet");
+ if (qp->s_wqe) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_send_complete(
+ qp,
+ qp->s_wqe,
+ IB_WC_SUCCESS);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_rc_send_complete(qp, &ahdr->ibh);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+ return 0;
+ }
+ break;
+ default:
+ kfree(s_packet);
+ break;
+ }
+out:
+ return hfi1_verbs_send_pio(qp, ahdr, hdrwords, ss, len, plen, dwords,
+ md.u.pbc);
+}
+
+/*
+ * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently
+ * this can be used anywhere, but the intention is for inline ACKs for RC and
+ * CCA packets. We don't restrict this usage though.
+ */
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+ u64 pbc, const void *from, size_t count)
+{
+ int snoop_mode = 0;
+ int md_len = 0;
+ struct capture_md md;
+ struct snoop_packet *s_packet = NULL;
+
+ /*
+ * count is in dwords so we need to convert to bytes.
+ * We also need to account for CRC which would be tacked on by hardware.
+ */
+ int packet_len = (count << 2) + 4;
+ int ret;
+
+ snoop_dbg("ACK OUT: len %d", packet_len);
+
+ if (!dd->hfi1_snoop.filter_callback) {
+ snoop_dbg("filter not set");
+ ret = HFI1_FILTER_HIT;
+ } else {
+ ret = dd->hfi1_snoop.filter_callback(
+ (struct hfi1_ib_header *)from,
+ NULL,
+ dd->hfi1_snoop.filter_value);
+ }
+
+ switch (ret) {
+ case HFI1_FILTER_ERR:
+ snoop_dbg("Error in filter call");
+ /* fall through */
+ case HFI1_FILTER_MISS:
+ snoop_dbg("Filter Miss");
+ break;
+ case HFI1_FILTER_HIT:
+ snoop_dbg("Capturing packet");
+ if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+ snoop_mode = 1;
+ if ((snoop_mode == 0) ||
+ unlikely(snoop_flags & SNOOP_USE_METADATA))
+ md_len = sizeof(struct capture_md);
+
+ s_packet = allocate_snoop_packet(packet_len, 0, md_len);
+
+ if (unlikely(s_packet == NULL)) {
+ dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n");
+ goto inline_pio_out;
+ }
+
+ s_packet->total_len = packet_len + md_len;
+
+ /* Fill in the metadata for the packet */
+ if (md_len > 0) {
+ memset(&md, 0, sizeof(struct capture_md));
+ md.port = 1;
+ md.dir = PKT_DIR_EGRESS;
+ md.u.pbc = pbc;
+ memcpy(s_packet->data, &md, md_len);
+ }
+
+ /* Add the packet data which is a single buffer */
+ memcpy(s_packet->data + md_len, from, packet_len);
+
+ snoop_list_add_tail(s_packet, dd);
+
+ if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) {
+ snoop_dbg("Dropping packet");
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+
+inline_pio_out:
+ pio_copy(dd, pbuf, pbc, from, count);
+
+}
diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c
new file mode 100644
index 000000000000..e03bd735173c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/dma.c
@@ -0,0 +1,186 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
+ size_t size, enum dma_data_direction direction)
+{
+ if (WARN_ON(!valid_dma_direction(direction)))
+ return BAD_DMA_ADDRESS;
+
+ return (u64) cpu_addr;
+}
+
+static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ /* This is a stub, nothing to be done here */
+}
+
+static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction)
+{
+ u64 addr;
+
+ if (WARN_ON(!valid_dma_direction(direction)))
+ return BAD_DMA_ADDRESS;
+
+ if (offset + size > PAGE_SIZE)
+ return BAD_DMA_ADDRESS;
+
+ addr = (u64) page_address(page);
+ if (addr)
+ addr += offset;
+
+ return addr;
+}
+
+static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ /* This is a stub, nothing to be done here */
+}
+
+static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction direction)
+{
+ struct scatterlist *sg;
+ u64 addr;
+ int i;
+ int ret = nents;
+
+ if (WARN_ON(!valid_dma_direction(direction)))
+ return BAD_DMA_ADDRESS;
+
+ for_each_sg(sgl, sg, nents, i) {
+ addr = (u64) page_address(sg_page(sg));
+ if (!addr) {
+ ret = 0;
+ break;
+ }
+ sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+ sg->dma_length = sg->length;
+#endif
+ }
+ return ret;
+}
+
+static void hfi1_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ /* This is a stub, nothing to be done here */
+}
+
+static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+ size_t size, enum dma_data_direction dir)
+{
+}
+
+static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+}
+
+static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
+ u64 *dma_handle, gfp_t flag)
+{
+ struct page *p;
+ void *addr = NULL;
+
+ p = alloc_pages(flag, get_order(size));
+ if (p)
+ addr = page_address(p);
+ if (dma_handle)
+ *dma_handle = (u64) addr;
+ return addr;
+}
+
+static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
+ void *cpu_addr, u64 dma_handle)
+{
+ free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
+ .mapping_error = hfi1_mapping_error,
+ .map_single = hfi1_dma_map_single,
+ .unmap_single = hfi1_dma_unmap_single,
+ .map_page = hfi1_dma_map_page,
+ .unmap_page = hfi1_dma_unmap_page,
+ .map_sg = hfi1_map_sg,
+ .unmap_sg = hfi1_unmap_sg,
+ .sync_single_for_cpu = hfi1_sync_single_for_cpu,
+ .sync_single_for_device = hfi1_sync_single_for_device,
+ .alloc_coherent = hfi1_dma_alloc_coherent,
+ .free_coherent = hfi1_dma_free_coherent
+};
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
new file mode 100644
index 000000000000..c0a59001e5cd
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/driver.c
@@ -0,0 +1,1241 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the initialization code.
+ */
+const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(hfi1_devs_lock);
+LIST_HEAD(hfi1_dev_list);
+DEFINE_MUTEX(hfi1_mutex); /* general driver use */
+
+unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
+
+unsigned int hfi1_cu = 1;
+module_param_named(cu, hfi1_cu, uint, S_IRUGO);
+MODULE_PARM_DESC(cu, "Credit return units");
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+static int hfi1_caps_set(const char *, const struct kernel_param *);
+static int hfi1_caps_get(char *, const struct kernel_param *);
+static const struct kernel_param_ops cap_ops = {
+ .set = hfi1_caps_set,
+ .get = hfi1_caps_get
+};
+module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
+MODULE_VERSION(HFI1_DRIVER_VERSION);
+
+/*
+ * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define MAX_PKT_RECV 64
+#define EGR_HEAD_UPDATE_THRESHOLD 16
+
+struct hfi1_ib_stats hfi1_stats;
+
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
+{
+ int ret = 0;
+ unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
+ cap_mask = *cap_mask_ptr, value, diff,
+ write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
+ HFI1_CAP_WRITABLE_MASK);
+
+ ret = kstrtoul(val, 0, &value);
+ if (ret) {
+ pr_warn("Invalid module parameter value for 'cap_mask'\n");
+ goto done;
+ }
+ /* Get the changed bits (except the locked bit) */
+ diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
+
+ /* Remove any bits that are not allowed to change after driver load */
+ if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
+ pr_warn("Ignoring non-writable capability bits %#lx\n",
+ diff & ~write_mask);
+ diff &= write_mask;
+ }
+
+ /* Mask off any reserved bits */
+ diff &= ~HFI1_CAP_RESERVED_MASK;
+ /* Clear any previously set and changing bits */
+ cap_mask &= ~diff;
+ /* Update the bits with the new capability */
+ cap_mask |= (value & diff);
+ /* Check for any kernel/user restrictions */
+ diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
+ ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
+ cap_mask &= ~diff;
+ /* Set the bitmask to the final set */
+ *cap_mask_ptr = cap_mask;
+done:
+ return ret;
+}
+
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
+{
+ unsigned long cap_mask = *(unsigned long *)kp->arg;
+
+ cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
+ cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
+
+ return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
+}
+
+const char *get_unit_name(int unit)
+{
+ static char iname[16];
+
+ snprintf(iname, sizeof(iname), DRIVER_NAME"_%u", unit);
+ return iname;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int hfi1_count_active_units(void)
+{
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ unsigned long flags;
+ int pidx, nunits_active = 0;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ list_for_each_entry(dd, &hfi1_dev_list, list) {
+ if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+ continue;
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->lid && ppd->linkup) {
+ nunits_active++;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int hfi1_count_units(int *npresentp, int *nupp)
+{
+ int nunits = 0, npresent = 0, nup = 0;
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+ int pidx;
+ struct hfi1_pportdata *ppd;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+ list_for_each_entry(dd, &hfi1_dev_list, list) {
+ nunits++;
+ if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
+ npresent++;
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->lid && ppd->linkup)
+ nup++;
+ }
+ }
+
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+ if (npresentp)
+ *npresentp = npresent;
+ if (nupp)
+ *nupp = nup;
+
+ return nunits;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
+ u8 *update)
+{
+ u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
+
+ *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
+ return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
+ (offset * RCV_BUF_BLOCK_SIZE));
+}
+
+/*
+ * Validate and encode the a given RcvArray Buffer size.
+ * The function will check whether the given size falls within
+ * allowed size ranges for the respective type and, optionally,
+ * return the proper encoding.
+ */
+inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
+{
+ if (unlikely(!IS_ALIGNED(size, PAGE_SIZE)))
+ return 0;
+ if (unlikely(size < MIN_EAGER_BUFFER))
+ return 0;
+ if (size >
+ (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
+ return 0;
+ if (encoded)
+ *encoded = ilog2(size / PAGE_SIZE) + 1;
+ return 1;
+}
+
+static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_message_header *rhdr = packet->hdr;
+ u32 rte = rhf_rcv_type_err(packet->rhf);
+ int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+ if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+ return;
+
+ if (packet->rhf & RHF_TID_ERR) {
+ /* For TIDERR and RC QPs preemptively schedule a NAK */
+ struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
+ struct hfi1_other_headers *ohdr = NULL;
+ u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+ u16 lid = be16_to_cpu(hdr->lrh[1]);
+ u32 qp_num;
+ u32 rcv_flags = 0;
+
+ /* Sanity check packet */
+ if (tlen < 24)
+ goto drop;
+
+ /* Check for GRH */
+ if (lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else if (lnh == HFI1_LRH_GRH) {
+ u32 vtf;
+
+ ohdr = &hdr->u.l.oth;
+ if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+ goto drop;
+ vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+ if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+ goto drop;
+ rcv_flags |= HFI1_HAS_GRH;
+ } else
+ goto drop;
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+ if (lid < HFI1_MULTICAST_LID_BASE) {
+ struct hfi1_qp *qp;
+
+ rcu_read_lock();
+ qp = hfi1_lookup_qpn(ibp, qp_num);
+ if (!qp) {
+ rcu_read_unlock();
+ goto drop;
+ }
+
+ /*
+ * Handle only RC QPs - for other QP types drop error
+ * packet.
+ */
+ spin_lock(&qp->r_lock);
+
+ /* Check for valid receive state. */
+ if (!(ib_hfi1_state_ops[qp->state] &
+ HFI1_PROCESS_RECV_OK)) {
+ ibp->n_pkt_drops++;
+ }
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_RC:
+ hfi1_rc_hdrerr(
+ rcd,
+ hdr,
+ rcv_flags,
+ qp);
+ break;
+ default:
+ /* For now don't handle any other QP types */
+ break;
+ }
+
+ spin_unlock(&qp->r_lock);
+ rcu_read_unlock();
+ } /* Unicast QP */
+ } /* Valid packet with TIDErr */
+
+ /* handle "RcvTypeErr" flags */
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ {
+ u32 opcode;
+ void *ebuf = NULL;
+ __be32 *bth = NULL;
+
+ if (rhf_use_egr_bfr(packet->rhf))
+ ebuf = packet->ebuf;
+
+ if (ebuf == NULL)
+ goto drop; /* this should never happen */
+
+ if (lnh == HFI1_LRH_BTH)
+ bth = (__be32 *)ebuf;
+ else if (lnh == HFI1_LRH_GRH)
+ bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
+ else
+ goto drop;
+
+ opcode = be32_to_cpu(bth[0]) >> 24;
+ opcode &= 0xff;
+
+ if (opcode == IB_OPCODE_CNP) {
+ /*
+ * Only in pre-B0 h/w is the CNP_OPCODE handled
+ * via this code path (errata 291394).
+ */
+ struct hfi1_qp *qp = NULL;
+ u32 lqpn, rqpn;
+ u16 rlid;
+ u8 svc_type, sl, sc5;
+
+ sc5 = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
+ if (rhf_dc_info(packet->rhf))
+ sc5 |= 0x10;
+ sl = ibp->sc_to_sl[sc5];
+
+ lqpn = be32_to_cpu(bth[1]) & HFI1_QPN_MASK;
+ rcu_read_lock();
+ qp = hfi1_lookup_qpn(ibp, lqpn);
+ if (qp == NULL) {
+ rcu_read_unlock();
+ goto drop;
+ }
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_UD:
+ rlid = 0;
+ rqpn = 0;
+ svc_type = IB_CC_SVCTYPE_UD;
+ break;
+ case IB_QPT_UC:
+ rlid = be16_to_cpu(rhdr->lrh[3]);
+ rqpn = qp->remote_qpn;
+ svc_type = IB_CC_SVCTYPE_UC;
+ break;
+ default:
+ goto drop;
+ }
+
+ process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+ rcu_read_unlock();
+ }
+
+ packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
+ break;
+ }
+ default:
+ break;
+ }
+
+drop:
+ return;
+}
+
+static inline void init_packet(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet)
+{
+
+ packet->rsize = rcd->rcvhdrqentsize; /* words */
+ packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+ packet->rcd = rcd;
+ packet->updegr = 0;
+ packet->etail = -1;
+ packet->rhf_addr = (__le32 *) rcd->rcvhdrq + rcd->head +
+ rcd->dd->rhf_offset;
+ packet->rhf = rhf_to_cpu(packet->rhf_addr);
+ packet->rhqoff = rcd->head;
+ packet->numpkt = 0;
+ packet->rcv_flags = 0;
+}
+
+#ifndef CONFIG_PRESCAN_RXQ
+static void prescan_rxq(struct hfi1_packet *packet) {}
+#else /* CONFIG_PRESCAN_RXQ */
+static int prescan_receive_queue;
+
+static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr,
+ struct hfi1_other_headers *ohdr,
+ u64 rhf, struct ib_grh *grh)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ u32 bth1;
+ u8 sc5, svc_type;
+ int is_fecn, is_becn;
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_UD:
+ svc_type = IB_CC_SVCTYPE_UD;
+ break;
+ case IB_QPT_UC: /* LATER */
+ case IB_QPT_RC: /* LATER */
+ default:
+ return;
+ }
+
+ is_fecn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+ HFI1_FECN_MASK;
+ is_becn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+ HFI1_BECN_MASK;
+
+ sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+ if (rhf_dc_info(rhf))
+ sc5 |= 0x10;
+
+ if (is_fecn) {
+ u32 src_qpn = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
+ u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+ u16 dlid = be16_to_cpu(hdr->lrh[1]);
+ u16 slid = be16_to_cpu(hdr->lrh[3]);
+
+ return_cnp(ibp, qp, src_qpn, pkey, dlid, slid, sc5, grh);
+ }
+
+ if (is_becn) {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 lqpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+ u8 sl = ibp->sc_to_sl[sc5];
+
+ process_becn(ppd, sl, 0, lqpn, 0, svc_type);
+ }
+
+ /* turn off BECN, or FECN */
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ bth1 &= ~(HFI1_FECN_MASK << HFI1_FECN_SHIFT);
+ bth1 &= ~(HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+ ohdr->bth[1] = cpu_to_be32(bth1);
+}
+
+struct ps_mdata {
+ struct hfi1_ctxtdata *rcd;
+ u32 rsize;
+ u32 maxcnt;
+ u32 ps_head;
+ u32 ps_tail;
+ u32 ps_seq;
+};
+
+static inline void init_ps_mdata(struct ps_mdata *mdata,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
+ mdata->rcd = rcd;
+ mdata->rsize = packet->rsize;
+ mdata->maxcnt = packet->maxcnt;
+
+ if (rcd->ps_state.initialized == 0) {
+ mdata->ps_head = packet->rhqoff;
+ rcd->ps_state.initialized++;
+ } else
+ mdata->ps_head = rcd->ps_state.ps_head;
+
+ if (HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+ mdata->ps_tail = packet->hdrqtail;
+ mdata->ps_seq = 0; /* not used with DMA_RTAIL */
+ } else {
+ mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
+ mdata->ps_seq = rcd->seq_cnt;
+ }
+}
+
+static inline int ps_done(struct ps_mdata *mdata, u64 rhf)
+{
+ if (HFI1_CAP_IS_KSET(DMA_RTAIL))
+ return mdata->ps_head == mdata->ps_tail;
+ return mdata->ps_seq != rhf_rcv_seq(rhf);
+}
+
+static inline void update_ps_mdata(struct ps_mdata *mdata)
+{
+ struct hfi1_ctxtdata *rcd = mdata->rcd;
+
+ mdata->ps_head += mdata->rsize;
+ if (mdata->ps_head > mdata->maxcnt)
+ mdata->ps_head = 0;
+ rcd->ps_state.ps_head = mdata->ps_head;
+ if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+ if (++mdata->ps_seq > 13)
+ mdata->ps_seq = 1;
+ }
+}
+
+/*
+ * prescan_rxq - search through the receive queue looking for packets
+ * containing Excplicit Congestion Notifications (FECNs, or BECNs).
+ * When an ECN is found, process the Congestion Notification, and toggle
+ * it off.
+ */
+static void prescan_rxq(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ps_mdata mdata;
+
+ if (!prescan_receive_queue)
+ return;
+
+ init_ps_mdata(&mdata, packet);
+
+ while (1) {
+ struct hfi1_devdata *dd = rcd->dd;
+ struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
+ __le32 *rhf_addr = (__le32 *) rcd->rcvhdrq + mdata.ps_head +
+ dd->rhf_offset;
+ struct hfi1_qp *qp;
+ struct hfi1_ib_header *hdr;
+ struct hfi1_other_headers *ohdr;
+ struct ib_grh *grh = NULL;
+ u64 rhf = rhf_to_cpu(rhf_addr);
+ u32 etype = rhf_rcv_type(rhf), qpn;
+ int is_ecn = 0;
+ u8 lnh;
+
+ if (ps_done(&mdata, rhf))
+ break;
+
+ if (etype != RHF_RCV_TYPE_IB)
+ goto next;
+
+ hdr = (struct hfi1_ib_header *)
+ hfi1_get_msgheader(dd, rhf_addr);
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+
+ if (lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else if (lnh == HFI1_LRH_GRH) {
+ ohdr = &hdr->u.l.oth;
+ grh = &hdr->u.l.grh;
+ } else
+ goto next; /* just in case */
+
+ is_ecn |= be32_to_cpu(ohdr->bth[1]) &
+ (HFI1_FECN_MASK << HFI1_FECN_SHIFT);
+ is_ecn |= be32_to_cpu(ohdr->bth[1]) &
+ (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+
+ if (!is_ecn)
+ goto next;
+
+ qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+ rcu_read_lock();
+ qp = hfi1_lookup_qpn(ibp, qpn);
+
+ if (qp == NULL) {
+ rcu_read_unlock();
+ goto next;
+ }
+
+ process_ecn(qp, hdr, ohdr, rhf, grh);
+ rcu_read_unlock();
+next:
+ update_ps_mdata(&mdata);
+ }
+}
+#endif /* CONFIG_PRESCAN_RXQ */
+
+#define RCV_PKT_OK 0x0
+#define RCV_PKT_MAX 0x1
+
+static inline int process_rcv_packet(struct hfi1_packet *packet)
+{
+ int ret = RCV_PKT_OK;
+
+ packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
+ packet->rhf_addr);
+ packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+ packet->etype = rhf_rcv_type(packet->rhf);
+ /* total length */
+ packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+ /* retrieve eager buffer details */
+ packet->ebuf = NULL;
+ if (rhf_use_egr_bfr(packet->rhf)) {
+ packet->etail = rhf_egr_index(packet->rhf);
+ packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+ &packet->updegr);
+ /*
+ * Prefetch the contents of the eager buffer. It is
+ * OK to send a negative length to prefetch_range().
+ * The +2 is the size of the RHF.
+ */
+ prefetch_range(packet->ebuf,
+ packet->tlen - ((packet->rcd->rcvhdrqentsize -
+ (rhf_hdrq_offset(packet->rhf)+2)) * 4));
+ }
+
+ /*
+ * Call a type specific handler for the packet. We
+ * should be able to trust that etype won't be beyond
+ * the range of valid indexes. If so something is really
+ * wrong and we can probably just let things come
+ * crashing down. There is no need to eat another
+ * comparison in this performance critical code.
+ */
+ packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
+ packet->numpkt++;
+
+ /* Set up for the next packet */
+ packet->rhqoff += packet->rsize;
+ if (packet->rhqoff >= packet->maxcnt)
+ packet->rhqoff = 0;
+
+ if (packet->numpkt == MAX_PKT_RECV) {
+ ret = RCV_PKT_MAX;
+ this_cpu_inc(*packet->rcd->dd->rcv_limit);
+ }
+
+ packet->rhf_addr = (__le32 *) packet->rcd->rcvhdrq + packet->rhqoff +
+ packet->rcd->dd->rhf_offset;
+ packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+ return ret;
+}
+
+static inline void process_rcv_update(int last, struct hfi1_packet *packet)
+{
+ /*
+ * Update head regs etc., every 16 packets, if not last pkt,
+ * to help prevent rcvhdrq overflows, when many packets
+ * are processed and queue is nearly full.
+ * Don't request an interrupt for intermediate updates.
+ */
+ if (!last && !(packet->numpkt & 0xf)) {
+ update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
+ packet->etail, 0, 0);
+ packet->updegr = 0;
+ }
+ packet->rcv_flags = 0;
+}
+
+static inline void finish_packet(struct hfi1_packet *packet)
+{
+
+ /*
+ * Nothing we need to free for the packet.
+ *
+ * The only thing we need to do is a final update and call for an
+ * interrupt
+ */
+ update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+ packet->etail, rcv_intr_dynamic, packet->numpkt);
+
+}
+
+static inline void process_rcv_qp_work(struct hfi1_packet *packet)
+{
+
+ struct hfi1_ctxtdata *rcd;
+ struct hfi1_qp *qp, *nqp;
+
+ rcd = packet->rcd;
+ rcd->head = packet->rhqoff;
+
+ /*
+ * Iterate over all QPs waiting to respond.
+ * The list won't change since the IRQ is only run on one CPU.
+ */
+ list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+ list_del_init(&qp->rspwait);
+ if (qp->r_flags & HFI1_R_RSP_NAK) {
+ qp->r_flags &= ~HFI1_R_RSP_NAK;
+ hfi1_send_rc_ack(rcd, qp, 0);
+ }
+ if (qp->r_flags & HFI1_R_RSP_SEND) {
+ unsigned long flags;
+
+ qp->r_flags &= ~HFI1_R_RSP_SEND;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_hfi1_state_ops[qp->state] &
+ HFI1_PROCESS_OR_FLUSH_SEND)
+ hfi1_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+}
+
+/*
+ * Handle receive interrupts when using the no dma rtail option.
+ */
+void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd)
+{
+ u32 seq;
+ int last = 0;
+ struct hfi1_packet packet;
+
+ init_packet(rcd, &packet);
+ seq = rhf_rcv_seq(packet.rhf);
+ if (seq != rcd->seq_cnt)
+ goto bail;
+
+ prescan_rxq(&packet);
+
+ while (!last) {
+ last = process_rcv_packet(&packet);
+ seq = rhf_rcv_seq(packet.rhf);
+ if (++rcd->seq_cnt > 13)
+ rcd->seq_cnt = 1;
+ if (seq != rcd->seq_cnt)
+ last = 1;
+ process_rcv_update(last, &packet);
+ }
+ process_rcv_qp_work(&packet);
+bail:
+ finish_packet(&packet);
+}
+
+void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd)
+{
+ u32 hdrqtail;
+ int last = 0;
+ struct hfi1_packet packet;
+
+ init_packet(rcd, &packet);
+ hdrqtail = get_rcvhdrtail(rcd);
+ if (packet.rhqoff == hdrqtail)
+ goto bail;
+ smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
+
+ prescan_rxq(&packet);
+
+ while (!last) {
+ last = process_rcv_packet(&packet);
+ if (packet.rhqoff == hdrqtail)
+ last = 1;
+ process_rcv_update(last, &packet);
+ }
+ process_rcv_qp_work(&packet);
+bail:
+ finish_packet(&packet);
+
+}
+
+static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < dd->first_user_ctxt; i++)
+ dd->rcd[i]->do_interrupt =
+ &handle_receive_interrupt_nodma_rtail;
+}
+
+static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < dd->first_user_ctxt; i++)
+ dd->rcd[i]->do_interrupt =
+ &handle_receive_interrupt_dma_rtail;
+}
+
+/*
+ * handle_receive_interrupt - receive a packet
+ * @rcd: the context
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler.
+ */
+void handle_receive_interrupt(struct hfi1_ctxtdata *rcd)
+{
+
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 hdrqtail;
+ int last = 0, needset = 1;
+ struct hfi1_packet packet;
+
+ init_packet(rcd, &packet);
+
+ if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+ u32 seq = rhf_rcv_seq(packet.rhf);
+
+ if (seq != rcd->seq_cnt)
+ goto bail;
+ hdrqtail = 0;
+ } else {
+ hdrqtail = get_rcvhdrtail(rcd);
+ if (packet.rhqoff == hdrqtail)
+ goto bail;
+ smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
+ }
+
+ prescan_rxq(&packet);
+
+ while (!last) {
+
+ if (unlikely(dd->do_drop && atomic_xchg(&dd->drop_packet,
+ DROP_PACKET_OFF) == DROP_PACKET_ON)) {
+ dd->do_drop = 0;
+
+ /* On to the next packet */
+ packet.rhqoff += packet.rsize;
+ packet.rhf_addr = (__le32 *) rcd->rcvhdrq +
+ packet.rhqoff +
+ dd->rhf_offset;
+ packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+ } else {
+ last = process_rcv_packet(&packet);
+ }
+
+ if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+ u32 seq = rhf_rcv_seq(packet.rhf);
+
+ if (++rcd->seq_cnt > 13)
+ rcd->seq_cnt = 1;
+ if (seq != rcd->seq_cnt)
+ last = 1;
+ if (needset) {
+ dd_dev_info(dd,
+ "Switching to NO_DMA_RTAIL\n");
+ set_all_nodma_rtail(dd);
+ needset = 0;
+ }
+ } else {
+ if (packet.rhqoff == hdrqtail)
+ last = 1;
+ if (needset) {
+ dd_dev_info(dd,
+ "Switching to DMA_RTAIL\n");
+ set_all_dma_rtail(dd);
+ needset = 0;
+ }
+ }
+
+ process_rcv_update(last, &packet);
+ }
+
+ process_rcv_qp_work(&packet);
+
+bail:
+ /*
+ * Always write head at end, and setup rcv interrupt, even
+ * if no packets were processed.
+ */
+ finish_packet(&packet);
+}
+
+/*
+ * Convert a given MTU size to the on-wire MAD packet enumeration.
+ * Return -1 if the size is invalid.
+ */
+int mtu_to_enum(u32 mtu, int default_if_bad)
+{
+ switch (mtu) {
+ case 0: return OPA_MTU_0;
+ case 256: return OPA_MTU_256;
+ case 512: return OPA_MTU_512;
+ case 1024: return OPA_MTU_1024;
+ case 2048: return OPA_MTU_2048;
+ case 4096: return OPA_MTU_4096;
+ case 8192: return OPA_MTU_8192;
+ case 10240: return OPA_MTU_10240;
+ }
+ return default_if_bad;
+}
+
+u16 enum_to_mtu(int mtu)
+{
+ switch (mtu) {
+ case OPA_MTU_0: return 0;
+ case OPA_MTU_256: return 256;
+ case OPA_MTU_512: return 512;
+ case OPA_MTU_1024: return 1024;
+ case OPA_MTU_2048: return 2048;
+ case OPA_MTU_4096: return 4096;
+ case OPA_MTU_8192: return 8192;
+ case OPA_MTU_10240: return 10240;
+ default: return 0xffff;
+ }
+}
+
+/*
+ * set_mtu - set the MTU
+ * @ppd: the per port data
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size. We do not deal with what happens
+ * to programs that are already running when the size changes.
+ */
+int set_mtu(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int i, drain, ret = 0, is_up = 0;
+
+ ppd->ibmtu = 0;
+ for (i = 0; i < ppd->vls_supported; i++)
+ if (ppd->ibmtu < dd->vld[i].mtu)
+ ppd->ibmtu = dd->vld[i].mtu;
+ ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
+
+ mutex_lock(&ppd->hls_lock);
+ if (ppd->host_link_state == HLS_UP_INIT
+ || ppd->host_link_state == HLS_UP_ARMED
+ || ppd->host_link_state == HLS_UP_ACTIVE)
+ is_up = 1;
+
+ drain = !is_ax(dd) && is_up;
+
+ if (drain)
+ /*
+ * MTU is specified per-VL. To ensure that no packet gets
+ * stuck (due, e.g., to the MTU for the packet's VL being
+ * reduced), empty the per-VL FIFOs before adjusting MTU.
+ */
+ ret = stop_drain_data_vls(dd);
+
+ if (ret) {
+ dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
+ __func__);
+ goto err;
+ }
+
+ hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
+
+ if (drain)
+ open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+ mutex_unlock(&ppd->hls_lock);
+
+ return ret;
+}
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ ppd->lid = lid;
+ ppd->lmc = lmc;
+ hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
+
+ dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid);
+
+ return 0;
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDs, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void run_led_override(unsigned long opaque)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
+ struct hfi1_devdata *dd = ppd->dd;
+ int timeoff;
+ int ph_idx;
+
+ if (!(dd->flags & HFI1_INITTED))
+ return;
+
+ ph_idx = ppd->led_override_phase++ & 1;
+ ppd->led_override = ppd->led_override_vals[ph_idx];
+ timeoff = ppd->led_override_timeoff;
+
+ /*
+ * don't re-fire the timer if user asked for it to be off; we let
+ * it fire one more time after they turn it off to simplify
+ */
+ if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+ mod_timer(&ppd->led_override_timer, jiffies + timeoff);
+}
+
+void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int timeoff, freq;
+
+ if (!(dd->flags & HFI1_INITTED))
+ return;
+
+ /* First check if we are blinking. If not, use 1HZ polling */
+ timeoff = HZ;
+ freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+ if (freq) {
+ /* For blink, set each phase from one nybble of val */
+ ppd->led_override_vals[0] = val & 0xF;
+ ppd->led_override_vals[1] = (val >> 4) & 0xF;
+ timeoff = (HZ << 4)/freq;
+ } else {
+ /* Non-blink set both phases the same. */
+ ppd->led_override_vals[0] = val & 0xF;
+ ppd->led_override_vals[1] = val & 0xF;
+ }
+ ppd->led_override_timeoff = timeoff;
+
+ /*
+ * If the timer has not already been started, do so. Use a "quick"
+ * timeout so the function will be called soon, to look at our request.
+ */
+ if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
+ /* Need to start timer */
+ init_timer(&ppd->led_override_timer);
+ ppd->led_override_timer.function = run_led_override;
+ ppd->led_override_timer.data = (unsigned long) ppd;
+ ppd->led_override_timer.expires = jiffies + 1;
+ add_timer(&ppd->led_override_timer);
+ } else {
+ if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+ mod_timer(&ppd->led_override_timer, jiffies + 1);
+ atomic_dec(&ppd->led_override_timer_active);
+ }
+}
+
+/**
+ * hfi1_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload). We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize. For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int hfi1_reset_device(int unit)
+{
+ int ret, i;
+ struct hfi1_devdata *dd = hfi1_lookup(unit);
+ struct hfi1_pportdata *ppd;
+ unsigned long flags;
+ int pidx;
+
+ if (!dd) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ dd_dev_info(dd, "Reset on unit %u requested\n", unit);
+
+ if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+ dd_dev_info(dd,
+ "Invalid unit number %u or not initialized or not present\n",
+ unit);
+ ret = -ENXIO;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ if (dd->rcd)
+ for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+ if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+ continue;
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+ ret = -EBUSY;
+ goto bail;
+ }
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (atomic_read(&ppd->led_override_timer_active)) {
+ /* Need to stop LED timer, _then_ shut off LEDs */
+ del_timer_sync(&ppd->led_override_timer);
+ atomic_set(&ppd->led_override_timer_active, 0);
+ }
+
+ /* Shut off LEDs after we are sure timer is not running */
+ ppd->led_override = LED_OVER_BOTH_OFF;
+ }
+ if (dd->flags & HFI1_HAS_SEND_DMA)
+ sdma_exit(dd);
+
+ hfi1_reset_cpu_counters(dd);
+
+ ret = hfi1_init(dd, 1);
+
+ if (ret)
+ dd_dev_err(dd,
+ "Reinitialize unit %u after reset failed with %d\n",
+ unit, ret);
+ else
+ dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
+ unit);
+
+bail:
+ return ret;
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ u32 rte = rhf_rcv_type_err(packet->rhf);
+
+ dd_dev_err(rcd->dd,
+ "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+ rcd->ctxt, packet->rhf,
+ packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+ packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+ packet->rhf & RHF_DC_ERR ? "dc " : "",
+ packet->rhf & RHF_TID_ERR ? "tid " : "",
+ packet->rhf & RHF_LEN_ERR ? "len " : "",
+ packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+ packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+ packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+ rte);
+
+ rcv_hdrerr(rcd, rcd->ppd, packet);
+}
+
+/*
+ * The following functions are called by the interrupt handler. They are type
+ * specific handlers for each packet type.
+ */
+int process_receive_ib(struct hfi1_packet *packet)
+{
+ trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
+ packet->rcd->ctxt,
+ rhf_err_flags(packet->rhf),
+ RHF_RCV_TYPE_IB,
+ packet->hlen,
+ packet->tlen,
+ packet->updegr,
+ rhf_egr_index(packet->rhf));
+
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ handle_eflags(packet);
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_ib_rcv(packet);
+ return RHF_RCV_CONTINUE;
+}
+
+int process_receive_bypass(struct hfi1_packet *packet)
+{
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ handle_eflags(packet);
+
+ dd_dev_err(packet->rcd->dd,
+ "Bypass packets are not supported in normal operation. Dropping\n");
+ return RHF_RCV_CONTINUE;
+}
+
+int process_receive_error(struct hfi1_packet *packet)
+{
+ handle_eflags(packet);
+
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ dd_dev_err(packet->rcd->dd,
+ "Unhandled error packet received. Dropping.\n");
+
+ return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_expected(struct hfi1_packet *packet)
+{
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ handle_eflags(packet);
+
+ dd_dev_err(packet->rcd->dd,
+ "Unhandled expected packet received. Dropping.\n");
+ return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_eager(struct hfi1_packet *packet)
+{
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ handle_eflags(packet);
+
+ dd_dev_err(packet->rcd->dd,
+ "Unhandled eager packet received. Dropping.\n");
+ return RHF_RCV_CONTINUE;
+}
+
+int process_receive_invalid(struct hfi1_packet *packet)
+{
+ dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
+ rhf_rcv_type(packet->rhf));
+ return RHF_RCV_CONTINUE;
+}
diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c
new file mode 100644
index 000000000000..b61d3ae93ed1
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/eprom.c
@@ -0,0 +1,475 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
+
+/*
+ * The EPROM is logically divided into two partitions:
+ * partition 0: the first 128K, visible from PCI ROM BAR
+ * partition 1: the rest
+ */
+#define P0_SIZE (128 * 1024)
+#define P1_START P0_SIZE
+
+/* largest erase size supported by the controller */
+#define SIZE_32KB (32 * 1024)
+#define MASK_32KB (SIZE_32KB - 1)
+
+/* controller page size, in bytes */
+#define EP_PAGE_SIZE 256
+#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1)
+
+/* controller commands */
+#define CMD_SHIFT 24
+#define CMD_NOP (0)
+#define CMD_PAGE_PROGRAM(addr) ((0x02 << CMD_SHIFT) | addr)
+#define CMD_READ_DATA(addr) ((0x03 << CMD_SHIFT) | addr)
+#define CMD_READ_SR1 ((0x05 << CMD_SHIFT))
+#define CMD_WRITE_ENABLE ((0x06 << CMD_SHIFT))
+#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr)
+#define CMD_CHIP_ERASE ((0x60 << CMD_SHIFT))
+#define CMD_READ_MANUF_DEV_ID ((0x90 << CMD_SHIFT))
+#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT))
+
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2 /* full speed */
+
+/* controller status register 1 bits */
+#define SR1_BUSY 0x1ull /* the BUSY bit in SR1 */
+
+/* sleep length while waiting for controller */
+#define WAIT_SLEEP_US 100 /* must be larger than 5 (see usage) */
+#define COUNT_DELAY_SEC(n) ((n) * (1000000/WAIT_SLEEP_US))
+
+/* GPIO pins */
+#define EPROM_WP_N (1ull << 14) /* EPROM write line */
+
+/*
+ * Use the EP mutex to guard against other callers from within the driver.
+ * Also covers usage of eprom_available.
+ */
+static DEFINE_MUTEX(eprom_mutex);
+static int eprom_available; /* default: not available */
+
+/*
+ * Turn on external enable line that allows writing on the flash.
+ */
+static void write_enable(struct hfi1_devdata *dd)
+{
+ /* raise signal */
+ write_csr(dd, ASIC_GPIO_OUT,
+ read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N);
+ /* raise enable */
+ write_csr(dd, ASIC_GPIO_OE,
+ read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N);
+}
+
+/*
+ * Turn off external enable line that allows writing on the flash.
+ */
+static void write_disable(struct hfi1_devdata *dd)
+{
+ /* lower signal */
+ write_csr(dd, ASIC_GPIO_OUT,
+ read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N);
+ /* lower enable */
+ write_csr(dd, ASIC_GPIO_OE,
+ read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N);
+}
+
+/*
+ * Wait for the device to become not busy. Must be called after all
+ * write or erase operations.
+ */
+static int wait_for_not_busy(struct hfi1_devdata *dd)
+{
+ unsigned long count = 0;
+ u64 reg;
+ int ret = 0;
+
+ /* starts page mode */
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1);
+ while (1) {
+ udelay(WAIT_SLEEP_US);
+ usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5);
+ count++;
+ reg = read_csr(dd, ASIC_EEP_DATA);
+ if ((reg & SR1_BUSY) == 0)
+ break;
+ /* 200s is the largest time for a 128Mb device */
+ if (count > COUNT_DELAY_SEC(200)) {
+ dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n");
+ ret = -ETIMEDOUT;
+ break; /* break, not goto - must stop page mode */
+ }
+ }
+
+ /* stop page mode with a NOP */
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP);
+
+ return ret;
+}
+
+/*
+ * Read the device ID from the SPI controller.
+ */
+static u32 read_device_id(struct hfi1_devdata *dd)
+{
+ /* read the Manufacture Device ID */
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID);
+ return (u32)read_csr(dd, ASIC_EEP_DATA);
+}
+
+/*
+ * Erase the whole flash.
+ */
+static int erase_chip(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ write_enable(dd);
+
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE);
+ ret = wait_for_not_busy(dd);
+
+ write_disable(dd);
+
+ return ret;
+}
+
+/*
+ * Erase a range using the 32KB erase command.
+ */
+static int erase_32kb_range(struct hfi1_devdata *dd, u32 start, u32 end)
+{
+ int ret = 0;
+
+ if (end < start)
+ return -EINVAL;
+
+ if ((start & MASK_32KB) || (end & MASK_32KB)) {
+ dd_dev_err(dd,
+ "%s: non-aligned range (0x%x,0x%x) for a 32KB erase\n",
+ __func__, start, end);
+ return -EINVAL;
+ }
+
+ write_enable(dd);
+
+ for (; start < end; start += SIZE_32KB) {
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+ write_csr(dd, ASIC_EEP_ADDR_CMD,
+ CMD_SECTOR_ERASE_32KB(start));
+ ret = wait_for_not_busy(dd);
+ if (ret)
+ goto done;
+ }
+
+done:
+ write_disable(dd);
+
+ return ret;
+}
+
+/*
+ * Read a 256 byte (64 dword) EPROM page.
+ * All callers have verified the offset is at a page boundary.
+ */
+static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
+{
+ int i;
+
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
+ for (i = 0; i < EP_PAGE_SIZE/sizeof(u32); i++)
+ result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
+}
+
+/*
+ * Read length bytes starting at offset. Copy to user address addr.
+ */
+static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
+{
+ u32 offset;
+ u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
+ int ret = 0;
+
+ /* reject anything not on an EPROM page boundary */
+ if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
+ return -EINVAL;
+
+ for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
+ read_page(dd, start + offset, buffer);
+ if (copy_to_user((void __user *)(addr + offset),
+ buffer, EP_PAGE_SIZE)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ }
+
+done:
+ return ret;
+}
+
+/*
+ * Write a 256 byte (64 dword) EPROM page.
+ * All callers have verified the offset is at a page boundary.
+ */
+static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data)
+{
+ int i;
+
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+ write_csr(dd, ASIC_EEP_DATA, data[0]);
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset));
+ for (i = 1; i < EP_PAGE_SIZE/sizeof(u32); i++)
+ write_csr(dd, ASIC_EEP_DATA, data[i]);
+ /* will close the open page */
+ return wait_for_not_busy(dd);
+}
+
+/*
+ * Write length bytes starting at offset. Read from user address addr.
+ */
+static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
+{
+ u32 offset;
+ u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
+ int ret = 0;
+
+ /* reject anything not on an EPROM page boundary */
+ if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
+ return -EINVAL;
+
+ write_enable(dd);
+
+ for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
+ if (copy_from_user(buffer, (void __user *)(addr + offset),
+ EP_PAGE_SIZE)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ ret = write_page(dd, start + offset, buffer);
+ if (ret)
+ goto done;
+ }
+
+done:
+ write_disable(dd);
+ return ret;
+}
+
+/*
+ * Perform the given operation on the EPROM. Called from user space. The
+ * user credentials have already been checked.
+ *
+ * Return 0 on success, -ERRNO on error
+ */
+int handle_eprom_command(const struct hfi1_cmd *cmd)
+{
+ struct hfi1_devdata *dd;
+ u32 dev_id;
+ int ret = 0;
+
+ /*
+ * The EPROM is per-device, so use unit 0 as that will always
+ * exist.
+ */
+ dd = hfi1_lookup(0);
+ if (!dd) {
+ pr_err("%s: cannot find unit 0!\n", __func__);
+ return -EINVAL;
+ }
+
+ /* lock against other callers touching the ASIC block */
+ mutex_lock(&eprom_mutex);
+
+ /* some platforms do not have an EPROM */
+ if (!eprom_available) {
+ ret = -ENOSYS;
+ goto done_asic;
+ }
+
+ /* lock against the other HFI on another OS */
+ ret = acquire_hw_mutex(dd);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: unable to acquire hw mutex, no EPROM support\n",
+ __func__);
+ goto done_asic;
+ }
+
+ dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n",
+ __func__, cmd->type, cmd->len, cmd->addr);
+
+ switch (cmd->type) {
+ case HFI1_CMD_EP_INFO:
+ if (cmd->len != sizeof(u32)) {
+ ret = -ERANGE;
+ break;
+ }
+ dev_id = read_device_id(dd);
+ /* addr points to a u32 user buffer */
+ if (copy_to_user((void __user *)cmd->addr, &dev_id,
+ sizeof(u32)))
+ ret = -EFAULT;
+ break;
+ case HFI1_CMD_EP_ERASE_CHIP:
+ ret = erase_chip(dd);
+ break;
+ case HFI1_CMD_EP_ERASE_P0:
+ if (cmd->len != P0_SIZE) {
+ ret = -ERANGE;
+ break;
+ }
+ ret = erase_32kb_range(dd, 0, cmd->len);
+ break;
+ case HFI1_CMD_EP_ERASE_P1:
+ /* check for overflow */
+ if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+ ret = -ERANGE;
+ break;
+ }
+ ret = erase_32kb_range(dd, P1_START, P1_START + cmd->len);
+ break;
+ case HFI1_CMD_EP_READ_P0:
+ if (cmd->len != P0_SIZE) {
+ ret = -ERANGE;
+ break;
+ }
+ ret = read_length(dd, 0, cmd->len, cmd->addr);
+ break;
+ case HFI1_CMD_EP_READ_P1:
+ /* check for overflow */
+ if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+ ret = -ERANGE;
+ break;
+ }
+ ret = read_length(dd, P1_START, cmd->len, cmd->addr);
+ break;
+ case HFI1_CMD_EP_WRITE_P0:
+ if (cmd->len > P0_SIZE) {
+ ret = -ERANGE;
+ break;
+ }
+ ret = write_length(dd, 0, cmd->len, cmd->addr);
+ break;
+ case HFI1_CMD_EP_WRITE_P1:
+ /* check for overflow */
+ if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+ ret = -ERANGE;
+ break;
+ }
+ ret = write_length(dd, P1_START, cmd->len, cmd->addr);
+ break;
+ default:
+ dd_dev_err(dd, "%s: unexpected command %d\n",
+ __func__, cmd->type);
+ ret = -EINVAL;
+ break;
+ }
+
+ release_hw_mutex(dd);
+done_asic:
+ mutex_unlock(&eprom_mutex);
+ return ret;
+}
+
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+
+ /* only the discrete chip has an EPROM, nothing to do */
+ if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+ return 0;
+
+ /* lock against other callers */
+ mutex_lock(&eprom_mutex);
+ if (eprom_available) /* already initialized */
+ goto done_asic;
+
+ /*
+ * Lock against the other HFI on another OS - the mutex above
+ * would have caught anything in this driver. It is OK if
+ * both OSes reset the EPROM - as long as they don't do it at
+ * the same time.
+ */
+ ret = acquire_hw_mutex(dd);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: unable to acquire hw mutex, no EPROM support\n",
+ __func__);
+ goto done_asic;
+ }
+
+ /* reset EPROM to be sure it is in a good state */
+
+ /* set reset */
+ write_csr(dd, ASIC_EEP_CTL_STAT,
+ ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+ /* clear reset, set speed */
+ write_csr(dd, ASIC_EEP_CTL_STAT,
+ EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+ /* wake the device with command "release powerdown NoID" */
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+ eprom_available = 1;
+ release_hw_mutex(dd);
+done_asic:
+ mutex_unlock(&eprom_mutex);
+ return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h
new file mode 100644
index 000000000000..64a64276be81
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/eprom.h
@@ -0,0 +1,55 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_cmd;
+struct hfi1_devdata;
+
+int eprom_init(struct hfi1_devdata *dd);
+int handle_eprom_command(const struct hfi1_cmd *cmd);
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
new file mode 100644
index 000000000000..72d38500d8ce
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -0,0 +1,2144 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <asm/pgtable.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <linux/uio.h>
+
+#include "hfi.h"
+#include "pio.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "user_sdma.h"
+#include "eprom.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
+
+/*
+ * File operation functions
+ */
+static int hfi1_file_open(struct inode *, struct file *);
+static int hfi1_file_close(struct inode *, struct file *);
+static ssize_t hfi1_file_write(struct file *, const char __user *,
+ size_t, loff_t *);
+static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
+static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
+static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
+
+static u64 kvirt_to_phys(void *);
+static int assign_ctxt(struct file *, struct hfi1_user_info *);
+static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
+static int user_init(struct file *);
+static int get_ctxt_info(struct file *, void __user *, __u32);
+static int get_base_info(struct file *, void __user *, __u32);
+static int setup_ctxt(struct file *);
+static int setup_subctxt(struct hfi1_ctxtdata *);
+static int get_user_context(struct file *, struct hfi1_user_info *,
+ int, unsigned);
+static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
+static int allocate_ctxt(struct file *, struct hfi1_devdata *,
+ struct hfi1_user_info *);
+static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
+static unsigned int poll_next(struct file *, struct poll_table_struct *);
+static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
+static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
+static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
+static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static int exp_tid_setup(struct file *, struct hfi1_tid_info *);
+static int exp_tid_free(struct file *, struct hfi1_tid_info *);
+static void unlock_exp_tids(struct hfi1_ctxtdata *);
+
+static const struct file_operations hfi1_file_ops = {
+ .owner = THIS_MODULE,
+ .write = hfi1_file_write,
+ .write_iter = hfi1_write_iter,
+ .open = hfi1_file_open,
+ .release = hfi1_file_close,
+ .poll = hfi1_poll,
+ .mmap = hfi1_file_mmap,
+ .llseek = noop_llseek,
+};
+
+static struct vm_operations_struct vm_ops = {
+ .fault = vma_fault,
+};
+
+/*
+ * Types of memories mapped into user processes' space
+ */
+enum mmap_types {
+ PIO_BUFS = 1,
+ PIO_BUFS_SOP,
+ PIO_CRED,
+ RCV_HDRQ,
+ RCV_EGRBUF,
+ UREGS,
+ EVENTS,
+ STATUS,
+ RTAIL,
+ SUBCTXT_UREGS,
+ SUBCTXT_RCV_HDRQ,
+ SUBCTXT_EGRBUF,
+ SDMA_COMP
+};
+
+/*
+ * Masks and offsets defining the mmap tokens
+ */
+#define HFI1_MMAP_OFFSET_MASK 0xfffULL
+#define HFI1_MMAP_OFFSET_SHIFT 0
+#define HFI1_MMAP_SUBCTXT_MASK 0xfULL
+#define HFI1_MMAP_SUBCTXT_SHIFT 12
+#define HFI1_MMAP_CTXT_MASK 0xffULL
+#define HFI1_MMAP_CTXT_SHIFT 16
+#define HFI1_MMAP_TYPE_MASK 0xfULL
+#define HFI1_MMAP_TYPE_SHIFT 24
+#define HFI1_MMAP_MAGIC_MASK 0xffffffffULL
+#define HFI1_MMAP_MAGIC_SHIFT 32
+
+#define HFI1_MMAP_MAGIC 0xdabbad00
+
+#define HFI1_MMAP_TOKEN_SET(field, val) \
+ (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
+#define HFI1_MMAP_TOKEN_GET(field, token) \
+ (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
+#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr) \
+ (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
+ HFI1_MMAP_TOKEN_SET(TYPE, type) | \
+ HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
+ HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
+ HFI1_MMAP_TOKEN_SET(OFFSET, ((unsigned long)addr & ~PAGE_MASK)))
+
+#define EXP_TID_SET(field, value) \
+ (((value) & EXP_TID_TID##field##_MASK) << \
+ EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) { \
+ (tid) &= ~(EXP_TID_TID##field##_MASK << \
+ EXP_TID_TID##field##_SHIFT); \
+ }
+#define EXP_TID_RESET(tid, field, value) do { \
+ EXP_TID_CLEAR(tid, field); \
+ (tid) |= EXP_TID_SET(field, value); \
+ } while (0)
+
+#define dbg(fmt, ...) \
+ pr_info(fmt, ##__VA_ARGS__)
+
+
+static inline int is_valid_mmap(u64 token)
+{
+ return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
+}
+
+static int hfi1_file_open(struct inode *inode, struct file *fp)
+{
+ /* The real work is performed later in assign_ctxt() */
+ fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
+ if (fp->private_data) /* no cpu affinity by default */
+ ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
+ return fp->private_data ? 0 : -ENOMEM;
+}
+
+static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *offset)
+{
+ const struct hfi1_cmd __user *ucmd;
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_cmd cmd;
+ struct hfi1_user_info uinfo;
+ struct hfi1_tid_info tinfo;
+ ssize_t consumed = 0, copy = 0, ret = 0;
+ void *dest = NULL;
+ __u64 user_val = 0;
+ int uctxt_required = 1;
+ int must_be_root = 0;
+
+ if (count < sizeof(cmd)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ ucmd = (const struct hfi1_cmd __user *)data;
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ consumed = sizeof(cmd);
+
+ switch (cmd.type) {
+ case HFI1_CMD_ASSIGN_CTXT:
+ uctxt_required = 0; /* assigned user context not required */
+ copy = sizeof(uinfo);
+ dest = &uinfo;
+ break;
+ case HFI1_CMD_SDMA_STATUS_UPD:
+ case HFI1_CMD_CREDIT_UPD:
+ copy = 0;
+ break;
+ case HFI1_CMD_TID_UPDATE:
+ case HFI1_CMD_TID_FREE:
+ copy = sizeof(tinfo);
+ dest = &tinfo;
+ break;
+ case HFI1_CMD_USER_INFO:
+ case HFI1_CMD_RECV_CTRL:
+ case HFI1_CMD_POLL_TYPE:
+ case HFI1_CMD_ACK_EVENT:
+ case HFI1_CMD_CTXT_INFO:
+ case HFI1_CMD_SET_PKEY:
+ case HFI1_CMD_CTXT_RESET:
+ copy = 0;
+ user_val = cmd.addr;
+ break;
+ case HFI1_CMD_EP_INFO:
+ case HFI1_CMD_EP_ERASE_CHIP:
+ case HFI1_CMD_EP_ERASE_P0:
+ case HFI1_CMD_EP_ERASE_P1:
+ case HFI1_CMD_EP_READ_P0:
+ case HFI1_CMD_EP_READ_P1:
+ case HFI1_CMD_EP_WRITE_P0:
+ case HFI1_CMD_EP_WRITE_P1:
+ uctxt_required = 0; /* assigned user context not required */
+ must_be_root = 1; /* validate user */
+ copy = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* If the command comes with user data, copy it. */
+ if (copy) {
+ if (copy_from_user(dest, (void __user *)cmd.addr, copy)) {
+ ret = -EFAULT;
+ goto bail;
+ }
+ consumed += copy;
+ }
+
+ /*
+ * Make sure there is a uctxt when needed.
+ */
+ if (uctxt_required && !uctxt) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* only root can do these operations */
+ if (must_be_root && !capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto bail;
+ }
+
+ switch (cmd.type) {
+ case HFI1_CMD_ASSIGN_CTXT:
+ ret = assign_ctxt(fp, &uinfo);
+ if (ret < 0)
+ goto bail;
+ ret = setup_ctxt(fp);
+ if (ret)
+ goto bail;
+ ret = user_init(fp);
+ break;
+ case HFI1_CMD_CTXT_INFO:
+ ret = get_ctxt_info(fp, (void __user *)(unsigned long)
+ user_val, cmd.len);
+ break;
+ case HFI1_CMD_USER_INFO:
+ ret = get_base_info(fp, (void __user *)(unsigned long)
+ user_val, cmd.len);
+ break;
+ case HFI1_CMD_SDMA_STATUS_UPD:
+ break;
+ case HFI1_CMD_CREDIT_UPD:
+ if (uctxt && uctxt->sc)
+ sc_return_credits(uctxt->sc);
+ break;
+ case HFI1_CMD_TID_UPDATE:
+ ret = exp_tid_setup(fp, &tinfo);
+ if (!ret) {
+ unsigned long addr;
+ /*
+ * Copy the number of tidlist entries we used
+ * and the length of the buffer we registered.
+ * These fields are adjacent in the structure so
+ * we can copy them at the same time.
+ */
+ addr = (unsigned long)cmd.addr +
+ offsetof(struct hfi1_tid_info, tidcnt);
+ if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+ sizeof(tinfo.tidcnt) +
+ sizeof(tinfo.length)))
+ ret = -EFAULT;
+ }
+ break;
+ case HFI1_CMD_TID_FREE:
+ ret = exp_tid_free(fp, &tinfo);
+ break;
+ case HFI1_CMD_RECV_CTRL:
+ ret = manage_rcvq(uctxt, subctxt_fp(fp), (int)user_val);
+ break;
+ case HFI1_CMD_POLL_TYPE:
+ uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
+ break;
+ case HFI1_CMD_ACK_EVENT:
+ ret = user_event_ack(uctxt, subctxt_fp(fp), user_val);
+ break;
+ case HFI1_CMD_SET_PKEY:
+ if (HFI1_CAP_IS_USET(PKEY_CHECK))
+ ret = set_ctxt_pkey(uctxt, subctxt_fp(fp), user_val);
+ else
+ ret = -EPERM;
+ break;
+ case HFI1_CMD_CTXT_RESET: {
+ struct send_context *sc;
+ struct hfi1_devdata *dd;
+
+ if (!uctxt || !uctxt->dd || !uctxt->sc) {
+ ret = -EINVAL;
+ break;
+ }
+ /*
+ * There is no protection here. User level has to
+ * guarantee that no one will be writing to the send
+ * context while it is being re-initialized.
+ * If user level breaks that guarantee, it will break
+ * it's own context and no one else's.
+ */
+ dd = uctxt->dd;
+ sc = uctxt->sc;
+ /*
+ * Wait until the interrupt handler has marked the
+ * context as halted or frozen. Report error if we time
+ * out.
+ */
+ wait_event_interruptible_timeout(
+ sc->halt_wait, (sc->flags & SCF_HALTED),
+ msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+ if (!(sc->flags & SCF_HALTED)) {
+ ret = -ENOLCK;
+ break;
+ }
+ /*
+ * If the send context was halted due to a Freeze,
+ * wait until the device has been "unfrozen" before
+ * resetting the context.
+ */
+ if (sc->flags & SCF_FROZEN) {
+ wait_event_interruptible_timeout(
+ dd->event_queue,
+ !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+ msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+ if (dd->flags & HFI1_FROZEN) {
+ ret = -ENOLCK;
+ break;
+ }
+ if (dd->flags & HFI1_FORCED_FREEZE) {
+ /* Don't allow context reset if we are into
+ * forced freeze */
+ ret = -ENODEV;
+ break;
+ }
+ sc_disable(sc);
+ ret = sc_enable(sc);
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
+ uctxt->ctxt);
+ } else
+ ret = sc_restart(sc);
+ if (!ret)
+ sc_return_credits(sc);
+ break;
+ }
+ case HFI1_CMD_EP_INFO:
+ case HFI1_CMD_EP_ERASE_CHIP:
+ case HFI1_CMD_EP_ERASE_P0:
+ case HFI1_CMD_EP_ERASE_P1:
+ case HFI1_CMD_EP_READ_P0:
+ case HFI1_CMD_EP_READ_P1:
+ case HFI1_CMD_EP_WRITE_P0:
+ case HFI1_CMD_EP_WRITE_P1:
+ ret = handle_eprom_command(&cmd);
+ break;
+ }
+
+ if (ret >= 0)
+ ret = consumed;
+bail:
+ return ret;
+}
+
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+ struct hfi1_user_sdma_pkt_q *pq;
+ struct hfi1_user_sdma_comp_q *cq;
+ int ret = 0, done = 0, reqs = 0;
+ unsigned long dim = from->nr_segs;
+
+ if (!user_sdma_comp_fp(kiocb->ki_filp) ||
+ !user_sdma_pkt_fp(kiocb->ki_filp)) {
+ ret = -EIO;
+ goto done;
+ }
+
+ if (!iter_is_iovec(from) || !dim) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
+ ctxt_fp(kiocb->ki_filp)->ctxt, subctxt_fp(kiocb->ki_filp),
+ dim);
+ pq = user_sdma_pkt_fp(kiocb->ki_filp);
+ cq = user_sdma_comp_fp(kiocb->ki_filp);
+
+ if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
+ ret = -ENOSPC;
+ goto done;
+ }
+
+ while (dim) {
+ unsigned long count = 0;
+
+ ret = hfi1_user_sdma_process_request(
+ kiocb->ki_filp, (struct iovec *)(from->iov + done),
+ dim, &count);
+ if (ret)
+ goto done;
+ dim -= count;
+ done += count;
+ reqs++;
+ }
+done:
+ return ret ? ret : reqs;
+}
+
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct hfi1_ctxtdata *uctxt;
+ struct hfi1_devdata *dd;
+ unsigned long flags, pfn;
+ u64 token = vma->vm_pgoff << PAGE_SHIFT,
+ memaddr = 0;
+ u8 subctxt, mapio = 0, vmf = 0, type;
+ ssize_t memlen = 0;
+ int ret = 0;
+ u16 ctxt;
+
+ uctxt = ctxt_fp(fp);
+ if (!is_valid_mmap(token) || !uctxt ||
+ !(vma->vm_flags & VM_SHARED)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ dd = uctxt->dd;
+ ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
+ subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
+ type = HFI1_MMAP_TOKEN_GET(TYPE, token);
+ if (ctxt != uctxt->ctxt || subctxt != subctxt_fp(fp)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ flags = vma->vm_flags;
+
+ switch (type) {
+ case PIO_BUFS:
+ case PIO_BUFS_SOP:
+ memaddr = ((dd->physaddr + TXE_PIO_SEND) +
+ /* chip pio base */
+ (uctxt->sc->hw_context * (1 << 16))) +
+ /* 64K PIO space / ctxt */
+ (type == PIO_BUFS_SOP ?
+ (TXE_PIO_SIZE / 2) : 0); /* sop? */
+ /*
+ * Map only the amount allocated to the context, not the
+ * entire available context's PIO space.
+ */
+ memlen = ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE,
+ PAGE_SIZE);
+ flags &= ~VM_MAYREAD;
+ flags |= VM_DONTCOPY | VM_DONTEXPAND;
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+ mapio = 1;
+ break;
+ case PIO_CRED:
+ if (flags & VM_WRITE) {
+ ret = -EPERM;
+ goto done;
+ }
+ /*
+ * The credit return location for this context could be on the
+ * second or third page allocated for credit returns (if number
+ * of enabled contexts > 64 and 128 respectively).
+ */
+ memaddr = dd->cr_base[uctxt->numa_id].pa +
+ (((u64)uctxt->sc->hw_free -
+ (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+ memlen = PAGE_SIZE;
+ flags &= ~VM_MAYWRITE;
+ flags |= VM_DONTCOPY | VM_DONTEXPAND;
+ /*
+ * The driver has already allocated memory for credit
+ * returns and programmed it into the chip. Has that
+ * memory been flagged as non-cached?
+ */
+ /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
+ mapio = 1;
+ break;
+ case RCV_HDRQ:
+ memaddr = uctxt->rcvhdrq_phys;
+ memlen = uctxt->rcvhdrq_size;
+ break;
+ case RCV_EGRBUF: {
+ unsigned long addr;
+ int i;
+ /*
+ * The RcvEgr buffer need to be handled differently
+ * as multiple non-contiguous pages need to be mapped
+ * into the user process.
+ */
+ memlen = uctxt->egrbufs.size;
+ if ((vma->vm_end - vma->vm_start) != memlen) {
+ dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
+ (vma->vm_end - vma->vm_start), memlen);
+ ret = -EINVAL;
+ goto done;
+ }
+ if (vma->vm_flags & VM_WRITE) {
+ ret = -EPERM;
+ goto done;
+ }
+ vma->vm_flags &= ~VM_MAYWRITE;
+ addr = vma->vm_start;
+ for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
+ ret = remap_pfn_range(
+ vma, addr,
+ uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
+ uctxt->egrbufs.buffers[i].len,
+ vma->vm_page_prot);
+ if (ret < 0)
+ goto done;
+ addr += uctxt->egrbufs.buffers[i].len;
+ }
+ ret = 0;
+ goto done;
+ }
+ case UREGS:
+ /*
+ * Map only the page that contains this context's user
+ * registers.
+ */
+ memaddr = (unsigned long)
+ (dd->physaddr + RXE_PER_CONTEXT_USER)
+ + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
+ /*
+ * TidFlow table is on the same page as the rest of the
+ * user registers.
+ */
+ memlen = PAGE_SIZE;
+ flags |= VM_DONTCOPY | VM_DONTEXPAND;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ mapio = 1;
+ break;
+ case EVENTS:
+ /*
+ * Use the page where this context's flags are. User level
+ * knows where it's own bitmap is within the page.
+ */
+ memaddr = ((unsigned long)dd->events +
+ ((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
+ memlen = PAGE_SIZE;
+ /*
+ * v3.7 removes VM_RESERVED but the effect is kept by
+ * using VM_IO.
+ */
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ case STATUS:
+ memaddr = kvirt_to_phys((void *)dd->status);
+ memlen = PAGE_SIZE;
+ flags |= VM_IO | VM_DONTEXPAND;
+ break;
+ case RTAIL:
+ if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
+ /*
+ * If the memory allocation failed, the context alloc
+ * also would have failed, so we would never get here
+ */
+ ret = -EINVAL;
+ goto done;
+ }
+ if (flags & VM_WRITE) {
+ ret = -EPERM;
+ goto done;
+ }
+ memaddr = uctxt->rcvhdrqtailaddr_phys;
+ memlen = PAGE_SIZE;
+ flags &= ~VM_MAYWRITE;
+ break;
+ case SUBCTXT_UREGS:
+ memaddr = (u64)uctxt->subctxt_uregbase;
+ memlen = PAGE_SIZE;
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ case SUBCTXT_RCV_HDRQ:
+ memaddr = (u64)uctxt->subctxt_rcvhdr_base;
+ memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ case SUBCTXT_EGRBUF:
+ memaddr = (u64)uctxt->subctxt_rcvegrbuf;
+ memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
+ flags |= VM_IO | VM_DONTEXPAND;
+ flags &= ~VM_MAYWRITE;
+ vmf = 1;
+ break;
+ case SDMA_COMP: {
+ struct hfi1_user_sdma_comp_q *cq;
+
+ if (!user_sdma_comp_fp(fp)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ cq = user_sdma_comp_fp(fp);
+ memaddr = (u64)cq->comps;
+ memlen = ALIGN(sizeof(*cq->comps) * cq->nentries, PAGE_SIZE);
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ }
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if ((vma->vm_end - vma->vm_start) != memlen) {
+ hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
+ uctxt->ctxt, subctxt_fp(fp),
+ (vma->vm_end - vma->vm_start), memlen);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ vma->vm_flags = flags;
+ dd_dev_info(dd,
+ "%s: %u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
+ __func__, ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
+ vma->vm_end - vma->vm_start, vma->vm_flags);
+ pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
+ if (vmf) {
+ vma->vm_pgoff = pfn;
+ vma->vm_ops = &vm_ops;
+ ret = 0;
+ } else if (mapio) {
+ ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+ vma->vm_page_prot);
+ } else {
+ ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+ vma->vm_page_prot);
+ }
+done:
+ return ret;
+}
+
+/*
+ * Local (non-chip) user memory is not mapped right away but as it is
+ * accessed by the user-level code.
+ */
+static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page;
+
+ page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+ if (!page)
+ return VM_FAULT_SIGBUS;
+
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
+{
+ struct hfi1_ctxtdata *uctxt;
+ unsigned pollflag;
+
+ uctxt = ctxt_fp(fp);
+ if (!uctxt)
+ pollflag = POLLERR;
+ else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
+ pollflag = poll_urgent(fp, pt);
+ else if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
+ pollflag = poll_next(fp, pt);
+ else /* invalid */
+ pollflag = POLLERR;
+
+ return pollflag;
+}
+
+static int hfi1_file_close(struct inode *inode, struct file *fp)
+{
+ struct hfi1_filedata *fdata = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+ struct hfi1_devdata *dd;
+ unsigned long flags, *ev;
+
+ fp->private_data = NULL;
+
+ if (!uctxt)
+ goto done;
+
+ hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
+ dd = uctxt->dd;
+ mutex_lock(&hfi1_mutex);
+
+ flush_wc();
+ /* drain user sdma queue */
+ if (fdata->pq)
+ hfi1_user_sdma_free_queues(fdata);
+
+ /*
+ * Clear any left over, unhandled events so the next process that
+ * gets this context doesn't get confused.
+ */
+ ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
+ *ev = 0;
+
+ if (--uctxt->cnt) {
+ uctxt->active_slaves &= ~(1 << fdata->subctxt);
+ uctxt->subpid[fdata->subctxt] = 0;
+ mutex_unlock(&hfi1_mutex);
+ goto done;
+ }
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ /*
+ * Disable receive context and interrupt available, reset all
+ * RcvCtxtCtrl bits to default values.
+ */
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+ HFI1_RCVCTRL_TIDFLOW_DIS |
+ HFI1_RCVCTRL_INTRAVAIL_DIS |
+ HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+ HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+ HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+ /* Clear the context's J_KEY */
+ hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
+ /*
+ * Reset context integrity checks to default.
+ * (writes to CSRs probably belong in chip.c)
+ */
+ write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
+ hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
+ sc_disable(uctxt->sc);
+ uctxt->pid = 0;
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+ dd->rcd[uctxt->ctxt] = NULL;
+ uctxt->rcvwait_to = 0;
+ uctxt->piowait_to = 0;
+ uctxt->rcvnowait = 0;
+ uctxt->pionowait = 0;
+ uctxt->event_flags = 0;
+
+ hfi1_clear_tids(uctxt);
+ hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
+
+ if (uctxt->tid_pg_list)
+ unlock_exp_tids(uctxt);
+
+ hfi1_stats.sps_ctxts--;
+ dd->freectxts++;
+ mutex_unlock(&hfi1_mutex);
+ hfi1_free_ctxtdata(dd, uctxt);
+done:
+ kfree(fdata);
+ return 0;
+}
+
+/*
+ * Convert kernel *virtual* addresses to physical addresses.
+ * This is used to vmalloc'ed addresses.
+ */
+static u64 kvirt_to_phys(void *addr)
+{
+ struct page *page;
+ u64 paddr = 0;
+
+ page = vmalloc_to_page(addr);
+ if (page)
+ paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
+{
+ int i_minor, ret = 0;
+ unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS;
+
+ swmajor = uinfo->userversion >> 16;
+ if (swmajor != HFI1_USER_SWMAJOR) {
+ ret = -ENODEV;
+ goto done;
+ }
+
+ swminor = uinfo->userversion & 0xffff;
+
+ if (uinfo->hfi1_alg < HFI1_ALG_COUNT)
+ alg = uinfo->hfi1_alg;
+
+ mutex_lock(&hfi1_mutex);
+ /* First, lets check if we need to setup a shared context? */
+ if (uinfo->subctxt_cnt)
+ ret = find_shared_ctxt(fp, uinfo);
+
+ /*
+ * We execute the following block if we couldn't find a
+ * shared context or if context sharing is not required.
+ */
+ if (!ret) {
+ i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
+ ret = get_user_context(fp, uinfo, i_minor - 1, alg);
+ }
+ mutex_unlock(&hfi1_mutex);
+done:
+ return ret;
+}
+
+static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
+ int devno, unsigned alg)
+{
+ struct hfi1_devdata *dd = NULL;
+ int ret = 0, devmax, npresent, nup, dev;
+
+ devmax = hfi1_count_units(&npresent, &nup);
+ if (!npresent) {
+ ret = -ENXIO;
+ goto done;
+ }
+ if (!nup) {
+ ret = -ENETDOWN;
+ goto done;
+ }
+ if (devno >= 0) {
+ dd = hfi1_lookup(devno);
+ if (!dd)
+ ret = -ENODEV;
+ else if (!dd->freectxts)
+ ret = -EBUSY;
+ } else {
+ struct hfi1_devdata *pdd;
+
+ if (alg == HFI1_ALG_ACROSS) {
+ unsigned free = 0U;
+
+ for (dev = 0; dev < devmax; dev++) {
+ pdd = hfi1_lookup(dev);
+ if (pdd && pdd->freectxts &&
+ pdd->freectxts > free) {
+ dd = pdd;
+ free = pdd->freectxts;
+ }
+ }
+ } else {
+ for (dev = 0; dev < devmax; dev++) {
+ pdd = hfi1_lookup(dev);
+ if (pdd && pdd->freectxts) {
+ dd = pdd;
+ break;
+ }
+ }
+ }
+ if (!dd)
+ ret = -EBUSY;
+ }
+done:
+ return ret ? ret : allocate_ctxt(fp, dd, uinfo);
+}
+
+static int find_shared_ctxt(struct file *fp,
+ const struct hfi1_user_info *uinfo)
+{
+ int devmax, ndev, i;
+ int ret = 0;
+
+ devmax = hfi1_count_units(NULL, NULL);
+
+ for (ndev = 0; ndev < devmax; ndev++) {
+ struct hfi1_devdata *dd = hfi1_lookup(ndev);
+
+ /* device portion of usable() */
+ if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
+ continue;
+ for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+ struct hfi1_ctxtdata *uctxt = dd->rcd[i];
+
+ /* Skip ctxts which are not yet open */
+ if (!uctxt || !uctxt->cnt)
+ continue;
+ /* Skip ctxt if it doesn't match the requested one */
+ if (memcmp(uctxt->uuid, uinfo->uuid,
+ sizeof(uctxt->uuid)) ||
+ uctxt->subctxt_id != uinfo->subctxt_id ||
+ uctxt->subctxt_cnt != uinfo->subctxt_cnt)
+ continue;
+
+ /* Verify the sharing process matches the master */
+ if (uctxt->userversion != uinfo->userversion ||
+ uctxt->cnt >= uctxt->subctxt_cnt) {
+ ret = -EINVAL;
+ goto done;
+ }
+ ctxt_fp(fp) = uctxt;
+ subctxt_fp(fp) = uctxt->cnt++;
+ uctxt->subpid[subctxt_fp(fp)] = current->pid;
+ uctxt->active_slaves |= 1 << subctxt_fp(fp);
+ ret = 1;
+ goto done;
+ }
+ }
+
+done:
+ return ret;
+}
+
+static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
+ struct hfi1_user_info *uinfo)
+{
+ struct hfi1_ctxtdata *uctxt;
+ unsigned ctxt;
+ int ret;
+
+ if (dd->flags & HFI1_FROZEN) {
+ /*
+ * Pick an error that is unique from all other errors
+ * that are returned so the user process knows that
+ * it tried to allocate while the SPC was frozen. It
+ * it should be able to retry with success in a short
+ * while.
+ */
+ return -EIO;
+ }
+
+ for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
+ if (!dd->rcd[ctxt])
+ break;
+
+ if (ctxt == dd->num_rcv_contexts)
+ return -EBUSY;
+
+ uctxt = hfi1_create_ctxtdata(dd->pport, ctxt);
+ if (!uctxt) {
+ dd_dev_err(dd,
+ "Unable to allocate ctxtdata memory, failing open\n");
+ return -ENOMEM;
+ }
+ /*
+ * Allocate and enable a PIO send context.
+ */
+ uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
+ uctxt->numa_id);
+ if (!uctxt->sc)
+ return -ENOMEM;
+
+ dbg("allocated send context %u(%u)\n", uctxt->sc->sw_index,
+ uctxt->sc->hw_context);
+ ret = sc_enable(uctxt->sc);
+ if (ret)
+ return ret;
+ /*
+ * Setup shared context resources if the user-level has requested
+ * shared contexts and this is the 'master' process.
+ * This has to be done here so the rest of the sub-contexts find the
+ * proper master.
+ */
+ if (uinfo->subctxt_cnt && !subctxt_fp(fp)) {
+ ret = init_subctxts(uctxt, uinfo);
+ /*
+ * On error, we don't need to disable and de-allocate the
+ * send context because it will be done during file close
+ */
+ if (ret)
+ return ret;
+ }
+ uctxt->userversion = uinfo->userversion;
+ uctxt->pid = current->pid;
+ uctxt->flags = HFI1_CAP_UGET(MASK);
+ init_waitqueue_head(&uctxt->wait);
+ strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
+ memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
+ uctxt->jkey = generate_jkey(current_uid());
+ INIT_LIST_HEAD(&uctxt->sdma_queues);
+ spin_lock_init(&uctxt->sdma_qlock);
+ hfi1_stats.sps_ctxts++;
+ dd->freectxts--;
+ ctxt_fp(fp) = uctxt;
+
+ return 0;
+}
+
+static int init_subctxts(struct hfi1_ctxtdata *uctxt,
+ const struct hfi1_user_info *uinfo)
+{
+ int ret = 0;
+ unsigned num_subctxts;
+
+ num_subctxts = uinfo->subctxt_cnt;
+ if (num_subctxts > HFI1_MAX_SHARED_CTXTS) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ uctxt->subctxt_cnt = uinfo->subctxt_cnt;
+ uctxt->subctxt_id = uinfo->subctxt_id;
+ uctxt->active_slaves = 1;
+ uctxt->redirect_seq_cnt = 1;
+ set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+bail:
+ return ret;
+}
+
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
+{
+ int ret = 0;
+ unsigned num_subctxts = uctxt->subctxt_cnt;
+
+ uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
+ if (!uctxt->subctxt_uregbase) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ /* We can take the size of the RcvHdr Queue from the master */
+ uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
+ num_subctxts);
+ if (!uctxt->subctxt_rcvhdr_base) {
+ ret = -ENOMEM;
+ goto bail_ureg;
+ }
+
+ uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
+ num_subctxts);
+ if (!uctxt->subctxt_rcvegrbuf) {
+ ret = -ENOMEM;
+ goto bail_rhdr;
+ }
+ goto bail;
+bail_rhdr:
+ vfree(uctxt->subctxt_rcvhdr_base);
+bail_ureg:
+ vfree(uctxt->subctxt_uregbase);
+ uctxt->subctxt_uregbase = NULL;
+bail:
+ return ret;
+}
+
+static int user_init(struct file *fp)
+{
+ int ret;
+ unsigned int rcvctrl_ops = 0;
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+
+ /* make sure that the context has already been setup */
+ if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ /*
+ * Subctxts don't need to initialize anything since master
+ * has done it.
+ */
+ if (subctxt_fp(fp)) {
+ ret = wait_event_interruptible(uctxt->wait,
+ !test_bit(HFI1_CTXT_MASTER_UNINIT,
+ &uctxt->event_flags));
+ goto done;
+ }
+
+ /* initialize poll variables... */
+ uctxt->urgent = 0;
+ uctxt->urgent_poll = 0;
+
+ /*
+ * Now enable the ctxt for receive.
+ * For chips that are set to DMA the tail register to memory
+ * when they change (and when the update bit transitions from
+ * 0 to 1. So for those chips, we turn it off and then back on.
+ * This will (very briefly) affect any other open ctxts, but the
+ * duration is very short, and therefore isn't an issue. We
+ * explicitly set the in-memory tail copy to 0 beforehand, so we
+ * don't have to wait to be sure the DMA update has happened
+ * (chip resets head/tail to 0 on transition to enable).
+ */
+ if (uctxt->rcvhdrtail_kvaddr)
+ clear_rcvhdrtail(uctxt);
+
+ /* Setup J_KEY before enabling the context */
+ hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
+
+ rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+ if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
+ rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
+ /*
+ * Ignore the bit in the flags for now until proper
+ * support for multiple packet per rcv array entry is
+ * added.
+ */
+ if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+ rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+ if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+ rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+ rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+ if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+ rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+ hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+
+ /* Notify any waiting slaves */
+ if (uctxt->subctxt_cnt) {
+ clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+ wake_up(&uctxt->wait);
+ }
+ ret = 0;
+
+done:
+ return ret;
+}
+
+static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
+{
+ struct hfi1_ctxt_info cinfo;
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_filedata *fd = fp->private_data;
+ int ret = 0;
+
+ memset(&cinfo, 0, sizeof(cinfo));
+ ret = hfi1_get_base_kinfo(uctxt, &cinfo);
+ if (ret < 0)
+ goto done;
+ cinfo.num_active = hfi1_count_active_units();
+ cinfo.unit = uctxt->dd->unit;
+ cinfo.ctxt = uctxt->ctxt;
+ cinfo.subctxt = subctxt_fp(fp);
+ cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
+ uctxt->dd->rcv_entries.group_size) +
+ uctxt->expected_count;
+ cinfo.credits = uctxt->sc->credits;
+ cinfo.numa_node = uctxt->numa_id;
+ cinfo.rec_cpu = fd->rec_cpu_num;
+ cinfo.send_ctxt = uctxt->sc->hw_context;
+
+ cinfo.egrtids = uctxt->egrbufs.alloced;
+ cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+ cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+ cinfo.sdma_ring_size = user_sdma_comp_fp(fp)->nentries;
+ cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
+
+ trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, subctxt_fp(fp), cinfo);
+ if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
+ ret = -EFAULT;
+done:
+ return ret;
+}
+
+static int setup_ctxt(struct file *fp)
+{
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_devdata *dd = uctxt->dd;
+ int ret = 0;
+
+ /*
+ * Context should be set up only once (including allocation and
+ * programming of eager buffers. This is done if context sharing
+ * is not requested or by the master process.
+ */
+ if (!uctxt->subctxt_cnt || !subctxt_fp(fp)) {
+ ret = hfi1_init_ctxt(uctxt->sc);
+ if (ret)
+ goto done;
+
+ /* Now allocate the RcvHdr queue and eager buffers. */
+ ret = hfi1_create_rcvhdrq(dd, uctxt);
+ if (ret)
+ goto done;
+ ret = hfi1_setup_eagerbufs(uctxt);
+ if (ret)
+ goto done;
+ if (uctxt->subctxt_cnt && !subctxt_fp(fp)) {
+ ret = setup_subctxt(uctxt);
+ if (ret)
+ goto done;
+ }
+ /* Setup Expected Rcv memories */
+ uctxt->tid_pg_list = vzalloc(uctxt->expected_count *
+ sizeof(struct page **));
+ if (!uctxt->tid_pg_list) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ uctxt->physshadow = vzalloc(uctxt->expected_count *
+ sizeof(*uctxt->physshadow));
+ if (!uctxt->physshadow) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ /* allocate expected TID map and initialize the cursor */
+ atomic_set(&uctxt->tidcursor, 0);
+ uctxt->numtidgroups = uctxt->expected_count /
+ dd->rcv_entries.group_size;
+ uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG +
+ !!(uctxt->numtidgroups % BITS_PER_LONG);
+ uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt *
+ sizeof(*uctxt->tidusemap),
+ GFP_KERNEL, uctxt->numa_id);
+ if (!uctxt->tidusemap) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ /*
+ * In case that the number of groups is not a multiple of
+ * 64 (the number of groups in a tidusemap element), mark
+ * the extra ones as used. This will effectively make them
+ * permanently used and should never be assigned. Otherwise,
+ * the code which checks how many free groups we have will
+ * get completely confused about the state of the bits.
+ */
+ if (uctxt->numtidgroups % BITS_PER_LONG)
+ uctxt->tidusemap[uctxt->tidmapcnt - 1] =
+ ~((1ULL << (uctxt->numtidgroups %
+ BITS_PER_LONG)) - 1);
+ trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0,
+ uctxt->tidusemap, uctxt->tidmapcnt);
+ }
+ ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
+ if (ret)
+ goto done;
+
+ set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
+done:
+ return ret;
+}
+
+static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
+{
+ struct hfi1_base_info binfo;
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_devdata *dd = uctxt->dd;
+ ssize_t sz;
+ unsigned offset;
+ int ret = 0;
+
+ trace_hfi1_uctxtdata(uctxt->dd, uctxt);
+
+ memset(&binfo, 0, sizeof(binfo));
+ binfo.hw_version = dd->revision;
+ binfo.sw_version = HFI1_KERN_SWVERSION;
+ binfo.bthqp = kdeth_qp;
+ binfo.jkey = uctxt->jkey;
+ /*
+ * If more than 64 contexts are enabled the allocated credit
+ * return will span two or three contiguous pages. Since we only
+ * map the page containing the context's credit return address,
+ * we need to calculate the offset in the proper page.
+ */
+ offset = ((u64)uctxt->sc->hw_free -
+ (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
+ binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
+ subctxt_fp(fp), offset);
+ binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
+ subctxt_fp(fp),
+ uctxt->sc->base_addr);
+ binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
+ uctxt->ctxt,
+ subctxt_fp(fp),
+ uctxt->sc->base_addr);
+ binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
+ subctxt_fp(fp),
+ uctxt->rcvhdrq);
+ binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
+ subctxt_fp(fp),
+ uctxt->egrbufs.rcvtids[0].phys);
+ binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
+ subctxt_fp(fp), 0);
+ /*
+ * user regs are at
+ * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
+ */
+ binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
+ subctxt_fp(fp), 0);
+ offset = ((((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + subctxt_fp(fp)) *
+ sizeof(*dd->events)) & ~PAGE_MASK;
+ binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
+ subctxt_fp(fp),
+ offset);
+ binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
+ subctxt_fp(fp),
+ dd->status);
+ if (HFI1_CAP_IS_USET(DMA_RTAIL))
+ binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
+ subctxt_fp(fp), 0);
+ if (uctxt->subctxt_cnt) {
+ binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
+ uctxt->ctxt,
+ subctxt_fp(fp), 0);
+ binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
+ uctxt->ctxt,
+ subctxt_fp(fp), 0);
+ binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
+ uctxt->ctxt,
+ subctxt_fp(fp), 0);
+ }
+ sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
+ if (copy_to_user(ubase, &binfo, sz))
+ ret = -EFAULT;
+ return ret;
+}
+
+static unsigned int poll_urgent(struct file *fp,
+ struct poll_table_struct *pt)
+{
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned pollflag;
+
+ poll_wait(fp, &uctxt->wait, pt);
+
+ spin_lock_irq(&dd->uctxt_lock);
+ if (uctxt->urgent != uctxt->urgent_poll) {
+ pollflag = POLLIN | POLLRDNORM;
+ uctxt->urgent_poll = uctxt->urgent;
+ } else {
+ pollflag = 0;
+ set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
+ }
+ spin_unlock_irq(&dd->uctxt_lock);
+
+ return pollflag;
+}
+
+static unsigned int poll_next(struct file *fp,
+ struct poll_table_struct *pt)
+{
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned pollflag;
+
+ poll_wait(fp, &uctxt->wait, pt);
+
+ spin_lock_irq(&dd->uctxt_lock);
+ if (hdrqempty(uctxt)) {
+ set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
+ pollflag = 0;
+ } else
+ pollflag = POLLIN | POLLRDNORM;
+ spin_unlock_irq(&dd->uctxt_lock);
+
+ return pollflag;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
+{
+ struct hfi1_ctxtdata *uctxt;
+ struct hfi1_devdata *dd = ppd->dd;
+ unsigned ctxt;
+ int ret = 0;
+ unsigned long flags;
+
+ if (!dd->events) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
+ ctxt++) {
+ uctxt = dd->rcd[ctxt];
+ if (uctxt) {
+ unsigned long *evs = dd->events +
+ (uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS;
+ int i;
+ /*
+ * subctxt_cnt is 0 if not shared, so do base
+ * separately, first, then remaining subctxt, if any
+ */
+ set_bit(evtbit, evs);
+ for (i = 1; i < uctxt->subctxt_cnt; i++)
+ set_bit(evtbit, evs + i);
+ }
+ }
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+done:
+ return ret;
+}
+
+/**
+ * manage_rcvq - manage a context's receive queue
+ * @uctxt: the context
+ * @subctxt: the sub-context
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions. start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+ int start_stop)
+{
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned int rcvctrl_op;
+
+ if (subctxt)
+ goto bail;
+ /* atomically clear receive enable ctxt. */
+ if (start_stop) {
+ /*
+ * On enable, force in-memory copy of the tail register to
+ * 0, so that protocol code doesn't have to worry about
+ * whether or not the chip has yet updated the in-memory
+ * copy or not on return from the system call. The chip
+ * always resets it's tail register back to 0 on a
+ * transition from disabled to enabled.
+ */
+ if (uctxt->rcvhdrtail_kvaddr)
+ clear_rcvhdrtail(uctxt);
+ rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
+ } else
+ rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+ hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
+ /* always; new head should be equal to new tail; see above */
+bail:
+ return 0;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
+ unsigned long events)
+{
+ int i;
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned long *evs;
+
+ if (!dd->events)
+ return 0;
+
+ evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + subctxt;
+
+ for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
+ if (!test_bit(i, &events))
+ continue;
+ clear_bit(i, evs);
+ }
+ return 0;
+}
+
+#define num_user_pages(vaddr, len) \
+ (1 + (((((unsigned long)(vaddr) + \
+ (unsigned long)(len) - 1) & PAGE_MASK) - \
+ ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
+
+/**
+ * tzcnt - count the number of trailing zeros in a 64bit value
+ * @value: the value to be examined
+ *
+ * Returns the number of trailing least significant zeros in the
+ * the input value. If the value is zero, return the number of
+ * bits of the value.
+ */
+static inline u8 tzcnt(u64 value)
+{
+ return value ? __builtin_ctzl(value) : sizeof(value) * 8;
+}
+
+static inline unsigned num_free_groups(unsigned long map, u16 *start)
+{
+ unsigned free;
+ u16 bitidx = *start;
+
+ if (bitidx >= BITS_PER_LONG)
+ return 0;
+ /* "Turn off" any bits set before our bit index */
+ map &= ~((1ULL << bitidx) - 1);
+ free = tzcnt(map) - bitidx;
+ while (!free && bitidx < BITS_PER_LONG) {
+ /* Zero out the last set bit so we look at the rest */
+ map &= ~(1ULL << bitidx);
+ /*
+ * Account for the previously checked bits and advance
+ * the bit index. We don't have to check for bitidx
+ * getting bigger than BITS_PER_LONG here as it would
+ * mean extra instructions that we don't need. If it
+ * did happen, it would push free to a negative value
+ * which will break the loop.
+ */
+ free = tzcnt(map) - ++bitidx;
+ }
+ *start = bitidx;
+ return free;
+}
+
+static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+ int ret = 0;
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned tid, mapped = 0, npages, ngroups, exp_groups,
+ tidpairs = uctxt->expected_count / 2;
+ struct page **pages;
+ unsigned long vaddr, tidmap[uctxt->tidmapcnt];
+ dma_addr_t *phys;
+ u32 tidlist[tidpairs], pairidx = 0, tidcursor;
+ u16 useidx, idx, bitidx, tidcnt = 0;
+
+ vaddr = tinfo->vaddr;
+
+ if (vaddr & ~PAGE_MASK) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ npages = num_user_pages(vaddr, tinfo->length);
+ if (!npages) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+ npages * PAGE_SIZE)) {
+ dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+ (void *)vaddr, npages);
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt);
+ memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs);
+
+ exp_groups = uctxt->expected_count / dd->rcv_entries.group_size;
+ /* which group set do we look at first? */
+ tidcursor = atomic_read(&uctxt->tidcursor);
+ useidx = (tidcursor >> 16) & 0xffff;
+ bitidx = tidcursor & 0xffff;
+
+ /*
+ * Keep going until we've mapped all pages or we've exhausted all
+ * RcvArray entries.
+ * This iterates over the number of tidmaps + 1
+ * (idx <= uctxt->tidmapcnt) so we check the bitmap which we
+ * started from one more time for any free bits before the
+ * starting point bit.
+ */
+ for (mapped = 0, idx = 0;
+ mapped < npages && idx <= uctxt->tidmapcnt;) {
+ u64 i, offset = 0;
+ unsigned free, pinned, pmapped = 0, bits_used;
+ u16 grp;
+
+ /*
+ * "Reserve" the needed group bits under lock so other
+ * processes can't step in the middle of it. Once
+ * reserved, we don't need the lock anymore since we
+ * are guaranteed the groups.
+ */
+ spin_lock(&uctxt->exp_lock);
+ if (uctxt->tidusemap[useidx] == -1ULL ||
+ bitidx >= BITS_PER_LONG) {
+ /* no free groups in the set, use the next */
+ useidx = (useidx + 1) % uctxt->tidmapcnt;
+ idx++;
+ bitidx = 0;
+ spin_unlock(&uctxt->exp_lock);
+ continue;
+ }
+ ngroups = ((npages - mapped) / dd->rcv_entries.group_size) +
+ !!((npages - mapped) % dd->rcv_entries.group_size);
+
+ /*
+ * If we've gotten here, the current set of groups does have
+ * one or more free groups.
+ */
+ free = num_free_groups(uctxt->tidusemap[useidx], &bitidx);
+ if (!free) {
+ /*
+ * Despite the check above, free could still come back
+ * as 0 because we don't check the entire bitmap but
+ * we start from bitidx.
+ */
+ spin_unlock(&uctxt->exp_lock);
+ continue;
+ }
+ bits_used = min(free, ngroups);
+ tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx;
+ uctxt->tidusemap[useidx] |= tidmap[useidx];
+ spin_unlock(&uctxt->exp_lock);
+
+ /*
+ * At this point, we know where in the map we have free bits.
+ * properly offset into the various "shadow" arrays and compute
+ * the RcvArray entry index.
+ */
+ offset = ((useidx * BITS_PER_LONG) + bitidx) *
+ dd->rcv_entries.group_size;
+ pages = uctxt->tid_pg_list + offset;
+ phys = uctxt->physshadow + offset;
+ tid = uctxt->expected_base + offset;
+
+ /* Calculate how many pages we can pin based on free bits */
+ pinned = min((bits_used * dd->rcv_entries.group_size),
+ (npages - mapped));
+ /*
+ * Now that we know how many free RcvArray entries we have,
+ * we can pin that many user pages.
+ */
+ ret = hfi1_get_user_pages(vaddr + (mapped * PAGE_SIZE),
+ pinned, pages);
+ if (ret) {
+ /*
+ * We can't continue because the pages array won't be
+ * initialized. This should never happen,
+ * unless perhaps the user has mpin'ed the pages
+ * themselves.
+ */
+ dd_dev_info(dd,
+ "Failed to lock addr %p, %u pages: errno %d\n",
+ (void *) vaddr, pinned, -ret);
+ /*
+ * Let go of the bits that we reserved since we are not
+ * going to use them.
+ */
+ spin_lock(&uctxt->exp_lock);
+ uctxt->tidusemap[useidx] &=
+ ~(((1ULL << bits_used) - 1) << bitidx);
+ spin_unlock(&uctxt->exp_lock);
+ goto done;
+ }
+ /*
+ * How many groups do we need based on how many pages we have
+ * pinned?
+ */
+ ngroups = (pinned / dd->rcv_entries.group_size) +
+ !!(pinned % dd->rcv_entries.group_size);
+ /*
+ * Keep programming RcvArray entries for all the <ngroups> free
+ * groups.
+ */
+ for (i = 0, grp = 0; grp < ngroups; i++, grp++) {
+ unsigned j;
+ u32 pair_size = 0, tidsize;
+ /*
+ * This inner loop will program an entire group or the
+ * array of pinned pages (which ever limit is hit
+ * first).
+ */
+ for (j = 0; j < dd->rcv_entries.group_size &&
+ pmapped < pinned; j++, pmapped++, tid++) {
+ tidsize = PAGE_SIZE;
+ phys[pmapped] = hfi1_map_page(dd->pcidev,
+ pages[pmapped], 0,
+ tidsize, PCI_DMA_FROMDEVICE);
+ trace_hfi1_exp_rcv_set(uctxt->ctxt,
+ subctxt_fp(fp),
+ tid, vaddr,
+ phys[pmapped],
+ pages[pmapped]);
+ /*
+ * Each RcvArray entry is programmed with one
+ * page * worth of memory. This will handle
+ * the 8K MTU as well as anything smaller
+ * due to the fact that both entries in the
+ * RcvTidPair are programmed with a page.
+ * PSM currently does not handle anything
+ * bigger than 8K MTU, so should we even worry
+ * about 10K here?
+ */
+ hfi1_put_tid(dd, tid, PT_EXPECTED,
+ phys[pmapped],
+ ilog2(tidsize >> PAGE_SHIFT) + 1);
+ pair_size += tidsize >> PAGE_SHIFT;
+ EXP_TID_RESET(tidlist[pairidx], LEN, pair_size);
+ if (!(tid % 2)) {
+ tidlist[pairidx] |=
+ EXP_TID_SET(IDX,
+ (tid - uctxt->expected_base)
+ / 2);
+ tidlist[pairidx] |=
+ EXP_TID_SET(CTRL, 1);
+ tidcnt++;
+ } else {
+ tidlist[pairidx] |=
+ EXP_TID_SET(CTRL, 2);
+ pair_size = 0;
+ pairidx++;
+ }
+ }
+ /*
+ * We've programmed the entire group (or as much of the
+ * group as we'll use. Now, it's time to push it out...
+ */
+ flush_wc();
+ }
+ mapped += pinned;
+ atomic_set(&uctxt->tidcursor,
+ (((useidx & 0xffffff) << 16) |
+ ((bitidx + bits_used) & 0xffffff)));
+ }
+ trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0, uctxt->tidusemap,
+ uctxt->tidmapcnt);
+
+done:
+ /* If we've mapped anything, copy relevant info to user */
+ if (mapped) {
+ if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+ tidlist, sizeof(tidlist[0]) * tidcnt)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ /* copy TID info to user */
+ if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap,
+ tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt))
+ ret = -EFAULT;
+ }
+bail:
+ /*
+ * Calculate mapped length. New Exp TID protocol does not "unwind" and
+ * report an error if it can't map the entire buffer. It just reports
+ * the length that was mapped.
+ */
+ tinfo->length = mapped * PAGE_SIZE;
+ tinfo->tidcnt = tidcnt;
+ return ret;
+}
+
+static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned long tidmap[uctxt->tidmapcnt];
+ struct page **pages;
+ dma_addr_t *phys;
+ u16 idx, bitidx, tid;
+ int ret = 0;
+
+ if (copy_from_user(&tidmap, (void __user *)(unsigned long)
+ tinfo->tidmap,
+ sizeof(tidmap[0]) * uctxt->tidmapcnt)) {
+ ret = -EFAULT;
+ goto done;
+ }
+ for (idx = 0; idx < uctxt->tidmapcnt; idx++) {
+ unsigned long map;
+
+ bitidx = 0;
+ if (!tidmap[idx])
+ continue;
+ map = tidmap[idx];
+ while ((bitidx = tzcnt(map)) < BITS_PER_LONG) {
+ int i, pcount = 0;
+ struct page *pshadow[dd->rcv_entries.group_size];
+ unsigned offset = ((idx * BITS_PER_LONG) + bitidx) *
+ dd->rcv_entries.group_size;
+
+ pages = uctxt->tid_pg_list + offset;
+ phys = uctxt->physshadow + offset;
+ tid = uctxt->expected_base + offset;
+ for (i = 0; i < dd->rcv_entries.group_size;
+ i++, tid++) {
+ if (pages[i]) {
+ hfi1_put_tid(dd, tid, PT_INVALID,
+ 0, 0);
+ trace_hfi1_exp_rcv_free(uctxt->ctxt,
+ subctxt_fp(fp),
+ tid, phys[i],
+ pages[i]);
+ pci_unmap_page(dd->pcidev, phys[i],
+ PAGE_SIZE, PCI_DMA_FROMDEVICE);
+ pshadow[pcount] = pages[i];
+ pages[i] = NULL;
+ pcount++;
+ phys[i] = 0;
+ }
+ }
+ flush_wc();
+ hfi1_release_user_pages(pshadow, pcount);
+ clear_bit(bitidx, &uctxt->tidusemap[idx]);
+ map &= ~(1ULL<<bitidx);
+ }
+ }
+ trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 1, uctxt->tidusemap,
+ uctxt->tidmapcnt);
+done:
+ return ret;
+}
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt)
+{
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned tid;
+
+ dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n",
+ uctxt->ctxt);
+ for (tid = 0; tid < uctxt->expected_count; tid++) {
+ struct page *p = uctxt->tid_pg_list[tid];
+ dma_addr_t phys;
+
+ if (!p)
+ continue;
+
+ phys = uctxt->physshadow[tid];
+ uctxt->physshadow[tid] = 0;
+ uctxt->tid_pg_list[tid] = NULL;
+ pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE);
+ hfi1_release_user_pages(&p, 1);
+ }
+}
+
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+ u16 pkey)
+{
+ int ret = -ENOENT, i, intable = 0;
+ struct hfi1_pportdata *ppd = uctxt->ppd;
+ struct hfi1_devdata *dd = uctxt->dd;
+
+ if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
+ if (pkey == ppd->pkeys[i]) {
+ intable = 1;
+ break;
+ }
+
+ if (intable)
+ ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
+done:
+ return ret;
+}
+
+static int ui_open(struct inode *inode, struct file *filp)
+{
+ struct hfi1_devdata *dd;
+
+ dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev);
+ filp->private_data = dd; /* for other methods */
+ return 0;
+}
+
+static int ui_release(struct inode *inode, struct file *filp)
+{
+ /* nothing to do */
+ return 0;
+}
+
+static loff_t ui_lseek(struct file *filp, loff_t offset, int whence)
+{
+ struct hfi1_devdata *dd = filp->private_data;
+
+ switch (whence) {
+ case SEEK_SET:
+ break;
+ case SEEK_CUR:
+ offset += filp->f_pos;
+ break;
+ case SEEK_END:
+ offset = ((dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE) -
+ offset;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (offset < 0)
+ return -EINVAL;
+
+ if (offset >= (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE)
+ return -EINVAL;
+
+ filp->f_pos = offset;
+
+ return filp->f_pos;
+}
+
+
+/* NOTE: assumes unsigned long is 8 bytes */
+static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
+ loff_t *f_pos)
+{
+ struct hfi1_devdata *dd = filp->private_data;
+ void __iomem *base = dd->kregbase;
+ unsigned long total, csr_off,
+ barlen = (dd->kregend - dd->kregbase);
+ u64 data;
+
+ /* only read 8 byte quantities */
+ if ((count % 8) != 0)
+ return -EINVAL;
+ /* offset must be 8-byte aligned */
+ if ((*f_pos % 8) != 0)
+ return -EINVAL;
+ /* destination buffer must be 8-byte aligned */
+ if ((unsigned long)buf % 8 != 0)
+ return -EINVAL;
+ /* must be in range */
+ if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE))
+ return -EINVAL;
+ /* only set the base if we are not starting past the BAR */
+ if (*f_pos < barlen)
+ base += *f_pos;
+ csr_off = *f_pos;
+ for (total = 0; total < count; total += 8, csr_off += 8) {
+ /* accessing LCB CSRs requires more checks */
+ if (is_lcb_offset(csr_off)) {
+ if (read_lcb_csr(dd, csr_off, (u64 *)&data))
+ break; /* failed */
+ }
+ /*
+ * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a
+ * false parity error. Avoid the whole issue by not reading
+ * them. These registers are defined as having a read value
+ * of 0.
+ */
+ else if (csr_off == ASIC_GPIO_CLEAR
+ || csr_off == ASIC_GPIO_FORCE
+ || csr_off == ASIC_QSFP1_CLEAR
+ || csr_off == ASIC_QSFP1_FORCE
+ || csr_off == ASIC_QSFP2_CLEAR
+ || csr_off == ASIC_QSFP2_FORCE)
+ data = 0;
+ else if (csr_off >= barlen) {
+ /*
+ * read_8051_data can read more than just 8 bytes at
+ * a time. However, folding this into the loop and
+ * handling the reads in 8 byte increments allows us
+ * to smoothly transition from chip memory to 8051
+ * memory.
+ */
+ if (read_8051_data(dd,
+ (u32)(csr_off - barlen),
+ sizeof(data), &data))
+ break; /* failed */
+ } else
+ data = readq(base + total);
+ if (put_user(data, (unsigned long __user *)(buf + total)))
+ break;
+ }
+ *f_pos += total;
+ return total;
+}
+
+/* NOTE: assumes unsigned long is 8 bytes */
+static ssize_t ui_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ struct hfi1_devdata *dd = filp->private_data;
+ void __iomem *base;
+ unsigned long total, data, csr_off;
+ int in_lcb;
+
+ /* only write 8 byte quantities */
+ if ((count % 8) != 0)
+ return -EINVAL;
+ /* offset must be 8-byte aligned */
+ if ((*f_pos % 8) != 0)
+ return -EINVAL;
+ /* source buffer must be 8-byte aligned */
+ if ((unsigned long)buf % 8 != 0)
+ return -EINVAL;
+ /* must be in range */
+ if (*f_pos + count > dd->kregend - dd->kregbase)
+ return -EINVAL;
+
+ base = (void __iomem *)dd->kregbase + *f_pos;
+ csr_off = *f_pos;
+ in_lcb = 0;
+ for (total = 0; total < count; total += 8, csr_off += 8) {
+ if (get_user(data, (unsigned long __user *)(buf + total)))
+ break;
+ /* accessing LCB CSRs requires a special procedure */
+ if (is_lcb_offset(csr_off)) {
+ if (!in_lcb) {
+ int ret = acquire_lcb_access(dd, 1);
+
+ if (ret)
+ break;
+ in_lcb = 1;
+ }
+ } else {
+ if (in_lcb) {
+ release_lcb_access(dd, 1);
+ in_lcb = 0;
+ }
+ }
+ writeq(data, base + total);
+ }
+ if (in_lcb)
+ release_lcb_access(dd, 1);
+ *f_pos += total;
+ return total;
+}
+
+static const struct file_operations ui_file_ops = {
+ .owner = THIS_MODULE,
+ .llseek = ui_lseek,
+ .read = ui_read,
+ .write = ui_write,
+ .open = ui_open,
+ .release = ui_release,
+};
+#define UI_OFFSET 192 /* device minor offset for UI devices */
+static int create_ui = 1;
+
+static struct cdev wildcard_cdev;
+static struct device *wildcard_device;
+
+static atomic_t user_count = ATOMIC_INIT(0);
+
+static void user_remove(struct hfi1_devdata *dd)
+{
+ if (atomic_dec_return(&user_count) == 0)
+ hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device);
+
+ hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+ hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device);
+}
+
+static int user_add(struct hfi1_devdata *dd)
+{
+ char name[10];
+ int ret;
+
+ if (atomic_inc_return(&user_count) == 1) {
+ ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops,
+ &wildcard_cdev, &wildcard_device,
+ true);
+ if (ret)
+ goto done;
+ }
+
+ snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
+ ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops,
+ &dd->user_cdev, &dd->user_device,
+ true);
+ if (ret)
+ goto done;
+
+ if (create_ui) {
+ snprintf(name, sizeof(name),
+ "%s_ui%d", class_name(), dd->unit);
+ ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops,
+ &dd->ui_cdev, &dd->ui_device,
+ false);
+ if (ret)
+ goto done;
+ }
+
+ return 0;
+done:
+ user_remove(dd);
+ return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int hfi1_device_create(struct hfi1_devdata *dd)
+{
+ int r, ret;
+
+ r = user_add(dd);
+ ret = hfi1_diag_add(dd);
+ if (r && !ret)
+ ret = r;
+ return ret;
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void hfi1_device_remove(struct hfi1_devdata *dd)
+{
+ user_remove(dd);
+ hfi1_diag_remove(dd);
+}
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
new file mode 100644
index 000000000000..5c2f2ed8f224
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/firmware.c
@@ -0,0 +1,1620 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/crc32.h>
+
+#include "hfi.h"
+#include "trace.h"
+
+/*
+ * Make it easy to toggle firmware file name and if it gets loaded by
+ * editing the following. This may be something we do while in development
+ * but not necessarily something a user would ever need to use.
+ */
+#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
+#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
+#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
+#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
+#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
+#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
+
+static uint fw_8051_load = 1;
+static uint fw_fabric_serdes_load = 1;
+static uint fw_pcie_serdes_load = 1;
+static uint fw_sbus_load = 1;
+static uint platform_config_load = 1;
+
+/* Firmware file names get set in hfi1_firmware_init() based on the above */
+static char *fw_8051_name;
+static char *fw_fabric_serdes_name;
+static char *fw_sbus_name;
+static char *fw_pcie_serdes_name;
+static char *platform_config_name;
+
+#define SBUS_MAX_POLL_COUNT 100
+#define SBUS_COUNTER(reg, name) \
+ (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
+ ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
+
+/*
+ * Firmware security header.
+ */
+struct css_header {
+ u32 module_type;
+ u32 header_len;
+ u32 header_version;
+ u32 module_id;
+ u32 module_vendor;
+ u32 date; /* BCD yyyymmdd */
+ u32 size; /* in DWORDs */
+ u32 key_size; /* in DWORDs */
+ u32 modulus_size; /* in DWORDs */
+ u32 exponent_size; /* in DWORDs */
+ u32 reserved[22];
+};
+/* expected field values */
+#define CSS_MODULE_TYPE 0x00000006
+#define CSS_HEADER_LEN 0x000000a1
+#define CSS_HEADER_VERSION 0x00010000
+#define CSS_MODULE_VENDOR 0x00008086
+
+#define KEY_SIZE 256
+#define MU_SIZE 8
+#define EXPONENT_SIZE 4
+
+/* the file itself */
+struct firmware_file {
+ struct css_header css_header;
+ u8 modulus[KEY_SIZE];
+ u8 exponent[EXPONENT_SIZE];
+ u8 signature[KEY_SIZE];
+ u8 firmware[];
+};
+
+struct augmented_firmware_file {
+ struct css_header css_header;
+ u8 modulus[KEY_SIZE];
+ u8 exponent[EXPONENT_SIZE];
+ u8 signature[KEY_SIZE];
+ u8 r2[KEY_SIZE];
+ u8 mu[MU_SIZE];
+ u8 firmware[];
+};
+
+/* augmented file size difference */
+#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
+ sizeof(struct firmware_file))
+
+struct firmware_details {
+ /* Linux core piece */
+ const struct firmware *fw;
+
+ struct css_header *css_header;
+ u8 *firmware_ptr; /* pointer to binary data */
+ u32 firmware_len; /* length in bytes */
+ u8 *modulus; /* pointer to the modulus */
+ u8 *exponent; /* pointer to the exponent */
+ u8 *signature; /* pointer to the signature */
+ u8 *r2; /* pointer to r2 */
+ u8 *mu; /* pointer to mu */
+ struct augmented_firmware_file dummy_header;
+};
+
+/*
+ * The mutex protects fw_state, fw_err, and all of the firmware_details
+ * variables.
+ */
+static DEFINE_MUTEX(fw_mutex);
+enum fw_state {
+ FW_EMPTY,
+ FW_ACQUIRED,
+ FW_ERR
+};
+static enum fw_state fw_state = FW_EMPTY;
+static int fw_err;
+static struct firmware_details fw_8051;
+static struct firmware_details fw_fabric;
+static struct firmware_details fw_pcie;
+static struct firmware_details fw_sbus;
+static const struct firmware *platform_config;
+
+/* flags for turn_off_spicos() */
+#define SPICO_SBUS 0x1
+#define SPICO_FABRIC 0x2
+#define ENABLE_SPICO_SMASK 0x1
+
+/* security block commands */
+#define RSA_CMD_INIT 0x1
+#define RSA_CMD_START 0x2
+
+/* security block status */
+#define RSA_STATUS_IDLE 0x0
+#define RSA_STATUS_ACTIVE 0x1
+#define RSA_STATUS_DONE 0x2
+#define RSA_STATUS_FAILED 0x3
+
+/* RSA engine timeout, in ms */
+#define RSA_ENGINE_TIMEOUT 100 /* ms */
+
+/* hardware mutex timeout, in ms */
+#define HM_TIMEOUT 4000 /* 4 s */
+
+/* 8051 memory access timeout, in us */
+#define DC8051_ACCESS_TIMEOUT 100 /* us */
+
+/* the number of fabric SerDes on the SBus */
+#define NUM_FABRIC_SERDES 4
+
+/* SBus fabric SerDes addresses, one set per HFI */
+static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
+ { 0x01, 0x02, 0x03, 0x04 },
+ { 0x28, 0x29, 0x2a, 0x2b }
+};
+
+/* SBus PCIe SerDes addresses, one set per HFI */
+static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
+ { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+ 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
+ { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
+ 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
+};
+
+/* SBus PCIe PCS addresses, one set per HFI */
+const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
+ { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
+ 0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
+ { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+ 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
+};
+
+/* SBus fabric SerDes broadcast addresses, one per HFI */
+static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
+static const u8 all_fabric_serdes_broadcast = 0xe1;
+
+/* SBus PCIe SerDes broadcast addresses, one per HFI */
+const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
+static const u8 all_pcie_serdes_broadcast = 0xe0;
+
+/* forwards */
+static void dispose_one_firmware(struct firmware_details *fdet);
+
+/*
+ * Read a single 64-bit value from 8051 data memory.
+ *
+ * Expects:
+ * o caller to have already set up data read, no auto increment
+ * o caller to turn off read enable when finished
+ *
+ * The address argument is a byte offset. Bits 0:2 in the address are
+ * ignored - i.e. the hardware will always do aligned 8-byte reads as if
+ * the lower bits are zero.
+ *
+ * Return 0 on success, -ENXIO on a read error (timeout).
+ */
+static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
+{
+ u64 reg;
+ int count;
+
+ /* start the read at the given address */
+ reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+ << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+ | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+ /* wait until ACCESS_COMPLETED is set */
+ count = 0;
+ while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+ & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+ == 0) {
+ count++;
+ if (count > DC8051_ACCESS_TIMEOUT) {
+ dd_dev_err(dd, "timeout reading 8051 data\n");
+ return -ENXIO;
+ }
+ ndelay(10);
+ }
+
+ /* gather the data */
+ *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
+
+ return 0;
+}
+
+/*
+ * Read 8051 data starting at addr, for len bytes. Will read in 8-byte chunks.
+ * Return 0 on success, -errno on error.
+ */
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
+{
+ unsigned long flags;
+ u32 done;
+ int ret = 0;
+
+ spin_lock_irqsave(&dd->dc8051_memlock, flags);
+
+ /* data read set-up, no auto-increment */
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+ for (done = 0; done < len; addr += 8, done += 8, result++) {
+ ret = __read_8051_data(dd, addr, result);
+ if (ret)
+ break;
+ }
+
+ /* turn off read enable */
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+
+ spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
+
+ return ret;
+}
+
+/*
+ * Write data or code to the 8051 code or data RAM.
+ */
+static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
+ const u8 *data, u32 len)
+{
+ u64 reg;
+ u32 offset;
+ int aligned, count;
+
+ /* check alignment */
+ aligned = ((unsigned long)data & 0x7) == 0;
+
+ /* write set-up */
+ reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
+ | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
+
+ reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+ << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+ | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+ /* write */
+ for (offset = 0; offset < len; offset += 8) {
+ int bytes = len - offset;
+
+ if (bytes < 8) {
+ reg = 0;
+ memcpy(&reg, &data[offset], bytes);
+ } else if (aligned) {
+ reg = *(u64 *)&data[offset];
+ } else {
+ memcpy(&reg, &data[offset], 8);
+ }
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
+
+ /* wait until ACCESS_COMPLETED is set */
+ count = 0;
+ while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+ & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+ == 0) {
+ count++;
+ if (count > DC8051_ACCESS_TIMEOUT) {
+ dd_dev_err(dd, "timeout writing 8051 data\n");
+ return -ENXIO;
+ }
+ udelay(1);
+ }
+ }
+
+ /* turn off write access, auto increment (also sets to data access) */
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+ return 0;
+}
+
+/* return 0 if values match, non-zero and complain otherwise */
+static int invalid_header(struct hfi1_devdata *dd, const char *what,
+ u32 actual, u32 expected)
+{
+ if (actual == expected)
+ return 0;
+
+ dd_dev_err(dd,
+ "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
+ what, expected, actual);
+ return 1;
+}
+
+/*
+ * Verify that the static fields in the CSS header match.
+ */
+static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
+{
+ /* verify CSS header fields (most sizes are in DW, so add /4) */
+ if (invalid_header(dd, "module_type", css->module_type, CSS_MODULE_TYPE)
+ || invalid_header(dd, "header_len", css->header_len,
+ (sizeof(struct firmware_file)/4))
+ || invalid_header(dd, "header_version",
+ css->header_version, CSS_HEADER_VERSION)
+ || invalid_header(dd, "module_vendor",
+ css->module_vendor, CSS_MODULE_VENDOR)
+ || invalid_header(dd, "key_size",
+ css->key_size, KEY_SIZE/4)
+ || invalid_header(dd, "modulus_size",
+ css->modulus_size, KEY_SIZE/4)
+ || invalid_header(dd, "exponent_size",
+ css->exponent_size, EXPONENT_SIZE/4)) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Make sure there are at least some bytes after the prefix.
+ */
+static int payload_check(struct hfi1_devdata *dd, const char *name,
+ long file_size, long prefix_size)
+{
+ /* make sure we have some payload */
+ if (prefix_size >= file_size) {
+ dd_dev_err(dd,
+ "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
+ name, file_size, prefix_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Request the firmware from the system. Extract the pieces and fill in
+ * fdet. If successful, the caller will need to call dispose_one_firmware().
+ * Returns 0 on success, -ERRNO on error.
+ */
+static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
+ struct firmware_details *fdet)
+{
+ struct css_header *css;
+ int ret;
+
+ memset(fdet, 0, sizeof(*fdet));
+
+ ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
+ if (ret) {
+ dd_dev_err(dd, "cannot load firmware \"%s\", err %d\n",
+ name, ret);
+ return ret;
+ }
+
+ /* verify the firmware */
+ if (fdet->fw->size < sizeof(struct css_header)) {
+ dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
+ ret = -EINVAL;
+ goto done;
+ }
+ css = (struct css_header *)fdet->fw->data;
+
+ hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
+ hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
+ hfi1_cdbg(FIRMWARE, "CSS structure:");
+ hfi1_cdbg(FIRMWARE, " module_type 0x%x", css->module_type);
+ hfi1_cdbg(FIRMWARE, " header_len 0x%03x (0x%03x bytes)",
+ css->header_len, 4 * css->header_len);
+ hfi1_cdbg(FIRMWARE, " header_version 0x%x", css->header_version);
+ hfi1_cdbg(FIRMWARE, " module_id 0x%x", css->module_id);
+ hfi1_cdbg(FIRMWARE, " module_vendor 0x%x", css->module_vendor);
+ hfi1_cdbg(FIRMWARE, " date 0x%x", css->date);
+ hfi1_cdbg(FIRMWARE, " size 0x%03x (0x%03x bytes)",
+ css->size, 4 * css->size);
+ hfi1_cdbg(FIRMWARE, " key_size 0x%03x (0x%03x bytes)",
+ css->key_size, 4 * css->key_size);
+ hfi1_cdbg(FIRMWARE, " modulus_size 0x%03x (0x%03x bytes)",
+ css->modulus_size, 4 * css->modulus_size);
+ hfi1_cdbg(FIRMWARE, " exponent_size 0x%03x (0x%03x bytes)",
+ css->exponent_size, 4 * css->exponent_size);
+ hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
+ fdet->fw->size - sizeof(struct firmware_file));
+
+ /*
+ * If the file does not have a valid CSS header, fail.
+ * Otherwise, check the CSS size field for an expected size.
+ * The augmented file has r2 and mu inserted after the header
+ * was generated, so there will be a known difference between
+ * the CSS header size and the actual file size. Use this
+ * difference to identify an augmented file.
+ *
+ * Note: css->size is in DWORDs, multiply by 4 to get bytes.
+ */
+ ret = verify_css_header(dd, css);
+ if (ret) {
+ dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
+ } else if ((css->size*4) == fdet->fw->size) {
+ /* non-augmented firmware file */
+ struct firmware_file *ff = (struct firmware_file *)
+ fdet->fw->data;
+
+ /* make sure there are bytes in the payload */
+ ret = payload_check(dd, name, fdet->fw->size,
+ sizeof(struct firmware_file));
+ if (ret == 0) {
+ fdet->css_header = css;
+ fdet->modulus = ff->modulus;
+ fdet->exponent = ff->exponent;
+ fdet->signature = ff->signature;
+ fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
+ fdet->mu = fdet->dummy_header.mu; /* use dummy space */
+ fdet->firmware_ptr = ff->firmware;
+ fdet->firmware_len = fdet->fw->size -
+ sizeof(struct firmware_file);
+ /*
+ * Header does not include r2 and mu - generate here.
+ * For now, fail.
+ */
+ dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
+ ret = -EINVAL;
+ }
+ } else if ((css->size*4) + AUGMENT_SIZE == fdet->fw->size) {
+ /* augmented firmware file */
+ struct augmented_firmware_file *aff =
+ (struct augmented_firmware_file *)fdet->fw->data;
+
+ /* make sure there are bytes in the payload */
+ ret = payload_check(dd, name, fdet->fw->size,
+ sizeof(struct augmented_firmware_file));
+ if (ret == 0) {
+ fdet->css_header = css;
+ fdet->modulus = aff->modulus;
+ fdet->exponent = aff->exponent;
+ fdet->signature = aff->signature;
+ fdet->r2 = aff->r2;
+ fdet->mu = aff->mu;
+ fdet->firmware_ptr = aff->firmware;
+ fdet->firmware_len = fdet->fw->size -
+ sizeof(struct augmented_firmware_file);
+ }
+ } else {
+ /* css->size check failed */
+ dd_dev_err(dd,
+ "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
+ fdet->fw->size/4, (fdet->fw->size - AUGMENT_SIZE)/4,
+ css->size);
+
+ ret = -EINVAL;
+ }
+
+done:
+ /* if returning an error, clean up after ourselves */
+ if (ret)
+ dispose_one_firmware(fdet);
+ return ret;
+}
+
+static void dispose_one_firmware(struct firmware_details *fdet)
+{
+ release_firmware(fdet->fw);
+ fdet->fw = NULL;
+}
+
+/*
+ * Called by all HFIs when loading their firmware - i.e. device probe time.
+ * The first one will do the actual firmware load. Use a mutex to resolve
+ * any possible race condition.
+ *
+ * The call to this routine cannot be moved to driver load because the kernel
+ * call request_firmware() requires a device which is only available after
+ * the first device probe.
+ */
+static int obtain_firmware(struct hfi1_devdata *dd)
+{
+ int err = 0;
+
+ mutex_lock(&fw_mutex);
+ if (fw_state == FW_ACQUIRED) {
+ goto done; /* already acquired */
+ } else if (fw_state == FW_ERR) {
+ err = fw_err;
+ goto done; /* already tried and failed */
+ }
+
+ if (fw_8051_load) {
+ err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
+ if (err)
+ goto done;
+ }
+
+ if (fw_fabric_serdes_load) {
+ err = obtain_one_firmware(dd, fw_fabric_serdes_name,
+ &fw_fabric);
+ if (err)
+ goto done;
+ }
+
+ if (fw_sbus_load) {
+ err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
+ if (err)
+ goto done;
+ }
+
+ if (fw_pcie_serdes_load) {
+ err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
+ if (err)
+ goto done;
+ }
+
+ if (platform_config_load) {
+ platform_config = NULL;
+ err = request_firmware(&platform_config, platform_config_name,
+ &dd->pcidev->dev);
+ if (err) {
+ err = 0;
+ platform_config = NULL;
+ }
+ }
+
+ /* success */
+ fw_state = FW_ACQUIRED;
+
+done:
+ if (err) {
+ fw_err = err;
+ fw_state = FW_ERR;
+ }
+ mutex_unlock(&fw_mutex);
+
+ return err;
+}
+
+/*
+ * Called when the driver unloads. The timing is asymmetric with its
+ * counterpart, obtain_firmware(). If called at device remove time,
+ * then it is conceivable that another device could probe while the
+ * firmware is being disposed. The mutexes can be moved to do that
+ * safely, but then the firmware would be requested from the OS multiple
+ * times.
+ *
+ * No mutex is needed as the driver is unloading and there cannot be any
+ * other callers.
+ */
+void dispose_firmware(void)
+{
+ dispose_one_firmware(&fw_8051);
+ dispose_one_firmware(&fw_fabric);
+ dispose_one_firmware(&fw_pcie);
+ dispose_one_firmware(&fw_sbus);
+
+ release_firmware(platform_config);
+ platform_config = NULL;
+
+ /* retain the error state, otherwise revert to empty */
+ if (fw_state != FW_ERR)
+ fw_state = FW_EMPTY;
+}
+
+/*
+ * Write a block of data to a given array CSR. All calls will be in
+ * multiples of 8 bytes.
+ */
+static void write_rsa_data(struct hfi1_devdata *dd, int what,
+ const u8 *data, int nbytes)
+{
+ int qw_size = nbytes/8;
+ int i;
+
+ if (((unsigned long)data & 0x7) == 0) {
+ /* aligned */
+ u64 *ptr = (u64 *)data;
+
+ for (i = 0; i < qw_size; i++, ptr++)
+ write_csr(dd, what + (8*i), *ptr);
+ } else {
+ /* not aligned */
+ for (i = 0; i < qw_size; i++, data += 8) {
+ u64 value;
+
+ memcpy(&value, data, 8);
+ write_csr(dd, what + (8*i), value);
+ }
+ }
+}
+
+/*
+ * Write a block of data to a given CSR as a stream of writes. All calls will
+ * be in multiples of 8 bytes.
+ */
+static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
+ const u8 *data, int nbytes)
+{
+ u64 *ptr = (u64 *)data;
+ int qw_size = nbytes/8;
+
+ for (; qw_size > 0; qw_size--, ptr++)
+ write_csr(dd, what, *ptr);
+}
+
+/*
+ * Download the signature and start the RSA mechanism. Wait for
+ * RSA_ENGINE_TIMEOUT before giving up.
+ */
+static int run_rsa(struct hfi1_devdata *dd, const char *who,
+ const u8 *signature)
+{
+ unsigned long timeout;
+ u64 reg;
+ u32 status;
+ int ret = 0;
+
+ /* write the signature */
+ write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
+
+ /* initialize RSA */
+ write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
+
+ /*
+ * Make sure the engine is idle and insert a delay between the two
+ * writes to MISC_CFG_RSA_CMD.
+ */
+ status = (read_csr(dd, MISC_CFG_FW_CTRL)
+ & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+ >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+ if (status != RSA_STATUS_IDLE) {
+ dd_dev_err(dd, "%s security engine not idle - giving up\n",
+ who);
+ return -EBUSY;
+ }
+
+ /* start RSA */
+ write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
+
+ /*
+ * Look for the result.
+ *
+ * The RSA engine is hooked up to two MISC errors. The driver
+ * masks these errors as they do not respond to the standard
+ * error "clear down" mechanism. Look for these errors here and
+ * clear them when possible. This routine will exit with the
+ * errors of the current run still set.
+ *
+ * MISC_FW_AUTH_FAILED_ERR
+ * Firmware authorization failed. This can be cleared by
+ * re-initializing the RSA engine, then clearing the status bit.
+ * Do not re-init the RSA angine immediately after a successful
+ * run - this will reset the current authorization.
+ *
+ * MISC_KEY_MISMATCH_ERR
+ * Key does not match. The only way to clear this is to load
+ * a matching key then clear the status bit. If this error
+ * is raised, it will persist outside of this routine until a
+ * matching key is loaded.
+ */
+ timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
+ while (1) {
+ status = (read_csr(dd, MISC_CFG_FW_CTRL)
+ & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+ >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+
+ if (status == RSA_STATUS_IDLE) {
+ /* should not happen */
+ dd_dev_err(dd, "%s firmware security bad idle state\n",
+ who);
+ ret = -EINVAL;
+ break;
+ } else if (status == RSA_STATUS_DONE) {
+ /* finished successfully */
+ break;
+ } else if (status == RSA_STATUS_FAILED) {
+ /* finished unsuccessfully */
+ ret = -EINVAL;
+ break;
+ }
+ /* else still active */
+
+ if (time_after(jiffies, timeout)) {
+ /*
+ * Timed out while active. We can't reset the engine
+ * if it is stuck active, but run through the
+ * error code to see what error bits are set.
+ */
+ dd_dev_err(dd, "%s firmware security time out\n", who);
+ ret = -ETIMEDOUT;
+ break;
+ }
+
+ msleep(20);
+ }
+
+ /*
+ * Arrive here on success or failure. Clear all RSA engine
+ * errors. All current errors will stick - the RSA logic is keeping
+ * error high. All previous errors will clear - the RSA logic
+ * is not keeping the error high.
+ */
+ write_csr(dd, MISC_ERR_CLEAR,
+ MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK
+ | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
+ /*
+ * All that is left are the current errors. Print failure details,
+ * if any.
+ */
+ reg = read_csr(dd, MISC_ERR_STATUS);
+ if (ret) {
+ if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
+ dd_dev_err(dd, "%s firmware authorization failed\n",
+ who);
+ if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
+ dd_dev_err(dd, "%s firmware key mismatch\n", who);
+ }
+
+ return ret;
+}
+
+static void load_security_variables(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ /* Security variables a. Write the modulus */
+ write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
+ /* Security variables b. Write the r2 */
+ write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
+ /* Security variables c. Write the mu */
+ write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
+ /* Security variables d. Write the header */
+ write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
+ (u8 *)fdet->css_header, sizeof(struct css_header));
+}
+
+/* return the 8051 firmware state */
+static inline u32 get_firmware_state(struct hfi1_devdata *dd)
+{
+ u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+
+ return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
+ & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
+}
+
+/*
+ * Wait until the firmware is up and ready to take host requests.
+ * Return 0 on success, -ETIMEDOUT on timeout.
+ */
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
+{
+ unsigned long timeout;
+
+ /* in the simulator, the fake 8051 is always ready */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+ return 0;
+
+ timeout = msecs_to_jiffies(mstimeout) + jiffies;
+ while (1) {
+ if (get_firmware_state(dd) == 0xa0) /* ready */
+ return 0;
+ if (time_after(jiffies, timeout)) /* timed out */
+ return -ETIMEDOUT;
+ usleep_range(1950, 2050); /* sleep 2ms-ish */
+ }
+}
+
+/*
+ * Load the 8051 firmware.
+ */
+static int load_8051_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ u64 reg;
+ int ret;
+ u8 ver_a, ver_b;
+
+ /*
+ * DC Reset sequence
+ * Load DC 8051 firmware
+ */
+ /*
+ * DC reset step 1: Reset DC8051
+ */
+ reg = DC_DC8051_CFG_RST_M8051W_SMASK
+ | DC_DC8051_CFG_RST_CRAM_SMASK
+ | DC_DC8051_CFG_RST_DRAM_SMASK
+ | DC_DC8051_CFG_RST_IRAM_SMASK
+ | DC_DC8051_CFG_RST_SFR_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+ /*
+ * DC reset step 2 (optional): Load 8051 data memory with link
+ * configuration
+ */
+
+ /*
+ * DC reset step 3: Load DC8051 firmware
+ */
+ /* release all but the core reset */
+ reg = DC_DC8051_CFG_RST_M8051W_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+ /* Firmware load step 1 */
+ load_security_variables(dd, fdet);
+
+ /*
+ * Firmware load step 2. Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
+ */
+ write_csr(dd, MISC_CFG_FW_CTRL, 0);
+
+ /* Firmware load steps 3-5 */
+ ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
+ fdet->firmware_len);
+ if (ret)
+ return ret;
+
+ /*
+ * DC reset step 4. Host starts the DC8051 firmware
+ */
+ /*
+ * Firmware load step 6. Set MISC_CFG_FW_CTRL.FW_8051_LOADED
+ */
+ write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
+
+ /* Firmware load steps 7-10 */
+ ret = run_rsa(dd, "8051", fdet->signature);
+ if (ret)
+ return ret;
+
+ /* clear all reset bits, releasing the 8051 */
+ write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
+ /*
+ * DC reset step 5. Wait for firmware to be ready to accept host
+ * requests.
+ */
+ ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+ if (ret) { /* timed out */
+ dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+ get_firmware_state(dd));
+ return -ETIMEDOUT;
+ }
+
+ read_misc_status(dd, &ver_a, &ver_b);
+ dd_dev_info(dd, "8051 firmware version %d.%d\n",
+ (int)ver_b, (int)ver_a);
+ dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
+
+ return 0;
+}
+
+/* SBus Master broadcast address */
+#define SBUS_MASTER_BROADCAST 0xfd
+
+/*
+ * Write the SBus request register
+ *
+ * No need for masking - the arguments are sized exactly.
+ */
+void sbus_request(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+ write_csr(dd, ASIC_CFG_SBUS_REQUEST,
+ ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT)
+ | ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT)
+ | ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT)
+ | ((u64)receiver_addr
+ << ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
+}
+
+/*
+ * Turn off the SBus and fabric serdes spicos.
+ *
+ * + Must be called with Sbus fast mode turned on.
+ * + Must be called after fabric serdes broadcast is set up.
+ * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
+ * when using MISC_CFG_FW_CTRL.
+ */
+static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
+{
+ /* only needed on A0 */
+ if (!is_a0(dd))
+ return;
+
+ dd_dev_info(dd, "Turning off spicos:%s%s\n",
+ flags & SPICO_SBUS ? " SBus" : "",
+ flags & SPICO_FABRIC ? " fabric" : "");
+
+ write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
+ /* disable SBus spico */
+ if (flags & SPICO_SBUS)
+ sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
+ WRITE_SBUS_RECEIVER, 0x00000040);
+
+ /* disable the fabric serdes spicos */
+ if (flags & SPICO_FABRIC)
+ sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
+ 0x07, WRITE_SBUS_RECEIVER, 0x00000000);
+ write_csr(dd, MISC_CFG_FW_CTRL, 0);
+}
+
+/*
+ * Reset all of the fabric serdes for our HFI.
+ */
+void fabric_serdes_reset(struct hfi1_devdata *dd)
+{
+ u8 ra;
+
+ if (dd->icode != ICODE_RTL_SILICON) /* only for RTL */
+ return;
+
+ ra = fabric_serdes_broadcast[dd->hfi1_id];
+
+ acquire_hw_mutex(dd);
+ set_sbus_fast_mode(dd);
+ /* place SerDes in reset and disable SPICO */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+ /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+ udelay(1);
+ /* remove SerDes reset */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+ /* turn SPICO enable on */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+ clear_sbus_fast_mode(dd);
+ release_hw_mutex(dd);
+}
+
+/* Access to the SBus in this routine should probably be serialized */
+int sbus_request_slow(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+ u64 reg, count = 0;
+
+ sbus_request(dd, receiver_addr, data_addr, command, data_in);
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+ ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
+ /* Wait for both DONE and RCV_DATA_VALID to go high */
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+ (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
+ if (count++ >= SBUS_MAX_POLL_COUNT) {
+ u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+ /*
+ * If the loop has timed out, we are OK if DONE bit
+ * is set and RCV_DATA_VALID and EXECUTE counters
+ * are the same. If not, we cannot proceed.
+ */
+ if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+ (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
+ SBUS_COUNTER(counts, EXECUTE)))
+ break;
+ return -ETIMEDOUT;
+ }
+ udelay(1);
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ }
+ count = 0;
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+ /* Wait for DONE to clear after EXECUTE is cleared */
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
+ if (count++ >= SBUS_MAX_POLL_COUNT)
+ return -ETIME;
+ udelay(1);
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ }
+ return 0;
+}
+
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ int i, err;
+ const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
+
+ dd_dev_info(dd, "Downloading fabric firmware\n");
+
+ /* step 1: load security variables */
+ load_security_variables(dd, fdet);
+ /* step 2: place SerDes in reset and disable SPICO */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+ /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+ udelay(1);
+ /* step 3: remove SerDes reset */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+ /* step 4: assert IMEM override */
+ sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
+ /* step 5: download SerDes machine code */
+ for (i = 0; i < fdet->firmware_len; i += 4) {
+ sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
+ *(u32 *)&fdet->firmware_ptr[i]);
+ }
+ /* step 6: IMEM override off */
+ sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
+ /* step 7: turn ECC on */
+ sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+ /* steps 8-11: run the RSA engine */
+ err = run_rsa(dd, "fabric serdes", fdet->signature);
+ if (err)
+ return err;
+
+ /* step 12: turn SPICO enable on */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+ /* step 13: enable core hardware interrupts */
+ sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
+
+ return 0;
+}
+
+static int load_sbus_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ int i, err;
+ const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+ dd_dev_info(dd, "Downloading SBus firmware\n");
+
+ /* step 1: load security variables */
+ load_security_variables(dd, fdet);
+ /* step 2: place SPICO into reset and enable off */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
+ /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
+ /* step 4: set starting IMEM address for burst download */
+ sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
+ /* step 5: download the SBus Master machine code */
+ for (i = 0; i < fdet->firmware_len; i += 4) {
+ sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
+ *(u32 *)&fdet->firmware_ptr[i]);
+ }
+ /* step 6: set IMEM_CNTL_EN off */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
+ /* step 7: turn ECC on */
+ sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+ /* steps 8-11: run the RSA engine */
+ err = run_rsa(dd, "SBus", fdet->signature);
+ if (err)
+ return err;
+
+ /* step 12: set SPICO_ENABLE on */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+
+ return 0;
+}
+
+static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ int i;
+ const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+ dd_dev_info(dd, "Downloading PCIe firmware\n");
+
+ /* step 1: load security variables */
+ load_security_variables(dd, fdet);
+ /* step 2: assert single step (halts the SBus Master spico) */
+ sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
+ /* step 3: enable XDMEM access */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
+ /* step 4: load firmware into SBus Master XDMEM */
+ /* NOTE: the dmem address, write_en, and wdata are all pre-packed,
+ we only need to pick up the bytes and write them */
+ for (i = 0; i < fdet->firmware_len; i += 4) {
+ sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
+ *(u32 *)&fdet->firmware_ptr[i]);
+ }
+ /* step 5: disable XDMEM access */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+ /* step 6: allow SBus Spico to run */
+ sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
+
+ /* steps 7-11: run RSA, if it succeeds, firmware is available to
+ be swapped */
+ return run_rsa(dd, "PCIe serdes", fdet->signature);
+}
+
+/*
+ * Set the given broadcast values on the given list of devices.
+ */
+static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
+ const u8 *addrs, int count)
+{
+ while (--count >= 0) {
+ /*
+ * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
+ * defaults for everything else. Do not read-modify-write,
+ * per instruction from the manufacturer.
+ *
+ * Register 0xfd:
+ * bits what
+ * ----- ---------------------------------
+ * 0 IGNORE_BROADCAST (default 0)
+ * 11:4 BROADCAST_GROUP_1 (default 0xff)
+ * 23:16 BROADCAST_GROUP_2 (default 0xff)
+ */
+ sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
+ (u32)bg1 << 4 | (u32)bg2 << 16);
+ }
+}
+
+int acquire_hw_mutex(struct hfi1_devdata *dd)
+{
+ unsigned long timeout;
+ int try = 0;
+ u8 mask = 1 << dd->hfi1_id;
+ u8 user;
+
+retry:
+ timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
+ while (1) {
+ write_csr(dd, ASIC_CFG_MUTEX, mask);
+ user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+ if (user == mask)
+ return 0; /* success */
+ if (time_after(jiffies, timeout))
+ break; /* timed out */
+ msleep(20);
+ }
+
+ /* timed out */
+ dd_dev_err(dd,
+ "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
+ (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
+
+ if (try == 0) {
+ /* break mutex and retry */
+ write_csr(dd, ASIC_CFG_MUTEX, 0);
+ try++;
+ goto retry;
+ }
+
+ return -EBUSY;
+}
+
+void release_hw_mutex(struct hfi1_devdata *dd)
+{
+ write_csr(dd, ASIC_CFG_MUTEX, 0);
+}
+
+void set_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+ ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
+}
+
+void clear_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+ u64 reg, count = 0;
+
+ reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+ while (SBUS_COUNTER(reg, EXECUTE) !=
+ SBUS_COUNTER(reg, RCV_DATA_VALID)) {
+ if (count++ >= SBUS_MAX_POLL_COUNT)
+ break;
+ udelay(1);
+ reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+ }
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+}
+
+int load_firmware(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ if (fw_sbus_load || fw_fabric_serdes_load) {
+ ret = acquire_hw_mutex(dd);
+ if (ret)
+ return ret;
+
+ set_sbus_fast_mode(dd);
+
+ /*
+ * The SBus contains part of the fabric firmware and so must
+ * also be downloaded.
+ */
+ if (fw_sbus_load) {
+ turn_off_spicos(dd, SPICO_SBUS);
+ ret = load_sbus_firmware(dd, &fw_sbus);
+ if (ret)
+ goto clear;
+ }
+
+ if (fw_fabric_serdes_load) {
+ set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
+ fabric_serdes_broadcast[dd->hfi1_id],
+ fabric_serdes_addrs[dd->hfi1_id],
+ NUM_FABRIC_SERDES);
+ turn_off_spicos(dd, SPICO_FABRIC);
+ ret = load_fabric_serdes_firmware(dd, &fw_fabric);
+ }
+
+clear:
+ clear_sbus_fast_mode(dd);
+ release_hw_mutex(dd);
+ if (ret)
+ return ret;
+ }
+
+ if (fw_8051_load) {
+ ret = load_8051_firmware(dd, &fw_8051);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int hfi1_firmware_init(struct hfi1_devdata *dd)
+{
+ /* only RTL can use these */
+ if (dd->icode != ICODE_RTL_SILICON) {
+ fw_fabric_serdes_load = 0;
+ fw_pcie_serdes_load = 0;
+ fw_sbus_load = 0;
+ }
+
+ /* no 8051 or QSFP on simulator */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ fw_8051_load = 0;
+ platform_config_load = 0;
+ }
+
+ if (!fw_8051_name) {
+ if (dd->icode == ICODE_RTL_SILICON)
+ fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
+ else
+ fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
+ }
+ if (!fw_fabric_serdes_name)
+ fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
+ if (!fw_sbus_name)
+ fw_sbus_name = DEFAULT_FW_SBUS_NAME;
+ if (!fw_pcie_serdes_name)
+ fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
+ if (!platform_config_name)
+ platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
+
+ return obtain_firmware(dd);
+}
+
+int parse_platform_config(struct hfi1_devdata *dd)
+{
+ struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+ u32 *ptr = NULL;
+ u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0;
+ u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
+
+ if (platform_config == NULL) {
+ dd_dev_info(dd, "%s: Missing config file\n", __func__);
+ goto bail;
+ }
+ ptr = (u32 *)platform_config->data;
+
+ magic_num = *ptr;
+ ptr++;
+ if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
+ dd_dev_info(dd, "%s: Bad config file\n", __func__);
+ goto bail;
+ }
+
+ while (ptr < (u32 *)(platform_config->data + platform_config->size)) {
+ header1 = *ptr;
+ header2 = *(ptr + 1);
+ if (header1 != ~header2) {
+ dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
+ __func__, (ptr - (u32 *)platform_config->data));
+ goto bail;
+ }
+
+ record_idx = *ptr &
+ ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
+
+ table_length_dwords = (*ptr >>
+ PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
+ ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
+
+ table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
+ ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
+
+ /* Done with this set of headers */
+ ptr += 2;
+
+ if (record_idx) {
+ /* data table */
+ switch (table_type) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ pcfgcache->config_tables[table_type].num_table =
+ 1;
+ break;
+ case PLATFORM_CONFIG_PORT_TABLE:
+ pcfgcache->config_tables[table_type].num_table =
+ 2;
+ break;
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ pcfgcache->config_tables[table_type].num_table =
+ table_length_dwords;
+ break;
+ default:
+ dd_dev_info(dd,
+ "%s: Unknown data table %d, offset %ld\n",
+ __func__, table_type,
+ (ptr - (u32 *)platform_config->data));
+ goto bail; /* We don't trust this file now */
+ }
+ pcfgcache->config_tables[table_type].table = ptr;
+ } else {
+ /* metadata table */
+ switch (table_type) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_PORT_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ break;
+ default:
+ dd_dev_info(dd,
+ "%s: Unknown metadata table %d, offset %ld\n",
+ __func__, table_type,
+ (ptr - (u32 *)platform_config->data));
+ goto bail; /* We don't trust this file now */
+ }
+ pcfgcache->config_tables[table_type].table_metadata =
+ ptr;
+ }
+
+ /* Calculate and check table crc */
+ crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
+ (table_length_dwords * 4));
+ crc ^= ~(u32)0;
+
+ /* Jump the table */
+ ptr += table_length_dwords;
+ if (crc != *ptr) {
+ dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
+ __func__, (ptr - (u32 *)platform_config->data));
+ goto bail;
+ }
+ /* Jump the CRC DWORD */
+ ptr++;
+ }
+
+ pcfgcache->cache_valid = 1;
+ return 0;
+bail:
+ memset(pcfgcache, 0, sizeof(struct platform_config_cache));
+ return -EINVAL;
+}
+
+static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
+ int field, u32 *field_len_bits, u32 *field_start_bits)
+{
+ struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+ u32 *src_ptr = NULL;
+
+ if (!pcfgcache->cache_valid)
+ return -EINVAL;
+
+ switch (table) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_PORT_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ if (field && field < platform_config_table_limits[table])
+ src_ptr =
+ pcfgcache->config_tables[table].table_metadata + field;
+ break;
+ default:
+ dd_dev_info(dd, "%s: Unknown table\n", __func__);
+ break;
+ }
+
+ if (!src_ptr)
+ return -EINVAL;
+
+ if (field_start_bits)
+ *field_start_bits = *src_ptr &
+ ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+
+ if (field_len_bits)
+ *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
+ & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+
+ return 0;
+}
+
+/* This is the central interface to getting data out of the platform config
+ * file. It depends on parse_platform_config() having populated the
+ * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
+ * validate the sanity of the cache.
+ *
+ * The non-obvious parameters:
+ * @table_index: Acts as a look up key into which instance of the tables the
+ * relevant field is fetched from.
+ *
+ * This applies to the data tables that have multiple instances. The port table
+ * is an exception to this rule as each HFI only has one port and thus the
+ * relevant table can be distinguished by hfi_id.
+ *
+ * @data: pointer to memory that will be populated with the field requested.
+ * @len: length of memory pointed by @data in bytes.
+ */
+int get_platform_config_field(struct hfi1_devdata *dd,
+ enum platform_config_table_type_encoding table_type,
+ int table_index, int field_index, u32 *data, u32 len)
+{
+ int ret = 0, wlen = 0, seek = 0;
+ u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
+ struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+ if (data)
+ memset(data, 0, len);
+ else
+ return -EINVAL;
+
+ ret = get_platform_fw_field_metadata(dd, table_type, field_index,
+ &field_len_bits, &field_start_bits);
+ if (ret)
+ return -EINVAL;
+
+ /* Convert length to bits */
+ len *= 8;
+
+ /* Our metadata function checked cache_valid and field_index for us */
+ switch (table_type) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ src_ptr = pcfgcache->config_tables[table_type].table;
+
+ if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
+ if (len < field_len_bits)
+ return -EINVAL;
+
+ seek = field_start_bits/8;
+ wlen = field_len_bits/8;
+
+ src_ptr = (u32 *)((u8 *)src_ptr + seek);
+
+ /* We expect the field to be byte aligned and whole byte
+ * lengths if we are here */
+ memcpy(data, src_ptr, wlen);
+ return 0;
+ }
+ break;
+ case PLATFORM_CONFIG_PORT_TABLE:
+ /* Port table is 4 DWORDS in META_VERSION 0 */
+ src_ptr = dd->hfi1_id ?
+ pcfgcache->config_tables[table_type].table + 4 :
+ pcfgcache->config_tables[table_type].table;
+ break;
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ src_ptr = pcfgcache->config_tables[table_type].table;
+
+ if (table_index <
+ pcfgcache->config_tables[table_type].num_table)
+ src_ptr += table_index;
+ else
+ src_ptr = NULL;
+ break;
+ default:
+ dd_dev_info(dd, "%s: Unknown table\n", __func__);
+ break;
+ }
+
+ if (!src_ptr || len < field_len_bits)
+ return -EINVAL;
+
+ src_ptr += (field_start_bits/32);
+ *data = (*src_ptr >> (field_start_bits % 32)) &
+ ((1 << field_len_bits) - 1);
+
+ return 0;
+}
+
+/*
+ * Download the firmware needed for the Gen3 PCIe SerDes. An update
+ * to the SBus firmware is needed before updating the PCIe firmware.
+ *
+ * Note: caller must be holding the HW mutex.
+ */
+int load_pcie_firmware(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+
+ /* both firmware loads below use the SBus */
+ set_sbus_fast_mode(dd);
+
+ if (fw_sbus_load) {
+ turn_off_spicos(dd, SPICO_SBUS);
+ ret = load_sbus_firmware(dd, &fw_sbus);
+ if (ret)
+ goto done;
+ }
+
+ if (fw_pcie_serdes_load) {
+ dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
+ set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
+ pcie_serdes_broadcast[dd->hfi1_id],
+ pcie_serdes_addrs[dd->hfi1_id],
+ NUM_PCIE_SERDES);
+ ret = load_pcie_serdes_firmware(dd, &fw_pcie);
+ if (ret)
+ goto done;
+ }
+
+done:
+ clear_sbus_fast_mode(dd);
+
+ return ret;
+}
+
+/*
+ * Read the GUID from the hardware, store it in dd.
+ */
+void read_guid(struct hfi1_devdata *dd)
+{
+ dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
+ dd_dev_info(dd, "GUID %llx",
+ (unsigned long long)dd->base_guid);
+}
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
new file mode 100644
index 000000000000..8ca171bf3e36
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -0,0 +1,1821 @@
+#ifndef _HFI1_KERNEL_H
+#define _HFI1_KERNEL_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#include "chip_registers.h"
+#include "common.h"
+#include "verbs.h"
+#include "pio.h"
+#include "chip.h"
+#include "mad.h"
+#include "qsfp.h"
+#include "platform_config.h"
+
+/* bumped 1 from s/w major version of TrueScale */
+#define HFI1_CHIP_VERS_MAJ 3U
+
+/* don't care about this except printing */
+#define HFI1_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define HFI1_OUI 0x001175
+#define HFI1_OUI_LSB 40
+
+#define DROP_PACKET_OFF 0
+#define DROP_PACKET_ON 1
+
+extern unsigned long hfi1_cap_mask;
+#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
+#define HFI1_CAP_UGET_MASK(mask, cap) \
+ (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
+#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
+#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
+#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
+ HFI1_CAP_MISC_MASK)
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted hfi1_statnames[] in debugfs.c must
+ * change to match.
+ */
+struct hfi1_ib_stats {
+ __u64 sps_ints; /* number of interrupts handled */
+ __u64 sps_errints; /* number of error interrupts */
+ __u64 sps_txerrs; /* tx-related packet errors */
+ __u64 sps_rcverrs; /* non-crc rcv packet errors */
+ __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+ __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+ __u64 sps_ctxts; /* number of contexts currently open */
+ __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+ __u64 sps_buffull;
+ __u64 sps_hdrfull;
+};
+
+extern struct hfi1_ib_stats hfi1_stats;
+extern const struct pci_error_handlers hfi1_pci_err_handler;
+
+/*
+ * First-cut criterion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct hfi1_opcode_stats_perctx;
+#endif
+
+/*
+ * struct ps_state keeps state associated with RX queue "prescanning"
+ * (prescanning for FECNs, and BECNs), if prescanning is in use.
+ */
+struct ps_state {
+ u32 ps_head;
+ int initialized;
+};
+
+struct ctxt_eager_bufs {
+ ssize_t size; /* total size of eager buffers */
+ u32 count; /* size of buffers array */
+ u32 numbufs; /* number of buffers allocated */
+ u32 alloced; /* number of rcvarray entries used */
+ u32 rcvtid_size; /* size of each eager rcv tid */
+ u32 threshold; /* head update threshold */
+ struct eager_buffer {
+ void *addr;
+ dma_addr_t phys;
+ ssize_t len;
+ } *buffers;
+ struct {
+ void *addr;
+ dma_addr_t phys;
+ } *rcvtids;
+};
+
+struct hfi1_ctxtdata {
+ /* shadow the ctxt's RcvCtrl register */
+ u64 rcvctrl;
+ /* rcvhdrq base, needs mmap before useful */
+ void *rcvhdrq;
+ /* kernel virtual address where hdrqtail is updated */
+ volatile __le64 *rcvhdrtail_kvaddr;
+ /*
+ * Shared page for kernel to signal user processes that send buffers
+ * need disarming. The process should call HFI1_CMD_DISARM_BUFS
+ * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+ */
+ unsigned long *user_event_mask;
+ /* when waiting for rcv or pioavail */
+ wait_queue_head_t wait;
+ /* rcvhdrq size (for freeing) */
+ size_t rcvhdrq_size;
+ /* number of rcvhdrq entries */
+ u16 rcvhdrq_cnt;
+ /* size of each of the rcvhdrq entries */
+ u16 rcvhdrqentsize;
+ /* mmap of hdrq, must fit in 44 bits */
+ dma_addr_t rcvhdrq_phys;
+ dma_addr_t rcvhdrqtailaddr_phys;
+ struct ctxt_eager_bufs egrbufs;
+ /* this receive context's assigned PIO ACK send context */
+ struct send_context *sc;
+
+ /* dynamic receive available interrupt timeout */
+ u32 rcvavail_timeout;
+ /*
+ * number of opens (including slave sub-contexts) on this instance
+ * (ignoring forks, dup, etc. for now)
+ */
+ int cnt;
+ /*
+ * how much space to leave at start of eager TID entries for
+ * protocol use, on each TID
+ */
+ /* instead of calculating it */
+ unsigned ctxt;
+ /* non-zero if ctxt is being shared. */
+ u16 subctxt_cnt;
+ /* non-zero if ctxt is being shared. */
+ u16 subctxt_id;
+ u8 uuid[16];
+ /* job key */
+ u16 jkey;
+ /* number of RcvArray groups for this context. */
+ u32 rcv_array_groups;
+ /* index of first eager TID entry. */
+ u32 eager_base;
+ /* number of expected TID entries */
+ u32 expected_count;
+ /* index of first expected TID entry. */
+ u32 expected_base;
+ /* cursor into the exp group sets */
+ atomic_t tidcursor;
+ /* number of exp TID groups assigned to the ctxt */
+ u16 numtidgroups;
+ /* size of exp TID group fields in tidusemap */
+ u16 tidmapcnt;
+ /* exp TID group usage bitfield array */
+ unsigned long *tidusemap;
+ /* pinned pages for exp sends, allocated at open */
+ struct page **tid_pg_list;
+ /* dma handles for exp tid pages */
+ dma_addr_t *physshadow;
+ /* lock protecting all Expected TID data */
+ spinlock_t exp_lock;
+ /* number of pio bufs for this ctxt (all procs, if shared) */
+ u32 piocnt;
+ /* first pio buffer for this ctxt */
+ u32 pio_base;
+ /* chip offset of PIO buffers for this ctxt */
+ u32 piobufs;
+ /* per-context configuration flags */
+ u16 flags;
+ /* per-context event flags for fileops/intr communication */
+ unsigned long event_flags;
+ /* WAIT_RCV that timed out, no interrupt */
+ u32 rcvwait_to;
+ /* WAIT_PIO that timed out, no interrupt */
+ u32 piowait_to;
+ /* WAIT_RCV already happened, no wait */
+ u32 rcvnowait;
+ /* WAIT_PIO already happened, no wait */
+ u32 pionowait;
+ /* total number of polled urgent packets */
+ u32 urgent;
+ /* saved total number of polled urgent packets for poll edge trigger */
+ u32 urgent_poll;
+ /* pid of process using this ctxt */
+ pid_t pid;
+ pid_t subpid[HFI1_MAX_SHARED_CTXTS];
+ /* same size as task_struct .comm[], command that opened context */
+ char comm[16];
+ /* so file ops can get at unit */
+ struct hfi1_devdata *dd;
+ /* so functions that need physical port can get it easily */
+ struct hfi1_pportdata *ppd;
+ /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+ void *subctxt_uregbase;
+ /* An array of pages for the eager receive buffers * N */
+ void *subctxt_rcvegrbuf;
+ /* An array of pages for the eager header queue entries * N */
+ void *subctxt_rcvhdr_base;
+ /* The version of the library which opened this ctxt */
+ u32 userversion;
+ /* Bitmask of active slaves */
+ u32 active_slaves;
+ /* Type of packets or conditions we want to poll for */
+ u16 poll_type;
+ /* receive packet sequence counter */
+ u8 seq_cnt;
+ u8 redirect_seq_cnt;
+ /* ctxt rcvhdrq head offset */
+ u32 head;
+ u32 pkt_count;
+ /* QPs waiting for context processing */
+ struct list_head qp_wait_list;
+ /* interrupt handling */
+ u64 imask; /* clear interrupt mask */
+ int ireg; /* clear interrupt register */
+ unsigned numa_id; /* numa node of this context */
+ /* verbs stats per CTX */
+ struct hfi1_opcode_stats_perctx *opstats;
+ /*
+ * This is the kernel thread that will keep making
+ * progress on the user sdma requests behind the scenes.
+ * There is one per context (shared contexts use the master's).
+ */
+ struct task_struct *progress;
+ struct list_head sdma_queues;
+ spinlock_t sdma_qlock;
+
+#ifdef CONFIG_PRESCAN_RXQ
+ struct ps_state ps_state;
+#endif /* CONFIG_PRESCAN_RXQ */
+
+ /*
+ * The interrupt handler for a particular receive context can vary
+ * throughout it's lifetime. This is not a lock protected data member so
+ * it must be updated atomically and the prev and new value must always
+ * be valid. Worst case is we process an extra interrupt and up to 64
+ * packets with the wrong interrupt handler.
+ */
+ void (*do_interrupt)(struct hfi1_ctxtdata *rcd);
+};
+
+/*
+ * Represents a single packet at a high level. Put commonly computed things in
+ * here so we do not have to keep doing them over and over. The rule of thumb is
+ * if something is used one time to derive some value, store that something in
+ * here. If it is used multiple times, then store the result of that derivation
+ * in here.
+ */
+struct hfi1_packet {
+ void *ebuf;
+ void *hdr;
+ struct hfi1_ctxtdata *rcd;
+ __le32 *rhf_addr;
+ struct hfi1_qp *qp;
+ struct hfi1_other_headers *ohdr;
+ u64 rhf;
+ u32 maxcnt;
+ u32 rhqoff;
+ u32 hdrqtail;
+ int numpkt;
+ u16 tlen;
+ u16 hlen;
+ s16 etail;
+ u16 rsize;
+ u8 updegr;
+ u8 rcv_flags;
+ u8 etype;
+};
+
+static inline bool has_sc4_bit(struct hfi1_packet *p)
+{
+ return !!rhf_dc_info(p->rhf);
+}
+
+/*
+ * Private data for snoop/capture support.
+ */
+struct hfi1_snoop_data {
+ int mode_flag;
+ struct cdev cdev;
+ struct device *class_dev;
+ spinlock_t snoop_lock;
+ struct list_head queue;
+ wait_queue_head_t waitq;
+ void *filter_value;
+ int (*filter_callback)(void *hdr, void *data, void *value);
+ u64 dcc_cfg; /* saved value of DCC Cfg register */
+};
+
+/* snoop mode_flag values */
+#define HFI1_PORT_SNOOP_MODE 1U
+#define HFI1_PORT_CAPTURE_MODE 2U
+
+struct hfi1_sge_state;
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
+#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
+#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define HFI1_IB_CFG_SPD 5 /* current Link spd */
+#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
+#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
+#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
+#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
+#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * HFI or Host Link States
+ *
+ * These describe the states the driver thinks the logical and physical
+ * states are in. Used as an argument to set_link_state(). Implemented
+ * as bits for easy multi-state checking. The actual state can only be
+ * one.
+ */
+#define __HLS_UP_INIT_BP 0
+#define __HLS_UP_ARMED_BP 1
+#define __HLS_UP_ACTIVE_BP 2
+#define __HLS_DN_DOWNDEF_BP 3 /* link down default */
+#define __HLS_DN_POLL_BP 4
+#define __HLS_DN_DISABLE_BP 5
+#define __HLS_DN_OFFLINE_BP 6
+#define __HLS_VERIFY_CAP_BP 7
+#define __HLS_GOING_UP_BP 8
+#define __HLS_GOING_OFFLINE_BP 9
+#define __HLS_LINK_COOLDOWN_BP 10
+
+#define HLS_UP_INIT (1 << __HLS_UP_INIT_BP)
+#define HLS_UP_ARMED (1 << __HLS_UP_ARMED_BP)
+#define HLS_UP_ACTIVE (1 << __HLS_UP_ACTIVE_BP)
+#define HLS_DN_DOWNDEF (1 << __HLS_DN_DOWNDEF_BP) /* link down default */
+#define HLS_DN_POLL (1 << __HLS_DN_POLL_BP)
+#define HLS_DN_DISABLE (1 << __HLS_DN_DISABLE_BP)
+#define HLS_DN_OFFLINE (1 << __HLS_DN_OFFLINE_BP)
+#define HLS_VERIFY_CAP (1 << __HLS_VERIFY_CAP_BP)
+#define HLS_GOING_UP (1 << __HLS_GOING_UP_BP)
+#define HLS_GOING_OFFLINE (1 << __HLS_GOING_OFFLINE_BP)
+#define HLS_LINK_COOLDOWN (1 << __HLS_LINK_COOLDOWN_BP)
+
+#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+
+/* use this MTU size if none other is given */
+#define HFI1_DEFAULT_ACTIVE_MTU 8192
+/* use this MTU size as the default maximum */
+#define HFI1_DEFAULT_MAX_MTU 8192
+/* default partition key */
+#define DEFAULT_PKEY 0xffff
+
+/*
+ * Possible fabric manager config parameters for fm_{get,set}_table()
+ */
+#define FM_TBL_VL_HIGH_ARB 1 /* Get/set VL high prio weights */
+#define FM_TBL_VL_LOW_ARB 2 /* Get/set VL low prio weights */
+#define FM_TBL_BUFFER_CONTROL 3 /* Get/set Buffer Control */
+#define FM_TBL_SC2VLNT 4 /* Get/set SC->VLnt */
+#define FM_TBL_VL_PREEMPT_ELEMS 5 /* Get (no set) VL preempt elems */
+#define FM_TBL_VL_PREEMPT_MATRIX 6 /* Get (no set) VL preempt matrix */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
+ */
+#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
+#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
+#define HFI1_RCVCTRL_CTXT_ENB 0x04
+#define HFI1_RCVCTRL_CTXT_DIS 0x08
+#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
+#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
+#define HFI1_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */
+#define HFI1_RCVCTRL_PKEY_DIS 0x80
+#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
+#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
+#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
+#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
+#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
+#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+
+/* partition enforcement flags */
+#define HFI1_PART_ENFORCE_IN 0x1
+#define HFI1_PART_ENFORCE_OUT 0x2
+
+/* how often we check for synthetic counter wrap around */
+#define SYNTH_CNT_TIME 2
+
+/* Counter flags */
+#define CNTR_NORMAL 0x0 /* Normal counters, just read register */
+#define CNTR_SYNTH 0x1 /* Synthetic counters, saturate at all 1s */
+#define CNTR_DISABLED 0x2 /* Disable this counter */
+#define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */
+#define CNTR_VL 0x8 /* Per VL counter */
+#define CNTR_INVALID_VL -1 /* Specifies invalid VL */
+#define CNTR_MODE_W 0x0
+#define CNTR_MODE_R 0x1
+
+/* VLs Supported/Operational */
+#define HFI1_MIN_VLS_SUPPORTED 1
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+static inline void incr_cntr64(u64 *cntr)
+{
+ if (*cntr < (u64)-1LL)
+ (*cntr)++;
+}
+
+static inline void incr_cntr32(u32 *cntr)
+{
+ if (*cntr < (u32)-1LL)
+ (*cntr)++;
+}
+
+#define MAX_NAME_SIZE 64
+struct hfi1_msix_entry {
+ struct msix_entry msix;
+ void *arg;
+ char name[MAX_NAME_SIZE];
+ cpumask_var_t mask;
+};
+
+/* per-SL CCA information */
+struct cca_timer {
+ struct hrtimer hrtimer;
+ struct hfi1_pportdata *ppd; /* read-only */
+ int sl; /* read-only */
+ u16 ccti; /* read/write - current value of CCTI */
+};
+
+struct link_down_reason {
+ /*
+ * SMA-facing value. Should be set from .latest when
+ * HLS_UP_* -> HLS_DN_* transition actually occurs.
+ */
+ u8 sma;
+ u8 latest;
+};
+
+enum {
+ LO_PRIO_TABLE,
+ HI_PRIO_TABLE,
+ MAX_PRIO_TABLE
+};
+
+struct vl_arb_cache {
+ spinlock_t lock;
+ struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct hfi1_pportdata {
+ struct hfi1_ibport ibport_data;
+
+ struct hfi1_devdata *dd;
+ struct kobject pport_cc_kobj;
+ struct kobject sc2vl_kobj;
+ struct kobject sl2sc_kobj;
+ struct kobject vl2mtu_kobj;
+
+ /* QSFP support */
+ struct qsfp_data qsfp_info;
+
+ /* GUID for this interface, in host order */
+ u64 guid;
+ /* GUID for peer interface, in host order */
+ u64 neighbor_guid;
+
+ /* up or down physical link state */
+ u32 linkup;
+
+ /*
+ * this address is mapped read-only into user processes so they can
+ * get status cheaply, whenever they want. One qword of status per port
+ */
+ u64 *statusp;
+
+ /* SendDMA related entries */
+
+ struct workqueue_struct *hfi1_wq;
+
+ /* move out of interrupt context */
+ struct work_struct link_vc_work;
+ struct work_struct link_up_work;
+ struct work_struct link_down_work;
+ struct work_struct sma_message_work;
+ struct work_struct freeze_work;
+ struct work_struct link_downgrade_work;
+ struct work_struct link_bounce_work;
+ /* host link state variables */
+ struct mutex hls_lock;
+ u32 host_link_state;
+
+ spinlock_t sdma_alllock ____cacheline_aligned_in_smp;
+
+ u32 lstate; /* logical link state */
+
+ /* these are the "32 bit" regs */
+
+ u32 ibmtu; /* The MTU programmed for this unit */
+ /*
+ * Current max size IB packet (in bytes) including IB headers, that
+ * we can send. Changes when ibmtu changes.
+ */
+ u32 ibmaxlen;
+ u32 current_egress_rate; /* units [10^6 bits/sec] */
+ /* LID programmed for this instance */
+ u16 lid;
+ /* list of pkeys programmed; 0 if not set */
+ u16 pkeys[MAX_PKEY_VALUES];
+ u16 link_width_supported;
+ u16 link_width_downgrade_supported;
+ u16 link_speed_supported;
+ u16 link_width_enabled;
+ u16 link_width_downgrade_enabled;
+ u16 link_speed_enabled;
+ u16 link_width_active;
+ u16 link_width_downgrade_tx_active;
+ u16 link_width_downgrade_rx_active;
+ u16 link_speed_active;
+ u8 vls_supported;
+ u8 vls_operational;
+ /* LID mask control */
+ u8 lmc;
+ /* Rx Polarity inversion (compensate for ~tx on partner) */
+ u8 rx_pol_inv;
+
+ u8 hw_pidx; /* physical port index */
+ u8 port; /* IB port number and index into dd->pports - 1 */
+ /* type of neighbor node */
+ u8 neighbor_type;
+ u8 neighbor_normal;
+ u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
+ u8 neighbor_port_number;
+ u8 is_sm_config_started;
+ u8 offline_disabled_reason;
+ u8 is_active_optimize_enabled;
+ u8 driver_link_ready; /* driver ready for active link */
+ u8 link_enabled; /* link enabled? */
+ u8 linkinit_reason;
+ u8 local_tx_rate; /* rate given to 8051 firmware */
+
+ /* placeholders for IB MAD packet settings */
+ u8 overrun_threshold;
+ u8 phy_error_threshold;
+
+ /* used to override LED behavior */
+ u8 led_override; /* Substituted for normal value, if non-zero */
+ u16 led_override_timeoff; /* delta to next timer event */
+ u8 led_override_vals[2]; /* Alternates per blink-frame */
+ u8 led_override_phase; /* Just counts, LSB picks from vals[] */
+ atomic_t led_override_timer_active;
+ /* Used to flash LEDs in override mode */
+ struct timer_list led_override_timer;
+ u32 sm_trap_qp;
+ u32 sa_qp;
+
+ /*
+ * cca_timer_lock protects access to the per-SL cca_timer
+ * structures (specifically the ccti member).
+ */
+ spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
+ struct cca_timer cca_timer[OPA_MAX_SLS];
+
+ /* List of congestion control table entries */
+ struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
+
+ /* congestion entries, each entry corresponding to a SL */
+ struct opa_congestion_setting_entry_shadow
+ congestion_entries[OPA_MAX_SLS];
+
+ /*
+ * cc_state_lock protects (write) access to the per-port
+ * struct cc_state.
+ */
+ spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
+
+ struct cc_state __rcu *cc_state;
+
+ /* Total number of congestion control table entries */
+ u16 total_cct_entry;
+
+ /* Bit map identifying service level */
+ u32 cc_sl_control_map;
+
+ /* CA's max number of 64 entry units in the congestion control table */
+ u8 cc_max_table_entries;
+
+ /* begin congestion log related entries
+ * cc_log_lock protects all congestion log related data */
+ spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
+ u8 threshold_cong_event_map[OPA_MAX_SLS/8];
+ u16 threshold_event_counter;
+ struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
+ int cc_log_idx; /* index for logging events */
+ int cc_mad_idx; /* index for reporting events */
+ /* end congestion log related entries */
+
+ struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
+
+ /* port relative counter buffer */
+ u64 *cntrs;
+ /* port relative synthetic counter buffer */
+ u64 *scntrs;
+ /* we synthesize port_xmit_discards from several egress errors */
+ u64 port_xmit_discards;
+ u64 port_xmit_constraint_errors;
+ u64 port_rcv_constraint_errors;
+ /* count of 'link_err' interrupts from DC */
+ u64 link_downed;
+ /* number of times link retrained successfully */
+ u64 link_up;
+ /* port_ltp_crc_mode is returned in 'portinfo' MADs */
+ u16 port_ltp_crc_mode;
+ /* port_crc_mode_enabled is the crc we support */
+ u8 port_crc_mode_enabled;
+ /* mgmt_allowed is also returned in 'portinfo' MADs */
+ u8 mgmt_allowed;
+ u8 part_enforce; /* partition enforcement flags */
+ struct link_down_reason local_link_down_reason;
+ struct link_down_reason neigh_link_down_reason;
+ /* Value to be sent to link peer on LinkDown .*/
+ u8 remote_link_down_reason;
+ /* Error events that will cause a port bounce. */
+ u32 port_error_action;
+};
+
+typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+typedef void (*opcode_handler)(struct hfi1_packet *packet);
+
+/* return values for the RHF receive functions */
+#define RHF_RCV_CONTINUE 0 /* keep going */
+#define RHF_RCV_DONE 1 /* stop, this packet processed */
+#define RHF_RCV_REPROCESS 2 /* stop. retain this packet */
+
+struct rcv_array_data {
+ u8 group_size;
+ u16 ngroups;
+ u16 nctxt_extra;
+};
+
+struct per_vl_data {
+ u16 mtu;
+ struct send_context *sc;
+};
+
+/* 16 to directly index */
+#define PER_VL_SEND_CONTEXTS 16
+
+struct err_info_rcvport {
+ u8 status_and_code;
+ u64 packet_flit1;
+ u64 packet_flit2;
+};
+
+struct err_info_constraint {
+ u8 status;
+ u16 pkey;
+ u32 slid;
+};
+
+struct hfi1_temp {
+ unsigned int curr; /* current temperature */
+ unsigned int lo_lim; /* low temperature limit */
+ unsigned int hi_lim; /* high temperature limit */
+ unsigned int crit_lim; /* critical temperature limit */
+ u8 triggers; /* temperature triggers */
+};
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a hfi1_pportdata struct.
+ */
+struct sdma_engine;
+struct sdma_vl_map;
+
+#define BOARD_VERS_MAX 96 /* how long the version string can be */
+#define SERIAL_MAX 16 /* length of the serial number */
+
+struct hfi1_devdata {
+ struct hfi1_ibdev verbs_dev; /* must be first */
+ struct list_head list;
+ /* pointers to related structs for this device */
+ /* pci access data structure */
+ struct pci_dev *pcidev;
+ struct cdev user_cdev;
+ struct cdev diag_cdev;
+ struct cdev ui_cdev;
+ struct device *user_device;
+ struct device *diag_device;
+ struct device *ui_device;
+
+ /* mem-mapped pointer to base of chip regs */
+ u8 __iomem *kregbase;
+ /* end of mem-mapped chip space excluding sendbuf and user regs */
+ u8 __iomem *kregend;
+ /* physical address of chip for io_remap, etc. */
+ resource_size_t physaddr;
+ /* receive context data */
+ struct hfi1_ctxtdata **rcd;
+ /* send context data */
+ struct send_context_info *send_contexts;
+ /* map hardware send contexts to software index */
+ u8 *hw_to_sw;
+ /* spinlock for allocating and releasing send context resources */
+ spinlock_t sc_lock;
+ /* Per VL data. Enough for all VLs but not all elements are set/used. */
+ struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
+ /* seqlock for sc2vl */
+ seqlock_t sc2vl_lock;
+ u64 sc2vl[4];
+ /* Send Context initialization lock. */
+ spinlock_t sc_init_lock;
+
+ /* fields common to all SDMA engines */
+
+ /* default flags to last descriptor */
+ u64 default_desc1;
+ volatile __le64 *sdma_heads_dma; /* DMA'ed by chip */
+ dma_addr_t sdma_heads_phys;
+ void *sdma_pad_dma; /* DMA'ed by chip */
+ dma_addr_t sdma_pad_phys;
+ /* for deallocation */
+ size_t sdma_heads_size;
+ /* number from the chip */
+ u32 chip_sdma_engines;
+ /* num used */
+ u32 num_sdma;
+ /* lock for sdma_map */
+ spinlock_t sde_map_lock;
+ /* array of engines sized by num_sdma */
+ struct sdma_engine *per_sdma;
+ /* array of vl maps */
+ struct sdma_vl_map __rcu *sdma_map;
+ /* SPC freeze waitqueue and variable */
+ wait_queue_head_t sdma_unfreeze_wq;
+ atomic_t sdma_unfreeze_count;
+
+
+ /* hfi1_pportdata, points to array of (physical) port-specific
+ * data structs, indexed by pidx (0..n-1)
+ */
+ struct hfi1_pportdata *pport;
+
+ /* mem-mapped pointer to base of PIO buffers */
+ void __iomem *piobase;
+ /*
+ * write-combining mem-mapped pointer to base of RcvArray
+ * memory.
+ */
+ void __iomem *rcvarray_wc;
+ /*
+ * credit return base - a per-NUMA range of DMA address that
+ * the chip will use to update the per-context free counter
+ */
+ struct credit_return_base *cr_base;
+
+ /* send context numbers and sizes for each type */
+ struct sc_config_sizes sc_sizes[SC_MAX];
+
+ u32 lcb_access_count; /* count of LCB users */
+
+ char *boardname; /* human readable board info */
+
+ /* device (not port) flags, basically device capabilities */
+ u32 flags;
+
+ /* reset value */
+ u64 z_int_counter;
+ u64 z_rcv_limit;
+ /* percpu int_counter */
+ u64 __percpu *int_counter;
+ u64 __percpu *rcv_limit;
+
+ /* number of receive contexts in use by the driver */
+ u32 num_rcv_contexts;
+ /* number of pio send contexts in use by the driver */
+ u32 num_send_contexts;
+ /*
+ * number of ctxts available for PSM open
+ */
+ u32 freectxts;
+ /* base receive interrupt timeout, in CSR units */
+ u32 rcv_intr_timeout_csr;
+
+ u64 __iomem *egrtidbase;
+ spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
+ spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
+ /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+ spinlock_t uctxt_lock; /* rcd and user context changes */
+ /* exclusive access to 8051 */
+ spinlock_t dc8051_lock;
+ /* exclusive access to 8051 memory */
+ spinlock_t dc8051_memlock;
+ int dc8051_timed_out; /* remember if the 8051 timed out */
+ /*
+ * A page that will hold event notification bitmaps for all
+ * contexts. This page will be mapped into all processes.
+ */
+ unsigned long *events;
+ /*
+ * per unit status, see also portdata statusp
+ * mapped read-only into user processes so they can get unit and
+ * IB link status cheaply
+ */
+ struct hfi1_status *status;
+ u32 freezelen; /* max length of freezemsg */
+
+ /* revision register shadow */
+ u64 revision;
+ /* Base GUID for device (network order) */
+ u64 base_guid;
+
+ /* these are the "32 bit" regs */
+
+ /* value we put in kr_rcvhdrsize */
+ u32 rcvhdrsize;
+ /* number of receive contexts the chip supports */
+ u32 chip_rcv_contexts;
+ /* number of receive array entries */
+ u32 chip_rcv_array_count;
+ /* number of PIO send contexts the chip supports */
+ u32 chip_send_contexts;
+ /* number of bytes in the PIO memory buffer */
+ u32 chip_pio_mem_size;
+ /* number of bytes in the SDMA memory buffer */
+ u32 chip_sdma_mem_size;
+
+ /* size of each rcvegrbuffer */
+ u32 rcvegrbufsize;
+ /* log2 of above */
+ u16 rcvegrbufsize_shift;
+ /* both sides of the PCIe link are gen3 capable */
+ u8 link_gen3_capable;
+ /* localbus width (1, 2,4,8,16,32) from config space */
+ u32 lbus_width;
+ /* localbus speed in MHz */
+ u32 lbus_speed;
+ int unit; /* unit # of this chip */
+ int node; /* home node of this chip */
+
+ /* save these PCI fields to restore after a reset */
+ u32 pcibar0;
+ u32 pcibar1;
+ u32 pci_rom;
+ u16 pci_command;
+ u16 pcie_devctl;
+ u16 pcie_lnkctl;
+ u16 pcie_devctl2;
+ u32 pci_msix0;
+ u32 pci_lnkctl3;
+ u32 pci_tph2;
+
+ /*
+ * ASCII serial number, from flash, large enough for original
+ * all digit strings, and longer serial number format
+ */
+ u8 serial[SERIAL_MAX];
+ /* human readable board version */
+ u8 boardversion[BOARD_VERS_MAX];
+ u8 lbus_info[32]; /* human readable localbus info */
+ /* chip major rev, from CceRevision */
+ u8 majrev;
+ /* chip minor rev, from CceRevision */
+ u8 minrev;
+ /* hardware ID */
+ u8 hfi1_id;
+ /* implementation code */
+ u8 icode;
+ /* default link down value (poll/sleep) */
+ u8 link_default;
+ /* vAU of this device */
+ u8 vau;
+ /* vCU of this device */
+ u8 vcu;
+ /* link credits of this device */
+ u16 link_credits;
+ /* initial vl15 credits to use */
+ u16 vl15_init;
+
+ /* Misc small ints */
+ /* Number of physical ports available */
+ u8 num_pports;
+ /* Lowest context number which can be used by user processes */
+ u8 first_user_ctxt;
+ u8 n_krcv_queues;
+ u8 qos_shift;
+ u8 qpn_mask;
+
+ u16 rhf_offset; /* offset of RHF within receive header entry */
+ u16 irev; /* implementation revision */
+ u16 dc8051_ver; /* 8051 firmware version */
+
+ struct platform_config_cache pcfg_cache;
+ /* control high-level access to qsfp */
+ struct mutex qsfp_i2c_mutex;
+
+ struct diag_client *diag_client;
+ spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
+
+ u8 psxmitwait_supported;
+ /* cycle length of PS* counters in HW (in picoseconds) */
+ u16 psxmitwait_check_rate;
+ /* high volume overflow errors deferred to tasklet */
+ struct tasklet_struct error_tasklet;
+ /* per device cq worker */
+ struct kthread_worker *worker;
+
+ /* MSI-X information */
+ struct hfi1_msix_entry *msix_entries;
+ u32 num_msix_entries;
+
+ /* INTx information */
+ u32 requested_intx_irq; /* did we request one? */
+ char intx_name[MAX_NAME_SIZE]; /* INTx name */
+
+ /* general interrupt: mask of handled interrupts */
+ u64 gi_mask[CCE_NUM_INT_CSRS];
+
+ struct rcv_array_data rcv_entries;
+
+ /*
+ * 64 bit synthetic counters
+ */
+ struct timer_list synth_stats_timer;
+
+ /*
+ * device counters
+ */
+ char *cntrnames;
+ size_t cntrnameslen;
+ size_t ndevcntrs;
+ u64 *cntrs;
+ u64 *scntrs;
+
+ /*
+ * remembered values for synthetic counters
+ */
+ u64 last_tx;
+ u64 last_rx;
+
+ /*
+ * per-port counters
+ */
+ size_t nportcntrs;
+ char *portcntrnames;
+ size_t portcntrnameslen;
+
+ struct hfi1_snoop_data hfi1_snoop;
+
+ struct err_info_rcvport err_info_rcvport;
+ struct err_info_constraint err_info_rcv_constraint;
+ struct err_info_constraint err_info_xmit_constraint;
+ u8 err_info_uncorrectable;
+ u8 err_info_fmconfig;
+
+ atomic_t drop_packet;
+ u8 do_drop;
+
+ /* receive interrupt functions */
+ rhf_rcv_function_ptr *rhf_rcv_function_map;
+ rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
+
+ /*
+ * Handlers for outgoing data so that snoop/capture does not
+ * have to have its hooks in the send path
+ */
+ int (*process_pio_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+ u32 hdrwords, struct hfi1_sge_state *ss,
+ u32 len, u32 plen, u32 dwords, u64 pbc);
+ int (*process_dma_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+ u32 hdrwords, struct hfi1_sge_state *ss,
+ u32 len, u32 plen, u32 dwords, u64 pbc);
+ void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+ u64 pbc, const void *from, size_t count);
+
+ /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
+ u8 oui1;
+ u8 oui2;
+ u8 oui3;
+ /* Timer and counter used to detect RcvBufOvflCnt changes */
+ struct timer_list rcverr_timer;
+ u32 rcv_ovfl_cnt;
+
+ int assigned_node_id;
+ wait_queue_head_t event_queue;
+
+ /* Save the enabled LCB error bits */
+ u64 lcb_err_en;
+ u8 dc_shutdown;
+};
+
+/* 8051 firmware version helper */
+#define dc8051_ver(a, b) ((a) << 8 | (b))
+
+/* f_put_tid types */
+#define PT_EXPECTED 0
+#define PT_EAGER 1
+#define PT_INVALID 2
+
+/* Private data for file operations */
+struct hfi1_filedata {
+ struct hfi1_ctxtdata *uctxt;
+ unsigned subctxt;
+ struct hfi1_user_sdma_comp_q *cq;
+ struct hfi1_user_sdma_pkt_q *pq;
+ /* for cpu affinity; -1 if none */
+ int rec_cpu_num;
+};
+
+extern struct list_head hfi1_dev_list;
+extern spinlock_t hfi1_devs_lock;
+struct hfi1_devdata *hfi1_lookup(int unit);
+extern u32 hfi1_cpulist_count;
+extern unsigned long *hfi1_cpulist;
+
+extern unsigned int snoop_drop_send;
+extern unsigned int snoop_force_capture;
+int hfi1_init(struct hfi1_devdata *, int);
+int hfi1_count_units(int *npresentp, int *nupp);
+int hfi1_count_active_units(void);
+
+int hfi1_diag_add(struct hfi1_devdata *);
+void hfi1_diag_remove(struct hfi1_devdata *);
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
+
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
+
+int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
+int hfi1_create_ctxts(struct hfi1_devdata *dd);
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32);
+void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
+ struct hfi1_devdata *, u8, u8);
+void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+
+void handle_receive_interrupt(struct hfi1_ctxtdata *);
+void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd);
+void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd);
+int hfi1_reset_device(int);
+
+/* return the driver's idea of the logical OPA port state */
+static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
+{
+ return ppd->lstate; /* use the cached value */
+}
+
+static inline u16 generate_jkey(kuid_t uid)
+{
+ return from_kuid(current_user_ns(), uid) & 0xffff;
+}
+
+/*
+ * active_egress_rate
+ *
+ * returns the active egress rate in units of [10^6 bits/sec]
+ */
+static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
+{
+ u16 link_speed = ppd->link_speed_active;
+ u16 link_width = ppd->link_width_active;
+ u32 egress_rate;
+
+ if (link_speed == OPA_LINK_SPEED_25G)
+ egress_rate = 25000;
+ else /* assume OPA_LINK_SPEED_12_5G */
+ egress_rate = 12500;
+
+ switch (link_width) {
+ case OPA_LINK_WIDTH_4X:
+ egress_rate *= 4;
+ break;
+ case OPA_LINK_WIDTH_3X:
+ egress_rate *= 3;
+ break;
+ case OPA_LINK_WIDTH_2X:
+ egress_rate *= 2;
+ break;
+ default:
+ /* assume IB_WIDTH_1X */
+ break;
+ }
+
+ return egress_rate;
+}
+
+/*
+ * egress_cycles
+ *
+ * Returns the number of 'fabric clock cycles' to egress a packet
+ * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
+ * rate is (approximately) 805 MHz, the units of the returned value
+ * are (1/805 MHz).
+ */
+static inline u32 egress_cycles(u32 len, u32 rate)
+{
+ u32 cycles;
+
+ /*
+ * cycles is:
+ *
+ * (length) [bits] / (rate) [bits/sec]
+ * ---------------------------------------------------
+ * fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
+ */
+
+ cycles = len * 8; /* bits */
+ cycles *= 805;
+ cycles /= rate;
+
+ return cycles;
+}
+
+void set_link_ipg(struct hfi1_pportdata *ppd);
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+ u32 rqpn, u8 svc_type);
+void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
+ u32 pkey, u32 slid, u32 dlid, u8 sc5,
+ const struct ib_grh *old_grh);
+
+#define PACKET_EGRESS_TIMEOUT 350
+static inline void pause_for_credit_return(struct hfi1_devdata *dd)
+{
+ /* Pause at least 1us, to ensure chip returns all credits */
+ u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
+
+ udelay(usec ? usec : 1);
+}
+
+/**
+ * sc_to_vlt() reverse lookup sc to vl
+ * @dd - devdata
+ * @sc5 - 5 bit sc
+ */
+static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
+{
+ unsigned seq;
+ u8 rval;
+
+ if (sc5 >= OPA_MAX_SCS)
+ return (u8)(0xff);
+
+ do {
+ seq = read_seqbegin(&dd->sc2vl_lock);
+ rval = *(((u8 *)dd->sc2vl) + sc5);
+ } while (read_seqretry(&dd->sc2vl_lock, seq));
+
+ return rval;
+}
+
+#define PKEY_MEMBER_MASK 0x8000
+#define PKEY_LOW_15_MASK 0x7fff
+
+/*
+ * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for ingress partition keys
+ * specified in the OPAv1 spec., section 9.10.14.
+ */
+static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+ u16 mkey = pkey & PKEY_LOW_15_MASK;
+ u16 ment = ent & PKEY_LOW_15_MASK;
+
+ if (mkey == ment) {
+ /*
+ * If pkey[15] is clear (limited partition member),
+ * is bit 15 in the corresponding table element
+ * clear (limited member)?
+ */
+ if (!(pkey & PKEY_MEMBER_MASK))
+ return !!(ent & PKEY_MEMBER_MASK);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * ingress_pkey_table_search - search the entire pkey table for
+ * an entry which matches 'pkey'. return 0 if a match is found,
+ * and 1 otherwise.
+ */
+static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
+{
+ int i;
+
+ for (i = 0; i < MAX_PKEY_VALUES; i++) {
+ if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * ingress_pkey_table_fail - record a failure of ingress pkey validation,
+ * i.e., increment port_rcv_constraint_errors for the port, and record
+ * the 'error info' for this failure.
+ */
+static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
+ u16 slid)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ incr_cntr64(&ppd->port_rcv_constraint_errors);
+ if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
+ dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
+ dd->err_info_rcv_constraint.slid = slid;
+ dd->err_info_rcv_constraint.pkey = pkey;
+ }
+}
+
+/*
+ * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
+ * is a hint as to the best place in the partition key table to begin
+ * searching. This function should not be called on the data path because
+ * of performance reasons. On datapath pkey check is expected to be done
+ * by HW and rcv_pkey_check function should be called instead.
+ */
+static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+ u8 sc5, u8 idx, u16 slid)
+{
+ if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+ return 0;
+
+ /* If SC15, pkey[0:14] must be 0x7fff */
+ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+ goto bad;
+
+ /* Is the pkey = 0x0, or 0x8000? */
+ if ((pkey & PKEY_LOW_15_MASK) == 0)
+ goto bad;
+
+ /* The most likely matching pkey has index 'idx' */
+ if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
+ return 0;
+
+ /* no match - try the whole table */
+ if (!ingress_pkey_table_search(ppd, pkey))
+ return 0;
+
+bad:
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+}
+
+/*
+ * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. It only ensures pkey is vlid for QP0. This function
+ * should be called on the data path instead of ingress_pkey_check
+ * as on data path, pkey check is done by HW (except for QP0).
+ */
+static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+ u8 sc5, u16 slid)
+{
+ if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+ return 0;
+
+ /* If SC15, pkey[0:14] must be 0x7fff */
+ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+ goto bad;
+
+ return 0;
+bad:
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+}
+
+/* MTU handling */
+
+/* MTU enumeration, 256-4k match IB */
+#define OPA_MTU_0 0
+#define OPA_MTU_256 1
+#define OPA_MTU_512 2
+#define OPA_MTU_1024 3
+#define OPA_MTU_2048 4
+#define OPA_MTU_4096 5
+
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
+int mtu_to_enum(u32 mtu, int default_if_bad);
+u16 enum_to_mtu(int);
+static inline int valid_ib_mtu(unsigned int mtu)
+{
+ return mtu == 256 || mtu == 512 ||
+ mtu == 1024 || mtu == 2048 ||
+ mtu == 4096;
+}
+static inline int valid_opa_max_mtu(unsigned int mtu)
+{
+ return mtu >= 2048 &&
+ (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
+}
+
+int set_mtu(struct hfi1_pportdata *);
+
+int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
+void hfi1_disable_after_error(struct hfi1_devdata *);
+int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
+int hfi1_rcvbuf_validate(u32, u8, u16 *);
+
+int fm_get_table(struct hfi1_pportdata *, int, void *);
+int fm_set_table(struct hfi1_pportdata *, int, void *);
+
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
+void reset_link_credits(struct hfi1_devdata *dd);
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
+
+int snoop_recv_handler(struct hfi1_packet *packet);
+int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc);
+int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc);
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+ u64 pbc, const void *from, size_t count);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define ctxt_fp(fp) \
+ (((struct hfi1_filedata *)(fp)->private_data)->uctxt)
+#define subctxt_fp(fp) \
+ (((struct hfi1_filedata *)(fp)->private_data)->subctxt)
+#define tidcursor_fp(fp) \
+ (((struct hfi1_filedata *)(fp)->private_data)->tidcursor)
+#define user_sdma_pkt_fp(fp) \
+ (((struct hfi1_filedata *)(fp)->private_data)->pq)
+#define user_sdma_comp_fp(fp) \
+ (((struct hfi1_filedata *)(fp)->private_data)->cq)
+
+static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
+{
+ return ppd->dd;
+}
+
+static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
+{
+ return container_of(dev, struct hfi1_devdata, verbs_dev);
+}
+
+static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+ return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
+{
+ return container_of(ibp, struct hfi1_pportdata, ibport_data);
+}
+
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+ WARN_ON(pidx >= dd->num_pports);
+ return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u16 ret;
+
+ if (index >= ARRAY_SIZE(ppd->pkeys))
+ ret = 0;
+ else
+ ret = ppd->pkeys[index];
+
+ return ret;
+}
+
+/*
+ * Readers of cc_state must call get_cc_state() under rcu_read_lock().
+ * Writers of cc_state must call get_cc_state() under cc_state_lock.
+ */
+static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
+{
+ return rcu_dereference(ppd->cc_state);
+}
+
+/*
+ * values for dd->flags (_device_ related flags)
+ */
+#define HFI1_INITTED 0x1 /* chip and driver up and initted */
+#define HFI1_PRESENT 0x2 /* chip accesses can be done */
+#define HFI1_FROZEN 0x4 /* chip in SPC freeze */
+#define HFI1_HAS_SDMA_TIMEOUT 0x8
+#define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */
+#define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */
+#define HFI1_DO_INIT_ASIC 0x100 /* This device will init the ASIC */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1)
+
+
+/* ctxt_flag bit offsets */
+ /* context has been setup */
+#define HFI1_CTXT_SETUP_DONE 1
+ /* waiting for a packet to arrive */
+#define HFI1_CTXT_WAITING_RCV 2
+ /* master has not finished initializing */
+#define HFI1_CTXT_MASTER_UNINIT 4
+ /* waiting for an urgent packet to arrive */
+#define HFI1_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
+ const struct pci_device_id *);
+void hfi1_free_devdata(struct hfi1_devdata *);
+void cc_state_reclaim(struct rcu_head *rcu);
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define HFI1_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define HFI1_LED_LOG 2 /* Logical (link) YELLOW LED */
+void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val);
+
+#define HFI1_CREDIT_RETURN_RATE (100)
+
+/*
+ * The number of words for the KDETH protocol field. If this is
+ * larger then the actual field used, then part of the payload
+ * will be in the header.
+ *
+ * Optimally, we want this sized so that a typical case will
+ * use full cache lines. The typical local KDETH header would
+ * be:
+ *
+ * Bytes Field
+ * 8 LRH
+ * 12 BHT
+ * ?? KDETH
+ * 8 RHF
+ * ---
+ * 28 + KDETH
+ *
+ * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
+ */
+#define DEFAULT_RCVHDRSIZE 9
+
+/*
+ * Maximal header byte count:
+ *
+ * Bytes Field
+ * 8 LRH
+ * 40 GRH (optional)
+ * 12 BTH
+ * ?? KDETH
+ * 8 RHF
+ * ---
+ * 68 + KDETH
+ *
+ * We also want to maintain a cache line alignment to assist DMA'ing
+ * of the header bytes. Round up to a good size.
+ */
+#define DEFAULT_RCVHDR_ENTSIZE 32
+
+int hfi1_get_user_pages(unsigned long, size_t, struct page **);
+void hfi1_release_user_pages(struct page **, size_t);
+
+static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+ *((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+ /*
+ * volatile because it's a DMA target from the chip, routine is
+ * inlined, and don't want register caching or reordering.
+ */
+ return (u32) le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_hfi1_version[];
+
+int hfi1_device_create(struct hfi1_devdata *);
+void hfi1_device_remove(struct hfi1_devdata *);
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+ struct kobject *kobj);
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
+/* Hook for sysfs read of QSFP */
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
+
+int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
+void hfi1_pcie_cleanup(struct pci_dev *);
+int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
+ const struct pci_device_id *);
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
+void hfi1_pcie_flr(struct hfi1_devdata *);
+int pcie_speeds(struct hfi1_devdata *);
+void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
+void hfi1_enable_intx(struct pci_dev *);
+void hfi1_nomsix(struct hfi1_devdata *);
+void restore_pci_variables(struct hfi1_devdata *dd);
+int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+int parse_platform_config(struct hfi1_devdata *dd);
+int get_platform_config_field(struct hfi1_devdata *dd,
+ enum platform_config_table_type_encoding table_type,
+ int table_index, int field_index, u32 *data, u32 len);
+
+dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long,
+ size_t, int);
+const char *get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void flush_wc(void)
+{
+ asm volatile("sfence" : : : "memory");
+}
+
+void handle_eflags(struct hfi1_packet *packet);
+int process_receive_ib(struct hfi1_packet *packet);
+int process_receive_bypass(struct hfi1_packet *packet);
+int process_receive_error(struct hfi1_packet *packet);
+int kdeth_process_expected(struct hfi1_packet *packet);
+int kdeth_process_eager(struct hfi1_packet *packet);
+int process_receive_invalid(struct hfi1_packet *packet);
+
+extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
+
+void update_sge(struct hfi1_sge_state *ss, u32 length);
+
+/* global module parameter variables */
+extern unsigned int hfi1_max_mtu;
+extern unsigned int hfi1_cu;
+extern unsigned int user_credit_return_threshold;
+extern uint num_rcv_contexts;
+extern unsigned n_krcvqs;
+extern u8 krcvqs[];
+extern int krcvqsset;
+extern uint kdeth_qp;
+extern uint loopback;
+extern uint quick_linkup;
+extern uint rcv_intr_timeout;
+extern uint rcv_intr_count;
+extern uint rcv_intr_dynamic;
+extern ushort link_crc_mask;
+
+extern struct mutex hfi1_mutex;
+
+/* Number of seconds before our card status check... */
+#define STATUS_TIMEOUT 60
+
+#define DRIVER_NAME "hfi1"
+#define HFI1_USER_MINOR_BASE 0
+#define HFI1_TRACE_MINOR 127
+#define HFI1_DIAGPKT_MINOR 128
+#define HFI1_DIAG_MINOR_BASE 129
+#define HFI1_SNOOP_CAPTURE_BASE 200
+#define HFI1_NMINORS 255
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_DEVICE_ID_INTEL0 0x24f0
+#define PCI_DEVICE_ID_INTEL1 0x24f1
+
+#define HFI1_PKT_USER_SC_INTEGRITY \
+ (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
+
+#define HFI1_PKT_KERNEL_SC_INTEGRITY \
+ (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
+
+static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
+ u16 ctxt_type)
+{
+ u64 base_sc_integrity =
+ SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+ if (ctxt_type == SC_USER)
+ base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
+ else
+ base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
+
+ if (is_a0(dd))
+ /* turn off send-side job key checks - A0 erratum */
+ return base_sc_integrity &
+ ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ return base_sc_integrity;
+}
+
+static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
+{
+ u64 base_sdma_integrity =
+ SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+ if (is_a0(dd))
+ /* turn off send-side job key checks - A0 erratum */
+ return base_sdma_integrity &
+ ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ return base_sdma_integrity;
+}
+
+/*
+ * hfi1_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is
+ * the same as dd_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ */
+#define hfi1_early_err(dev, fmt, ...) \
+ dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define hfi1_early_info(dev, fmt, ...) \
+ dev_info(dev, fmt, ##__VA_ARGS__)
+
+#define dd_dev_emerg(dd, fmt, ...) \
+ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_err(dd, fmt, ...) \
+ dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_warn(dd, fmt, ...) \
+ dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_warn_ratelimited(dd, fmt, ...) \
+ dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_info(dd, fmt, ...) \
+ dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define hfi1_dev_porterr(dd, port, fmt, ...) \
+ dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
+ get_unit_name((dd)->unit), (dd)->unit, (port), \
+ ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct hfi1_hwerror_msgs {
+ u64 mask;
+ const char *msg;
+ size_t sz;
+};
+
+/* in intr.c... */
+void hfi1_format_hwerrors(u64 hwerrs,
+ const struct hfi1_hwerror_msgs *hwerrmsgs,
+ size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+#define USER_OPCODE_CHECK_VAL 0xC0
+#define USER_OPCODE_CHECK_MASK 0xC0
+#define OPCODE_CHECK_VAL_DISABLED 0x0
+#define OPCODE_CHECK_MASK_DISABLED 0x0
+
+static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ dd->z_int_counter = get_all_cpu_total(dd->int_counter);
+ dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
+
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ ppd->ibport_data.z_rc_acks =
+ get_all_cpu_total(ppd->ibport_data.rc_acks);
+ ppd->ibport_data.z_rc_qacks =
+ get_all_cpu_total(ppd->ibport_data.rc_qacks);
+ }
+}
+
+/* Control LED state */
+static inline void setextled(struct hfi1_devdata *dd, u32 on)
+{
+ if (on)
+ write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
+ else
+ write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
+
+#endif /* _HFI1_KERNEL_H */
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
new file mode 100644
index 000000000000..a877eda8c13c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -0,0 +1,1722 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "mad.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "verbs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define HFI1_MIN_USER_CTXT_BUFCNT 7
+
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
+#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+
+/*
+ * Number of user receive contexts we are configured to use (to allow for more
+ * pio buffers per ctxt, etc.) Zero means use one user context per CPU.
+ */
+uint num_rcv_contexts;
+module_param_named(num_rcv_contexts, num_rcv_contexts, uint, S_IRUGO);
+MODULE_PARM_DESC(
+ num_rcv_contexts, "Set max number of user receive contexts to use");
+
+u8 krcvqs[RXE_NUM_DATA_VL];
+int krcvqsset;
+module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "Array of the number of kernel receive queues by VL");
+
+/* computed based on above array */
+unsigned n_krcvqs;
+
+static unsigned hfi1_rcvarr_split = 25;
+module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
+
+static uint eager_buffer_size = (2 << 20); /* 2MB */
+module_param(eager_buffer_size, uint, S_IRUGO);
+MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
+
+static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
+module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
+
+static uint hfi1_hdrq_entsize = 32;
+module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
+MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
+
+unsigned int user_credit_return_threshold = 33; /* default is 33% */
+module_param(user_credit_return_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(user_credit_return_theshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+
+static inline u64 encode_rcv_header_entry_size(u16);
+
+static struct idr hfi1_unit_table;
+u32 hfi1_cpulist_count;
+unsigned long *hfi1_cpulist;
+
+/*
+ * Common code for creating the receive context array.
+ */
+int hfi1_create_ctxts(struct hfi1_devdata *dd)
+{
+ unsigned i;
+ int ret;
+ int local_node_id = pcibus_to_node(dd->pcidev->bus);
+
+ if (local_node_id < 0)
+ local_node_id = numa_node_id();
+ dd->assigned_node_id = local_node_id;
+
+ dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL);
+ if (!dd->rcd) {
+ dd_dev_err(dd,
+ "Unable to allocate receive context array, failing\n");
+ goto nomem;
+ }
+
+ /* create one or more kernel contexts */
+ for (i = 0; i < dd->first_user_ctxt; ++i) {
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ctxtdata *rcd;
+
+ ppd = dd->pport + (i % dd->num_pports);
+ rcd = hfi1_create_ctxtdata(ppd, i);
+ if (!rcd) {
+ dd_dev_err(dd,
+ "Unable to allocate kernel receive context, failing\n");
+ goto nomem;
+ }
+ /*
+ * Set up the kernel context flags here and now because they
+ * use default values for all receive side memories. User
+ * contexts will be handled as they are created.
+ */
+ rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+ HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+ HFI1_CAP_KGET(NODROP_EGR_FULL) |
+ HFI1_CAP_KGET(DMA_RTAIL);
+ rcd->seq_cnt = 1;
+
+ rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
+ if (!rcd->sc) {
+ dd_dev_err(dd,
+ "Unable to allocate kernel send context, failing\n");
+ dd->rcd[rcd->ctxt] = NULL;
+ hfi1_free_ctxtdata(dd, rcd);
+ goto nomem;
+ }
+
+ ret = hfi1_init_ctxt(rcd->sc);
+ if (ret < 0) {
+ dd_dev_err(dd,
+ "Failed to setup kernel receive context, failing\n");
+ sc_free(rcd->sc);
+ dd->rcd[rcd->ctxt] = NULL;
+ hfi1_free_ctxtdata(dd, rcd);
+ ret = -EFAULT;
+ goto bail;
+ }
+ }
+
+ return 0;
+nomem:
+ ret = -ENOMEM;
+bail:
+ kfree(dd->rcd);
+ dd->rcd = NULL;
+ return ret;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_ctxtdata *rcd;
+ unsigned kctxt_ngroups = 0;
+ u32 base;
+
+ if (dd->rcv_entries.nctxt_extra >
+ dd->num_rcv_contexts - dd->first_user_ctxt)
+ kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
+ (dd->num_rcv_contexts - dd->first_user_ctxt));
+ rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
+ if (rcd) {
+ u32 rcvtids, max_entries;
+
+ dd_dev_info(dd, "%s: setting up context %u\n", __func__, ctxt);
+
+ INIT_LIST_HEAD(&rcd->qp_wait_list);
+ rcd->ppd = ppd;
+ rcd->dd = dd;
+ rcd->cnt = 1;
+ rcd->ctxt = ctxt;
+ dd->rcd[ctxt] = rcd;
+ rcd->numa_id = numa_node_id();
+ rcd->rcv_array_groups = dd->rcv_entries.ngroups;
+
+ spin_lock_init(&rcd->exp_lock);
+
+ /*
+ * Calculate the context's RcvArray entry starting point.
+ * We do this here because we have to take into account all
+ * the RcvArray entries that previous context would have
+ * taken and we have to account for any extra groups
+ * assigned to the kernel or user contexts.
+ */
+ if (ctxt < dd->first_user_ctxt) {
+ if (ctxt < kctxt_ngroups) {
+ base = ctxt * (dd->rcv_entries.ngroups + 1);
+ rcd->rcv_array_groups++;
+ } else
+ base = kctxt_ngroups +
+ (ctxt * dd->rcv_entries.ngroups);
+ } else {
+ u16 ct = ctxt - dd->first_user_ctxt;
+
+ base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
+ kctxt_ngroups);
+ if (ct < dd->rcv_entries.nctxt_extra) {
+ base += ct * (dd->rcv_entries.ngroups + 1);
+ rcd->rcv_array_groups++;
+ } else
+ base += dd->rcv_entries.nctxt_extra +
+ (ct * dd->rcv_entries.ngroups);
+ }
+ rcd->eager_base = base * dd->rcv_entries.group_size;
+
+ /* Validate and initialize Rcv Hdr Q variables */
+ if (rcvhdrcnt % HDRQ_INCREMENT) {
+ dd_dev_err(dd,
+ "ctxt%u: header queue count %d must be divisible by %d\n",
+ rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
+ goto bail;
+ }
+ rcd->rcvhdrq_cnt = rcvhdrcnt;
+ rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
+ /*
+ * Simple Eager buffer allocation: we have already pre-allocated
+ * the number of RcvArray entry groups. Each ctxtdata structure
+ * holds the number of groups for that context.
+ *
+ * To follow CSR requirements and maintain cacheline alignment,
+ * make sure all sizes and bases are multiples of group_size.
+ *
+ * The expected entry count is what is left after assigning
+ * eager.
+ */
+ max_entries = rcd->rcv_array_groups *
+ dd->rcv_entries.group_size;
+ rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
+ rcd->egrbufs.count = round_down(rcvtids,
+ dd->rcv_entries.group_size);
+ if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
+ dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
+ rcd->ctxt);
+ rcd->egrbufs.count = MAX_EAGER_ENTRIES;
+ }
+ dd_dev_info(dd, "ctxt%u: max Eager buffer RcvArray entries: %u\n",
+ rcd->ctxt, rcd->egrbufs.count);
+
+ /*
+ * Allocate array that will hold the eager buffer accounting
+ * data.
+ * This will allocate the maximum possible buffer count based
+ * on the value of the RcvArray split parameter.
+ * The resulting value will be rounded down to the closest
+ * multiple of dd->rcv_entries.group_size.
+ */
+ rcd->egrbufs.buffers = kzalloc(sizeof(*rcd->egrbufs.buffers) *
+ rcd->egrbufs.count, GFP_KERNEL);
+ if (!rcd->egrbufs.buffers)
+ goto bail;
+ rcd->egrbufs.rcvtids = kzalloc(sizeof(*rcd->egrbufs.rcvtids) *
+ rcd->egrbufs.count, GFP_KERNEL);
+ if (!rcd->egrbufs.rcvtids)
+ goto bail;
+ rcd->egrbufs.size = eager_buffer_size;
+ /*
+ * The size of the buffers programmed into the RcvArray
+ * entries needs to be big enough to handle the highest
+ * MTU supported.
+ */
+ if (rcd->egrbufs.size < hfi1_max_mtu) {
+ rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
+ dd_dev_info(dd,
+ "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+ rcd->ctxt, rcd->egrbufs.size);
+ }
+ rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
+
+ if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+ rcd->opstats = kzalloc(sizeof(*rcd->opstats),
+ GFP_KERNEL);
+ if (!rcd->opstats) {
+ dd_dev_err(dd,
+ "ctxt%u: Unable to allocate per ctxt stats buffer\n",
+ rcd->ctxt);
+ goto bail;
+ }
+ }
+ }
+ return rcd;
+bail:
+ kfree(rcd->opstats);
+ kfree(rcd->egrbufs.rcvtids);
+ kfree(rcd->egrbufs.buffers);
+ kfree(rcd);
+ return NULL;
+}
+
+/*
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid.
+ */
+static inline u64 encode_rcv_header_entry_size(u16 size)
+{
+ /* there are only 3 valid receive header entry sizes */
+ if (size == 2)
+ return 1;
+ if (size == 16)
+ return 2;
+ else if (size == 32)
+ return 4;
+ return 0; /* invalid */
+}
+
+/*
+ * Select the largest ccti value over all SLs to determine the intra-
+ * packet gap for the link.
+ *
+ * called with cca_timer_lock held (to protect access to cca_timer
+ * array), and rcu_read_lock() (to protect access to cc_state).
+ */
+void set_link_ipg(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct cc_state *cc_state;
+ int i;
+ u16 cce, ccti_limit, max_ccti = 0;
+ u16 shift, mult;
+ u64 src;
+ u32 current_egress_rate; /* Mbits /sec */
+ u32 max_pkt_time;
+ /*
+ * max_pkt_time is the maximum packet egress time in units
+ * of the fabric clock period 1/(805 MHz).
+ */
+
+ cc_state = get_cc_state(ppd);
+
+ if (cc_state == NULL)
+ /*
+ * This should _never_ happen - rcu_read_lock() is held,
+ * and set_link_ipg() should not be called if cc_state
+ * is NULL.
+ */
+ return;
+
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ u16 ccti = ppd->cca_timer[i].ccti;
+
+ if (ccti > max_ccti)
+ max_ccti = ccti;
+ }
+
+ ccti_limit = cc_state->cct.ccti_limit;
+ if (max_ccti > ccti_limit)
+ max_ccti = ccti_limit;
+
+ cce = cc_state->cct.entries[max_ccti].entry;
+ shift = (cce & 0xc000) >> 14;
+ mult = (cce & 0x3fff);
+
+ current_egress_rate = active_egress_rate(ppd);
+
+ max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
+
+ src = (max_pkt_time >> shift) * mult;
+
+ src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
+ src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
+
+ write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
+}
+
+static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
+{
+ struct cca_timer *cca_timer;
+ struct hfi1_pportdata *ppd;
+ int sl;
+ u16 ccti, ccti_timer, ccti_min;
+ struct cc_state *cc_state;
+
+ cca_timer = container_of(t, struct cca_timer, hrtimer);
+ ppd = cca_timer->ppd;
+ sl = cca_timer->sl;
+
+ rcu_read_lock();
+
+ cc_state = get_cc_state(ppd);
+
+ if (cc_state == NULL) {
+ rcu_read_unlock();
+ return HRTIMER_NORESTART;
+ }
+
+ /*
+ * 1) decrement ccti for SL
+ * 2) calculate IPG for link (set_link_ipg())
+ * 3) restart timer, unless ccti is at min value
+ */
+
+ ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
+ ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+
+ spin_lock(&ppd->cca_timer_lock);
+
+ ccti = cca_timer->ccti;
+
+ if (ccti > ccti_min) {
+ cca_timer->ccti--;
+ set_link_ipg(ppd);
+ }
+
+ spin_unlock(&ppd->cca_timer_lock);
+
+ rcu_read_unlock();
+
+ if (ccti > ccti_min) {
+ unsigned long nsec = 1024 * ccti_timer;
+ /* ccti_timer is in units of 1.024 usec */
+ hrtimer_forward_now(t, ns_to_ktime(nsec));
+ return HRTIMER_RESTART;
+ }
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+ struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+{
+ int i, size;
+ uint default_pkey_idx;
+
+ ppd->dd = dd;
+ ppd->hw_pidx = hw_pidx;
+ ppd->port = port; /* IB port number, not index */
+
+ default_pkey_idx = 1;
+
+ ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
+ if (loopback) {
+ hfi1_early_err(&pdev->dev,
+ "Faking data partition 0x8001 in idx %u\n",
+ !default_pkey_idx);
+ ppd->pkeys[!default_pkey_idx] = 0x8001;
+ }
+
+ INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
+ INIT_WORK(&ppd->link_up_work, handle_link_up);
+ INIT_WORK(&ppd->link_down_work, handle_link_down);
+ INIT_WORK(&ppd->freeze_work, handle_freeze);
+ INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
+ INIT_WORK(&ppd->sma_message_work, handle_sma_message);
+ INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+ mutex_init(&ppd->hls_lock);
+ spin_lock_init(&ppd->sdma_alllock);
+ spin_lock_init(&ppd->qsfp_info.qsfp_lock);
+
+ ppd->sm_trap_qp = 0x0;
+ ppd->sa_qp = 0x1;
+
+ ppd->hfi1_wq = NULL;
+
+ spin_lock_init(&ppd->cca_timer_lock);
+
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ ppd->cca_timer[i].ppd = ppd;
+ ppd->cca_timer[i].sl = i;
+ ppd->cca_timer[i].ccti = 0;
+ ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
+ }
+
+ ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
+
+ spin_lock_init(&ppd->cc_state_lock);
+ spin_lock_init(&ppd->cc_log_lock);
+ size = sizeof(struct cc_state);
+ RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
+ if (!rcu_dereference(ppd->cc_state))
+ goto bail;
+ return;
+
+bail:
+
+ hfi1_early_err(&pdev->dev,
+ "Congestion Control Agent disabled for port %d\n", port);
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct hfi1_devdata *dd)
+{
+ return 0;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the hfi1_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /*
+ * Ensure chip does no sends or receives, tail updates, or
+ * pioavail updates while we re-initialize. This is mostly
+ * for the driver data structures, not chip registers.
+ */
+ for (i = 0; i < dd->num_rcv_contexts; i++)
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+ HFI1_RCVCTRL_INTRAVAIL_DIS |
+ HFI1_RCVCTRL_TAILUPD_DIS, i);
+ pio_send_control(dd, PSC_GLOBAL_DISABLE);
+ for (i = 0; i < dd->num_send_contexts; i++)
+ sc_disable(dd->send_contexts[i].sc);
+
+ return 0;
+}
+
+static void enable_chip(struct hfi1_devdata *dd)
+{
+ u32 rcvmask;
+ u32 i;
+
+ /* enable PIO send */
+ pio_send_control(dd, PSC_GLOBAL_ENABLE);
+
+ /*
+ * Enable kernel ctxts' receive and receive interrupt.
+ * Other ctxts done as user opens and initializes them.
+ */
+ rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
+ for (i = 0; i < dd->first_user_ctxt; ++i) {
+ rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+ HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+ if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
+ rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+ if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
+ rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+ if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
+ rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ hfi1_rcvctrl(dd, rcvmask, i);
+ sc_enable(dd->rcd[i]->sc);
+ }
+}
+
+/**
+ * create_workqueues - create per port workqueues
+ * @dd: the hfi1_ib device
+ */
+static int create_workqueues(struct hfi1_devdata *dd)
+{
+ int pidx;
+ struct hfi1_pportdata *ppd;
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (!ppd->hfi1_wq) {
+ char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
+
+ snprintf(wq_name, sizeof(wq_name), "hfi%d_%d",
+ dd->unit, pidx);
+ ppd->hfi1_wq =
+ create_singlethread_workqueue(wq_name);
+ if (!ppd->hfi1_wq)
+ goto wq_error;
+ }
+ }
+ return 0;
+wq_error:
+ pr_err("create_singlethread_workqueue failed for port %d\n",
+ pidx + 1);
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->hfi1_wq) {
+ destroy_workqueue(ppd->hfi1_wq);
+ ppd->hfi1_wq = NULL;
+ }
+ }
+ return -ENOMEM;
+}
+
+/**
+ * hfi1_init - do the actual initialization sequence on the chip
+ * @dd: the hfi1_ib device
+ * @reinit: re-initializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip. This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0). We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int hfi1_init(struct hfi1_devdata *dd, int reinit)
+{
+ int ret = 0, pidx, lastfail = 0;
+ unsigned i, len;
+ struct hfi1_ctxtdata *rcd;
+ struct hfi1_pportdata *ppd;
+
+ /* Set up recv low level handlers */
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
+ kdeth_process_expected;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
+ kdeth_process_eager;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
+ process_receive_error;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
+ process_receive_bypass;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
+ process_receive_invalid;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
+ process_receive_invalid;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
+ process_receive_invalid;
+ dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+
+ /* Set up send low level handlers */
+ dd->process_pio_send = hfi1_verbs_send_pio;
+ dd->process_dma_send = hfi1_verbs_send_dma;
+ dd->pio_inline_send = pio_copy;
+
+ if (is_a0(dd)) {
+ atomic_set(&dd->drop_packet, DROP_PACKET_ON);
+ dd->do_drop = 1;
+ } else {
+ atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
+ dd->do_drop = 0;
+ }
+
+ /* make sure the link is not "up" */
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ ppd->linkup = 0;
+ }
+
+ if (reinit)
+ ret = init_after_reset(dd);
+ else
+ ret = loadtime_init(dd);
+ if (ret)
+ goto done;
+
+ /* dd->rcd can be NULL if early initialization failed */
+ for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+ /*
+ * Set up the (kernel) rcvhdr queue and egr TIDs. If doing
+ * re-init, the simplest way to handle this is to free
+ * existing, and re-allocate.
+ * Need to re-create rest of ctxt 0 ctxtdata as well.
+ */
+ rcd = dd->rcd[i];
+ if (!rcd)
+ continue;
+
+ rcd->do_interrupt = &handle_receive_interrupt;
+
+ lastfail = hfi1_create_rcvhdrq(dd, rcd);
+ if (!lastfail)
+ lastfail = hfi1_setup_eagerbufs(rcd);
+ if (lastfail)
+ dd_dev_err(dd,
+ "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+ }
+ if (lastfail)
+ ret = lastfail;
+
+ /* Allocate enough memory for user event notification. */
+ len = ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
+ sizeof(*dd->events), PAGE_SIZE);
+ dd->events = vmalloc_user(len);
+ if (!dd->events)
+ dd_dev_err(dd, "Failed to allocate user events page\n");
+ /*
+ * Allocate a page for device and port status.
+ * Page will be shared amongst all user processes.
+ */
+ dd->status = vmalloc_user(PAGE_SIZE);
+ if (!dd->status)
+ dd_dev_err(dd, "Failed to allocate dev status page\n");
+ else
+ dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
+ sizeof(dd->status->freezemsg));
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (dd->status)
+ /* Currently, we only have one port */
+ ppd->statusp = &dd->status->port;
+
+ set_mtu(ppd);
+ }
+
+ /* enable chip even if we have an error, so we can debug cause */
+ enable_chip(dd);
+
+ ret = hfi1_cq_init(dd);
+done:
+ /*
+ * Set status even if port serdes is not initialized
+ * so that diags will work.
+ */
+ if (dd->status)
+ dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
+ HFI1_STATUS_INITTED;
+ if (!ret) {
+ /* enable all interrupts from the chip */
+ set_intr_state(dd, 1);
+
+ /* chip is OK for user apps; mark it as initialized */
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+
+ /* initialize the qsfp if it exists
+ * Requires interrupts to be enabled so we are notified
+ * when the QSFP completes reset, and has
+ * to be done before bringing up the SERDES
+ */
+ init_qsfp(ppd);
+
+ /* start the serdes - must be after interrupts are
+ enabled so we are notified when the link goes up */
+ lastfail = bringup_serdes(ppd);
+ if (lastfail)
+ dd_dev_info(dd,
+ "Failed to bring up port %u\n",
+ ppd->port);
+
+ /*
+ * Set status even if port serdes is not initialized
+ * so that diags will work.
+ */
+ if (ppd->statusp)
+ *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
+ HFI1_STATUS_INITTED;
+ if (!ppd->link_speed_enabled)
+ continue;
+ }
+ }
+
+ /* if ret is non-zero, we probably should do some cleanup here... */
+ return ret;
+}
+
+static inline struct hfi1_devdata *__hfi1_lookup(int unit)
+{
+ return idr_find(&hfi1_unit_table, unit);
+}
+
+struct hfi1_devdata *hfi1_lookup(int unit)
+{
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ dd = __hfi1_lookup(unit);
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+ return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void stop_timers(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int pidx;
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->led_override_timer.data) {
+ del_timer_sync(&ppd->led_override_timer);
+ atomic_set(&ppd->led_override_timer_active, 0);
+ }
+ }
+}
+
+/**
+ * shutdown_device - shut down a device
+ * @dd: the hfi1_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled. It does not free any data structures.
+ * Everything it does has to be setup again by hfi1_init(dd, 1)
+ */
+static void shutdown_device(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ unsigned pidx;
+ int i;
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+
+ ppd->linkup = 0;
+ if (ppd->statusp)
+ *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+ HFI1_STATUS_IB_READY);
+ }
+ dd->flags &= ~HFI1_INITTED;
+
+ /* mask interrupts, but not errors */
+ set_intr_state(dd, 0);
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ for (i = 0; i < dd->num_rcv_contexts; i++)
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
+ HFI1_RCVCTRL_CTXT_DIS |
+ HFI1_RCVCTRL_INTRAVAIL_DIS |
+ HFI1_RCVCTRL_PKEY_DIS |
+ HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
+ /*
+ * Gracefully stop all sends allowing any in progress to
+ * trickle out first.
+ */
+ for (i = 0; i < dd->num_send_contexts; i++)
+ sc_flush(dd->send_contexts[i].sc);
+ }
+
+ /*
+ * Enough for anything that's going to trickle out to have actually
+ * done so.
+ */
+ udelay(20);
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+
+ /* disable all contexts */
+ for (i = 0; i < dd->num_send_contexts; i++)
+ sc_disable(dd->send_contexts[i].sc);
+ /* disable the send device */
+ pio_send_control(dd, PSC_GLOBAL_DISABLE);
+
+ /*
+ * Clear SerdesEnable.
+ * We can't count on interrupts since we are stopping.
+ */
+ hfi1_quiet_serdes(ppd);
+
+ if (ppd->hfi1_wq) {
+ destroy_workqueue(ppd->hfi1_wq);
+ ppd->hfi1_wq = NULL;
+ }
+ }
+ sdma_exit(dd);
+}
+
+/**
+ * hfi1_free_ctxtdata - free a context's allocated data
+ * @dd: the hfi1_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after hfi1_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+ unsigned e;
+
+ if (!rcd)
+ return;
+
+ if (rcd->rcvhdrq) {
+ dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+ rcd->rcvhdrq, rcd->rcvhdrq_phys);
+ rcd->rcvhdrq = NULL;
+ if (rcd->rcvhdrtail_kvaddr) {
+ dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+ (void *)rcd->rcvhdrtail_kvaddr,
+ rcd->rcvhdrqtailaddr_phys);
+ rcd->rcvhdrtail_kvaddr = NULL;
+ }
+ }
+
+ /* all the RcvArray entries should have been cleared by now */
+ kfree(rcd->egrbufs.rcvtids);
+
+ for (e = 0; e < rcd->egrbufs.alloced; e++) {
+ if (rcd->egrbufs.buffers[e].phys)
+ dma_free_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.buffers[e].len,
+ rcd->egrbufs.buffers[e].addr,
+ rcd->egrbufs.buffers[e].phys);
+ }
+ kfree(rcd->egrbufs.buffers);
+
+ sc_free(rcd->sc);
+ vfree(rcd->physshadow);
+ vfree(rcd->tid_pg_list);
+ vfree(rcd->user_event_mask);
+ vfree(rcd->subctxt_uregbase);
+ vfree(rcd->subctxt_rcvegrbuf);
+ vfree(rcd->subctxt_rcvhdr_base);
+ kfree(rcd->tidusemap);
+ kfree(rcd->opstats);
+ kfree(rcd);
+}
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ idr_remove(&hfi1_unit_table, dd->unit);
+ list_del(&dd->list);
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+ rcu_barrier(); /* wait for rcu callbacks to complete */
+ free_percpu(dd->int_counter);
+ free_percpu(dd->rcv_limit);
+ ib_dealloc_device(&dd->verbs_dev.ibdev);
+}
+
+/*
+ * Allocate our primary per-unit data structure. Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+ unsigned long flags;
+ struct hfi1_devdata *dd;
+ int ret;
+
+ dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra);
+ if (!dd)
+ return ERR_PTR(-ENOMEM);
+ /* extra is * number of ports */
+ dd->num_pports = extra / sizeof(struct hfi1_pportdata);
+ dd->pport = (struct hfi1_pportdata *)(dd + 1);
+
+ INIT_LIST_HEAD(&dd->list);
+ dd->node = dev_to_node(&pdev->dev);
+ if (dd->node < 0)
+ dd->node = 0;
+ idr_preload(GFP_KERNEL);
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+ ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
+ if (ret >= 0) {
+ dd->unit = ret;
+ list_add(&dd->list, &hfi1_dev_list);
+ }
+
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ idr_preload_end();
+
+ if (ret < 0) {
+ hfi1_early_err(&pdev->dev,
+ "Could not allocate unit ID: error %d\n", -ret);
+ goto bail;
+ }
+ /*
+ * Initialize all locks for the device. This needs to be as early as
+ * possible so locks are usable.
+ */
+ spin_lock_init(&dd->sc_lock);
+ spin_lock_init(&dd->sendctrl_lock);
+ spin_lock_init(&dd->rcvctrl_lock);
+ spin_lock_init(&dd->uctxt_lock);
+ spin_lock_init(&dd->hfi1_diag_trans_lock);
+ spin_lock_init(&dd->sc_init_lock);
+ spin_lock_init(&dd->dc8051_lock);
+ spin_lock_init(&dd->dc8051_memlock);
+ mutex_init(&dd->qsfp_i2c_mutex);
+ seqlock_init(&dd->sc2vl_lock);
+ spin_lock_init(&dd->sde_map_lock);
+ init_waitqueue_head(&dd->event_queue);
+
+ dd->int_counter = alloc_percpu(u64);
+ if (!dd->int_counter) {
+ ret = -ENOMEM;
+ hfi1_early_err(&pdev->dev,
+ "Could not allocate per-cpu int_counter\n");
+ goto bail;
+ }
+
+ dd->rcv_limit = alloc_percpu(u64);
+ if (!dd->rcv_limit) {
+ ret = -ENOMEM;
+ hfi1_early_err(&pdev->dev,
+ "Could not allocate per-cpu rcv_limit\n");
+ goto bail;
+ }
+
+ if (!hfi1_cpulist_count) {
+ u32 count = num_online_cpus();
+
+ hfi1_cpulist = kzalloc(BITS_TO_LONGS(count) *
+ sizeof(long), GFP_KERNEL);
+ if (hfi1_cpulist)
+ hfi1_cpulist_count = count;
+ else
+ hfi1_early_err(
+ &pdev->dev,
+ "Could not alloc cpulist info, cpu affinity might be wrong\n");
+ }
+ hfi1_dbg_ibdev_init(&dd->verbs_dev);
+ return dd;
+
+bail:
+ if (!list_empty(&dd->list))
+ list_del_init(&dd->list);
+ ib_dealloc_device(&dd->verbs_dev.ibdev);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code. Should be paranoid about state of
+ * system and data structures.
+ */
+void hfi1_disable_after_error(struct hfi1_devdata *dd)
+{
+ if (dd->flags & HFI1_INITTED) {
+ u32 pidx;
+
+ dd->flags &= ~HFI1_INITTED;
+ if (dd->pport)
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ struct hfi1_pportdata *ppd;
+
+ ppd = dd->pport + pidx;
+ if (dd->flags & HFI1_PRESENT)
+ set_link_state(ppd, HLS_DN_DISABLE);
+
+ if (ppd->statusp)
+ *ppd->statusp &= ~HFI1_STATUS_IB_READY;
+ }
+ }
+
+ /*
+ * Mark as having had an error for driver, and also
+ * for /sys and status word mapped to user programs.
+ * This marks unit as not usable, until reset.
+ */
+ if (dd->status)
+ dd->status->dev |= HFI1_STATUS_HWERROR;
+}
+
+static void remove_one(struct pci_dev *);
+static int init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
+#define PFX DRIVER_NAME ": "
+
+static const struct pci_device_id hfi1_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
+ { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
+
+static struct pci_driver hfi1_pci_driver = {
+ .name = DRIVER_NAME,
+ .probe = init_one,
+ .remove = remove_one,
+ .id_table = hfi1_pci_tbl,
+ .err_handler = &hfi1_pci_err_handler,
+};
+
+static void __init compute_krcvqs(void)
+{
+ int i;
+
+ for (i = 0; i < krcvqsset; i++)
+ n_krcvqs += krcvqs[i];
+}
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init hfi1_mod_init(void)
+{
+ int ret;
+
+ ret = dev_init();
+ if (ret)
+ goto bail;
+
+ /* validate max MTU before any devices start */
+ if (!valid_opa_max_mtu(hfi1_max_mtu)) {
+ pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
+ hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
+ hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+ }
+ /* valid CUs run from 1-128 in powers of 2 */
+ if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
+ hfi1_cu = 1;
+ /* valid credit return threshold is 0-100, variable is unsigned */
+ if (user_credit_return_threshold > 100)
+ user_credit_return_threshold = 100;
+
+ compute_krcvqs();
+ /* sanitize receive interrupt count, time must wait until after
+ the hardware type is known */
+ if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
+ rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
+ /* reject invalid combinations */
+ if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
+ pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
+ rcv_intr_count = 1;
+ }
+ if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
+ /*
+ * Avoid indefinite packet delivery by requiring a timeout
+ * if count is > 1.
+ */
+ pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
+ rcv_intr_timeout = 1;
+ }
+ if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
+ /*
+ * The dynamic algorithm expects a non-zero timeout
+ * and a count > 1.
+ */
+ pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
+ rcv_intr_dynamic = 0;
+ }
+
+ /* sanitize link CRC options */
+ link_crc_mask &= SUPPORTED_CRCS;
+
+ /*
+ * These must be called before the driver is registered with
+ * the PCI subsystem.
+ */
+ idr_init(&hfi1_unit_table);
+
+ hfi1_dbg_init();
+ ret = pci_register_driver(&hfi1_pci_driver);
+ if (ret < 0) {
+ pr_err("Unable to register driver: error %d\n", -ret);
+ goto bail_dev;
+ }
+ goto bail; /* all OK */
+
+bail_dev:
+ hfi1_dbg_exit();
+ idr_destroy(&hfi1_unit_table);
+ dev_cleanup();
+bail:
+ return ret;
+}
+
+module_init(hfi1_mod_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit hfi1_mod_cleanup(void)
+{
+ pci_unregister_driver(&hfi1_pci_driver);
+ hfi1_dbg_exit();
+ hfi1_cpulist_count = 0;
+ kfree(hfi1_cpulist);
+
+ idr_destroy(&hfi1_unit_table);
+ dispose_firmware(); /* asymmetric with obtain_firmware() */
+ dev_cleanup();
+}
+
+module_exit(hfi1_mod_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct hfi1_devdata *dd)
+{
+ int ctxt;
+ int pidx;
+ struct hfi1_ctxtdata **tmp;
+ unsigned long flags;
+
+ /* users can't do anything more with chip */
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ struct hfi1_pportdata *ppd = &dd->pport[pidx];
+ struct cc_state *cc_state;
+ int i;
+
+ if (ppd->statusp)
+ *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
+
+ for (i = 0; i < OPA_MAX_SLS; i++)
+ hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
+
+ spin_lock(&ppd->cc_state_lock);
+ cc_state = get_cc_state(ppd);
+ rcu_assign_pointer(ppd->cc_state, NULL);
+ spin_unlock(&ppd->cc_state_lock);
+
+ if (cc_state)
+ call_rcu(&cc_state->rcu, cc_state_reclaim);
+ }
+
+ free_credit_return(dd);
+
+ /*
+ * Free any resources still in use (usually just kernel contexts)
+ * at unload; we do for ctxtcnt, because that's what we allocate.
+ * We acquire lock to be really paranoid that rcd isn't being
+ * accessed from some interrupt-related code (that should not happen,
+ * but best to be sure).
+ */
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ tmp = dd->rcd;
+ dd->rcd = NULL;
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+ for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
+ struct hfi1_ctxtdata *rcd = tmp[ctxt];
+
+ tmp[ctxt] = NULL; /* debugging paranoia */
+ if (rcd) {
+ hfi1_clear_tids(rcd);
+ hfi1_free_ctxtdata(dd, rcd);
+ }
+ }
+ kfree(tmp);
+ /* must follow rcv context free - need to remove rcv's hooks */
+ for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
+ sc_free(dd->send_contexts[ctxt].sc);
+ dd->num_send_contexts = 0;
+ kfree(dd->send_contexts);
+ dd->send_contexts = NULL;
+ kfree(dd->boardname);
+ vfree(dd->events);
+ vfree(dd->status);
+ hfi1_cq_exit(dd);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void postinit_cleanup(struct hfi1_devdata *dd)
+{
+ hfi1_start_cleanup(dd);
+
+ hfi1_pcie_ddcleanup(dd);
+ hfi1_pcie_cleanup(dd->pcidev);
+
+ cleanup_device_data(dd);
+
+ hfi1_free_devdata(dd);
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int ret = 0, j, pidx, initfail;
+ struct hfi1_devdata *dd = NULL;
+
+ /* First, lock the non-writable module parameters */
+ HFI1_CAP_LOCK();
+
+ /* Validate some global module parameters */
+ if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+ hfi1_early_err(&pdev->dev, "Header queue count too small\n");
+ ret = -EINVAL;
+ goto bail;
+ }
+ /* use the encoding function as a sanitization check */
+ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
+ hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
+ hfi1_hdrq_entsize);
+ goto bail;
+ }
+
+ /* The receive eager buffer size must be set before the receive
+ * contexts are created.
+ *
+ * Set the eager buffer size. Validate that it falls in a range
+ * allowed by the hardware - all powers of 2 between the min and
+ * max. The maximum valid MTU is within the eager buffer range
+ * so we do not need to cap the max_mtu by an eager buffer size
+ * setting.
+ */
+ if (eager_buffer_size) {
+ if (!is_power_of_2(eager_buffer_size))
+ eager_buffer_size =
+ roundup_pow_of_two(eager_buffer_size);
+ eager_buffer_size =
+ clamp_val(eager_buffer_size,
+ MIN_EAGER_BUFFER * 8,
+ MAX_EAGER_BUFFER_TOTAL);
+ hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
+ eager_buffer_size);
+ } else {
+ hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* restrict value of hfi1_rcvarr_split */
+ hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
+
+ ret = hfi1_pcie_init(pdev, ent);
+ if (ret)
+ goto bail;
+
+ /*
+ * Do device-specific initialization, function table setup, dd
+ * allocation, etc.
+ */
+ switch (ent->device) {
+ case PCI_DEVICE_ID_INTEL0:
+ case PCI_DEVICE_ID_INTEL1:
+ dd = hfi1_init_dd(pdev, ent);
+ break;
+ default:
+ hfi1_early_err(&pdev->dev,
+ "Failing on unknown Intel deviceid 0x%x\n",
+ ent->device);
+ ret = -ENODEV;
+ }
+
+ if (IS_ERR(dd))
+ ret = PTR_ERR(dd);
+ if (ret)
+ goto clean_bail; /* error already printed */
+
+ ret = create_workqueues(dd);
+ if (ret)
+ goto clean_bail;
+
+ /* do the generic initialization */
+ initfail = hfi1_init(dd, 0);
+
+ ret = hfi1_register_ib_device(dd);
+
+ /*
+ * Now ready for use. this should be cleared whenever we
+ * detect a reset, or initiate one. If earlier failure,
+ * we still create devices, so diags, etc. can be used
+ * to determine cause of problem.
+ */
+ if (!initfail && !ret)
+ dd->flags |= HFI1_INITTED;
+
+ j = hfi1_device_create(dd);
+ if (j)
+ dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+
+ if (initfail || ret) {
+ stop_timers(dd);
+ flush_workqueue(ib_wq);
+ for (pidx = 0; pidx < dd->num_pports; ++pidx)
+ hfi1_quiet_serdes(dd->pport + pidx);
+ if (!j)
+ hfi1_device_remove(dd);
+ if (!ret)
+ hfi1_unregister_ib_device(dd);
+ postinit_cleanup(dd);
+ if (initfail)
+ ret = initfail;
+ goto bail; /* everything already cleaned */
+ }
+
+ sdma_start(dd);
+
+ return 0;
+
+clean_bail:
+ hfi1_pcie_cleanup(pdev);
+bail:
+ return ret;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ /* unregister from IB core */
+ hfi1_unregister_ib_device(dd);
+
+ /*
+ * Disable the IB link, disable interrupts on the device,
+ * clear dma engines, etc.
+ */
+ shutdown_device(dd);
+
+ stop_timers(dd);
+
+ /* wait until all of our (qsfp) queue_work() calls complete */
+ flush_workqueue(ib_wq);
+
+ hfi1_device_remove(dd);
+
+ postinit_cleanup(dd);
+}
+
+/**
+ * hfi1_create_rcvhdrq - create a receive header queue
+ * @dd: the hfi1_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+ unsigned amt;
+ u64 reg;
+
+ if (!rcd->rcvhdrq) {
+ dma_addr_t phys_hdrqtail;
+ gfp_t gfp_flags;
+
+ /*
+ * rcvhdrqentsize is in DWs, so we have to convert to bytes
+ * (* sizeof(u32)).
+ */
+ amt = ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
+ sizeof(u32), PAGE_SIZE);
+
+ gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+ GFP_USER : GFP_KERNEL;
+ rcd->rcvhdrq = dma_zalloc_coherent(
+ &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+ gfp_flags | __GFP_COMP);
+
+ if (!rcd->rcvhdrq) {
+ dd_dev_err(dd,
+ "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+ amt, rcd->ctxt);
+ goto bail;
+ }
+
+ /* Event mask is per device now and is in hfi1_devdata */
+ /*if (rcd->ctxt >= dd->first_user_ctxt) {
+ rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
+ if (!rcd->user_event_mask)
+ goto bail_free_hdrq;
+ }*/
+
+ if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+ rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
+ &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+ gfp_flags);
+ if (!rcd->rcvhdrtail_kvaddr)
+ goto bail_free;
+ rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+ }
+
+ rcd->rcvhdrq_size = amt;
+ }
+ /*
+ * These values are per-context:
+ * RcvHdrCnt
+ * RcvHdrEntSize
+ * RcvHdrSize
+ */
+ reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
+ & RCV_HDR_CNT_CNT_MASK)
+ << RCV_HDR_CNT_CNT_SHIFT;
+ write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
+ reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
+ & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
+ << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+ write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
+ reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
+ << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+ write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
+ return 0;
+
+bail_free:
+ dd_dev_err(dd,
+ "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+ rcd->ctxt);
+ vfree(rcd->user_event_mask);
+ rcd->user_event_mask = NULL;
+ dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+ rcd->rcvhdrq_phys);
+ rcd->rcvhdrq = NULL;
+bail:
+ return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls. Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
+ gfp_t gfp_flags;
+ u16 order;
+ int ret = 0;
+ u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
+
+ /*
+ * GFP_USER, but without GFP_FS, so buffer cache can be
+ * coalesced (we hope); otherwise, even at order 4,
+ * heavy filesystem activity makes these fail, and we can
+ * use compound pages.
+ */
+ gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+ /*
+ * The minimum size of the eager buffers is a groups of MTU-sized
+ * buffers.
+ * The global eager_buffer_size parameter is checked against the
+ * theoretical lower limit of the value. Here, we check against the
+ * MTU.
+ */
+ if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
+ rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
+ /*
+ * If using one-pkt-per-egr-buffer, lower the eager buffer
+ * size to the max MTU (page-aligned).
+ */
+ if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+ rcd->egrbufs.rcvtid_size = round_mtu;
+
+ /*
+ * Eager buffers sizes of 1MB or less require smaller TID sizes
+ * to satisfy the "multiple of 8 RcvArray entries" requirement.
+ */
+ if (rcd->egrbufs.size <= (1 << 20))
+ rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
+ rounddown_pow_of_two(rcd->egrbufs.size / 8));
+
+ while (alloced_bytes < rcd->egrbufs.size &&
+ rcd->egrbufs.alloced < rcd->egrbufs.count) {
+ rcd->egrbufs.buffers[idx].addr =
+ dma_zalloc_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.rcvtid_size,
+ &rcd->egrbufs.buffers[idx].phys,
+ gfp_flags);
+ if (rcd->egrbufs.buffers[idx].addr) {
+ rcd->egrbufs.buffers[idx].len =
+ rcd->egrbufs.rcvtid_size;
+ rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
+ rcd->egrbufs.buffers[idx].addr;
+ rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
+ rcd->egrbufs.buffers[idx].phys;
+ rcd->egrbufs.alloced++;
+ alloced_bytes += rcd->egrbufs.rcvtid_size;
+ idx++;
+ } else {
+ u32 new_size, i, j;
+ u64 offset = 0;
+
+ /*
+ * Fail the eager buffer allocation if:
+ * - we are already using the lowest acceptable size
+ * - we are using one-pkt-per-egr-buffer (this implies
+ * that we are accepting only one size)
+ */
+ if (rcd->egrbufs.rcvtid_size == round_mtu ||
+ !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
+ dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
+ rcd->ctxt);
+ goto bail_rcvegrbuf_phys;
+ }
+
+ new_size = rcd->egrbufs.rcvtid_size / 2;
+
+ /*
+ * If the first attempt to allocate memory failed, don't
+ * fail everything but continue with the next lower
+ * size.
+ */
+ if (idx == 0) {
+ rcd->egrbufs.rcvtid_size = new_size;
+ continue;
+ }
+
+ /*
+ * Re-partition already allocated buffers to a smaller
+ * size.
+ */
+ rcd->egrbufs.alloced = 0;
+ for (i = 0, j = 0, offset = 0; j < idx; i++) {
+ if (i >= rcd->egrbufs.count)
+ break;
+ rcd->egrbufs.rcvtids[i].phys =
+ rcd->egrbufs.buffers[j].phys + offset;
+ rcd->egrbufs.rcvtids[i].addr =
+ rcd->egrbufs.buffers[j].addr + offset;
+ rcd->egrbufs.alloced++;
+ if ((rcd->egrbufs.buffers[j].phys + offset +
+ new_size) ==
+ (rcd->egrbufs.buffers[j].phys +
+ rcd->egrbufs.buffers[j].len)) {
+ j++;
+ offset = 0;
+ } else
+ offset += new_size;
+ }
+ rcd->egrbufs.rcvtid_size = new_size;
+ }
+ }
+ rcd->egrbufs.numbufs = idx;
+ rcd->egrbufs.size = alloced_bytes;
+
+ dd_dev_info(dd, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+ rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
+ rcd->egrbufs.size);
+
+ /*
+ * Set the contexts rcv array head update threshold to the closest
+ * power of 2 (so we can use a mask instead of modulo) below half
+ * the allocated entries.
+ */
+ rcd->egrbufs.threshold =
+ rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
+ /*
+ * Compute the expected RcvArray entry base. This is done after
+ * allocating the eager buffers in order to maximize the
+ * expected RcvArray entries for the context.
+ */
+ max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
+ egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
+ rcd->expected_count = max_entries - egrtop;
+ if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
+ rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
+
+ rcd->expected_base = rcd->eager_base + egrtop;
+ dd_dev_info(dd, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
+ rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
+ rcd->eager_base, rcd->expected_base);
+
+ if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
+ dd_dev_err(dd, "ctxt%u: current Eager buffer size is invalid %u\n",
+ rcd->ctxt, rcd->egrbufs.rcvtid_size);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
+ hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
+ rcd->egrbufs.rcvtids[idx].phys, order);
+ cond_resched();
+ }
+ goto bail;
+
+bail_rcvegrbuf_phys:
+ for (idx = 0; idx < rcd->egrbufs.alloced &&
+ rcd->egrbufs.buffers[idx].addr;
+ idx++) {
+ dma_free_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.buffers[idx].len,
+ rcd->egrbufs.buffers[idx].addr,
+ rcd->egrbufs.buffers[idx].phys);
+ rcd->egrbufs.buffers[idx].addr = NULL;
+ rcd->egrbufs.buffers[idx].phys = 0;
+ rcd->egrbufs.buffers[idx].len = 0;
+ }
+bail:
+ return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c
new file mode 100644
index 000000000000..426582b9ab65
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/intr.c
@@ -0,0 +1,207 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "sdma.h"
+
+/**
+ * format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+ strlcat(msg, "[", msgl);
+ strlcat(msg, hwmsg, msgl);
+ strlcat(msg, "]", msgl);
+}
+
+/**
+ * hfi1_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
+ size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+ int i;
+
+ for (i = 0; i < nhwerrmsgs; i++)
+ if (hwerrs & hwerrmsgs[i].mask)
+ format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
+{
+ struct ib_event event;
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /*
+ * Only call ib_dispatch_event() if the IB device has been
+ * registered. HFI1_INITED is set iff the driver has successfully
+ * registered with the IB core.
+ */
+ if (!(dd->flags & HFI1_INITTED))
+ return;
+ event.device = &dd->verbs_dev.ibdev;
+ event.element.port_num = ppd->port;
+ event.event = ev;
+ ib_dispatch_event(&event);
+}
+
+/*
+ * Handle a linkup or link down notification.
+ * This is called outside an interrupt.
+ */
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
+{
+ struct hfi1_pportdata *ppd = &dd->pport[0];
+ enum ib_event_type ev;
+
+ if (!(ppd->linkup ^ !!linkup))
+ return; /* no change, nothing to do */
+
+ if (linkup) {
+ /*
+ * Quick linkup and all link up on the simulator does not
+ * trigger or implement:
+ * - VerifyCap interrupt
+ * - VerifyCap frames
+ * But rather moves directly to LinkUp.
+ *
+ * Do the work of the VerifyCap interrupt handler,
+ * handle_verify_cap(), but do not try moving the state to
+ * LinkUp as we are already there.
+ *
+ * NOTE: This uses this device's vAU, vCU, and vl15_init for
+ * the remote values. Both sides must be using the values.
+ */
+ if (quick_linkup
+ || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ set_up_vl15(dd, dd->vau, dd->vl15_init);
+ assign_remote_cm_au_table(dd, dd->vcu);
+ ppd->neighbor_guid =
+ read_csr(dd,
+ DC_DC8051_STS_REMOTE_GUID);
+ ppd->neighbor_type =
+ read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+ DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+ ppd->neighbor_port_number =
+ read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+ DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+ dd_dev_info(dd,
+ "Neighbor GUID: %llx Neighbor type %d\n",
+ ppd->neighbor_guid,
+ ppd->neighbor_type);
+ }
+
+ /* physical link went up */
+ ppd->linkup = 1;
+ ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+
+ /* link widths are not available until the link is fully up */
+ get_linkup_link_widths(ppd);
+
+ } else {
+ /* physical link went down */
+ ppd->linkup = 0;
+
+ /* clear HW details of the previous connection */
+ reset_link_credits(dd);
+
+ /* freeze after a link down to guarantee a clean egress */
+ start_freeze_handling(ppd, FREEZE_SELF|FREEZE_LINK_DOWN);
+
+ ev = IB_EVENT_PORT_ERR;
+
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
+
+ /* if we are down, the neighbor is down */
+ ppd->neighbor_normal = 0;
+
+ /* notify IB of the link change */
+ signal_ib_event(ppd, ev);
+ }
+
+
+}
+
+/*
+ * Handle receive or urgent interrupts for user contexts. This means a user
+ * process was waiting for a packet to arrive, and didn't want to poll.
+ */
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ if (!rcd->cnt)
+ goto done;
+
+ if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
+ wake_up_interruptible(&rcd->wait);
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
+ } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
+ &rcd->event_flags)) {
+ rcd->urgent++;
+ wake_up_interruptible(&rcd->wait);
+ }
+done:
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h
new file mode 100644
index 000000000000..fa361b405851
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/iowait.h
@@ -0,0 +1,186 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * struct iowait - linkage for delayed progress/waiting
+ * @list: used to add/insert into QP/PQ wait lists
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed. They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ */
+
+struct iowait {
+ struct list_head list;
+ struct list_head tx_head;
+ int (*sleep)(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx,
+ unsigned seq);
+ void (*wakeup)(struct iowait *wait, int reason);
+ struct work_struct iowork;
+ wait_queue_head_t wait_dma;
+ atomic_t sdma_busy;
+ u32 count;
+ u32 tx_limit;
+ u32 tx_count;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @wakeup: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+
+static inline void iowait_init(
+ struct iowait *wait,
+ u32 tx_limit,
+ void (*func)(struct work_struct *work),
+ int (*sleep)(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx,
+ unsigned seq),
+ void (*wakeup)(struct iowait *wait, int reason))
+{
+ wait->count = 0;
+ INIT_LIST_HEAD(&wait->list);
+ INIT_LIST_HEAD(&wait->tx_head);
+ INIT_WORK(&wait->iowork, func);
+ init_waitqueue_head(&wait->wait_dma);
+ atomic_set(&wait->sdma_busy, 0);
+ wait->tx_limit = tx_limit;
+ wait->sleep = sleep;
+ wait->wakeup = wakeup;
+}
+
+/**
+ * iowait_schedule() - initialize wait structure
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ */
+static inline void iowait_schedule(
+ struct iowait *wait,
+ struct workqueue_struct *wq)
+{
+ queue_work(wq, &wait->iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+ wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ *
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *wait)
+{
+ wake_up(&wait->wait_dma);
+}
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/keys.c b/drivers/staging/rdma/hfi1/keys.c
new file mode 100644
index 000000000000..f6eff177ace1
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/keys.c
@@ -0,0 +1,411 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/**
+ * hfi1_alloc_lkey - allocate an lkey
+ * @mr: memory region that this lkey protects
+ * @dma_region: 0->normal key, 1->restricted DMA key
+ *
+ * Returns 0 if successful, otherwise returns -errno.
+ *
+ * Increments mr reference count as required.
+ *
+ * Sets the lkey field mr for non-dma regions.
+ *
+ */
+
+int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region)
+{
+ unsigned long flags;
+ u32 r;
+ u32 n;
+ int ret = 0;
+ struct hfi1_ibdev *dev = to_idev(mr->pd->device);
+ struct hfi1_lkey_table *rkt = &dev->lk_table;
+
+ hfi1_get_mr(mr);
+ spin_lock_irqsave(&rkt->lock, flags);
+
+ /* special case for dma_mr lkey == 0 */
+ if (dma_region) {
+ struct hfi1_mregion *tmr;
+
+ tmr = rcu_access_pointer(dev->dma_mr);
+ if (!tmr) {
+ rcu_assign_pointer(dev->dma_mr, mr);
+ mr->lkey_published = 1;
+ } else {
+ hfi1_put_mr(mr);
+ }
+ goto success;
+ }
+
+ /* Find the next available LKEY */
+ r = rkt->next;
+ n = r;
+ for (;;) {
+ if (!rcu_access_pointer(rkt->table[r]))
+ break;
+ r = (r + 1) & (rkt->max - 1);
+ if (r == n)
+ goto bail;
+ }
+ rkt->next = (r + 1) & (rkt->max - 1);
+ /*
+ * Make sure lkey is never zero which is reserved to indicate an
+ * unrestricted LKEY.
+ */
+ rkt->gen++;
+ /*
+ * bits are capped in verbs.c to ensure enough bits for
+ * generation number
+ */
+ mr->lkey = (r << (32 - hfi1_lkey_table_size)) |
+ ((((1 << (24 - hfi1_lkey_table_size)) - 1) & rkt->gen)
+ << 8);
+ if (mr->lkey == 0) {
+ mr->lkey |= 1 << 8;
+ rkt->gen++;
+ }
+ rcu_assign_pointer(rkt->table[r], mr);
+ mr->lkey_published = 1;
+success:
+ spin_unlock_irqrestore(&rkt->lock, flags);
+out:
+ return ret;
+bail:
+ hfi1_put_mr(mr);
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ ret = -ENOMEM;
+ goto out;
+}
+
+/**
+ * hfi1_free_lkey - free an lkey
+ * @mr: mr to free from tables
+ */
+void hfi1_free_lkey(struct hfi1_mregion *mr)
+{
+ unsigned long flags;
+ u32 lkey = mr->lkey;
+ u32 r;
+ struct hfi1_ibdev *dev = to_idev(mr->pd->device);
+ struct hfi1_lkey_table *rkt = &dev->lk_table;
+ int freed = 0;
+
+ spin_lock_irqsave(&rkt->lock, flags);
+ if (!mr->lkey_published)
+ goto out;
+ if (lkey == 0)
+ RCU_INIT_POINTER(dev->dma_mr, NULL);
+ else {
+ r = lkey >> (32 - hfi1_lkey_table_size);
+ RCU_INIT_POINTER(rkt->table[r], NULL);
+ }
+ mr->lkey_published = 0;
+ freed++;
+out:
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ if (freed) {
+ synchronize_rcu();
+ hfi1_put_mr(mr);
+ }
+}
+
+/**
+ * hfi1_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @pd: protection domain
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * increments the reference count upon success
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
+ struct hfi1_sge *isge, struct ib_sge *sge, int acc)
+{
+ struct hfi1_mregion *mr;
+ unsigned n, m;
+ size_t off;
+
+ /*
+ * We use LKEY == zero for kernel virtual addresses
+ * (see hfi1_get_dma_mr and dma.c).
+ */
+ rcu_read_lock();
+ if (sge->lkey == 0) {
+ struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
+
+ if (pd->user)
+ goto bail;
+ mr = rcu_dereference(dev->dma_mr);
+ if (!mr)
+ goto bail;
+ atomic_inc(&mr->refcount);
+ rcu_read_unlock();
+
+ isge->mr = mr;
+ isge->vaddr = (void *) sge->addr;
+ isge->length = sge->length;
+ isge->sge_length = sge->length;
+ isge->m = 0;
+ isge->n = 0;
+ goto ok;
+ }
+ mr = rcu_dereference(
+ rkt->table[(sge->lkey >> (32 - hfi1_lkey_table_size))]);
+ if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+ goto bail;
+
+ off = sge->addr - mr->user_base;
+ if (unlikely(sge->addr < mr->user_base ||
+ off + sge->length > mr->length ||
+ (mr->access_flags & acc) != acc))
+ goto bail;
+ atomic_inc(&mr->refcount);
+ rcu_read_unlock();
+
+ off += mr->offset;
+ if (mr->page_shift) {
+ /*
+ page sizes are uniform power of 2 so no loop is necessary
+ entries_spanned_by_off is the number of times the loop below
+ would have executed.
+ */
+ size_t entries_spanned_by_off;
+
+ entries_spanned_by_off = off >> mr->page_shift;
+ off -= (entries_spanned_by_off << mr->page_shift);
+ m = entries_spanned_by_off / HFI1_SEGSZ;
+ n = entries_spanned_by_off % HFI1_SEGSZ;
+ } else {
+ m = 0;
+ n = 0;
+ while (off >= mr->map[m]->segs[n].length) {
+ off -= mr->map[m]->segs[n].length;
+ n++;
+ if (n >= HFI1_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ }
+ isge->mr = mr;
+ isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+ isge->length = mr->map[m]->segs[n].length - off;
+ isge->sge_length = sge->length;
+ isge->m = m;
+ isge->n = n;
+ok:
+ return 1;
+bail:
+ rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * hfi1_rkey_ok - check the IB virtual address, length, and RKEY
+ * @qp: qp for validation
+ * @sge: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ *
+ * increments the reference count upon success
+ */
+int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
+ u32 len, u64 vaddr, u32 rkey, int acc)
+{
+ struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+ struct hfi1_mregion *mr;
+ unsigned n, m;
+ size_t off;
+
+ /*
+ * We use RKEY == zero for kernel virtual addresses
+ * (see hfi1_get_dma_mr and dma.c).
+ */
+ rcu_read_lock();
+ if (rkey == 0) {
+ struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
+ struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
+
+ if (pd->user)
+ goto bail;
+ mr = rcu_dereference(dev->dma_mr);
+ if (!mr)
+ goto bail;
+ atomic_inc(&mr->refcount);
+ rcu_read_unlock();
+
+ sge->mr = mr;
+ sge->vaddr = (void *) vaddr;
+ sge->length = len;
+ sge->sge_length = len;
+ sge->m = 0;
+ sge->n = 0;
+ goto ok;
+ }
+
+ mr = rcu_dereference(
+ rkt->table[(rkey >> (32 - hfi1_lkey_table_size))]);
+ if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+ goto bail;
+
+ off = vaddr - mr->iova;
+ if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+ (mr->access_flags & acc) == 0))
+ goto bail;
+ atomic_inc(&mr->refcount);
+ rcu_read_unlock();
+
+ off += mr->offset;
+ if (mr->page_shift) {
+ /*
+ page sizes are uniform power of 2 so no loop is necessary
+ entries_spanned_by_off is the number of times the loop below
+ would have executed.
+ */
+ size_t entries_spanned_by_off;
+
+ entries_spanned_by_off = off >> mr->page_shift;
+ off -= (entries_spanned_by_off << mr->page_shift);
+ m = entries_spanned_by_off / HFI1_SEGSZ;
+ n = entries_spanned_by_off % HFI1_SEGSZ;
+ } else {
+ m = 0;
+ n = 0;
+ while (off >= mr->map[m]->segs[n].length) {
+ off -= mr->map[m]->segs[n].length;
+ n++;
+ if (n >= HFI1_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ }
+ sge->mr = mr;
+ sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+ sge->length = mr->map[m]->segs[n].length - off;
+ sge->sge_length = len;
+ sge->m = m;
+ sge->n = n;
+ok:
+ return 1;
+bail:
+ rcu_read_unlock();
+ return 0;
+}
+
+/*
+ * Initialize the memory region specified by the work request.
+ */
+int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr)
+{
+ struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+ struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
+ struct hfi1_mregion *mr;
+ u32 rkey = wr->wr.fast_reg.rkey;
+ unsigned i, n, m;
+ int ret = -EINVAL;
+ unsigned long flags;
+ u64 *page_list;
+ size_t ps;
+
+ spin_lock_irqsave(&rkt->lock, flags);
+ if (pd->user || rkey == 0)
+ goto bail;
+
+ mr = rcu_dereference_protected(
+ rkt->table[(rkey >> (32 - hfi1_lkey_table_size))],
+ lockdep_is_held(&rkt->lock));
+ if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd))
+ goto bail;
+
+ if (wr->wr.fast_reg.page_list_len > mr->max_segs)
+ goto bail;
+
+ ps = 1UL << wr->wr.fast_reg.page_shift;
+ if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len)
+ goto bail;
+
+ mr->user_base = wr->wr.fast_reg.iova_start;
+ mr->iova = wr->wr.fast_reg.iova_start;
+ mr->lkey = rkey;
+ mr->length = wr->wr.fast_reg.length;
+ mr->access_flags = wr->wr.fast_reg.access_flags;
+ page_list = wr->wr.fast_reg.page_list->page_list;
+ m = 0;
+ n = 0;
+ for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+ mr->map[m]->segs[n].vaddr = (void *) page_list[i];
+ mr->map[m]->segs[n].length = ps;
+ if (++n == HFI1_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+
+ ret = 0;
+bail:
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
new file mode 100644
index 000000000000..b2c1b72d38ce
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mad.c
@@ -0,0 +1,4257 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
+ / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+/* the reset value from the FM is supposed to be 0xffff, handle both */
+#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
+#define OPA_LINK_WIDTH_RESET 0xffff
+
+static int reply(struct ib_mad_hdr *smp)
+{
+ /*
+ * The verbs framework will handle the directed/LID route
+ * packet changes.
+ */
+ smp->method = IB_MGMT_METHOD_GET_RESP;
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ smp->status |= IB_SMP_DIRECTION;
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static inline void clear_opa_smp_data(struct opa_smp *smp)
+{
+ void *data = opa_get_smp_data(smp);
+ size_t size = opa_get_smp_data_size(smp);
+
+ memset(data, 0, size);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+{
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_agent *agent;
+ struct ib_smp *smp;
+ int ret;
+ unsigned long flags;
+ unsigned long timeout;
+ int pkey_idx;
+ u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
+
+ agent = ibp->send_agent;
+ if (!agent)
+ return;
+
+ /* o14-3.2.1 */
+ if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
+ return;
+
+ /* o14-2 */
+ if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout))
+ return;
+
+ pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+ if (pkey_idx < 0) {
+ pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
+ __func__, hfi1_get_pkey(ibp, 1));
+ pkey_idx = 1;
+ }
+
+ send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
+ IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+ if (IS_ERR(send_buf))
+ return;
+
+ smp = send_buf->mad;
+ smp->base_version = IB_MGMT_BASE_VERSION;
+ smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ smp->class_version = 1;
+ smp->method = IB_MGMT_METHOD_TRAP;
+ ibp->tid++;
+ smp->tid = cpu_to_be64(ibp->tid);
+ smp->attr_id = IB_SMP_ATTR_NOTICE;
+ /* o14-1: smp->mkey = 0; */
+ memcpy(smp->data, data, len);
+
+ spin_lock_irqsave(&ibp->lock, flags);
+ if (!ibp->sm_ah) {
+ if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+ struct ib_ah *ah;
+
+ ah = hfi1_create_qp0_ah(ibp, ibp->sm_lid);
+ if (IS_ERR(ah))
+ ret = PTR_ERR(ah);
+ else {
+ send_buf->ah = ah;
+ ibp->sm_ah = to_iah(ah);
+ ret = 0;
+ }
+ } else
+ ret = -EINVAL;
+ } else {
+ send_buf->ah = &ibp->sm_ah->ibah;
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&ibp->lock, flags);
+
+ if (!ret)
+ ret = ib_post_send_mad(send_buf, NULL);
+ if (!ret) {
+ /* 4.096 usec. */
+ timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000;
+ ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout);
+ } else {
+ ib_free_send_mad(send_buf);
+ ibp->trap_timeout = 0;
+ }
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+ u32 qp1, u32 qp2, __be16 lid1, __be16 lid2)
+{
+ struct ib_mad_notice_attr data;
+
+ if (trap_num == IB_NOTICE_TRAP_BAD_PKEY)
+ ibp->pkey_violations++;
+ else
+ ibp->qkey_violations++;
+ ibp->n_pkt_drops++;
+
+ /* Send violation trap */
+ data.generic_type = IB_NOTICE_TYPE_SECURITY;
+ data.prod_type_msb = 0;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = trap_num;
+ data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+ data.toggle_count = 0;
+ memset(&data.details, 0, sizeof(data.details));
+ data.details.ntc_257_258.lid1 = lid1;
+ data.details.ntc_257_258.lid2 = lid2;
+ data.details.ntc_257_258.key = cpu_to_be32(key);
+ data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1);
+ data.details.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+ __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
+{
+ struct ib_mad_notice_attr data;
+
+ /* Send violation trap */
+ data.generic_type = IB_NOTICE_TYPE_SECURITY;
+ data.prod_type_msb = 0;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = IB_NOTICE_TRAP_BAD_MKEY;
+ data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+ data.toggle_count = 0;
+ memset(&data.details, 0, sizeof(data.details));
+ data.details.ntc_256.lid = data.issuer_lid;
+ data.details.ntc_256.method = mad->method;
+ data.details.ntc_256.attr_id = mad->attr_id;
+ data.details.ntc_256.attr_mod = mad->attr_mod;
+ data.details.ntc_256.mkey = mkey;
+ if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+
+ data.details.ntc_256.dr_slid = (__force __be16)dr_slid;
+ data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+ if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) {
+ data.details.ntc_256.dr_trunc_hop |=
+ IB_NOTICE_TRAP_DR_TRUNC;
+ hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path);
+ }
+ data.details.ntc_256.dr_trunc_hop |= hop_cnt;
+ memcpy(data.details.ntc_256.dr_rtn_path, return_path,
+ hop_cnt);
+ }
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void hfi1_cap_mask_chg(struct hfi1_ibport *ibp)
+{
+ struct ib_mad_notice_attr data;
+
+ data.generic_type = IB_NOTICE_TYPE_INFO;
+ data.prod_type_msb = 0;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+ data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+ data.toggle_count = 0;
+ memset(&data.details, 0, sizeof(data.details));
+ data.details.ntc_144.lid = data.issuer_lid;
+ data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
+{
+ struct ib_mad_notice_attr data;
+
+ data.generic_type = IB_NOTICE_TYPE_INFO;
+ data.prod_type_msb = 0;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG;
+ data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+ data.toggle_count = 0;
+ memset(&data.details, 0, sizeof(data.details));
+ data.details.ntc_145.lid = data.issuer_lid;
+ data.details.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
+{
+ struct ib_mad_notice_attr data;
+
+ data.generic_type = IB_NOTICE_TYPE_INFO;
+ data.prod_type_msb = 0;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+ data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+ data.toggle_count = 0;
+ memset(&data.details, 0, sizeof(data.details));
+ data.details.ntc_144.lid = data.issuer_lid;
+ data.details.ntc_144.local_changes = 1;
+ data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG;
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct opa_node_description *nd;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ nd = (struct opa_node_description *)data;
+
+ memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
+
+ if (resp_len)
+ *resp_len += sizeof(*nd);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_node_info *ni;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+ ni = (struct opa_node_info *)data;
+
+ /* GUID 0 is illegal */
+ if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+ ni->base_version = OPA_MGMT_BASE_VERSION;
+ ni->class_version = OPA_SMI_CLASS_VERSION;
+ ni->node_type = 1; /* channel adapter */
+ ni->num_ports = ibdev->phys_port_cnt;
+ /* This is already in network order */
+ ni->system_image_guid = ib_hfi1_sys_image_guid;
+ /* Use first-port GUID as node */
+ ni->node_guid = cpu_to_be64(dd->pport->guid);
+ ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+ ni->device_id = cpu_to_be16(dd->pcidev->device);
+ ni->revision = cpu_to_be32(dd->minrev);
+ ni->local_port_num = port;
+ ni->vendor_id[0] = dd->oui1;
+ ni->vendor_id[1] = dd->oui2;
+ ni->vendor_id[2] = dd->oui3;
+
+ if (resp_len)
+ *resp_len += sizeof(*ni);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+ u8 port)
+{
+ struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+ /* GUID 0 is illegal */
+ if (smp->attr_mod || pidx >= dd->num_pports ||
+ dd->pport[pidx].guid == 0)
+ smp->status |= IB_SMP_INVALID_FIELD;
+ else
+ nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+
+ nip->base_version = OPA_MGMT_BASE_VERSION;
+ nip->class_version = OPA_SMI_CLASS_VERSION;
+ nip->node_type = 1; /* channel adapter */
+ nip->num_ports = ibdev->phys_port_cnt;
+ /* This is already in network order */
+ nip->sys_guid = ib_hfi1_sys_image_guid;
+ /* Use first-port GUID as node */
+ nip->node_guid = cpu_to_be64(dd->pport->guid);
+ nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+ nip->device_id = cpu_to_be16(dd->pcidev->device);
+ nip->revision = cpu_to_be32(dd->minrev);
+ nip->local_port_num = port;
+ nip->vendor_id[0] = dd->oui1;
+ nip->vendor_id[1] = dd->oui2;
+ nip->vendor_id[2] = dd->oui3;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
+}
+
+static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
+{
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
+}
+
+static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+ int mad_flags, __be64 mkey, __be32 dr_slid,
+ u8 return_path[], u8 hop_cnt)
+{
+ int valid_mkey = 0;
+ int ret = 0;
+
+ /* Is the mkey in the process of expiring? */
+ if (ibp->mkey_lease_timeout &&
+ time_after_eq(jiffies, ibp->mkey_lease_timeout)) {
+ /* Clear timeout and mkey protection field. */
+ ibp->mkey_lease_timeout = 0;
+ ibp->mkeyprot = 0;
+ }
+
+ if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->mkey == 0 ||
+ ibp->mkey == mkey)
+ valid_mkey = 1;
+
+ /* Unset lease timeout on any valid Get/Set/TrapRepress */
+ if (valid_mkey && ibp->mkey_lease_timeout &&
+ (mad->method == IB_MGMT_METHOD_GET ||
+ mad->method == IB_MGMT_METHOD_SET ||
+ mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
+ ibp->mkey_lease_timeout = 0;
+
+ if (!valid_mkey) {
+ switch (mad->method) {
+ case IB_MGMT_METHOD_GET:
+ /* Bad mkey not a violation below level 2 */
+ if (ibp->mkeyprot < 2)
+ break;
+ case IB_MGMT_METHOD_SET:
+ case IB_MGMT_METHOD_TRAP_REPRESS:
+ if (ibp->mkey_violations != 0xFFFF)
+ ++ibp->mkey_violations;
+ if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period)
+ ibp->mkey_lease_timeout = jiffies +
+ ibp->mkey_lease_period * HZ;
+ /* Generate a trap notice. */
+ bad_mkey(ibp, mad, mkey, dr_slid, return_path,
+ hop_cnt);
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * The SMA caches reads from LCB registers in case the LCB is unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+ u32 off;
+ u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+ { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
+};
+
+static int write_lcb_cache(u32 off, u64 val)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+ if (lcb_cache[i].off == off) {
+ lcb_cache[i].val = val;
+ return 0;
+ }
+ }
+
+ pr_warn("%s bad offset 0x%x\n", __func__, off);
+ return -1;
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+ if (lcb_cache[i].off == off) {
+ *val = lcb_cache[i].val;
+ return 0;
+ }
+ }
+
+ pr_warn("%s bad offset 0x%x\n", __func__, off);
+ return -1;
+}
+
+void read_ltp_rtt(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
+ dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
+ else
+ write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
+}
+
+static u8 __opa_porttype(struct hfi1_pportdata *ppd)
+{
+ if (qsfp_mod_present(ppd)) {
+ if (ppd->qsfp_info.cache_valid)
+ return OPA_PORT_TYPE_STANDARD;
+ return OPA_PORT_TYPE_DISCONNECTED;
+ }
+ return OPA_PORT_TYPE_UNKNOWN;
+}
+
+static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int i;
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ struct opa_port_info *pi = (struct opa_port_info *)data;
+ u8 mtu;
+ u8 credit_rate;
+ u32 state;
+ u32 num_ports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ u32 buffer_units;
+ u64 tmp = 0;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ dd = dd_from_ibdev(ibdev);
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ ibp = &ppd->ibport_data;
+
+ if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+ ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ pi->lid = cpu_to_be32(ppd->lid);
+
+ /* Only return the mkey if the protection field allows it. */
+ if (!(smp->method == IB_MGMT_METHOD_GET &&
+ ibp->mkey != smp->mkey &&
+ ibp->mkeyprot == 1))
+ pi->mkey = ibp->mkey;
+
+ pi->subnet_prefix = ibp->gid_prefix;
+ pi->sm_lid = cpu_to_be32(ibp->sm_lid);
+ pi->ib_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+ pi->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period);
+ pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
+ pi->sa_qp = cpu_to_be32(ppd->sa_qp);
+
+ pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
+ pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
+ pi->link_width.active = cpu_to_be16(ppd->link_width_active);
+
+ pi->link_width_downgrade.supported =
+ cpu_to_be16(ppd->link_width_downgrade_supported);
+ pi->link_width_downgrade.enabled =
+ cpu_to_be16(ppd->link_width_downgrade_enabled);
+ pi->link_width_downgrade.tx_active =
+ cpu_to_be16(ppd->link_width_downgrade_tx_active);
+ pi->link_width_downgrade.rx_active =
+ cpu_to_be16(ppd->link_width_downgrade_rx_active);
+
+ pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
+ pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
+ pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
+
+ state = driver_lstate(ppd);
+
+ if (start_of_sm_config && (state == IB_PORT_INIT))
+ ppd->is_sm_config_started = 1;
+
+ pi->port_phys_conf = __opa_porttype(ppd) & 0xf;
+
+#if PI_LED_ENABLE_SUP
+ pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+ pi->port_states.ledenable_offlinereason |=
+ ppd->is_sm_config_started << 5;
+ pi->port_states.ledenable_offlinereason |=
+ ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+#else
+ pi->port_states.offline_reason = ppd->neighbor_normal << 4;
+ pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+ pi->port_states.offline_reason |= ppd->offline_disabled_reason &
+ OPA_PI_MASK_OFFLINE_REASON;
+#endif /* PI_LED_ENABLE_SUP */
+
+ pi->port_states.portphysstate_portstate =
+ (hfi1_ibphys_portstate(ppd) << 4) | state;
+
+ pi->mkeyprotect_lmc = (ibp->mkeyprot << 6) | ppd->lmc;
+
+ memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
+ for (i = 0; i < ppd->vls_supported; i++) {
+ mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
+ if ((i % 2) == 0)
+ pi->neigh_mtu.pvlx_to_mtu[i/2] |= (mtu << 4);
+ else
+ pi->neigh_mtu.pvlx_to_mtu[i/2] |= mtu;
+ }
+ /* don't forget VL 15 */
+ mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
+ pi->neigh_mtu.pvlx_to_mtu[15/2] |= mtu;
+ pi->smsl = ibp->sm_sl & OPA_PI_MASK_SMSL;
+ pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
+ pi->partenforce_filterraw |=
+ (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
+ if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
+ pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
+ if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
+ pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
+ pi->mkey_violations = cpu_to_be16(ibp->mkey_violations);
+ /* P_KeyViolations are counted by hardware. */
+ pi->pkey_violations = cpu_to_be16(ibp->pkey_violations);
+ pi->qkey_violations = cpu_to_be16(ibp->qkey_violations);
+
+ pi->vl.cap = ppd->vls_supported;
+ pi->vl.high_limit = cpu_to_be16(ibp->vl_high_limit);
+ pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
+ pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
+
+ pi->clientrereg_subnettimeout = ibp->subnet_timeout;
+
+ pi->port_link_mode = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
+ OPA_PORT_LINK_MODE_OPA << 5 |
+ OPA_PORT_LINK_MODE_OPA);
+
+ pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
+
+ pi->port_mode = cpu_to_be16(
+ ppd->is_active_optimize_enabled ?
+ OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
+
+ pi->port_packet_format.supported =
+ cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+ pi->port_packet_format.enabled =
+ cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+
+ /* flit_control.interleave is (OPA V1, version .76):
+ * bits use
+ * ---- ---
+ * 2 res
+ * 2 DistanceSupported
+ * 2 DistanceEnabled
+ * 5 MaxNextLevelTxEnabled
+ * 5 MaxNestLevelRxSupported
+ *
+ * HFI supports only "distance mode 1" (see OPA V1, version .76,
+ * section 9.6.2), so set DistanceSupported, DistanceEnabled
+ * to 0x1.
+ */
+ pi->flit_control.interleave = cpu_to_be16(0x1400);
+
+ pi->link_down_reason = ppd->local_link_down_reason.sma;
+ pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
+ pi->port_error_action = cpu_to_be32(ppd->port_error_action);
+ pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
+
+ /* 32.768 usec. response time (guessing) */
+ pi->resptimevalue = 3;
+
+ pi->local_port_num = port;
+
+ /* buffer info for FM */
+ pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
+
+ pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
+ pi->neigh_port_num = ppd->neighbor_port_number;
+ pi->port_neigh_mode =
+ (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
+ (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
+ (ppd->neighbor_fm_security ?
+ OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
+
+ /* HFIs shall always return VL15 credits to their
+ * neighbor in a timely manner, without any credit return pacing.
+ */
+ credit_rate = 0;
+ buffer_units = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
+ buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
+ buffer_units |= (credit_rate << 6) &
+ OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
+ buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
+ pi->buffer_units = cpu_to_be32(buffer_units);
+
+ pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
+
+ /* HFI supports a replay buffer 128 LTPs in size */
+ pi->replay_depth.buffer = 0x80;
+ /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+ read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
+
+ /* this counter is 16 bits wide, but the replay_depth.wire
+ * variable is only 8 bits */
+ if (tmp > 0xff)
+ tmp = 0xff;
+ pi->replay_depth.wire = tmp;
+
+ if (resp_len)
+ *resp_len += sizeof(struct opa_port_info);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+ struct hfi1_pportdata *ppd = dd->pport + port - 1;
+
+ memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
+
+ return 0;
+}
+
+static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 n_blocks_req = OPA_AM_NBLK(am);
+ u32 start_block = am & 0x7ff;
+ __be16 *p;
+ u16 *q;
+ int i;
+ u16 n_blocks_avail;
+ unsigned npkeys = hfi1_get_npkeys(dd);
+ size_t size;
+
+ if (n_blocks_req == 0) {
+ pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+ port, start_block, n_blocks_req);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ n_blocks_avail = (u16) (npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+ size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
+
+ if (start_block + n_blocks_req > n_blocks_avail ||
+ n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+ pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
+ "avail 0x%x; blk/smp 0x%lx\n",
+ start_block, n_blocks_req, n_blocks_avail,
+ OPA_NUM_PKEY_BLOCKS_PER_SMP);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ p = (__be16 *) data;
+ q = (u16 *)data;
+ /* get the real pkeys if we are requesting the first block */
+ if (start_block == 0) {
+ get_pkeys(dd, port, q);
+ for (i = 0; i < npkeys; i++)
+ p[i] = cpu_to_be16(q[i]);
+ if (resp_len)
+ *resp_len += size;
+ } else
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+enum {
+ HFI_TRANSITION_DISALLOWED,
+ HFI_TRANSITION_IGNORED,
+ HFI_TRANSITION_ALLOWED,
+ HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * Use shortened names to improve readability of
+ * {logical,physical}_state_transitions
+ */
+enum {
+ __D = HFI_TRANSITION_DISALLOWED,
+ __I = HFI_TRANSITION_IGNORED,
+ __A = HFI_TRANSITION_ALLOWED,
+ __U = HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
+ * represented in physical_state_transitions.
+ */
+#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
+
+/*
+ * Within physical_state_transitions, rows represent "old" states,
+ * columns "new" states, and physical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 6-4).
+ */
+static const struct {
+ u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
+} physical_state_transitions = {
+ {
+ /* 2 3 4 5 6 7 8 9 10 11 */
+ /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
+ /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
+ /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
+ /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
+ /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
+ /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
+ }
+};
+
+/*
+ * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
+ * logical_state_transitions
+ */
+
+#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
+
+/*
+ * Within logical_state_transitions rows represent "old" states,
+ * columns "new" states, and logical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 9-12).
+ */
+static const struct {
+ u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
+} logical_state_transitions = {
+ {
+ /* 1 2 3 4 5 */
+ /* 1 */ { __I, __D, __D, __D, __U},
+ /* 2 */ { __D, __I, __A, __D, __U},
+ /* 3 */ { __D, __D, __I, __A, __U},
+ /* 4 */ { __D, __D, __I, __I, __U},
+ /* 5 */ { __U, __U, __U, __U, __U},
+ }
+};
+
+static int logical_transition_allowed(int old, int new)
+{
+ if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
+ new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
+ pr_warn("invalid logical state(s) (old %d new %d)\n",
+ old, new);
+ return HFI_TRANSITION_UNDEFINED;
+ }
+
+ if (new == IB_PORT_NOP)
+ return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+ /* adjust states for indexing into logical_state_transitions */
+ old -= IB_PORT_DOWN;
+ new -= IB_PORT_DOWN;
+
+ if (old < 0 || new < 0)
+ return HFI_TRANSITION_UNDEFINED;
+ return logical_state_transitions.allowed[old][new];
+}
+
+static int physical_transition_allowed(int old, int new)
+{
+ if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
+ new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
+ pr_warn("invalid physical state(s) (old %d new %d)\n",
+ old, new);
+ return HFI_TRANSITION_UNDEFINED;
+ }
+
+ if (new == IB_PORTPHYSSTATE_NOP)
+ return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+ /* adjust states for indexing into physical_state_transitions */
+ old -= IB_PORTPHYSSTATE_POLLING;
+ new -= IB_PORTPHYSSTATE_POLLING;
+
+ if (old < 0 || new < 0)
+ return HFI_TRANSITION_UNDEFINED;
+ return physical_state_transitions.allowed[old][new];
+}
+
+static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
+ u32 logical_new, u32 physical_new)
+{
+ u32 physical_old = driver_physical_state(ppd);
+ u32 logical_old = driver_logical_state(ppd);
+ int ret, logical_allowed, physical_allowed;
+
+ logical_allowed = ret =
+ logical_transition_allowed(logical_old, logical_new);
+
+ if (ret == HFI_TRANSITION_DISALLOWED ||
+ ret == HFI_TRANSITION_UNDEFINED) {
+ pr_warn("invalid logical state transition %s -> %s\n",
+ opa_lstate_name(logical_old),
+ opa_lstate_name(logical_new));
+ return ret;
+ }
+
+ physical_allowed = ret =
+ physical_transition_allowed(physical_old, physical_new);
+
+ if (ret == HFI_TRANSITION_DISALLOWED ||
+ ret == HFI_TRANSITION_UNDEFINED) {
+ pr_warn("invalid physical state transition %s -> %s\n",
+ opa_pstate_name(physical_old),
+ opa_pstate_name(physical_new));
+ return ret;
+ }
+
+ if (logical_allowed == HFI_TRANSITION_IGNORED &&
+ physical_allowed == HFI_TRANSITION_IGNORED)
+ return HFI_TRANSITION_IGNORED;
+
+ /*
+ * Either physical_allowed or logical_allowed is
+ * HFI_TRANSITION_ALLOWED.
+ */
+ return HFI_TRANSITION_ALLOWED;
+}
+
+static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
+ u32 logical_state, u32 phys_state,
+ int suppress_idle_sma)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 link_state;
+ int ret;
+
+ ret = port_states_transition_allowed(ppd, logical_state, phys_state);
+ if (ret == HFI_TRANSITION_DISALLOWED ||
+ ret == HFI_TRANSITION_UNDEFINED) {
+ /* error message emitted above */
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return 0;
+ }
+
+ if (ret == HFI_TRANSITION_IGNORED)
+ return 0;
+
+ if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
+ !(logical_state == IB_PORT_DOWN ||
+ logical_state == IB_PORT_NOP)){
+ pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
+ logical_state, phys_state);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+
+ /*
+ * Logical state changes are summarized in OPAv1g1 spec.,
+ * Table 9-12; physical state changes are summarized in
+ * OPAv1g1 spec., Table 6.4.
+ */
+ switch (logical_state) {
+ case IB_PORT_NOP:
+ if (phys_state == IB_PORTPHYSSTATE_NOP)
+ break;
+ /* FALLTHROUGH */
+ case IB_PORT_DOWN:
+ if (phys_state == IB_PORTPHYSSTATE_NOP)
+ link_state = HLS_DN_DOWNDEF;
+ else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+ link_state = HLS_DN_POLL;
+ set_link_down_reason(ppd,
+ OPA_LINKDOWN_REASON_FM_BOUNCE, 0,
+ OPA_LINKDOWN_REASON_FM_BOUNCE);
+ } else if (phys_state == IB_PORTPHYSSTATE_DISABLED)
+ link_state = HLS_DN_DISABLE;
+ else {
+ pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
+ phys_state);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ break;
+ }
+
+ set_link_state(ppd, link_state);
+ if (link_state == HLS_DN_DISABLE &&
+ (ppd->offline_disabled_reason >
+ OPA_LINKDOWN_REASON_SMA_DISABLED ||
+ ppd->offline_disabled_reason ==
+ OPA_LINKDOWN_REASON_NONE))
+ ppd->offline_disabled_reason =
+ OPA_LINKDOWN_REASON_SMA_DISABLED;
+ /*
+ * Don't send a reply if the response would be sent
+ * through the disabled port.
+ */
+ if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ break;
+ case IB_PORT_ARMED:
+ ret = set_link_state(ppd, HLS_UP_ARMED);
+ if ((ret == 0) && (suppress_idle_sma == 0))
+ send_idle_sma(dd, SMA_IDLE_ARM);
+ break;
+ case IB_PORT_ACTIVE:
+ if (ppd->neighbor_normal) {
+ ret = set_link_state(ppd, HLS_UP_ACTIVE);
+ if (ret == 0)
+ send_idle_sma(dd, SMA_IDLE_ACTIVE);
+ } else {
+ pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ break;
+ default:
+ pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
+ logical_state);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+
+ return 0;
+}
+
+/**
+ * subn_set_opa_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ */
+static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_port_info *pi = (struct opa_port_info *)data;
+ struct ib_event event;
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ u8 clientrereg;
+ unsigned long flags;
+ u32 smlid, opa_lid; /* tmp vars to hold LID values */
+ u16 lid;
+ u8 ls_old, ls_new, ps_new;
+ u8 vls;
+ u8 msl;
+ u8 crc_enabled;
+ u16 lse, lwe, mtu;
+ u32 num_ports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ int ret, i, invalid = 0, call_set_mtu = 0;
+ int call_link_downgrade_policy = 0;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ opa_lid = be32_to_cpu(pi->lid);
+ if (opa_lid & 0xFFFF0000) {
+ pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ goto get_only;
+ }
+
+ lid = (u16)(opa_lid & 0x0000FFFF);
+
+ smlid = be32_to_cpu(pi->sm_lid);
+ if (smlid & 0xFFFF0000) {
+ pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ goto get_only;
+ }
+ smlid &= 0x0000FFFF;
+
+ clientrereg = (pi->clientrereg_subnettimeout &
+ OPA_PI_MASK_CLIENT_REREGISTER);
+
+ dd = dd_from_ibdev(ibdev);
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ ibp = &ppd->ibport_data;
+ event.device = ibdev;
+ event.element.port_num = port;
+
+ ls_old = driver_lstate(ppd);
+
+ ibp->mkey = pi->mkey;
+ ibp->gid_prefix = pi->subnet_prefix;
+ ibp->mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+
+ /* Must be a valid unicast LID address. */
+ if ((lid == 0 && ls_old > IB_PORT_INIT) ||
+ lid >= HFI1_MULTICAST_LID_BASE) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
+ lid);
+ } else if (ppd->lid != lid ||
+ ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
+ if (ppd->lid != lid)
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
+ if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
+ hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
+ event.event = IB_EVENT_LID_CHANGE;
+ ib_dispatch_event(&event);
+ }
+
+ msl = pi->smsl & OPA_PI_MASK_SMSL;
+ if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
+ ppd->linkinit_reason =
+ (pi->partenforce_filterraw &
+ OPA_PI_MASK_LINKINIT_REASON);
+ /* enable/disable SW pkey checking as per FM control */
+ if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
+ ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
+ else
+ ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
+
+ if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
+ ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
+ else
+ ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
+
+ /* Must be a valid unicast LID address. */
+ if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
+ smlid >= HFI1_MULTICAST_LID_BASE) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
+ } else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) {
+ pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
+ spin_lock_irqsave(&ibp->lock, flags);
+ if (ibp->sm_ah) {
+ if (smlid != ibp->sm_lid)
+ ibp->sm_ah->attr.dlid = smlid;
+ if (msl != ibp->sm_sl)
+ ibp->sm_ah->attr.sl = msl;
+ }
+ spin_unlock_irqrestore(&ibp->lock, flags);
+ if (smlid != ibp->sm_lid)
+ ibp->sm_lid = smlid;
+ if (msl != ibp->sm_sl)
+ ibp->sm_sl = msl;
+ event.event = IB_EVENT_SM_CHANGE;
+ ib_dispatch_event(&event);
+ }
+
+ if (pi->link_down_reason == 0) {
+ ppd->local_link_down_reason.sma = 0;
+ ppd->local_link_down_reason.latest = 0;
+ }
+
+ if (pi->neigh_link_down_reason == 0) {
+ ppd->neigh_link_down_reason.sma = 0;
+ ppd->neigh_link_down_reason.latest = 0;
+ }
+
+ ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
+ ppd->sa_qp = be32_to_cpu(pi->sa_qp);
+
+ ppd->port_error_action = be32_to_cpu(pi->port_error_action);
+ lwe = be16_to_cpu(pi->link_width.enabled);
+ if (lwe) {
+ if (lwe == OPA_LINK_WIDTH_RESET
+ || lwe == OPA_LINK_WIDTH_RESET_OLD)
+ set_link_width_enabled(ppd, ppd->link_width_supported);
+ else if ((lwe & ~ppd->link_width_supported) == 0)
+ set_link_width_enabled(ppd, lwe);
+ else
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
+ /* LWD.E is always applied - 0 means "disabled" */
+ if (lwe == OPA_LINK_WIDTH_RESET
+ || lwe == OPA_LINK_WIDTH_RESET_OLD) {
+ set_link_width_downgrade_enabled(ppd,
+ ppd->link_width_downgrade_supported);
+ } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
+ /* only set and apply if something changed */
+ if (lwe != ppd->link_width_downgrade_enabled) {
+ set_link_width_downgrade_enabled(ppd, lwe);
+ call_link_downgrade_policy = 1;
+ }
+ } else
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ lse = be16_to_cpu(pi->link_speed.enabled);
+ if (lse) {
+ if (lse & be16_to_cpu(pi->link_speed.supported))
+ set_link_speed_enabled(ppd, lse);
+ else
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+
+ ibp->mkeyprot = (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+ ibp->vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
+ ibp->vl_high_limit);
+
+ if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+ ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ for (i = 0; i < ppd->vls_supported; i++) {
+ if ((i % 2) == 0)
+ mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i/2] >> 4)
+ & 0xF);
+ else
+ mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i/2] & 0xF);
+ if (mtu == 0xffff) {
+ pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
+ mtu,
+ (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ mtu = hfi1_max_mtu; /* use a valid MTU */
+ }
+ if (dd->vld[i].mtu != mtu) {
+ dd_dev_info(dd,
+ "MTU change on vl %d from %d to %d\n",
+ i, dd->vld[i].mtu, mtu);
+ dd->vld[i].mtu = mtu;
+ call_set_mtu++;
+ }
+ }
+ /* As per OPAV1 spec: VL15 must support and be configured
+ * for operation with a 2048 or larger MTU.
+ */
+ mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15/2] & 0xF);
+ if (mtu < 2048 || mtu == 0xffff)
+ mtu = 2048;
+ if (dd->vld[15].mtu != mtu) {
+ dd_dev_info(dd,
+ "MTU change on vl 15 from %d to %d\n",
+ dd->vld[15].mtu, mtu);
+ dd->vld[15].mtu = mtu;
+ call_set_mtu++;
+ }
+ if (call_set_mtu)
+ set_mtu(ppd);
+
+ /* Set operational VLs */
+ vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
+ if (vls) {
+ if (vls > ppd->vls_supported) {
+ pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
+ pi->operational_vls);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ } else {
+ if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
+ vls) == -EINVAL)
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ }
+
+ if (pi->mkey_violations == 0)
+ ibp->mkey_violations = 0;
+
+ if (pi->pkey_violations == 0)
+ ibp->pkey_violations = 0;
+
+ if (pi->qkey_violations == 0)
+ ibp->qkey_violations = 0;
+
+ ibp->subnet_timeout =
+ pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
+
+ crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
+ crc_enabled >>= 4;
+ crc_enabled &= 0xf;
+
+ if (crc_enabled != 0)
+ ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
+
+ ppd->is_active_optimize_enabled =
+ !!(be16_to_cpu(pi->port_mode)
+ & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
+
+ ls_new = pi->port_states.portphysstate_portstate &
+ OPA_PI_MASK_PORT_STATE;
+ ps_new = (pi->port_states.portphysstate_portstate &
+ OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
+
+ if (ls_old == IB_PORT_INIT) {
+ if (start_of_sm_config) {
+ if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+ ppd->is_sm_config_started = 1;
+ } else if (ls_new == IB_PORT_ARMED) {
+ if (ppd->is_sm_config_started == 0)
+ invalid = 1;
+ }
+ }
+
+ /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
+ if (clientrereg) {
+ event.event = IB_EVENT_CLIENT_REREGISTER;
+ ib_dispatch_event(&event);
+ }
+
+ /*
+ * Do the port state change now that the other link parameters
+ * have been set.
+ * Changing the port physical state only makes sense if the link
+ * is down or is being set to down.
+ */
+
+ ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+ if (ret)
+ return ret;
+
+ ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+
+ /* restore re-reg bit per o14-12.2.1 */
+ pi->clientrereg_subnettimeout |= clientrereg;
+
+ /*
+ * Apply the new link downgrade policy. This may result in a link
+ * bounce. Do this after everything else so things are settled.
+ * Possible problem: if setting the port state above fails, then
+ * the policy change is not applied.
+ */
+ if (call_link_downgrade_policy)
+ apply_link_downgrade_policy(ppd, 0);
+
+ return ret;
+
+get_only:
+ return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+ int changed = 0;
+ int update_includes_mgmt_partition = 0;
+
+ /*
+ * IB port one/two always maps to context zero/one,
+ * always a kernel context, no locking needed
+ * If we get here with ppd setup, no need to check
+ * that rcd is valid.
+ */
+ ppd = dd->pport + (port - 1);
+ /*
+ * If the update does not include the management pkey, don't do it.
+ */
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+ if (pkeys[i] == LIM_MGMT_P_KEY) {
+ update_includes_mgmt_partition = 1;
+ break;
+ }
+ }
+
+ if (!update_includes_mgmt_partition)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+ u16 key = pkeys[i];
+ u16 okey = ppd->pkeys[i];
+
+ if (key == okey)
+ continue;
+ /*
+ * The SM gives us the complete PKey table. We have
+ * to ensure that we put the PKeys in the matching
+ * slots.
+ */
+ ppd->pkeys[i] = key;
+ changed = 1;
+ }
+
+ if (changed) {
+ struct ib_event event;
+
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+
+ event.event = IB_EVENT_PKEY_CHANGE;
+ event.device = &dd->verbs_dev.ibdev;
+ event.element.port_num = port;
+ ib_dispatch_event(&event);
+ }
+ return 0;
+}
+
+static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 n_blocks_sent = OPA_AM_NBLK(am);
+ u32 start_block = am & 0x7ff;
+ u16 *p = (u16 *) data;
+ __be16 *q = (__be16 *)data;
+ int i;
+ u16 n_blocks_avail;
+ unsigned npkeys = hfi1_get_npkeys(dd);
+
+ if (n_blocks_sent == 0) {
+ pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+ port, start_block, n_blocks_sent);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ n_blocks_avail = (u16)(npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+ if (start_block + n_blocks_sent > n_blocks_avail ||
+ n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+ pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
+ start_block, n_blocks_sent, n_blocks_avail,
+ OPA_NUM_PKEY_BLOCKS_PER_SMP);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
+ p[i] = be16_to_cpu(q[i]);
+
+ if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
+}
+
+static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+ u64 *val = (u64 *)data;
+
+ *val++ = read_csr(dd, SEND_SC2VLT0);
+ *val++ = read_csr(dd, SEND_SC2VLT1);
+ *val++ = read_csr(dd, SEND_SC2VLT2);
+ *val++ = read_csr(dd, SEND_SC2VLT3);
+ return 0;
+}
+
+#define ILLEGAL_VL 12
+/*
+ * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
+ * for SC15, which must map to VL15). If we don't remap things this
+ * way it is possible for VL15 counters to increment when we try to
+ * send on a SC which is mapped to an invalid VL.
+ */
+static void filter_sc2vlt(void *data)
+{
+ int i;
+ u8 *pd = (u8 *)data;
+
+ for (i = 0; i < OPA_MAX_SCS; i++) {
+ if (i == 15)
+ continue;
+ if ((pd[i] & 0x1f) == 0xf)
+ pd[i] = ILLEGAL_VL;
+ }
+}
+
+static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+ u64 *val = (u64 *)data;
+
+ filter_sc2vlt(data);
+
+ write_csr(dd, SEND_SC2VLT0, *val++);
+ write_csr(dd, SEND_SC2VLT1, *val++);
+ write_csr(dd, SEND_SC2VLT2, *val++);
+ write_csr(dd, SEND_SC2VLT3, *val++);
+ write_seqlock_irq(&dd->sc2vl_lock);
+ memcpy(dd->sc2vl, (u64 *)data, sizeof(dd->sc2vl));
+ write_sequnlock_irq(&dd->sc2vl_lock);
+ return 0;
+}
+
+static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = (u8 *)data;
+ size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
+ unsigned i;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+ *p++ = ibp->sl_to_sc[i];
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = (u8 *)data;
+ int i;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+ ibp->sl_to_sc[i] = *p++;
+
+ return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = (u8 *)data;
+ size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
+ unsigned i;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+ *p++ = ibp->sc_to_sl[i];
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = (u8 *)data;
+ int i;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+ ibp->sc_to_sl[i] = *p++;
+
+ return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NBLK(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ void *vp = (void *) data;
+ size_t size = 4 * sizeof(u64);
+
+ if (n_blocks != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ get_sc2vlt_tables(dd, vp);
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NBLK(am);
+ int async_update = OPA_AM_ASYNC(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ void *vp = (void *) data;
+ struct hfi1_pportdata *ppd;
+ int lstate;
+
+ if (n_blocks != 1 || async_update) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ lstate = driver_lstate(ppd);
+ /* it's known that async_update is 0 by this point, but include
+ * the explicit check for clarity */
+ if (!async_update &&
+ (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ set_sc2vlt_tables(dd, vp);
+
+ return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ void *vp = (void *) data;
+ int size;
+
+ if (n_blocks != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ppd = dd->pport + (port - 1);
+
+ size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ void *vp = (void *) data;
+ int lstate;
+
+ if (n_blocks != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ lstate = driver_lstate(ppd);
+ if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ppd = dd->pport + (port - 1);
+
+ fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
+
+ return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+ resp_len);
+}
+
+static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 nports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ u32 lstate;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+
+ if (nports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ibp = to_iport(ibdev, port);
+ ppd = ppd_from_ibp(ibp);
+
+ lstate = driver_lstate(ppd);
+
+ if (start_of_sm_config && (lstate == IB_PORT_INIT))
+ ppd->is_sm_config_started = 1;
+
+#if PI_LED_ENABLE_SUP
+ psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+ psi->port_states.ledenable_offlinereason |=
+ ppd->is_sm_config_started << 5;
+ psi->port_states.ledenable_offlinereason |=
+ ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+#else
+ psi->port_states.offline_reason = ppd->neighbor_normal << 4;
+ psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+ psi->port_states.offline_reason |= ppd->offline_disabled_reason &
+ OPA_PI_MASK_OFFLINE_REASON;
+#endif /* PI_LED_ENABLE_SUP */
+
+ psi->port_states.portphysstate_portstate =
+ (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
+ psi->link_width_downgrade_tx_active =
+ cpu_to_be16(ppd->link_width_downgrade_tx_active);
+ psi->link_width_downgrade_rx_active =
+ cpu_to_be16(ppd->link_width_downgrade_rx_active);
+ if (resp_len)
+ *resp_len += sizeof(struct opa_port_state_info);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 nports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ u32 ls_old;
+ u8 ls_new, ps_new;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+ int ret, invalid = 0;
+
+ if (nports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ibp = to_iport(ibdev, port);
+ ppd = ppd_from_ibp(ibp);
+
+ ls_old = driver_lstate(ppd);
+
+ ls_new = port_states_to_logical_state(&psi->port_states);
+ ps_new = port_states_to_phys_state(&psi->port_states);
+
+ if (ls_old == IB_PORT_INIT) {
+ if (start_of_sm_config) {
+ if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+ ppd->is_sm_config_started = 1;
+ } else if (ls_new == IB_PORT_ARMED) {
+ if (ppd->is_sm_config_started == 0)
+ invalid = 1;
+ }
+ }
+
+ ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+ if (ret)
+ return ret;
+
+ if (invalid)
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 addr = OPA_AM_CI_ADDR(am);
+ u32 len = OPA_AM_CI_LEN(am) + 1;
+ int ret;
+
+#define __CI_PAGE_SIZE (1 << 7) /* 128 bytes */
+#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
+#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
+
+ /* check that addr is within spec, and
+ * addr and (addr + len - 1) are on the same "page" */
+ if (addr >= 4096 ||
+ (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ret = get_cable_info(dd, port, addr, len, data);
+
+ if (ret == -ENODEV) {
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* The address range for the CableInfo SMA query is wider than the
+ * memory available on the QSFP cable. We want to return a valid
+ * response, albeit zeroed out, for address ranges beyond available
+ * memory but that are within the CableInfo query spec
+ */
+ if (ret < 0 && ret != -ERANGE) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ if (resp_len)
+ *resp_len += len;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ u32 num_ports = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ struct buffer_control *p = (struct buffer_control *) data;
+ int size;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ppd = dd->pport + (port - 1);
+ size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+ trace_bct_get(dd, p);
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ u32 num_ports = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ struct buffer_control *p = (struct buffer_control *) data;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ ppd = dd->pport + (port - 1);
+ trace_bct_set(dd, p);
+ if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+ u32 num_ports = OPA_AM_NPORT(am);
+ u8 section = (am & 0x00ff0000) >> 16;
+ u8 *p = data;
+ int size = 0;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ switch (section) {
+ case OPA_VLARB_LOW_ELEMENTS:
+ size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+ break;
+ case OPA_VLARB_HIGH_ELEMENTS:
+ size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+ break;
+ case OPA_VLARB_PREEMPT_ELEMENTS:
+ size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+ break;
+ case OPA_VLARB_PREEMPT_MATRIX:
+ size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+ break;
+ default:
+ pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
+ be32_to_cpu(smp->attr_mod));
+ smp->status |= IB_SMP_INVALID_FIELD;
+ break;
+ }
+
+ if (size > 0 && resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+ u32 num_ports = OPA_AM_NPORT(am);
+ u8 section = (am & 0x00ff0000) >> 16;
+ u8 *p = data;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ switch (section) {
+ case OPA_VLARB_LOW_ELEMENTS:
+ (void) fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+ break;
+ case OPA_VLARB_HIGH_ELEMENTS:
+ (void) fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+ break;
+ /* neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+ * can be changed from the default values */
+ case OPA_VLARB_PREEMPT_ELEMENTS:
+ /* FALLTHROUGH */
+ case OPA_VLARB_PREEMPT_MATRIX:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ break;
+ default:
+ pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
+ be32_to_cpu(smp->attr_mod));
+ smp->status |= IB_SMP_INVALID_FIELD;
+ break;
+ }
+
+ return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_pma_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[2024];
+} __packed;
+
+struct opa_class_port_info {
+ u8 base_version;
+ u8 class_version;
+ __be16 cap_mask;
+ __be32 cap_mask2_resp_time;
+
+ u8 redirect_gid[16];
+ __be32 redirect_tc_fl;
+ __be32 redirect_lid;
+ __be32 redirect_sl_qp;
+ __be32 redirect_qkey;
+
+ u8 trap_gid[16];
+ __be32 trap_tc_fl;
+ __be32 trap_lid;
+ __be32 trap_hl_qp;
+ __be32 trap_qkey;
+
+ __be16 trap_pkey;
+ __be16 redirect_pkey;
+
+ u8 trap_sl_rsvd;
+ u8 reserved[3];
+} __packed;
+
+struct opa_port_status_req {
+ __u8 port_num;
+ __u8 reserved[3];
+ __be32 vl_select_mask;
+};
+
+#define VL_MASK_ALL 0x000080ff
+
+struct opa_port_status_rsp {
+ __u8 port_num;
+ __u8 reserved[3];
+ __be32 vl_select_mask;
+
+ /* Data counters */
+ __be64 port_xmit_data;
+ __be64 port_rcv_data;
+ __be64 port_xmit_pkts;
+ __be64 port_rcv_pkts;
+ __be64 port_multicast_xmit_pkts;
+ __be64 port_multicast_rcv_pkts;
+ __be64 port_xmit_wait;
+ __be64 sw_port_congestion;
+ __be64 port_rcv_fecn;
+ __be64 port_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_xmit_wasted_bw;
+ __be64 port_xmit_wait_data;
+ __be64 port_rcv_bubble;
+ __be64 port_mark_fecn;
+ /* Error counters */
+ __be64 port_rcv_constraint_errors;
+ __be64 port_rcv_switch_relay_errors;
+ __be64 port_xmit_discards;
+ __be64 port_xmit_constraint_errors;
+ __be64 port_rcv_remote_physical_errors;
+ __be64 local_link_integrity_errors;
+ __be64 port_rcv_errors;
+ __be64 excessive_buffer_overruns;
+ __be64 fm_config_errors;
+ __be32 link_error_recovery;
+ __be32 link_downed;
+ u8 uncorrectable_errors;
+
+ u8 link_quality_indicator; /* 5res, 3bit */
+ u8 res2[6];
+ struct _vls_pctrs {
+ /* per-VL Data counters */
+ __be64 port_vl_xmit_data;
+ __be64 port_vl_rcv_data;
+ __be64 port_vl_xmit_pkts;
+ __be64 port_vl_rcv_pkts;
+ __be64 port_vl_xmit_wait;
+ __be64 sw_port_vl_congestion;
+ __be64 port_vl_rcv_fecn;
+ __be64 port_vl_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_vl_xmit_wasted_bw;
+ __be64 port_vl_xmit_wait_data;
+ __be64 port_vl_rcv_bubble;
+ __be64 port_vl_mark_fecn;
+ __be64 port_vl_xmit_discards;
+ } vls[0]; /* real array size defined by # bits set in vl_select_mask */
+};
+
+enum counter_selects {
+ CS_PORT_XMIT_DATA = (1 << 31),
+ CS_PORT_RCV_DATA = (1 << 30),
+ CS_PORT_XMIT_PKTS = (1 << 29),
+ CS_PORT_RCV_PKTS = (1 << 28),
+ CS_PORT_MCAST_XMIT_PKTS = (1 << 27),
+ CS_PORT_MCAST_RCV_PKTS = (1 << 26),
+ CS_PORT_XMIT_WAIT = (1 << 25),
+ CS_SW_PORT_CONGESTION = (1 << 24),
+ CS_PORT_RCV_FECN = (1 << 23),
+ CS_PORT_RCV_BECN = (1 << 22),
+ CS_PORT_XMIT_TIME_CONG = (1 << 21),
+ CS_PORT_XMIT_WASTED_BW = (1 << 20),
+ CS_PORT_XMIT_WAIT_DATA = (1 << 19),
+ CS_PORT_RCV_BUBBLE = (1 << 18),
+ CS_PORT_MARK_FECN = (1 << 17),
+ CS_PORT_RCV_CONSTRAINT_ERRORS = (1 << 16),
+ CS_PORT_RCV_SWITCH_RELAY_ERRORS = (1 << 15),
+ CS_PORT_XMIT_DISCARDS = (1 << 14),
+ CS_PORT_XMIT_CONSTRAINT_ERRORS = (1 << 13),
+ CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS = (1 << 12),
+ CS_LOCAL_LINK_INTEGRITY_ERRORS = (1 << 11),
+ CS_PORT_RCV_ERRORS = (1 << 10),
+ CS_EXCESSIVE_BUFFER_OVERRUNS = (1 << 9),
+ CS_FM_CONFIG_ERRORS = (1 << 8),
+ CS_LINK_ERROR_RECOVERY = (1 << 7),
+ CS_LINK_DOWNED = (1 << 6),
+ CS_UNCORRECTABLE_ERRORS = (1 << 5),
+};
+
+struct opa_clear_port_status {
+ __be64 port_select_mask[4];
+ __be32 counter_select_mask;
+};
+
+struct opa_aggregate {
+ __be16 attr_id;
+ __be16 err_reqlength; /* 1 bit, 8 res, 7 bit */
+ __be32 attr_mod;
+ u8 data[0];
+};
+
+/* Request contains first two fields, response contains those plus the rest */
+struct opa_port_data_counters_msg {
+ __be64 port_select_mask[4];
+ __be32 vl_select_mask;
+
+ /* Response fields follow */
+ __be32 reserved1;
+ struct _port_dctrs {
+ u8 port_number;
+ u8 reserved2[3];
+ __be32 link_quality_indicator; /* 29res, 3bit */
+
+ /* Data counters */
+ __be64 port_xmit_data;
+ __be64 port_rcv_data;
+ __be64 port_xmit_pkts;
+ __be64 port_rcv_pkts;
+ __be64 port_multicast_xmit_pkts;
+ __be64 port_multicast_rcv_pkts;
+ __be64 port_xmit_wait;
+ __be64 sw_port_congestion;
+ __be64 port_rcv_fecn;
+ __be64 port_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_xmit_wasted_bw;
+ __be64 port_xmit_wait_data;
+ __be64 port_rcv_bubble;
+ __be64 port_mark_fecn;
+
+ __be64 port_error_counter_summary;
+ /* Sum of error counts/port */
+
+ struct _vls_dctrs {
+ /* per-VL Data counters */
+ __be64 port_vl_xmit_data;
+ __be64 port_vl_rcv_data;
+ __be64 port_vl_xmit_pkts;
+ __be64 port_vl_rcv_pkts;
+ __be64 port_vl_xmit_wait;
+ __be64 sw_port_vl_congestion;
+ __be64 port_vl_rcv_fecn;
+ __be64 port_vl_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_vl_xmit_wasted_bw;
+ __be64 port_vl_xmit_wait_data;
+ __be64 port_vl_rcv_bubble;
+ __be64 port_vl_mark_fecn;
+ } vls[0];
+ /* array size defined by #bits set in vl_select_mask*/
+ } port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_counters64_msg {
+ /* Request contains first two fields, response contains the
+ * whole magilla */
+ __be64 port_select_mask[4];
+ __be32 vl_select_mask;
+
+ /* Response-only fields follow */
+ __be32 reserved1;
+ struct _port_ectrs {
+ u8 port_number;
+ u8 reserved2[7];
+ __be64 port_rcv_constraint_errors;
+ __be64 port_rcv_switch_relay_errors;
+ __be64 port_xmit_discards;
+ __be64 port_xmit_constraint_errors;
+ __be64 port_rcv_remote_physical_errors;
+ __be64 local_link_integrity_errors;
+ __be64 port_rcv_errors;
+ __be64 excessive_buffer_overruns;
+ __be64 fm_config_errors;
+ __be32 link_error_recovery;
+ __be32 link_downed;
+ u8 uncorrectable_errors;
+ u8 reserved3[7];
+ struct _vls_ectrs {
+ __be64 port_vl_xmit_discards;
+ } vls[0];
+ /* array size defined by #bits set in vl_select_mask */
+ } port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_info_msg {
+ __be64 port_select_mask[4];
+ __be32 error_info_select_mask;
+ __be32 reserved1;
+ struct _port_ei {
+
+ u8 port_number;
+ u8 reserved2[7];
+
+ /* PortRcvErrorInfo */
+ struct {
+ u8 status_and_code;
+ union {
+ u8 raw[17];
+ struct {
+ /* EI1to12 format */
+ u8 packet_flit1[8];
+ u8 packet_flit2[8];
+ u8 remaining_flit_bits12;
+ } ei1to12;
+ struct {
+ u8 packet_bytes[8];
+ u8 remaining_flit_bits;
+ } ei13;
+ } ei;
+ u8 reserved3[6];
+ } __packed port_rcv_ei;
+
+ /* ExcessiveBufferOverrunInfo */
+ struct {
+ u8 status_and_sc;
+ u8 reserved4[7];
+ } __packed excessive_buffer_overrun_ei;
+
+ /* PortXmitConstraintErrorInfo */
+ struct {
+ u8 status;
+ u8 reserved5;
+ __be16 pkey;
+ __be32 slid;
+ } __packed port_xmit_constraint_ei;
+
+ /* PortRcvConstraintErrorInfo */
+ struct {
+ u8 status;
+ u8 reserved6;
+ __be16 pkey;
+ __be32 slid;
+ } __packed port_rcv_constraint_ei;
+
+ /* PortRcvSwitchRelayErrorInfo */
+ struct {
+ u8 status_and_code;
+ u8 reserved7[3];
+ __u32 error_info;
+ } __packed port_rcv_switch_relay_ei;
+
+ /* UncorrectableErrorInfo */
+ struct {
+ u8 status_and_code;
+ u8 reserved8;
+ } __packed uncorrectable_ei;
+
+ /* FMConfigErrorInfo */
+ struct {
+ u8 status_and_code;
+ u8 error_info;
+ } __packed fm_config_ei;
+ __u32 reserved9;
+ } port[1]; /* actual array size defined by #ports in attr modifier */
+};
+
+/* opa_port_error_info_msg error_info_select_mask bit definitions */
+enum error_info_selects {
+ ES_PORT_RCV_ERROR_INFO = (1 << 31),
+ ES_EXCESSIVE_BUFFER_OVERRUN_INFO = (1 << 30),
+ ES_PORT_XMIT_CONSTRAINT_ERROR_INFO = (1 << 29),
+ ES_PORT_RCV_CONSTRAINT_ERROR_INFO = (1 << 28),
+ ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO = (1 << 27),
+ ES_UNCORRECTABLE_ERROR_INFO = (1 << 26),
+ ES_FM_CONFIG_ERROR_INFO = (1 << 25)
+};
+
+static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u32 *resp_len)
+{
+ struct opa_class_port_info *p =
+ (struct opa_class_port_info *)pmp->data;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ if (pmp->mad_hdr.attr_mod != 0)
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+ p->base_version = OPA_MGMT_BASE_VERSION;
+ p->class_version = OPA_SMI_CLASS_VERSION;
+ /*
+ * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+ */
+ p->cap_mask2_resp_time = cpu_to_be32(18);
+
+ if (resp_len)
+ *resp_len += sizeof(*p);
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void a0_portstatus(struct hfi1_pportdata *ppd,
+ struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+{
+ if (!is_bx(ppd->dd)) {
+ unsigned long vl;
+ int vfi = 0;
+ u64 max_vl_xmit_wait = 0, tmp;
+ u32 vl_all_mask = VL_MASK_ALL;
+ u64 rcv_data, rcv_bubble;
+
+ rcv_data = be64_to_cpu(rsp->port_rcv_data);
+ rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
+ /* In the measured time period, calculate the total number
+ * of flits that were received. Subtract out one false
+ * rcv_bubble increment for every 32 received flits but
+ * don't let the number go negative.
+ */
+ if (rcv_bubble >= (rcv_data>>5)) {
+ rcv_bubble -= (rcv_data>>5);
+ rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
+ }
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(vl_select_mask)) {
+ rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
+ rcv_bubble =
+ be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
+ if (rcv_bubble >= (rcv_data>>5)) {
+ rcv_bubble -= (rcv_data>>5);
+ rsp->vls[vfi].port_vl_rcv_bubble =
+ cpu_to_be64(rcv_bubble);
+ }
+ vfi++;
+ }
+
+ for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+ 8 * sizeof(vl_all_mask)) {
+ tmp = read_port_cntr(ppd, C_TX_WAIT_VL,
+ idx_from_vl(vl));
+ if (tmp > max_vl_xmit_wait)
+ max_vl_xmit_wait = tmp;
+ }
+ rsp->port_xmit_wait = cpu_to_be64(max_vl_xmit_wait);
+ }
+}
+
+
+static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ struct opa_port_status_req *req =
+ (struct opa_port_status_req *)pmp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct opa_port_status_rsp *rsp;
+ u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+ unsigned long vl;
+ size_t response_data_size;
+ u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+ u8 port_num = req->port_num;
+ u8 num_vls = hweight32(vl_select_mask);
+ struct _vls_pctrs *vlinfo;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ int vfi;
+ u64 tmp, tmp2;
+
+ response_data_size = sizeof(struct opa_port_status_rsp) +
+ num_vls * sizeof(struct _vls_pctrs);
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ if (nports != 1 || (port_num && port_num != port)
+ || num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ rsp = (struct opa_port_status_rsp *)pmp->data;
+ if (port_num)
+ rsp->port_num = port_num;
+ else
+ rsp->port_num = port;
+
+ rsp->port_rcv_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+ CNTR_INVALID_VL));
+
+ hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
+
+ rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+ rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_bubble =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
+ rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_xmit_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+ rsp->port_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+ rsp->port_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+ rsp->port_xmit_discards =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_remote_physical_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+ CNTR_INVALID_VL));
+ tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+ tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+ if (tmp2 < tmp) {
+ /* overflow/wrapped */
+ rsp->local_link_integrity_errors = cpu_to_be64(~0);
+ } else {
+ rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+ }
+ tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+ tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+ CNTR_INVALID_VL);
+ if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+ /* overflow/wrapped */
+ rsp->link_error_recovery = cpu_to_be32(~0);
+ } else {
+ rsp->link_error_recovery = cpu_to_be32(tmp2);
+ }
+ rsp->port_rcv_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+ rsp->excessive_buffer_overruns =
+ cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+ rsp->fm_config_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+ CNTR_INVALID_VL));
+ rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+ CNTR_INVALID_VL));
+
+ /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
+ tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+ rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+ vlinfo = &(rsp->vls[0]);
+ vfi = 0;
+ /* The vl_select_mask has been checked above, and we know
+ * that it contains only entries which represent valid VLs.
+ * So in the for_each_set_bit() loop below, we don't need
+ * any additional checks for vl.
+ */
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(vl_select_mask)) {
+ memset(vlinfo, 0, sizeof(*vlinfo));
+
+ tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
+ rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
+ rsp->vls[vfi].port_vl_rcv_bubble =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_data =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_pkts =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+ idx_from_vl(vl)));
+
+ vlinfo++;
+ vfi++;
+ }
+
+ a0_portstatus(ppd, rsp, vl_select_mask);
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u64 error_counter_summary = 0, tmp;
+
+ error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+ CNTR_INVALID_VL);
+ /* port_rcv_switch_relay_errors is 0 for HFIs */
+ error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_TX_REPLAY,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_RX_REPLAY,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_SEQ_CRC_CNT,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+ CNTR_INVALID_VL);
+ /* ppd->link_downed is a 32-bit value */
+ error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
+ CNTR_INVALID_VL);
+ tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+ /* this is an 8-bit quantity */
+ error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+ return error_counter_summary;
+}
+
+static void a0_datacounters(struct hfi1_devdata *dd, struct _port_dctrs *rsp,
+ u32 vl_select_mask)
+{
+ if (!is_bx(dd)) {
+ unsigned long vl;
+ int vfi = 0;
+ u64 rcv_data, rcv_bubble, sum_vl_xmit_wait = 0;
+
+ rcv_data = be64_to_cpu(rsp->port_rcv_data);
+ rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
+ /* In the measured time period, calculate the total number
+ * of flits that were received. Subtract out one false
+ * rcv_bubble increment for every 32 received flits but
+ * don't let the number go negative.
+ */
+ if (rcv_bubble >= (rcv_data>>5)) {
+ rcv_bubble -= (rcv_data>>5);
+ rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
+ }
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(vl_select_mask)) {
+ rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
+ rcv_bubble =
+ be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
+ if (rcv_bubble >= (rcv_data>>5)) {
+ rcv_bubble -= (rcv_data>>5);
+ rsp->vls[vfi].port_vl_rcv_bubble =
+ cpu_to_be64(rcv_bubble);
+ }
+ vfi++;
+ }
+ vfi = 0;
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(vl_select_mask)) {
+ u64 tmp = sum_vl_xmit_wait +
+ be64_to_cpu(rsp->vls[vfi++].port_vl_xmit_wait);
+ if (tmp < sum_vl_xmit_wait) {
+ /* we wrapped */
+ sum_vl_xmit_wait = (u64) ~0;
+ break;
+ }
+ sum_vl_xmit_wait = tmp;
+ }
+ if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+ rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+ }
+}
+
+static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ struct opa_port_data_counters_msg *req =
+ (struct opa_port_data_counters_msg *)pmp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct _port_dctrs *rsp;
+ struct _vls_dctrs *vlinfo;
+ size_t response_data_size;
+ u32 num_ports;
+ u8 num_pslm;
+ u8 lq, num_vls;
+ u64 port_mask;
+ unsigned long port_num;
+ unsigned long vl;
+ u32 vl_select_mask;
+ int vfi;
+
+ num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+ num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+ vl_select_mask = be32_to_cpu(req->vl_select_mask);
+
+ if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /* Sanity check */
+ response_data_size = sizeof(struct opa_port_data_counters_msg) +
+ num_vls * sizeof(struct _vls_dctrs);
+
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /*
+ * The bit set in the mask needs to be consistent with the
+ * port the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask));
+
+ if ((u8)port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ rsp = (struct _port_dctrs *)&(req->port[0]);
+ memset(rsp, 0, sizeof(*rsp));
+
+ rsp->port_number = port;
+ /*
+ * Note that link_quality_indicator is a 32 bit quantity in
+ * 'datacounters' queries (as opposed to 'portinfo' queries,
+ * where it's a byte).
+ */
+ hfi1_read_link_quality(dd, &lq);
+ rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+
+ /* rsp->sw_port_congestion is 0 for HFIs */
+ /* rsp->port_xmit_time_cong is 0 for HFIs */
+ /* rsp->port_xmit_wasted_bw ??? */
+ /* rsp->port_xmit_wait_data ??? */
+ /* rsp->port_mark_fecn is 0 for HFIs */
+
+ rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_bubble =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
+ rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_xmit_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+ rsp->port_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+ rsp->port_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+
+ rsp->port_error_counter_summary =
+ cpu_to_be64(get_error_counter_summary(ibdev, port));
+
+ vlinfo = &(rsp->vls[0]);
+ vfi = 0;
+ /* The vl_select_mask has been checked above, and we know
+ * that it contains only entries which represent valid VLs.
+ * So in the for_each_set_bit() loop below, we don't need
+ * any additional checks for vl.
+ */
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(req->vl_select_mask)) {
+ memset(vlinfo, 0, sizeof(*vlinfo));
+
+ rsp->vls[vfi].port_vl_xmit_data =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_data =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
+ idx_from_vl(vl)));
+ rsp->vls[vfi].port_vl_rcv_bubble =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_pkts =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+ idx_from_vl(vl)));
+ rsp->vls[vfi].port_vl_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+ idx_from_vl(vl)));
+
+ /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
+ /* rsp->port_vl_xmit_wasted_bw ??? */
+ /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
+ * does this differ from rsp->vls[vfi].port_vl_xmit_wait */
+ /*rsp->vls[vfi].port_vl_mark_fecn =
+ cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+ + offset));
+ */
+ vlinfo++;
+ vfi++;
+ }
+
+ a0_datacounters(dd, rsp, vl_select_mask);
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ size_t response_data_size;
+ struct _port_ectrs *rsp;
+ unsigned long port_num;
+ struct opa_port_error_counters64_msg *req;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 num_ports;
+ u8 num_pslm;
+ u8 num_vls;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct _vls_ectrs *vlinfo;
+ unsigned long vl;
+ u64 port_mask, tmp, tmp2;
+ u32 vl_select_mask;
+ int vfi;
+
+ req = (struct opa_port_error_counters64_msg *)pmp->data;
+
+ num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+ num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+
+ if (num_ports != 1 || num_ports != num_pslm) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ response_data_size = sizeof(struct opa_port_error_counters64_msg) +
+ num_vls * sizeof(struct _vls_ectrs);
+
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+ /*
+ * The bit set in the mask needs to be consistent with the
+ * port the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask));
+
+ if ((u8)port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ rsp = (struct _port_ectrs *)&(req->port[0]);
+
+ ibp = to_iport(ibdev, port_num);
+ ppd = ppd_from_ibp(ibp);
+
+ memset(rsp, 0, sizeof(*rsp));
+ rsp->port_number = (u8)port_num;
+
+ rsp->port_rcv_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+ CNTR_INVALID_VL));
+ /* port_rcv_switch_relay_errors is 0 for HFIs */
+ rsp->port_xmit_discards =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_remote_physical_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+ CNTR_INVALID_VL));
+ tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+ tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+ if (tmp2 < tmp) {
+ /* overflow/wrapped */
+ rsp->local_link_integrity_errors = cpu_to_be64(~0);
+ } else {
+ rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+ }
+ tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+ tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+ CNTR_INVALID_VL);
+ if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+ /* overflow/wrapped */
+ rsp->link_error_recovery = cpu_to_be32(~0);
+ } else {
+ rsp->link_error_recovery = cpu_to_be32(tmp2);
+ }
+ rsp->port_xmit_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+ CNTR_INVALID_VL));
+ rsp->excessive_buffer_overruns =
+ cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+ rsp->fm_config_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+ CNTR_INVALID_VL));
+ rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+ CNTR_INVALID_VL));
+ tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+ rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+ vlinfo = (struct _vls_ectrs *)&(rsp->vls[0]);
+ vfi = 0;
+ vl_select_mask = be32_to_cpu(req->vl_select_mask);
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(req->vl_select_mask)) {
+ memset(vlinfo, 0, sizeof(*vlinfo));
+ /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
+ vlinfo += 1;
+ vfi++;
+ }
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ size_t response_data_size;
+ struct _port_ei *rsp;
+ struct opa_port_error_info_msg *req;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u64 port_mask;
+ u32 num_ports;
+ unsigned long port_num;
+ u8 num_pslm;
+ u64 reg;
+
+ req = (struct opa_port_error_info_msg *)pmp->data;
+ rsp = (struct _port_ei *)&(req->port[0]);
+
+ num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+ memset(rsp, 0, sizeof(*rsp));
+
+ if (num_ports != 1 || num_ports != num_pslm) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /* Sanity check */
+ response_data_size = sizeof(struct opa_port_error_info_msg);
+
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /*
+ * The bit set in the mask needs to be consistent with the port
+ * the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask));
+
+ if ((u8)port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /* PortRcvErrorInfo */
+ rsp->port_rcv_ei.status_and_code =
+ dd->err_info_rcvport.status_and_code;
+ memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
+ &dd->err_info_rcvport.packet_flit1, sizeof(u64));
+ memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
+ &dd->err_info_rcvport.packet_flit2, sizeof(u64));
+
+ /* ExcessiverBufferOverrunInfo */
+ reg = read_csr(dd, RCV_ERR_INFO);
+ if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
+ /* if the RcvExcessBufferOverrun bit is set, save SC of
+ * first pkt that encountered an excess buffer overrun */
+ u8 tmp = (u8)reg;
+
+ tmp &= RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
+ tmp <<= 2;
+ rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
+ /* set the status bit */
+ rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
+ }
+
+ rsp->port_xmit_constraint_ei.status =
+ dd->err_info_xmit_constraint.status;
+ rsp->port_xmit_constraint_ei.pkey =
+ cpu_to_be16(dd->err_info_xmit_constraint.pkey);
+ rsp->port_xmit_constraint_ei.slid =
+ cpu_to_be32(dd->err_info_xmit_constraint.slid);
+
+ rsp->port_rcv_constraint_ei.status =
+ dd->err_info_rcv_constraint.status;
+ rsp->port_rcv_constraint_ei.pkey =
+ cpu_to_be16(dd->err_info_rcv_constraint.pkey);
+ rsp->port_rcv_constraint_ei.slid =
+ cpu_to_be32(dd->err_info_rcv_constraint.slid);
+
+ /* UncorrectableErrorInfo */
+ rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
+
+ /* FMConfigErrorInfo */
+ rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ struct opa_clear_port_status *req =
+ (struct opa_clear_port_status *)pmp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+ u64 portn = be64_to_cpu(req->port_select_mask[3]);
+ u32 counter_select = be32_to_cpu(req->counter_select_mask);
+ u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+ unsigned long vl;
+
+ if ((nports != 1) || (portn != 1 << port)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+ /*
+ * only counters returned by pma_get_opa_portstatus() are
+ * handled, so when pma_get_opa_portstatus() gets a fix,
+ * the corresponding change should be made here as well.
+ */
+
+ if (counter_select & CS_PORT_XMIT_DATA)
+ write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_DATA)
+ write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_XMIT_PKTS)
+ write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_PKTS)
+ write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
+ write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_MCAST_RCV_PKTS)
+ write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_XMIT_WAIT)
+ write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
+
+ /* ignore cs_sw_portCongestion for HFIs */
+
+ if (counter_select & CS_PORT_RCV_FECN)
+ write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_BECN)
+ write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
+
+ /* ignore cs_port_xmit_time_cong for HFIs */
+ /* ignore cs_port_xmit_wasted_bw for now */
+ /* ignore cs_port_xmit_wait_data for now */
+ if (counter_select & CS_PORT_RCV_BUBBLE)
+ write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
+
+ /* Only applicable for switch */
+ /*if (counter_select & CS_PORT_MARK_FECN)
+ write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);*/
+
+ if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
+ write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+ /* ignore cs_port_rcv_switch_relay_errors for HFIs */
+ if (counter_select & CS_PORT_XMIT_DISCARDS)
+ write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
+ write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
+ write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
+ write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+ write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+ }
+
+ if (counter_select & CS_LINK_ERROR_RECOVERY) {
+ write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+ write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+ CNTR_INVALID_VL, 0);
+ }
+
+ if (counter_select & CS_PORT_RCV_ERRORS)
+ write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
+ write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+ dd->rcv_ovfl_cnt = 0;
+ }
+
+ if (counter_select & CS_FM_CONFIG_ERRORS)
+ write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_LINK_DOWNED)
+ write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_UNCORRECTABLE_ERRORS)
+ write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
+
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(vl_select_mask)) {
+
+ if (counter_select & CS_PORT_XMIT_DATA)
+ write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_RCV_DATA)
+ write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_XMIT_PKTS)
+ write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_RCV_PKTS)
+ write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_XMIT_WAIT)
+ write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
+
+ /* sw_port_vl_congestion is 0 for HFIs */
+ if (counter_select & CS_PORT_RCV_FECN)
+ write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_RCV_BECN)
+ write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
+
+ /* port_vl_xmit_time_cong is 0 for HFIs */
+ /* port_vl_xmit_wasted_bw ??? */
+ /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
+ if (counter_select & CS_PORT_RCV_BUBBLE)
+ write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
+
+ /*if (counter_select & CS_PORT_MARK_FECN)
+ write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+ */
+ /* port_vl_xmit_discards ??? */
+ }
+
+ if (resp_len)
+ *resp_len += sizeof(*req);
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ struct _port_ei *rsp;
+ struct opa_port_error_info_msg *req;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u64 port_mask;
+ u32 num_ports;
+ unsigned long port_num;
+ u8 num_pslm;
+ u32 error_info_select;
+
+ req = (struct opa_port_error_info_msg *)pmp->data;
+ rsp = (struct _port_ei *)&(req->port[0]);
+
+ num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+ memset(rsp, 0, sizeof(*rsp));
+
+ if (num_ports != 1 || num_ports != num_pslm) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /*
+ * The bit set in the mask needs to be consistent with the port
+ * the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask));
+
+ if ((u8)port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ error_info_select = be32_to_cpu(req->error_info_select_mask);
+
+ /* PortRcvErrorInfo */
+ if (error_info_select & ES_PORT_RCV_ERROR_INFO)
+ /* turn off status bit */
+ dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+
+ /* ExcessiverBufferOverrunInfo */
+ if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
+ /* status bit is essentially kept in the h/w - bit 5 of
+ * RCV_ERR_INFO */
+ write_csr(dd, RCV_ERR_INFO,
+ RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+
+ if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
+ dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+ if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
+ dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+ /* UncorrectableErrorInfo */
+ if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
+ /* turn off status bit */
+ dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
+
+ /* FMConfigErrorInfo */
+ if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
+ /* turn off status bit */
+ dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
+
+ if (resp_len)
+ *resp_len += sizeof(*req);
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+struct opa_congestion_info_attr {
+ __be16 congestion_info;
+ u8 control_table_cap; /* Multiple of 64 entry unit CCTs */
+ u8 congestion_log_length;
+} __packed;
+
+static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_congestion_info_attr *p =
+ (struct opa_congestion_info_attr *)data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ p->congestion_info = 0;
+ p->control_table_cap = ppd->cc_max_table_entries;
+ p->congestion_log_length = OPA_CONG_LOG_ELEMS;
+
+ if (resp_len)
+ *resp_len += sizeof(*p);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
+ u8 *data,
+ struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ int i;
+ struct opa_congestion_setting_attr *p =
+ (struct opa_congestion_setting_attr *) data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct opa_congestion_setting_entry_shadow *entries;
+ struct cc_state *cc_state;
+
+ rcu_read_lock();
+
+ cc_state = get_cc_state(ppd);
+
+ if (cc_state == NULL) {
+ rcu_read_unlock();
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ entries = cc_state->cong_setting.entries;
+ p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
+ p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ p->entries[i].ccti_increase = entries[i].ccti_increase;
+ p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+ p->entries[i].trigger_threshold =
+ entries[i].trigger_threshold;
+ p->entries[i].ccti_min = entries[i].ccti_min;
+ }
+
+ rcu_read_unlock();
+
+ if (resp_len)
+ *resp_len += sizeof(*p);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_congestion_setting_attr *p =
+ (struct opa_congestion_setting_attr *) data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct opa_congestion_setting_entry_shadow *entries;
+ int i;
+
+ ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
+
+ entries = ppd->congestion_entries;
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ entries[i].ccti_increase = p->entries[i].ccti_increase;
+ entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
+ entries[i].trigger_threshold =
+ p->entries[i].trigger_threshold;
+ entries[i].ccti_min = p->entries[i].ccti_min;
+ }
+
+ return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
+ resp_len);
+}
+
+static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
+ s64 ts;
+ int i;
+
+ if (am != 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ spin_lock(&ppd->cc_log_lock);
+
+ cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
+ cong_log->congestion_flags = 0;
+ cong_log->threshold_event_counter =
+ cpu_to_be16(ppd->threshold_event_counter);
+ memcpy(cong_log->threshold_cong_event_map,
+ ppd->threshold_cong_event_map,
+ sizeof(cong_log->threshold_cong_event_map));
+ /* keep timestamp in units of 1.024 usec */
+ ts = ktime_to_ns(ktime_get()) / 1024;
+ cong_log->current_time_stamp = cpu_to_be32(ts);
+ for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
+ struct opa_hfi1_cong_log_event_internal *cce =
+ &ppd->cc_events[ppd->cc_mad_idx++];
+ if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
+ ppd->cc_mad_idx = 0;
+ /*
+ * Entries which are older than twice the time
+ * required to wrap the counter are supposed to
+ * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
+ */
+ if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
+ continue;
+ memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
+ memcpy(cong_log->events[i].remote_qp_number_cn_entry,
+ &cce->rqpn, 3);
+ cong_log->events[i].sl_svc_type_cn_entry =
+ ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
+ cong_log->events[i].remote_lid_cn_entry =
+ cpu_to_be32(cce->rlid);
+ cong_log->events[i].timestamp_cn_entry =
+ cpu_to_be32(cce->timestamp);
+ }
+
+ /*
+ * Reset threshold_cong_event_map, and threshold_event_counter
+ * to 0 when log is read.
+ */
+ memset(ppd->threshold_cong_event_map, 0x0,
+ sizeof(ppd->threshold_cong_event_map));
+ ppd->threshold_event_counter = 0;
+
+ spin_unlock(&ppd->cc_log_lock);
+
+ if (resp_len)
+ *resp_len += sizeof(struct opa_hfi1_cong_log);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct ib_cc_table_attr *cc_table_attr =
+ (struct ib_cc_table_attr *) data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 start_block = OPA_AM_START_BLK(am);
+ u32 n_blocks = OPA_AM_NBLK(am);
+ struct ib_cc_table_entry_shadow *entries;
+ int i, j;
+ u32 sentry, eentry;
+ struct cc_state *cc_state;
+
+ /* sanity check n_blocks, start_block */
+ if (n_blocks == 0 ||
+ start_block + n_blocks > ppd->cc_max_table_entries) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ rcu_read_lock();
+
+ cc_state = get_cc_state(ppd);
+
+ if (cc_state == NULL) {
+ rcu_read_unlock();
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ sentry = start_block * IB_CCT_ENTRIES;
+ eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
+
+ cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
+
+ entries = cc_state->cct.entries;
+
+ /* return n_blocks, though the last block may not be full */
+ for (j = 0, i = sentry; i < eentry; j++, i++)
+ cc_table_attr->ccti_entries[j].entry =
+ cpu_to_be16(entries[i].entry);
+
+ rcu_read_unlock();
+
+ if (resp_len)
+ *resp_len += sizeof(u16)*(IB_CCT_ENTRIES * n_blocks + 1);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+void cc_state_reclaim(struct rcu_head *rcu)
+{
+ struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
+
+ kfree(cc_state);
+}
+
+static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct ib_cc_table_attr *p = (struct ib_cc_table_attr *) data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 start_block = OPA_AM_START_BLK(am);
+ u32 n_blocks = OPA_AM_NBLK(am);
+ struct ib_cc_table_entry_shadow *entries;
+ int i, j;
+ u32 sentry, eentry;
+ u16 ccti_limit;
+ struct cc_state *old_cc_state, *new_cc_state;
+
+ /* sanity check n_blocks, start_block */
+ if (n_blocks == 0 ||
+ start_block + n_blocks > ppd->cc_max_table_entries) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ sentry = start_block * IB_CCT_ENTRIES;
+ eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
+ (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
+
+ /* sanity check ccti_limit */
+ ccti_limit = be16_to_cpu(p->ccti_limit);
+ if (ccti_limit + 1 > eentry) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+ if (new_cc_state == NULL)
+ goto getit;
+
+ spin_lock(&ppd->cc_state_lock);
+
+ old_cc_state = get_cc_state(ppd);
+
+ if (old_cc_state == NULL) {
+ spin_unlock(&ppd->cc_state_lock);
+ kfree(new_cc_state);
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ *new_cc_state = *old_cc_state;
+
+ new_cc_state->cct.ccti_limit = ccti_limit;
+
+ entries = ppd->ccti_entries;
+ ppd->total_cct_entry = ccti_limit + 1;
+
+ for (j = 0, i = sentry; i < eentry; j++, i++)
+ entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
+
+ memcpy(new_cc_state->cct.entries, entries,
+ eentry * sizeof(struct ib_cc_table_entry));
+
+ new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+ new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+ memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+ OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+ rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+ spin_unlock(&ppd->cc_state_lock);
+
+ call_rcu(&old_cc_state->rcu, cc_state_reclaim);
+
+getit:
+ return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_led_info {
+ __be32 rsvd_led_mask;
+ __be32 rsvd;
+};
+
+#define OPA_LED_SHIFT 31
+#define OPA_LED_MASK (1 << OPA_LED_SHIFT)
+
+static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct opa_led_info *p = (struct opa_led_info *) data;
+ u32 nport = OPA_AM_NPORT(am);
+ u64 reg;
+
+ if (nport != 1 || OPA_AM_PORTNUM(am)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ reg = read_csr(dd, DCC_CFG_LED_CNTRL);
+ if ((reg & DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK) &&
+ ((reg & DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK) == 0xf))
+ p->rsvd_led_mask = cpu_to_be32(OPA_LED_MASK);
+
+ if (resp_len)
+ *resp_len += sizeof(struct opa_led_info);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct opa_led_info *p = (struct opa_led_info *) data;
+ u32 nport = OPA_AM_NPORT(am);
+ int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
+
+ if (nport != 1 || OPA_AM_PORTNUM(am)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ setextled(dd, on);
+
+ return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
+}
+
+static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int ret;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+ switch (attr_id) {
+ case IB_SMP_ATTR_NODE_DESC:
+ ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_NODE_INFO:
+ ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_PORT_INFO:
+ ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_PKEY_TABLE:
+ ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+ ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+ ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+ ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+ ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_PORT_STATE_INFO:
+ ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+ ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_CABLE_INFO:
+ ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_VL_ARB_TABLE:
+ ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_CONGESTION_INFO:
+ ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+ ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
+ port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
+ ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
+ port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+ ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_LED_INFO:
+ ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_SM_INFO:
+ if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ if (ibp->port_cap_flags & IB_PORT_SM)
+ return IB_MAD_RESULT_SUCCESS;
+ /* FALLTHROUGH */
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)smp);
+ break;
+ }
+ return ret;
+}
+
+static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int ret;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+ switch (attr_id) {
+ case IB_SMP_ATTR_PORT_INFO:
+ ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_PKEY_TABLE:
+ ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+ ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+ ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+ ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+ ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_PORT_STATE_INFO:
+ ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+ ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_VL_ARB_TABLE:
+ ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+ ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
+ port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+ ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_LED_INFO:
+ ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_SM_INFO:
+ if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ if (ibp->port_cap_flags & IB_PORT_SM)
+ return IB_MAD_RESULT_SUCCESS;
+ /* FALLTHROUGH */
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)smp);
+ break;
+ }
+ return ret;
+}
+
+static inline void set_aggr_error(struct opa_aggregate *ag)
+{
+ ag->err_reqlength |= cpu_to_be16(0x8000);
+}
+
+static int subn_get_opa_aggregate(struct opa_smp *smp,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int i;
+ u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+ u8 *next_smp = opa_get_smp_data(smp);
+
+ if (num_attr < 1 || num_attr > 117) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < num_attr; i++) {
+ struct opa_aggregate *agg;
+ size_t agg_data_len;
+ size_t agg_size;
+ u32 am;
+
+ agg = (struct opa_aggregate *)next_smp;
+ agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+ agg_size = sizeof(*agg) + agg_data_len;
+ am = be32_to_cpu(agg->attr_mod);
+
+ *resp_len += agg_size;
+
+ if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* zero the payload for this segment */
+ memset(next_smp + sizeof(*agg), 0, agg_data_len);
+
+ (void) subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+ ibdev, port, NULL);
+ if (smp->status & ~IB_SMP_DIRECTION) {
+ set_aggr_error(agg);
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ next_smp += agg_size;
+
+ }
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_set_opa_aggregate(struct opa_smp *smp,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int i;
+ u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+ u8 *next_smp = opa_get_smp_data(smp);
+
+ if (num_attr < 1 || num_attr > 117) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < num_attr; i++) {
+ struct opa_aggregate *agg;
+ size_t agg_data_len;
+ size_t agg_size;
+ u32 am;
+
+ agg = (struct opa_aggregate *)next_smp;
+ agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+ agg_size = sizeof(*agg) + agg_data_len;
+ am = be32_to_cpu(agg->attr_mod);
+
+ *resp_len += agg_size;
+
+ if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ (void) subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+ ibdev, port, NULL);
+ if (smp->status & ~IB_SMP_DIRECTION) {
+ set_aggr_error(agg);
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ next_smp += agg_size;
+
+ }
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * OPAv1 specifies that, on the transition to link up, these counters
+ * are cleared:
+ * PortRcvErrors [*]
+ * LinkErrorRecovery
+ * LocalLinkIntegrityErrors
+ * ExcessiveBufferOverruns [*]
+ *
+ * [*] Error info associated with these counters is retained, but the
+ * error info status is reset to 0.
+ */
+void clear_linkup_counters(struct hfi1_devdata *dd)
+{
+ /* PortRcvErrors */
+ write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+ dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+ /* LinkErrorRecovery */
+ write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+ write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
+ /* LocalLinkIntegrityErrors */
+ write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+ write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+ /* ExcessiveBufferOverruns */
+ write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+ dd->rcv_ovfl_cnt = 0;
+ dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+}
+
+/*
+ * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
+ * local node, 0 otherwise.
+ */
+static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
+ const struct ib_wc *in_wc)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ const struct opa_smp *smp = (const struct opa_smp *)mad;
+
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ return (smp->hop_cnt == 0 &&
+ smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
+ smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
+ }
+
+ return (in_wc->slid == ppd->lid);
+}
+
+/*
+ * opa_local_smp_check() should only be called on MADs for which
+ * is_local_mad() returns true. It applies the SMP checks that are
+ * specific to SMPs which are sent from, and destined to this node.
+ * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
+ * otherwise.
+ *
+ * SMPs which arrive from other nodes are instead checked by
+ * opa_smp_check().
+ */
+static int opa_local_smp_check(struct hfi1_ibport *ibp,
+ const struct ib_wc *in_wc)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u16 slid = in_wc->slid;
+ u16 pkey;
+
+ if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
+ return 1;
+
+ pkey = ppd->pkeys[in_wc->pkey_index];
+ /*
+ * We need to do the "node-local" checks specified in OPAv1,
+ * rev 0.90, section 9.10.26, which are:
+ * - pkey is 0x7fff, or 0xffff
+ * - Source QPN == 0 || Destination QPN == 0
+ * - the MAD header's management class is either
+ * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
+ * IB_MGMT_CLASS_SUBN_LID_ROUTED
+ * - SLID != 0
+ *
+ * However, we know (and so don't need to check again) that,
+ * for local SMPs, the MAD stack passes MADs with:
+ * - Source QPN of 0
+ * - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
+ * our own port's lid
+ *
+ */
+ if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+ return 0;
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+}
+
+static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
+ u8 port, const struct opa_mad *in_mad,
+ struct opa_mad *out_mad,
+ u32 *resp_len)
+{
+ struct opa_smp *smp = (struct opa_smp *)out_mad;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *data;
+ u32 am;
+ __be16 attr_id;
+ int ret;
+
+ *out_mad = *in_mad;
+ data = opa_get_smp_data(smp);
+
+ am = be32_to_cpu(smp->attr_mod);
+ attr_id = smp->attr_id;
+ if (smp->class_version != OPA_SMI_CLASS_VERSION) {
+ smp->status |= IB_SMP_UNSUP_VERSION;
+ ret = reply((struct ib_mad_hdr *)smp);
+ goto bail;
+ }
+ ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
+ smp->route.dr.dr_slid, smp->route.dr.return_path,
+ smp->hop_cnt);
+ if (ret) {
+ u32 port_num = be32_to_cpu(smp->attr_mod);
+
+ /*
+ * If this is a get/set portinfo, we already check the
+ * M_Key if the MAD is for another port and the M_Key
+ * is OK on the receiving port. This check is needed
+ * to increment the error counters when the M_Key
+ * fails to match on *both* ports.
+ */
+ if (attr_id == IB_SMP_ATTR_PORT_INFO &&
+ (smp->method == IB_MGMT_METHOD_GET ||
+ smp->method == IB_MGMT_METHOD_SET) &&
+ port_num && port_num <= ibdev->phys_port_cnt &&
+ port != port_num)
+ (void) check_mkey(to_iport(ibdev, port_num),
+ (struct ib_mad_hdr *)smp, 0,
+ smp->mkey, smp->route.dr.dr_slid,
+ smp->route.dr.return_path,
+ smp->hop_cnt);
+ ret = IB_MAD_RESULT_FAILURE;
+ goto bail;
+ }
+
+ *resp_len = opa_get_smp_header_size(smp);
+
+ switch (smp->method) {
+ case IB_MGMT_METHOD_GET:
+ switch (attr_id) {
+ default:
+ clear_opa_smp_data(smp);
+ ret = subn_get_opa_sma(attr_id, smp, am, data,
+ ibdev, port, resp_len);
+ goto bail;
+ case OPA_ATTRIB_ID_AGGREGATE:
+ ret = subn_get_opa_aggregate(smp, ibdev, port,
+ resp_len);
+ goto bail;
+ }
+ case IB_MGMT_METHOD_SET:
+ switch (attr_id) {
+ default:
+ ret = subn_set_opa_sma(attr_id, smp, am, data,
+ ibdev, port, resp_len);
+ goto bail;
+ case OPA_ATTRIB_ID_AGGREGATE:
+ ret = subn_set_opa_aggregate(smp, ibdev, port,
+ resp_len);
+ goto bail;
+ }
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_REPORT:
+ case IB_MGMT_METHOD_REPORT_RESP:
+ case IB_MGMT_METHOD_GET_RESP:
+ /*
+ * The ib_mad module will call us to process responses
+ * before checking for other consumers.
+ * Just tell the caller to process it normally.
+ */
+ ret = IB_MAD_RESULT_SUCCESS;
+ goto bail;
+ default:
+ smp->status |= IB_SMP_UNSUP_METHOD;
+ ret = reply((struct ib_mad_hdr *)smp);
+ }
+
+bail:
+ return ret;
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+ u8 port, const struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ struct ib_smp *smp = (struct ib_smp *)out_mad;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ int ret;
+
+ *out_mad = *in_mad;
+ if (smp->class_version != 1) {
+ smp->status |= IB_SMP_UNSUP_VERSION;
+ ret = reply((struct ib_mad_hdr *)smp);
+ goto bail;
+ }
+
+ ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
+ smp->mkey, (__force __be32)smp->dr_slid,
+ smp->return_path, smp->hop_cnt);
+ if (ret) {
+ u32 port_num = be32_to_cpu(smp->attr_mod);
+
+ /*
+ * If this is a get/set portinfo, we already check the
+ * M_Key if the MAD is for another port and the M_Key
+ * is OK on the receiving port. This check is needed
+ * to increment the error counters when the M_Key
+ * fails to match on *both* ports.
+ */
+ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+ (smp->method == IB_MGMT_METHOD_GET ||
+ smp->method == IB_MGMT_METHOD_SET) &&
+ port_num && port_num <= ibdev->phys_port_cnt &&
+ port != port_num)
+ (void) check_mkey(to_iport(ibdev, port_num),
+ (struct ib_mad_hdr *)smp, 0,
+ smp->mkey,
+ (__force __be32)smp->dr_slid,
+ smp->return_path, smp->hop_cnt);
+ ret = IB_MAD_RESULT_FAILURE;
+ goto bail;
+ }
+
+ switch (smp->method) {
+ case IB_MGMT_METHOD_GET:
+ switch (smp->attr_id) {
+ case IB_SMP_ATTR_NODE_INFO:
+ ret = subn_get_nodeinfo(smp, ibdev, port);
+ goto bail;
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)smp);
+ goto bail;
+ }
+ }
+
+bail:
+ return ret;
+}
+
+static int process_perf_opa(struct ib_device *ibdev, u8 port,
+ const struct opa_mad *in_mad,
+ struct opa_mad *out_mad, u32 *resp_len)
+{
+ struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
+ int ret;
+
+ *out_mad = *in_mad;
+
+ if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ *resp_len = sizeof(pmp->mad_hdr);
+
+ switch (pmp->mad_hdr.method) {
+ case IB_MGMT_METHOD_GET:
+ switch (pmp->mad_hdr.attr_id) {
+ case IB_PMA_CLASS_PORT_INFO:
+ ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
+ goto bail;
+ case OPA_PM_ATTRIB_ID_PORT_STATUS:
+ ret = pma_get_opa_portstatus(pmp, ibdev, port,
+ resp_len);
+ goto bail;
+ case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
+ ret = pma_get_opa_datacounters(pmp, ibdev, port,
+ resp_len);
+ goto bail;
+ case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
+ ret = pma_get_opa_porterrors(pmp, ibdev, port,
+ resp_len);
+ goto bail;
+ case OPA_PM_ATTRIB_ID_ERROR_INFO:
+ ret = pma_get_opa_errorinfo(pmp, ibdev, port,
+ resp_len);
+ goto bail;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ goto bail;
+ }
+
+ case IB_MGMT_METHOD_SET:
+ switch (pmp->mad_hdr.attr_id) {
+ case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
+ ret = pma_set_opa_portstatus(pmp, ibdev, port,
+ resp_len);
+ goto bail;
+ case OPA_PM_ATTRIB_ID_ERROR_INFO:
+ ret = pma_set_opa_errorinfo(pmp, ibdev, port,
+ resp_len);
+ goto bail;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ goto bail;
+ }
+
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_GET_RESP:
+ /*
+ * The ib_mad module will call us to process responses
+ * before checking for other consumers.
+ * Just tell the caller to process it normally.
+ */
+ ret = IB_MAD_RESULT_SUCCESS;
+ goto bail;
+
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ }
+
+bail:
+ return ret;
+}
+
+static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
+ u8 port, const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct opa_mad *in_mad,
+ struct opa_mad *out_mad, size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ int ret;
+ int pkey_idx;
+ u32 resp_len = 0;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+ pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+ if (pkey_idx < 0) {
+ pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
+ hfi1_get_pkey(ibp, 1));
+ pkey_idx = 1;
+ }
+ *out_mad_pkey_index = (u16)pkey_idx;
+
+ switch (in_mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ if (is_local_mad(ibp, in_mad, in_wc)) {
+ ret = opa_local_smp_check(ibp, in_wc);
+ if (ret)
+ return IB_MAD_RESULT_FAILURE;
+ }
+ ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
+ out_mad, &resp_len);
+ goto bail;
+ case IB_MGMT_CLASS_PERF_MGMT:
+ ret = process_perf_opa(ibdev, port, in_mad, out_mad,
+ &resp_len);
+ goto bail;
+
+ default:
+ ret = IB_MAD_RESULT_SUCCESS;
+ }
+
+bail:
+ if (ret & IB_MAD_RESULT_REPLY)
+ *out_mad_size = round_up(resp_len, 8);
+ else if (ret & IB_MAD_RESULT_SUCCESS)
+ *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
+
+ return ret;
+}
+
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+ const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ int ret;
+
+ switch (in_mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+ goto bail;
+ default:
+ ret = IB_MAD_RESULT_SUCCESS;
+ }
+
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+ struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ switch (in_mad->base_version) {
+ case OPA_MGMT_BASE_VERSION:
+ if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
+ dev_err(ibdev->dma_device, "invalid in_mad_size\n");
+ return IB_MAD_RESULT_FAILURE;
+ }
+ return hfi1_process_opa_mad(ibdev, mad_flags, port,
+ in_wc, in_grh,
+ (struct opa_mad *)in_mad,
+ (struct opa_mad *)out_mad,
+ out_mad_size,
+ out_mad_pkey_index);
+ case IB_MGMT_BASE_VERSION:
+ return hfi1_process_ib_mad(ibdev, mad_flags, port,
+ in_wc, in_grh,
+ (const struct ib_mad *)in_mad,
+ (struct ib_mad *)out_mad);
+ default:
+ break;
+ }
+
+ return IB_MAD_RESULT_FAILURE;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int hfi1_create_agents(struct hfi1_ibdev *dev)
+{
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ struct ib_mad_agent *agent;
+ struct hfi1_ibport *ibp;
+ int p;
+ int ret;
+
+ for (p = 0; p < dd->num_pports; p++) {
+ ibp = &dd->pport[p].ibport_data;
+ agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
+ NULL, 0, send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ goto err;
+ }
+
+ ibp->send_agent = agent;
+ }
+
+ return 0;
+
+err:
+ for (p = 0; p < dd->num_pports; p++) {
+ ibp = &dd->pport[p].ibport_data;
+ if (ibp->send_agent) {
+ agent = ibp->send_agent;
+ ibp->send_agent = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+ }
+
+ return ret;
+}
+
+void hfi1_free_agents(struct hfi1_ibdev *dev)
+{
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ struct ib_mad_agent *agent;
+ struct hfi1_ibport *ibp;
+ int p;
+
+ for (p = 0; p < dd->num_pports; p++) {
+ ibp = &dd->pport[p].ibport_data;
+ if (ibp->send_agent) {
+ agent = ibp->send_agent;
+ ibp->send_agent = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+ if (ibp->sm_ah) {
+ ib_destroy_ah(&ibp->sm_ah->ibah);
+ ibp->sm_ah = NULL;
+ }
+ }
+}
diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h
new file mode 100644
index 000000000000..47457501c044
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mad.h
@@ -0,0 +1,325 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MAD_H
+#define _HFI1_MAD_H
+
+#include <rdma/ib_pma.h>
+#define USE_PI_LED_ENABLE 1 /* use led enabled bit in struct
+ * opa_port_states, if available */
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+#ifndef PI_LED_ENABLE_SUP
+#define PI_LED_ENABLE_SUP 0
+#endif
+#include "opa_compat.h"
+
+
+
+#define IB_VLARB_LOWPRI_0_31 1
+#define IB_VLARB_LOWPRI_32_63 2
+#define IB_VLARB_HIGHPRI_0_31 3
+#define IB_VLARB_HIGHPRI_32_63 4
+
+#define OPA_MAX_PREEMPT_CAP 32
+#define OPA_VLARB_LOW_ELEMENTS 0
+#define OPA_VLARB_HIGH_ELEMENTS 1
+#define OPA_VLARB_PREEMPT_ELEMENTS 2
+#define OPA_VLARB_PREEMPT_MATRIX 3
+
+#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+ u8 reserved;
+ u8 reserved1;
+ __be16 port_check_rate;
+ __be16 symbol_error_counter;
+ u8 link_error_recovery_counter;
+ u8 link_downed_counter;
+ __be16 port_rcv_errors;
+ __be16 port_rcv_remphys_errors;
+ __be16 port_rcv_switch_relay_errors;
+ __be16 port_xmit_discards;
+ u8 port_xmit_constraint_errors;
+ u8 port_rcv_constraint_errors;
+ u8 reserved2;
+ u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+ __be16 reserved3;
+ __be16 vl15_dropped;
+ __be64 port_xmit_data;
+ __be64 port_rcv_data;
+ __be64 port_xmit_packets;
+ __be64 port_rcv_packets;
+ __be64 port_xmit_wait;
+ __be64 port_adr_events;
+} __packed;
+
+#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C)
+
+#define OPA_MAX_PREEMPT_CAP 32
+#define OPA_VLARB_LOW_ELEMENTS 0
+#define OPA_VLARB_HIGH_ELEMENTS 1
+#define OPA_VLARB_PREEMPT_ELEMENTS 2
+#define OPA_VLARB_PREEMPT_MATRIX 3
+
+#define HFI1_XMIT_RATE_UNSUPPORTED 0x0
+#define HFI1_XMIT_RATE_PICO 0x7
+/* number of 4nsec cycles equaling 2secs */
+#define HFI1_CONG_TIMER_PSINTERVAL 0x1DCD64EC
+
+#define IB_CC_SVCTYPE_RC 0x0
+#define IB_CC_SVCTYPE_UC 0x1
+#define IB_CC_SVCTYPE_RD 0x2
+#define IB_CC_SVCTYPE_UD 0x3
+
+
+/*
+ * There should be an equivalent IB #define for the following, but
+ * I cannot find it.
+ */
+#define OPA_CC_LOG_TYPE_HFI 2
+
+struct opa_hfi1_cong_log_event_internal {
+ u32 lqpn;
+ u32 rqpn;
+ u8 sl;
+ u8 svc_type;
+ u32 rlid;
+ s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
+};
+
+struct opa_hfi1_cong_log_event {
+ u8 local_qp_cn_entry[3];
+ u8 remote_qp_number_cn_entry[3];
+ u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
+ u8 reserved;
+ __be32 remote_lid_cn_entry;
+ __be32 timestamp_cn_entry;
+} __packed;
+
+#define OPA_CONG_LOG_ELEMS 96
+
+struct opa_hfi1_cong_log {
+ u8 log_type;
+ u8 congestion_flags;
+ __be16 threshold_event_counter;
+ __be32 current_time_stamp;
+ u8 threshold_cong_event_map[OPA_MAX_SLS/8];
+ struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
+} __packed;
+
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct opa_congestion_setting_entry {
+ u8 ccti_increase;
+ u8 reserved;
+ __be16 ccti_timer;
+ u8 trigger_threshold;
+ u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_entry_shadow {
+ u8 ccti_increase;
+ u8 reserved;
+ u16 ccti_timer;
+ u8 trigger_threshold;
+ u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_attr {
+ __be32 control_map;
+ __be16 port_control;
+ struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
+} __packed;
+
+struct opa_congestion_setting_attr_shadow {
+ u32 control_map;
+ u16 port_control;
+ struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+ __be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+ u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+ __be16 ccti_limit; /* max CCTI for cc table */
+ struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+ u16 ccti_limit; /* max CCTI for cc table */
+ struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+ (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+ u16 ccti_limit; /* max CCTI for cc table */
+ struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * struct cc_state combines the (active) per-port congestion control
+ * table, and the (active) per-SL congestion settings. cc_state data
+ * may need to be read in code paths that we want to be fast, so it
+ * is an RCU protected structure.
+ */
+struct cc_state {
+ struct rcu_head rcu;
+ struct cc_table_shadow cct;
+ struct opa_congestion_setting_attr_shadow cong_setting;
+};
+
+/*
+ * OPA BufferControl MAD
+ */
+
+/* attribute modifier macros */
+#define OPA_AM_NPORT_SHIFT 24
+#define OPA_AM_NPORT_MASK 0xff
+#define OPA_AM_NPORT_SMASK (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
+#define OPA_AM_NPORT(am) (((am) >> OPA_AM_NPORT_SHIFT) & \
+ OPA_AM_NPORT_MASK)
+
+#define OPA_AM_NBLK_SHIFT 24
+#define OPA_AM_NBLK_MASK 0xff
+#define OPA_AM_NBLK_SMASK (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
+#define OPA_AM_NBLK(am) (((am) >> OPA_AM_NBLK_SHIFT) & \
+ OPA_AM_NBLK_MASK)
+
+#define OPA_AM_START_BLK_SHIFT 0
+#define OPA_AM_START_BLK_MASK 0xff
+#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
+ OPA_AM_START_BLK_SHIFT)
+#define OPA_AM_START_BLK(am) (((am) >> OPA_AM_START_BLK_SHIFT) & \
+ OPA_AM_START_BLK_MASK)
+
+#define OPA_AM_PORTNUM_SHIFT 0
+#define OPA_AM_PORTNUM_MASK 0xff
+#define OPA_AM_PORTNUM_SMASK (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
+#define OPA_AM_PORTNUM(am) (((am) >> OPA_AM_PORTNUM_SHIFT) & \
+ OPA_AM_PORTNUM_MASK)
+
+#define OPA_AM_ASYNC_SHIFT 12
+#define OPA_AM_ASYNC_MASK 0x1
+#define OPA_AM_ASYNC_SMASK (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
+#define OPA_AM_ASYNC(am) (((am) >> OPA_AM_ASYNC_SHIFT) & \
+ OPA_AM_ASYNC_MASK)
+
+#define OPA_AM_START_SM_CFG_SHIFT 9
+#define OPA_AM_START_SM_CFG_MASK 0x1
+#define OPA_AM_START_SM_CFG_SMASK (OPA_AM_START_SM_CFG_MASK << \
+ OPA_AM_START_SM_CFG_SHIFT)
+#define OPA_AM_START_SM_CFG(am) (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
+ & OPA_AM_START_SM_CFG_MASK)
+
+#define OPA_AM_CI_ADDR_SHIFT 19
+#define OPA_AM_CI_ADDR_MASK 0xfff
+#define OPA_AM_CI_ADDR_SMASK (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
+#define OPA_AM_CI_ADDR(am) (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
+ OPA_AM_CI_ADDR_MASK)
+
+#define OPA_AM_CI_LEN_SHIFT 13
+#define OPA_AM_CI_LEN_MASK 0x3f
+#define OPA_AM_CI_LEN_SMASK (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
+#define OPA_AM_CI_LEN(am) (((am) >> OPA_AM_CI_LEN_SHIFT) & \
+ OPA_AM_CI_LEN_MASK)
+
+/* error info macros */
+#define OPA_EI_STATUS_SMASK 0x80
+#define OPA_EI_CODE_SMASK 0x0f
+
+struct vl_limit {
+ __be16 dedicated;
+ __be16 shared;
+};
+
+struct buffer_control {
+ __be16 reserved;
+ __be16 overall_shared_limit;
+ struct vl_limit vl[OPA_MAX_VLS];
+};
+
+struct sc2vlnt {
+ u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
+};
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+ cpu_to_be32(COUNTER_MASK(1, 0) | \
+ COUNTER_MASK(1, 1) | \
+ COUNTER_MASK(1, 2) | \
+ COUNTER_MASK(1, 3) | \
+ COUNTER_MASK(1, 4))
+
+#endif /* _HFI1_MAD_H */
diff --git a/drivers/staging/rdma/hfi1/mmap.c b/drivers/staging/rdma/hfi1/mmap.c
new file mode 100644
index 000000000000..5173b1c60b3d
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mmap.c
@@ -0,0 +1,192 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "verbs.h"
+
+/**
+ * hfi1_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct hfi1_mmap_info
+ */
+void hfi1_release_mmap_info(struct kref *ref)
+{
+ struct hfi1_mmap_info *ip =
+ container_of(ref, struct hfi1_mmap_info, ref);
+ struct hfi1_ibdev *dev = to_idev(ip->context->device);
+
+ spin_lock_irq(&dev->pending_lock);
+ list_del(&ip->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+
+ vfree(ip->obj);
+ kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void hfi1_vma_open(struct vm_area_struct *vma)
+{
+ struct hfi1_mmap_info *ip = vma->vm_private_data;
+
+ kref_get(&ip->ref);
+}
+
+static void hfi1_vma_close(struct vm_area_struct *vma)
+{
+ struct hfi1_mmap_info *ip = vma->vm_private_data;
+
+ kref_put(&ip->ref, hfi1_release_mmap_info);
+}
+
+static struct vm_operations_struct hfi1_vm_ops = {
+ .open = hfi1_vma_open,
+ .close = hfi1_vma_close,
+};
+
+/**
+ * hfi1_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct hfi1_ibdev *dev = to_idev(context->device);
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct hfi1_mmap_info *ip, *pp;
+ int ret = -EINVAL;
+
+ /*
+ * Search the device's list of objects waiting for a mmap call.
+ * Normally, this list is very short since a call to create a
+ * CQ, QP, or SRQ is soon followed by a call to mmap().
+ */
+ spin_lock_irq(&dev->pending_lock);
+ list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+ pending_mmaps) {
+ /* Only the creator is allowed to mmap the object */
+ if (context != ip->context || (__u64) offset != ip->offset)
+ continue;
+ /* Don't allow a mmap larger than the object. */
+ if (size > ip->size)
+ break;
+
+ list_del_init(&ip->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+
+ ret = remap_vmalloc_range(vma, ip->obj, 0);
+ if (ret)
+ goto done;
+ vma->vm_ops = &hfi1_vm_ops;
+ vma->vm_private_data = ip;
+ hfi1_vma_open(vma);
+ goto done;
+ }
+ spin_unlock_irq(&dev->pending_lock);
+done:
+ return ret;
+}
+
+/*
+ * Allocate information for hfi1_mmap
+ */
+struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev,
+ u32 size,
+ struct ib_ucontext *context,
+ void *obj) {
+ struct hfi1_mmap_info *ip;
+
+ ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+ if (!ip)
+ goto bail;
+
+ size = PAGE_ALIGN(size);
+
+ spin_lock_irq(&dev->mmap_offset_lock);
+ if (dev->mmap_offset == 0)
+ dev->mmap_offset = PAGE_SIZE;
+ ip->offset = dev->mmap_offset;
+ dev->mmap_offset += size;
+ spin_unlock_irq(&dev->mmap_offset_lock);
+
+ INIT_LIST_HEAD(&ip->pending_mmaps);
+ ip->size = size;
+ ip->context = context;
+ ip->obj = obj;
+ kref_init(&ip->ref);
+
+bail:
+ return ip;
+}
+
+void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
+ u32 size, void *obj)
+{
+ size = PAGE_ALIGN(size);
+
+ spin_lock_irq(&dev->mmap_offset_lock);
+ if (dev->mmap_offset == 0)
+ dev->mmap_offset = PAGE_SIZE;
+ ip->offset = dev->mmap_offset;
+ dev->mmap_offset += size;
+ spin_unlock_irq(&dev->mmap_offset_lock);
+
+ ip->size = size;
+ ip->obj = obj;
+}
diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c
new file mode 100644
index 000000000000..bd64e4f986f9
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/mr.c
@@ -0,0 +1,551 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+
+/* Fast memory region */
+struct hfi1_fmr {
+ struct ib_fmr ibfmr;
+ struct hfi1_mregion mr; /* must be last */
+};
+
+static inline struct hfi1_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+ return container_of(ibfmr, struct hfi1_fmr, ibfmr);
+}
+
+static int init_mregion(struct hfi1_mregion *mr, struct ib_pd *pd,
+ int count)
+{
+ int m, i = 0;
+ int rval = 0;
+
+ m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+ for (; i < m; i++) {
+ mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+ if (!mr->map[i])
+ goto bail;
+ }
+ mr->mapsz = m;
+ init_completion(&mr->comp);
+ /* count returning the ptr to user */
+ atomic_set(&mr->refcount, 1);
+ mr->pd = pd;
+ mr->max_segs = count;
+out:
+ return rval;
+bail:
+ while (i)
+ kfree(mr->map[--i]);
+ rval = -ENOMEM;
+ goto out;
+}
+
+static void deinit_mregion(struct hfi1_mregion *mr)
+{
+ int i = mr->mapsz;
+
+ mr->mapsz = 0;
+ while (i)
+ kfree(mr->map[--i]);
+}
+
+
+/**
+ * hfi1_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see dma.c).
+ */
+struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct hfi1_mr *mr = NULL;
+ struct ib_mr *ret;
+ int rval;
+
+ if (to_ipd(pd)->user) {
+ ret = ERR_PTR(-EPERM);
+ goto bail;
+ }
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ rval = init_mregion(&mr->mr, pd, 0);
+ if (rval) {
+ ret = ERR_PTR(rval);
+ goto bail;
+ }
+
+
+ rval = hfi1_alloc_lkey(&mr->mr, 1);
+ if (rval) {
+ ret = ERR_PTR(rval);
+ goto bail_mregion;
+ }
+
+ mr->mr.access_flags = acc;
+ ret = &mr->ibmr;
+done:
+ return ret;
+
+bail_mregion:
+ deinit_mregion(&mr->mr);
+bail:
+ kfree(mr);
+ goto done;
+}
+
+static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd)
+{
+ struct hfi1_mr *mr;
+ int rval = -ENOMEM;
+ int m;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+ mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
+ if (!mr)
+ goto bail;
+
+ rval = init_mregion(&mr->mr, pd, count);
+ if (rval)
+ goto bail;
+ /*
+ * ib_reg_phys_mr() will initialize mr->ibmr except for
+ * lkey and rkey.
+ */
+ rval = hfi1_alloc_lkey(&mr->mr, 0);
+ if (rval)
+ goto bail_mregion;
+ mr->ibmr.lkey = mr->mr.lkey;
+ mr->ibmr.rkey = mr->mr.lkey;
+done:
+ return mr;
+
+bail_mregion:
+ deinit_mregion(&mr->mr);
+bail:
+ kfree(mr);
+ mr = ERR_PTR(rval);
+ goto done;
+}
+
+/**
+ * hfi1_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf, int acc, u64 *iova_start)
+{
+ struct hfi1_mr *mr;
+ int n, m, i;
+ struct ib_mr *ret;
+
+ mr = alloc_mr(num_phys_buf, pd);
+ if (IS_ERR(mr)) {
+ ret = (struct ib_mr *)mr;
+ goto bail;
+ }
+
+ mr->mr.user_base = *iova_start;
+ mr->mr.iova = *iova_start;
+ mr->mr.access_flags = acc;
+
+ m = 0;
+ n = 0;
+ for (i = 0; i < num_phys_buf; i++) {
+ mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+ mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+ mr->mr.length += buffer_list[i].size;
+ n++;
+ if (n == HFI1_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+
+ ret = &mr->ibmr;
+
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata)
+{
+ struct hfi1_mr *mr;
+ struct ib_umem *umem;
+ struct scatterlist *sg;
+ int n, m, entry;
+ struct ib_mr *ret;
+
+ if (length == 0) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ umem = ib_umem_get(pd->uobject->context, start, length,
+ mr_access_flags, 0);
+ if (IS_ERR(umem))
+ return (void *) umem;
+
+ n = umem->nmap;
+
+ mr = alloc_mr(n, pd);
+ if (IS_ERR(mr)) {
+ ret = (struct ib_mr *)mr;
+ ib_umem_release(umem);
+ goto bail;
+ }
+
+ mr->mr.user_base = start;
+ mr->mr.iova = virt_addr;
+ mr->mr.length = length;
+ mr->mr.offset = ib_umem_offset(umem);
+ mr->mr.access_flags = mr_access_flags;
+ mr->umem = umem;
+
+ if (is_power_of_2(umem->page_size))
+ mr->mr.page_shift = ilog2(umem->page_size);
+ m = 0;
+ n = 0;
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+ void *vaddr;
+
+ vaddr = page_address(sg_page(sg));
+ if (!vaddr) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+ mr->mr.map[m]->segs[n].vaddr = vaddr;
+ mr->mr.map[m]->segs[n].length = umem->page_size;
+ n++;
+ if (n == HFI1_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ ret = &mr->ibmr;
+
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by hfi1_get_dma_mr()
+ * or hfi1_reg_user_mr().
+ */
+int hfi1_dereg_mr(struct ib_mr *ibmr)
+{
+ struct hfi1_mr *mr = to_imr(ibmr);
+ int ret = 0;
+ unsigned long timeout;
+
+ hfi1_free_lkey(&mr->mr);
+
+ hfi1_put_mr(&mr->mr); /* will set completion if last */
+ timeout = wait_for_completion_timeout(&mr->mr.comp,
+ 5 * HZ);
+ if (!timeout) {
+ dd_dev_err(
+ dd_from_ibdev(mr->mr.pd->device),
+ "hfi1_dereg_mr timeout mr %p pd %p refcount %u\n",
+ mr, mr->mr.pd, atomic_read(&mr->mr.refcount));
+ hfi1_get_mr(&mr->mr);
+ ret = -EBUSY;
+ goto out;
+ }
+ deinit_mregion(&mr->mr);
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+ kfree(mr);
+out:
+ return ret;
+}
+
+/*
+ * Allocate a memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ *
+ * Return the memory region on success, otherwise return an errno.
+ */
+struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct hfi1_mr *mr;
+
+ if (mr_type != IB_MR_TYPE_MEM_REG)
+ return ERR_PTR(-EINVAL);
+
+ mr = alloc_mr(max_num_sg, pd);
+ if (IS_ERR(mr))
+ return (struct ib_mr *)mr;
+
+ return &mr->ibmr;
+}
+
+struct ib_fast_reg_page_list *
+hfi1_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len)
+{
+ unsigned size = page_list_len * sizeof(u64);
+ struct ib_fast_reg_page_list *pl;
+
+ if (size > PAGE_SIZE)
+ return ERR_PTR(-EINVAL);
+
+ pl = kzalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return ERR_PTR(-ENOMEM);
+
+ pl->page_list = kzalloc(size, GFP_KERNEL);
+ if (!pl->page_list)
+ goto err_free;
+
+ return pl;
+
+err_free:
+ kfree(pl);
+ return ERR_PTR(-ENOMEM);
+}
+
+void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
+{
+ kfree(pl->page_list);
+ kfree(pl);
+}
+
+/**
+ * hfi1_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct hfi1_fmr *fmr;
+ int m;
+ struct ib_fmr *ret;
+ int rval = -ENOMEM;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (fmr_attr->max_pages + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+ fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
+ if (!fmr)
+ goto bail;
+
+ rval = init_mregion(&fmr->mr, pd, fmr_attr->max_pages);
+ if (rval)
+ goto bail;
+
+ /*
+ * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+ * rkey.
+ */
+ rval = hfi1_alloc_lkey(&fmr->mr, 0);
+ if (rval)
+ goto bail_mregion;
+ fmr->ibfmr.rkey = fmr->mr.lkey;
+ fmr->ibfmr.lkey = fmr->mr.lkey;
+ /*
+ * Resources are allocated but no valid mapping (RKEY can't be
+ * used).
+ */
+ fmr->mr.access_flags = mr_access_flags;
+ fmr->mr.max_segs = fmr_attr->max_pages;
+ fmr->mr.page_shift = fmr_attr->page_shift;
+
+ ret = &fmr->ibfmr;
+done:
+ return ret;
+
+bail_mregion:
+ deinit_mregion(&fmr->mr);
+bail:
+ kfree(fmr);
+ ret = ERR_PTR(rval);
+ goto done;
+}
+
+/**
+ * hfi1_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova)
+{
+ struct hfi1_fmr *fmr = to_ifmr(ibfmr);
+ struct hfi1_lkey_table *rkt;
+ unsigned long flags;
+ int m, n, i;
+ u32 ps;
+ int ret;
+
+ i = atomic_read(&fmr->mr.refcount);
+ if (i > 2)
+ return -EBUSY;
+
+ if (list_len > fmr->mr.max_segs) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ rkt = &to_idev(ibfmr->device)->lk_table;
+ spin_lock_irqsave(&rkt->lock, flags);
+ fmr->mr.user_base = iova;
+ fmr->mr.iova = iova;
+ ps = 1 << fmr->mr.page_shift;
+ fmr->mr.length = list_len * ps;
+ m = 0;
+ n = 0;
+ for (i = 0; i < list_len; i++) {
+ fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+ fmr->mr.map[m]->segs[n].length = ps;
+ if (++n == HFI1_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int hfi1_unmap_fmr(struct list_head *fmr_list)
+{
+ struct hfi1_fmr *fmr;
+ struct hfi1_lkey_table *rkt;
+ unsigned long flags;
+
+ list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+ rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+ spin_lock_irqsave(&rkt->lock, flags);
+ fmr->mr.user_base = 0;
+ fmr->mr.iova = 0;
+ fmr->mr.length = 0;
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ }
+ return 0;
+}
+
+/**
+ * hfi1_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int hfi1_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+ struct hfi1_fmr *fmr = to_ifmr(ibfmr);
+ int ret = 0;
+ unsigned long timeout;
+
+ hfi1_free_lkey(&fmr->mr);
+ hfi1_put_mr(&fmr->mr); /* will set completion if last */
+ timeout = wait_for_completion_timeout(&fmr->mr.comp,
+ 5 * HZ);
+ if (!timeout) {
+ hfi1_get_mr(&fmr->mr);
+ ret = -EBUSY;
+ goto out;
+ }
+ deinit_mregion(&fmr->mr);
+ kfree(fmr);
+out:
+ return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h
new file mode 100644
index 000000000000..f64eec1c2951
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/opa_compat.h
@@ -0,0 +1,129 @@
+#ifndef _LINUX_H
+#define _LINUX_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This header file is for OPA-specific definitions which are
+ * required by the HFI driver, and which aren't yet in the Linux
+ * IB core. We'll collect these all here, then merge them into
+ * the kernel when that's convenient.
+ */
+
+/* OPA SMA attribute IDs */
+#define OPA_ATTRIB_ID_CONGESTION_INFO cpu_to_be16(0x008b)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG cpu_to_be16(0x008f)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING cpu_to_be16(0x0090)
+#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
+
+/* OPA PMA attribute IDs */
+#define OPA_PM_ATTRIB_ID_PORT_STATUS cpu_to_be16(0x0040)
+#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS cpu_to_be16(0x0041)
+#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS cpu_to_be16(0x0042)
+#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS cpu_to_be16(0x0043)
+#define OPA_PM_ATTRIB_ID_ERROR_INFO cpu_to_be16(0x0044)
+
+/* OPA status codes */
+#define OPA_PM_STATUS_REQUEST_TOO_LARGE cpu_to_be16(0x100)
+
+static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
+{
+ return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
+}
+
+static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
+{
+ return ((ps->portphysstate_portstate &
+ OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
+}
+
+/*
+ * OPA port physical states
+ * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
+ * values.
+ *
+ * When writing, only values 0-3 are valid, other values are ignored.
+ * When reading, 0 is reserved.
+ *
+ * Returned by the ibphys_portstate() routine.
+ */
+enum opa_port_phys_state {
+ IB_PORTPHYSSTATE_NOP = 0,
+ /* 1 is reserved */
+ IB_PORTPHYSSTATE_POLLING = 2,
+ IB_PORTPHYSSTATE_DISABLED = 3,
+ IB_PORTPHYSSTATE_TRAINING = 4,
+ IB_PORTPHYSSTATE_LINKUP = 5,
+ IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
+ IB_PORTPHYSSTATE_PHY_TEST = 7,
+ /* 8 is reserved */
+ OPA_PORTPHYSSTATE_OFFLINE = 9,
+ OPA_PORTPHYSSTATE_GANGED = 10,
+ OPA_PORTPHYSSTATE_TEST = 11,
+ OPA_PORTPHYSSTATE_MAX = 11,
+ /* values 12-15 are reserved/ignored */
+};
+
+/* OPA_PORT_TYPE_* definitions - these belong in opa_port_info.h */
+#define OPA_PORT_TYPE_UNKNOWN 0
+#define OPA_PORT_TYPE_DISCONNECTED 1
+/* port is not currently usable, CableInfo not available */
+#define OPA_PORT_TYPE_FIXED 2
+/* A fixed backplane port in a director class switch. All OPA ASICS */
+#define OPA_PORT_TYPE_VARIABLE 3
+/* A backplane port in a blade system, possibly mixed configuration */
+#define OPA_PORT_TYPE_STANDARD 4
+/* implies a SFF-8636 defined format for CableInfo (QSFP) */
+#define OPA_PORT_TYPE_SI_PHOTONICS 5
+/* A silicon photonics module implies TBD defined format for CableInfo
+ * as defined by Intel SFO group */
+/* 6 - 15 are reserved */
+
+#endif /* _LINUX_H */
diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c
new file mode 100644
index 000000000000..ac5653c0f65e
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pcie.c
@@ -0,0 +1,1253 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "chip_registers.h"
+
+/* link speed vector for Gen3 speed - not in Linux headers */
+#define GEN1_SPEED_VECTOR 0x1
+#define GEN2_SPEED_VECTOR 0x2
+#define GEN3_SPEED_VECTOR 0x3
+
+/*
+ * This file contains PCIe utility routines.
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ */
+static void tune_pcie_caps(struct hfi1_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success. Therefore dd_dev_err() can't be used for error
+ * printing.
+ */
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret) {
+ /*
+ * This can happen (in theory) iff:
+ * We did a chip reset, and then failed to reprogram the
+ * BAR, or the chip reset due to an internal error. We then
+ * unloaded the driver and reloaded it.
+ *
+ * Both reset cases set the BAR back to initial state. For
+ * the latter case, the AER sticky error bit at offset 0x718
+ * should be set, but the Linux kernel doesn't yet know
+ * about that, it appears. If the original BAR was retained
+ * in the kernel data structures, this may be OK.
+ */
+ hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
+ -ret);
+ goto done;
+ }
+
+ ret = pci_request_regions(pdev, DRIVER_NAME);
+ if (ret) {
+ hfi1_early_err(&pdev->dev,
+ "pci_request_regions fails: err %d\n", -ret);
+ goto bail;
+ }
+
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (ret) {
+ /*
+ * If the 64 bit setup fails, try 32 bit. Some systems
+ * do not setup 64 bit maps on systems with 2GB or less
+ * memory installed.
+ */
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (ret) {
+ hfi1_early_err(&pdev->dev,
+ "Unable to set DMA mask: %d\n", ret);
+ goto bail;
+ }
+ ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ } else
+ ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (ret) {
+ hfi1_early_err(&pdev->dev,
+ "Unable to set DMA consistent mask: %d\n", ret);
+ goto bail;
+ }
+
+ pci_set_master(pdev);
+ ret = pci_enable_pcie_error_reporting(pdev);
+ if (ret) {
+ hfi1_early_err(&pdev->dev,
+ "Unable to enable pcie error reporting: %d\n",
+ ret);
+ ret = 0;
+ }
+ goto done;
+
+bail:
+ hfi1_pcie_cleanup(pdev);
+done:
+ return ret;
+}
+
+/*
+ * Clean what was done in hfi1_pcie_init()
+ */
+void hfi1_pcie_cleanup(struct pci_dev *pdev)
+{
+ pci_disable_device(pdev);
+ /*
+ * Release regions should be called after the disable. OK to
+ * call if request regions has not been called or failed.
+ */
+ pci_release_regions(pdev);
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ unsigned long len;
+ resource_size_t addr;
+
+ dd->pcidev = pdev;
+ pci_set_drvdata(pdev, dd);
+
+ addr = pci_resource_start(pdev, 0);
+ len = pci_resource_len(pdev, 0);
+
+ /*
+ * The TXE PIO buffers are at the tail end of the chip space.
+ * Cut them off and map them separately.
+ */
+
+ /* sanity check vs expectations */
+ if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
+ dd_dev_err(dd, "chip PIO range does not match\n");
+ return -EINVAL;
+ }
+
+ dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
+ if (!dd->kregbase)
+ return -ENOMEM;
+
+ dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
+ if (!dd->piobase) {
+ iounmap(dd->kregbase);
+ return -ENOMEM;
+ }
+
+ dd->flags |= HFI1_PRESENT; /* now register routines work */
+
+ dd->kregend = dd->kregbase + TXE_PIO_SEND;
+ dd->physaddr = addr; /* used for io_remap, etc. */
+
+ /*
+ * Re-map the chip's RcvArray as write-combining to allow us
+ * to write an entire cacheline worth of entries in one shot.
+ * If this re-map fails, just continue - the RcvArray programming
+ * function will handle both cases.
+ */
+ dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
+ dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
+ dd->chip_rcv_array_count * 8);
+ dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
+ /*
+ * Save BARs and command to rewrite after device reset.
+ */
+ dd->pcibar0 = addr;
+ dd->pcibar1 = addr >> 32;
+ pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+ pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+ &dd->pcie_devctl2);
+ pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+ &dd->pci_lnkctl3);
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+
+ return 0;
+}
+
+/*
+ * Do PCIe cleanup related to dd, after chip-specific cleanup, etc. Just prior
+ * to releasing the dd memory.
+ * Void because all of the core pcie cleanup functions are void.
+ */
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
+{
+ u64 __iomem *base = (void __iomem *) dd->kregbase;
+
+ dd->flags &= ~HFI1_PRESENT;
+ dd->kregbase = NULL;
+ iounmap(base);
+ if (dd->rcvarray_wc)
+ iounmap(dd->rcvarray_wc);
+ if (dd->piobase)
+ iounmap(dd->piobase);
+
+ pci_set_drvdata(dd->pcidev, NULL);
+}
+
+/*
+ * Do a Function Level Reset (FLR) on the device.
+ * Based on static function drivers/pci/pci.c:pcie_flr().
+ */
+void hfi1_pcie_flr(struct hfi1_devdata *dd)
+{
+ int i;
+ u16 status;
+
+ /* no need to check for the capability - we know the device has it */
+
+ /* wait for Transaction Pending bit to clear, at most a few ms */
+ for (i = 0; i < 4; i++) {
+ if (i)
+ msleep((1 << (i - 1)) * 100);
+
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
+ if (!(status & PCI_EXP_DEVSTA_TRPND))
+ goto clear;
+ }
+
+ dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
+
+clear:
+ pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
+ PCI_EXP_DEVCTL_BCR_FLR);
+ /* PCIe spec requires the function to be back within 100ms */
+ msleep(100);
+}
+
+static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
+ struct hfi1_msix_entry *hfi1_msix_entry)
+{
+ int ret;
+ int nvec = *msixcnt;
+ struct msix_entry *msix_entry;
+ int i;
+
+ /* We can't pass hfi1_msix_entry array to msix_setup
+ * so use a dummy msix_entry array and copy the allocated
+ * irq back to the hfi1_msix_entry array. */
+ msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
+ if (!msix_entry) {
+ ret = -ENOMEM;
+ goto do_intx;
+ }
+
+ for (i = 0; i < nvec; i++)
+ msix_entry[i] = hfi1_msix_entry[i].msix;
+
+ ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+ if (ret < 0)
+ goto free_msix_entry;
+ nvec = ret;
+
+ for (i = 0; i < nvec; i++)
+ hfi1_msix_entry[i].msix = msix_entry[i];
+
+ kfree(msix_entry);
+ *msixcnt = nvec;
+ return;
+
+free_msix_entry:
+ kfree(msix_entry);
+
+do_intx:
+ dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
+ nvec, ret);
+ *msixcnt = 0;
+ hfi1_enable_intx(dd->pcidev);
+
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_speed(u16 linkstat)
+{
+ u32 speed;
+
+ switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+ default: /* not defined, assume Gen1 */
+ case PCI_EXP_LNKSTA_CLS_2_5GB:
+ speed = 2500; /* Gen 1, 2.5GHz */
+ break;
+ case PCI_EXP_LNKSTA_CLS_5_0GB:
+ speed = 5000; /* Gen 2, 5GHz */
+ break;
+ case GEN3_SPEED_VECTOR:
+ speed = 8000; /* Gen 3, 8GHz */
+ break;
+ }
+ return speed;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_width(u16 linkstat)
+{
+ return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
+static void update_lbus_info(struct hfi1_devdata *dd)
+{
+ u16 linkstat;
+
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+ dd->lbus_width = extract_width(linkstat);
+ dd->lbus_speed = extract_speed(linkstat);
+ snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+ "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
+}
+
+/*
+ * Read in the current PCIe link width and speed. Find if the link is
+ * Gen3 capable.
+ */
+int pcie_speeds(struct hfi1_devdata *dd)
+{
+ u32 linkcap;
+
+ if (!pci_is_pcie(dd->pcidev)) {
+ dd_dev_err(dd, "Can't find PCI Express capability!\n");
+ return -EINVAL;
+ }
+
+ /* find if our max speed is Gen3 and parent supports Gen3 speeds */
+ dd->link_gen3_capable = 1;
+
+ pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+ if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
+ dd_dev_info(dd,
+ "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+ linkcap & PCI_EXP_LNKCAP_SLS);
+ dd->link_gen3_capable = 0;
+ }
+
+ /*
+ * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
+ */
+ if (dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+ dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
+ dd->link_gen3_capable = 0;
+ }
+
+ /* obtain the link width and current speed */
+ update_lbus_info(dd);
+
+ /* check against expected pcie width and complain if "wrong" */
+ if (dd->lbus_width < 16)
+ dd_dev_err(dd, "PCIe width %u (x16 HFI)\n", dd->lbus_width);
+
+ return 0;
+}
+
+/*
+ * Returns in *nent:
+ * - actual number of interrupts allocated
+ * - 0 if fell back to INTx.
+ */
+void request_msix(struct hfi1_devdata *dd, u32 *nent,
+ struct hfi1_msix_entry *entry)
+{
+ int pos;
+
+ pos = dd->pcidev->msix_cap;
+ if (*nent && pos) {
+ msix_setup(dd, pos, nent, entry);
+ /* did it, either MSI-X or INTx */
+ } else {
+ *nent = 0;
+ hfi1_enable_intx(dd->pcidev);
+ }
+
+ tune_pcie_caps(dd);
+}
+
+/*
+ * Disable MSI-X.
+ */
+void hfi1_nomsix(struct hfi1_devdata *dd)
+{
+ pci_disable_msix(dd->pcidev);
+}
+
+void hfi1_enable_intx(struct pci_dev *pdev)
+{
+ /* first, turn on INTx */
+ pci_intx(pdev, 1);
+ /* then turn off MSI-X */
+ pci_disable_msix(pdev);
+}
+
+/* restore command and BARs after a reset has wiped them out */
+void restore_pci_variables(struct hfi1_devdata *dd)
+{
+ pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+ pci_write_config_dword(dd->pcidev,
+ PCI_BASE_ADDRESS_0, dd->pcibar0);
+ pci_write_config_dword(dd->pcidev,
+ PCI_BASE_ADDRESS_1, dd->pcibar1);
+ pci_write_config_dword(dd->pcidev,
+ PCI_ROM_ADDRESS, dd->pci_rom);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+ dd->pcie_devctl2);
+ pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+ dd->pci_lnkctl3);
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+}
+
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int hfi1_pcie_caps;
+module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+static void tune_pcie_caps(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent;
+ u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+ u16 rc_mrrs, ep_mrrs, max_mrrs;
+
+ /* Find out supported and configured values for parent (root) */
+ parent = dd->pcidev->bus->self;
+ if (!pci_is_root_bus(parent->bus)) {
+ dd_dev_info(dd, "Parent not root\n");
+ return;
+ }
+
+ if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+ return;
+ rc_mpss = parent->pcie_mpss;
+ rc_mps = ffs(pcie_get_mps(parent)) - 8;
+ /* Find out supported and configured values for endpoint (us) */
+ ep_mpss = dd->pcidev->pcie_mpss;
+ ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+ /* Find max payload supported by root, endpoint */
+ if (rc_mpss > ep_mpss)
+ rc_mpss = ep_mpss;
+
+ /* If Supported greater than limit in module param, limit it */
+ if (rc_mpss > (hfi1_pcie_caps & 7))
+ rc_mpss = hfi1_pcie_caps & 7;
+ /* If less than (allowed, supported), bump root payload */
+ if (rc_mpss > rc_mps) {
+ rc_mps = rc_mpss;
+ pcie_set_mps(parent, 128 << rc_mps);
+ }
+ /* If less than (allowed, supported), bump endpoint payload */
+ if (rc_mpss > ep_mps) {
+ ep_mps = rc_mpss;
+ pcie_set_mps(dd->pcidev, 128 << ep_mps);
+ }
+
+ /*
+ * Now the Read Request size.
+ * No field for max supported, but PCIe spec limits it to 4096,
+ * which is code '5' (log2(4096) - 7)
+ */
+ max_mrrs = 5;
+ if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
+ max_mrrs = (hfi1_pcie_caps >> 4) & 7;
+
+ max_mrrs = 128 << max_mrrs;
+ rc_mrrs = pcie_get_readrq(parent);
+ ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+ if (max_mrrs > rc_mrrs) {
+ rc_mrrs = max_mrrs;
+ pcie_set_readrq(parent, rc_mrrs);
+ }
+ if (max_mrrs > ep_mrrs) {
+ ep_mrrs = max_mrrs;
+ pcie_set_readrq(dd->pcidev, ep_mrrs);
+ }
+}
+/* End of PCIe capability tuning */
+
+/*
+ * From here through hfi1_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+ pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+ switch (state) {
+ case pci_channel_io_normal:
+ dd_dev_info(dd, "State Normal, ignoring\n");
+ break;
+
+ case pci_channel_io_frozen:
+ dd_dev_info(dd, "State Frozen, requesting reset\n");
+ pci_disable_device(pdev);
+ ret = PCI_ERS_RESULT_NEED_RESET;
+ break;
+
+ case pci_channel_io_perm_failure:
+ if (dd) {
+ dd_dev_info(dd, "State Permanent Failure, disabling\n");
+ /* no more register accesses! */
+ dd->flags &= ~HFI1_PRESENT;
+ hfi1_disable_after_error(dd);
+ }
+ /* else early, or other problem */
+ ret = PCI_ERS_RESULT_DISCONNECT;
+ break;
+
+ default: /* shouldn't happen */
+ dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
+ state);
+ break;
+ }
+ return ret;
+}
+
+static pci_ers_result_t
+pci_mmio_enabled(struct pci_dev *pdev)
+{
+ u64 words = 0U;
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+ pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+ if (dd && dd->pport) {
+ words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
+ if (words == ~0ULL)
+ ret = PCI_ERS_RESULT_NEED_RESET;
+ dd_dev_info(dd,
+ "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+ words, ret);
+ }
+ return ret;
+}
+
+static pci_ers_result_t
+pci_slot_reset(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
+ return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+pci_link_reset(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
+ return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+pci_resume(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ dd_dev_info(dd, "HFI1 resume function called\n");
+ pci_cleanup_aer_uncorrect_error_status(pdev);
+ /*
+ * Running jobs will fail, since it's asynchronous
+ * unlike sysfs-requested reset. Better than
+ * doing nothing.
+ */
+ hfi1_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers hfi1_pci_err_handler = {
+ .error_detected = pci_error_detected,
+ .mmio_enabled = pci_mmio_enabled,
+ .link_reset = pci_link_reset,
+ .slot_reset = pci_slot_reset,
+ .resume = pci_resume,
+};
+
+/*============================================================================*/
+/* PCIe Gen3 support */
+
+/*
+ * This code is separated out because it is expected to be removed in the
+ * final shipping product. If not, then it will be revisited and items
+ * will be moved to more standard locations.
+ */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
+#define DL_STATUS_HFI0 0x1 /* hfi0 firmware download complete */
+#define DL_STATUS_HFI1 0x2 /* hfi1 firmware download complete */
+#define DL_STATUS_BOTH 0x3 /* hfi0 and hfi1 firmware download complete */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
+#define DL_ERR_NONE 0x0 /* no error */
+#define DL_ERR_SWAP_PARITY 0x1 /* parity error in SerDes interrupt */
+ /* or response data */
+#define DL_ERR_DISABLED 0x2 /* hfi disabled */
+#define DL_ERR_SECURITY 0x3 /* security check failed */
+#define DL_ERR_SBUS 0x4 /* SBus status error */
+#define DL_ERR_XFR_PARITY 0x5 /* parity error during ROM transfer*/
+
+/* gasket block secondary bus reset delay */
+#define SBR_DELAY_US 200000 /* 200ms */
+
+/* mask for PCIe capability register lnkctl2 target link speed */
+#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
+
+static uint pcie_target = 3;
+module_param(pcie_target, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
+
+static uint pcie_force;
+module_param(pcie_force, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
+
+static uint pcie_retry = 5;
+module_param(pcie_retry, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
+
+#define UNSET_PSET 255
+#define DEFAULT_DISCRETE_PSET 2 /* discrete HFI */
+#define DEFAULT_MCP_PSET 4 /* MCP HFI */
+static uint pcie_pset = UNSET_PSET;
+module_param(pcie_pset, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
+
+/* equalization columns */
+#define PREC 0
+#define ATTN 1
+#define POST 2
+
+/* discrete silicon preliminary equalization values */
+static const u8 discrete_preliminary_eq[11][3] = {
+ /* prec attn post */
+ { 0x00, 0x00, 0x12 }, /* p0 */
+ { 0x00, 0x00, 0x0c }, /* p1 */
+ { 0x00, 0x00, 0x0f }, /* p2 */
+ { 0x00, 0x00, 0x09 }, /* p3 */
+ { 0x00, 0x00, 0x00 }, /* p4 */
+ { 0x06, 0x00, 0x00 }, /* p5 */
+ { 0x09, 0x00, 0x00 }, /* p6 */
+ { 0x06, 0x00, 0x0f }, /* p7 */
+ { 0x09, 0x00, 0x09 }, /* p8 */
+ { 0x0c, 0x00, 0x00 }, /* p9 */
+ { 0x00, 0x00, 0x18 }, /* p10 */
+};
+
+/* integrated silicon preliminary equalization values */
+static const u8 integrated_preliminary_eq[11][3] = {
+ /* prec attn post */
+ { 0x00, 0x1e, 0x07 }, /* p0 */
+ { 0x00, 0x1e, 0x05 }, /* p1 */
+ { 0x00, 0x1e, 0x06 }, /* p2 */
+ { 0x00, 0x1e, 0x04 }, /* p3 */
+ { 0x00, 0x1e, 0x00 }, /* p4 */
+ { 0x03, 0x1e, 0x00 }, /* p5 */
+ { 0x04, 0x1e, 0x00 }, /* p6 */
+ { 0x03, 0x1e, 0x06 }, /* p7 */
+ { 0x03, 0x1e, 0x04 }, /* p8 */
+ { 0x05, 0x1e, 0x00 }, /* p9 */
+ { 0x00, 0x1e, 0x0a }, /* p10 */
+};
+
+/* helper to format the value to write to hardware */
+#define eq_value(pre, curr, post) \
+ ((((u32)(pre)) << \
+ PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
+ | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
+ | (((u32)(post)) << \
+ PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
+
+/*
+ * Load the given EQ preset table into the PCIe hardware.
+ */
+static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
+ u8 div)
+{
+ struct pci_dev *pdev = dd->pcidev;
+ u32 hit_error = 0;
+ u32 violation;
+ u32 i;
+ u8 c_minus1, c0, c_plus1;
+
+ for (i = 0; i < 11; i++) {
+ /* set index */
+ pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
+ /* write the value */
+ c_minus1 = eq[i][PREC] / div;
+ c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
+ c_plus1 = eq[i][POST] / div;
+ pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
+ eq_value(c_minus1, c0, c_plus1));
+ /* check if these coefficients violate EQ rules */
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
+ &violation);
+ if (violation
+ & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
+ if (hit_error == 0) {
+ dd_dev_err(dd,
+ "Gen3 EQ Table Coefficient rule violations\n");
+ dd_dev_err(dd, " prec attn post\n");
+ }
+ dd_dev_err(dd, " p%02d: %02x %02x %02x\n",
+ i, (u32)eq[i][0], (u32)eq[i][1], (u32)eq[i][2]);
+ dd_dev_err(dd, " %02x %02x %02x\n",
+ (u32)c_minus1, (u32)c0, (u32)c_plus1);
+ hit_error = 1;
+ }
+ }
+ if (hit_error)
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Steps to be done after the PCIe firmware is downloaded and
+ * before the SBR for the Pcie Gen3.
+ * The hardware mutex is already being held.
+ */
+static void pcie_post_steps(struct hfi1_devdata *dd)
+{
+ int i;
+
+ set_sbus_fast_mode(dd);
+ /*
+ * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
+ * This avoids a spurious framing error that can otherwise be
+ * generated by the MAC layer.
+ *
+ * Use individual addresses since no broadcast is set up.
+ */
+ for (i = 0; i < NUM_PCIE_SERDES; i++) {
+ sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
+ 0x03, WRITE_SBUS_RECEIVER, 0x00022132);
+ }
+
+ clear_sbus_fast_mode(dd);
+}
+
+/*
+ * Trigger a secondary bus reset (SBR) on ourselves using our parent.
+ *
+ * Based on pci_parent_bus_reset() which is not exported by the
+ * kernel core.
+ */
+static int trigger_sbr(struct hfi1_devdata *dd)
+{
+ struct pci_dev *dev = dd->pcidev;
+ struct pci_dev *pdev;
+
+ /* need a parent */
+ if (!dev->bus->self) {
+ dd_dev_err(dd, "%s: no parent device\n", __func__);
+ return -ENOTTY;
+ }
+
+ /* should not be anyone else on the bus */
+ list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+ if (pdev != dev) {
+ dd_dev_err(dd,
+ "%s: another device is on the same bus\n",
+ __func__);
+ return -ENOTTY;
+ }
+
+ /*
+ * A secondary bus reset (SBR) issues a hot reset to our device.
+ * The following routine does a 1s wait after the reset is dropped
+ * per PCI Trhfa (recovery time). PCIe 3.0 section 6.6.1 -
+ * Conventional Reset, paragraph 3, line 35 also says that a 1s
+ * delay after a reset is required. Per spec requirements,
+ * the link is either working or not after that point.
+ */
+ pci_reset_bridge_secondary_bus(dev->bus->self);
+
+ return 0;
+}
+
+/*
+ * Write the given gasket interrupt register.
+ */
+static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
+ u16 code, u16 data)
+{
+ write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
+ (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT)
+ |((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+}
+
+/*
+ * Tell the gasket logic how to react to the reset.
+ */
+static void arm_gasket_logic(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ reg = (((u64)1 << dd->hfi1_id)
+ << ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT)
+ | ((u64)pcie_serdes_broadcast[dd->hfi1_id]
+ << ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT
+ | ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK
+ | ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK)
+ << ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT
+ );
+ write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
+ /* read back to push the write */
+ read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
+}
+
+/*
+ * Do all the steps needed to transition the PCIe link to Gen3 speed.
+ */
+int do_pcie_gen3_transition(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent;
+ u64 fw_ctrl;
+ u64 reg, therm;
+ u32 reg32, fs, lf;
+ u32 status, err;
+ int ret;
+ int do_retry, retry_count = 0;
+ uint default_pset;
+ u16 target_vector, target_speed;
+ u16 lnkctl, lnkctl2, vendor;
+ u8 nsbr = 1;
+ u8 div;
+ const u8 (*eq)[3];
+ int return_error = 0;
+
+ /* PCIe Gen3 is for the ASIC only */
+ if (dd->icode != ICODE_RTL_SILICON)
+ return 0;
+
+ if (pcie_target == 1) { /* target Gen1 */
+ target_vector = GEN1_SPEED_VECTOR;
+ target_speed = 2500;
+ } else if (pcie_target == 2) { /* target Gen2 */
+ target_vector = GEN2_SPEED_VECTOR;
+ target_speed = 5000;
+ } else if (pcie_target == 3) { /* target Gen3 */
+ target_vector = GEN3_SPEED_VECTOR;
+ target_speed = 8000;
+ } else {
+ /* off or invalid target - skip */
+ dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
+ return 0;
+ }
+
+ /* if already at target speed, done (unless forced) */
+ if (dd->lbus_speed == target_speed) {
+ dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
+ pcie_target,
+ pcie_force ? "re-doing anyway" : "skipping");
+ if (!pcie_force)
+ return 0;
+ }
+
+ /*
+ * A0 needs an additional SBR
+ */
+ if (is_a0(dd))
+ nsbr++;
+
+ /*
+ * Do the Gen3 transition. Steps are those of the PCIe Gen3
+ * recipe.
+ */
+
+ /* step 1: pcie link working in gen1/gen2 */
+
+ /* step 2: if either side is not capable of Gen3, done */
+ if (pcie_target == 3 && !dd->link_gen3_capable) {
+ dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
+ ret = -ENOSYS;
+ goto done_no_mutex;
+ }
+
+ /* hold the HW mutex across the firmware download and SBR */
+ ret = acquire_hw_mutex(dd);
+ if (ret)
+ return ret;
+
+ /* make sure thermal polling is not causing interrupts */
+ therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
+ if (therm) {
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+ msleep(100);
+ dd_dev_info(dd, "%s: Disabled therm polling\n",
+ __func__);
+ }
+
+ /* step 3: download SBus Master firmware */
+ /* step 4: download PCIe Gen3 SerDes firmware */
+retry:
+ dd_dev_info(dd, "%s: downloading firmware\n", __func__);
+ ret = load_pcie_firmware(dd);
+ if (ret)
+ goto done;
+
+ /* step 5: set up device parameter settings */
+ dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
+
+ /*
+ * PcieCfgSpcie1 - Link Control 3
+ * Leave at reset value. No need to set PerfEq - link equalization
+ * will be performed automatically after the SBR when the target
+ * speed is 8GT/s.
+ */
+
+ /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
+
+ /* step 5a: Set Synopsys Port Logic registers */
+
+ /*
+ * PcieCfgRegPl2 - Port Force Link
+ *
+ * Set the low power field to 0x10 to avoid unnecessary power
+ * management messages. All other fields are zero.
+ */
+ reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
+
+ /*
+ * PcieCfgRegPl100 - Gen3 Control
+ *
+ * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
+ * turn on PcieCfgRegPl100.EqEieosCnt (erratum)
+ * Everything else zero.
+ */
+ reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
+
+ /*
+ * PcieCfgRegPl101 - Gen3 EQ FS and LF
+ * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
+ * PcieCfgRegPl103 - Gen3 EQ Preset Index
+ * PcieCfgRegPl105 - Gen3 EQ Status
+ *
+ * Give initial EQ settings.
+ */
+ if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
+ /* 1000mV, FS=24, LF = 8 */
+ fs = 24;
+ lf = 8;
+ div = 3;
+ eq = discrete_preliminary_eq;
+ default_pset = DEFAULT_DISCRETE_PSET;
+ } else {
+ /* 400mV, FS=29, LF = 9 */
+ fs = 29;
+ lf = 9;
+ div = 1;
+ eq = integrated_preliminary_eq;
+ default_pset = DEFAULT_MCP_PSET;
+ }
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
+ (fs << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT)
+ | (lf << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+ ret = load_eq_table(dd, eq, fs, div);
+ if (ret)
+ goto done;
+
+ /*
+ * PcieCfgRegPl106 - Gen3 EQ Control
+ *
+ * Set Gen3EqPsetReqVec, leave other fields 0.
+ */
+ if (pcie_pset == UNSET_PSET)
+ pcie_pset = default_pset;
+ if (pcie_pset > 10) { /* valid range is 0-10, inclusive */
+ dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
+ __func__, pcie_pset, default_pset);
+ pcie_pset = default_pset;
+ }
+ dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
+ ((1 << pcie_pset)
+ << PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT)
+ | PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK
+ | PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+
+ /*
+ * step 5b: Do post firmware download steps via SBus
+ */
+ dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
+ pcie_post_steps(dd);
+
+ /*
+ * step 5c: Program gasket interrupts
+ */
+ /* set the Rx Bit Rate to REFCLK ratio */
+ write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
+ /* disable pCal for PCIe Gen3 RX equalization */
+ write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
+ /*
+ * Enable iCal for PCIe Gen3 RX equalization, and set which
+ * evaluation of RX_EQ_EVAL will launch the iCal procedure.
+ */
+ write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
+ /* terminate list */
+ write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
+
+ /*
+ * step 5d: program XMT margin
+ * Right now, leave the default alone. To change, do a
+ * read-modify-write of:
+ * CcePcieCtrl.XmtMargin
+ * CcePcieCtrl.XmitMarginOverwriteEnable
+ */
+
+ /* step 5e: disable active state power management (ASPM) */
+ dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &lnkctl);
+ lnkctl &= ~PCI_EXP_LNKCTL_ASPMC;
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, lnkctl);
+
+ /*
+ * step 5f: clear DirectSpeedChange
+ * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
+ * change in the speed target from starting before we are ready.
+ * This field defaults to 0 and we are not changing it, so nothing
+ * needs to be done.
+ */
+
+ /* step 5g: Set target link speed */
+ /*
+ * Set target link speed to be target on both device and parent.
+ * On setting the parent: Some system BIOSs "helpfully" set the
+ * parent target speed to Gen2 to match the ASIC's initial speed.
+ * We can set the target Gen3 because we have already checked
+ * that it is Gen3 capable earlier.
+ */
+ dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
+ parent = dd->pcidev->bus->self;
+ pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+ dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ /* only write to parent if target is not as high as ours */
+ if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
+ lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+ lnkctl2 |= target_vector;
+ dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
+ } else {
+ dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
+ }
+
+ dd_dev_info(dd, "%s: setting target link speed\n", __func__);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+ dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+ lnkctl2 |= target_vector;
+ dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+
+ /* step 5h: arm gasket logic */
+ /* hold DC in reset across the SBR */
+ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+ (void) read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+ /* save firmware control across the SBR */
+ fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
+
+ dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
+ arm_gasket_logic(dd);
+
+ /*
+ * step 6: quiesce PCIe link
+ * The chip has already been reset, so there will be no traffic
+ * from the chip. Linux has no easy way to enforce that it will
+ * not try to access the device, so we just need to hope it doesn't
+ * do it while we are doing the reset.
+ */
+
+ /*
+ * step 7: initiate the secondary bus reset (SBR)
+ * step 8: hardware brings the links back up
+ * step 9: wait for link speed transition to be complete
+ */
+ dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
+ ret = trigger_sbr(dd);
+ if (ret)
+ goto done;
+
+ /* step 10: decide what to do next */
+
+ /* check if we can read PCI space */
+ ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
+ if (ret) {
+ dd_dev_info(dd,
+ "%s: read of VendorID failed after SBR, err %d\n",
+ __func__, ret);
+ return_error = 1;
+ goto done;
+ }
+ if (vendor == 0xffff) {
+ dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
+ return_error = 1;
+ ret = -EIO;
+ goto done;
+ }
+
+ /* restore PCI space registers we know were reset */
+ dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
+ restore_pci_variables(dd);
+ /* restore firmware control */
+ write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
+
+ /*
+ * Check the gasket block status.
+ *
+ * This is the first CSR read after the SBR. If the read returns
+ * all 1s (fails), the link did not make it back.
+ *
+ * Once we're sure we can read and write, clear the DC reset after
+ * the SBR. Then check for any per-lane errors. Then look over
+ * the status.
+ */
+ reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
+ dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
+ if (reg == ~0ull) { /* PCIe read failed/timeout */
+ dd_dev_err(dd, "SBR failed - unable to read from device\n");
+ return_error = 1;
+ ret = -ENOSYS;
+ goto done;
+ }
+
+ /* clear the DC reset */
+ write_csr(dd, CCE_DC_CTRL, 0);
+ /* Set the LED off */
+ if (is_a0(dd))
+ setextled(dd, 0);
+
+ /* check for any per-lane errors */
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+ dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
+
+ /* extract status, look for our HFI */
+ status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
+ & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
+ if ((status & (1 << dd->hfi1_id)) == 0) {
+ dd_dev_err(dd,
+ "%s: gasket status 0x%x, expecting 0x%x\n",
+ __func__, status, 1 << dd->hfi1_id);
+ ret = -EIO;
+ goto done;
+ }
+
+ /* extract error */
+ err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
+ & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
+ if (err) {
+ dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
+ ret = -EIO;
+ goto done;
+ }
+
+ /* update our link information cache */
+ update_lbus_info(dd);
+ dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
+ dd->lbus_info);
+
+ if (dd->lbus_speed != target_speed) { /* not target */
+ /* maybe retry */
+ do_retry = retry_count < pcie_retry;
+ dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
+ pcie_target, do_retry ? ", retrying" : "");
+ retry_count++;
+ if (do_retry) {
+ msleep(100); /* allow time to settle */
+ goto retry;
+ }
+ ret = -EIO;
+ }
+
+done:
+ if (therm) {
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+ msleep(100);
+ dd_dev_info(dd, "%s: Re-enable therm polling\n",
+ __func__);
+ }
+ release_hw_mutex(dd);
+done_no_mutex:
+ /* return no error if it is OK to be at current speed */
+ if (ret && !return_error) {
+ dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
+ ret = 0;
+ }
+
+ dd_dev_info(dd, "%s: done\n", __func__);
+ return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
new file mode 100644
index 000000000000..9991814a8f05
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio.c
@@ -0,0 +1,1771 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+
+#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
+
+#define SC(name) SEND_CTXT_##name
+/*
+ * Send Context functions
+ */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
+
+/*
+ * Set the CM reset bit and wait for it to clear. Use the provided
+ * sendctrl register. This routine has no locking.
+ */
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
+{
+ write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
+ while (1) {
+ udelay(1);
+ sendctrl = read_csr(dd, SEND_CTRL);
+ if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
+ break;
+ }
+}
+
+/* defined in header release 48 and higher */
+#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+ << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
+#endif
+
+/* global control of PIO send */
+void pio_send_control(struct hfi1_devdata *dd, int op)
+{
+ u64 reg, mask;
+ unsigned long flags;
+ int write = 1; /* write sendctrl back */
+ int flush = 0; /* re-read sendctrl to make sure it is flushed */
+
+ spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+ reg = read_csr(dd, SEND_CTRL);
+ switch (op) {
+ case PSC_GLOBAL_ENABLE:
+ reg |= SEND_CTRL_SEND_ENABLE_SMASK;
+ /* Fall through */
+ case PSC_DATA_VL_ENABLE:
+ /* Disallow sending on VLs not enabled */
+ mask = (((~0ull)<<num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK)<<
+ SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+ reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
+ break;
+ case PSC_GLOBAL_DISABLE:
+ reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
+ break;
+ case PSC_GLOBAL_VLARB_ENABLE:
+ reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+ break;
+ case PSC_GLOBAL_VLARB_DISABLE:
+ reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+ break;
+ case PSC_CM_RESET:
+ __cm_reset(dd, reg);
+ write = 0; /* CSR already written (and flushed) */
+ break;
+ case PSC_DATA_VL_DISABLE:
+ reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
+ flush = 1;
+ break;
+ default:
+ dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
+ break;
+ }
+
+ if (write) {
+ write_csr(dd, SEND_CTRL, reg);
+ if (flush)
+ (void) read_csr(dd, SEND_CTRL); /* flush write */
+ }
+
+ spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/* number of send context memory pools */
+#define NUM_SC_POOLS 2
+
+/* Send Context Size (SCS) wildcards */
+#define SCS_POOL_0 -1
+#define SCS_POOL_1 -2
+/* Send Context Count (SCC) wildcards */
+#define SCC_PER_VL -1
+#define SCC_PER_CPU -2
+
+#define SCC_PER_KRCVQ -3
+#define SCC_ACK_CREDITS 32
+
+#define PIO_WAIT_BATCH_SIZE 5
+
+/* default send context sizes */
+static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
+ [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */
+ .count = SCC_PER_VL },/* one per NUMA */
+ [SC_ACK] = { .size = SCC_ACK_CREDITS,
+ .count = SCC_PER_KRCVQ },
+ [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */
+ .count = SCC_PER_CPU }, /* one per CPU */
+
+};
+
+/* send context memory pool configuration */
+struct mem_pool_config {
+ int centipercent; /* % of memory, in 100ths of 1% */
+ int absolute_blocks; /* absolute block count */
+};
+
+/* default memory pool configuration: 100% in pool 0 */
+static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
+ /* centi%, abs blocks */
+ { 10000, -1 }, /* pool 0 */
+ { 0, -1 }, /* pool 1 */
+};
+
+/* memory pool information, used when calculating final sizes */
+struct mem_pool_info {
+ int centipercent; /* 100th of 1% of memory to use, -1 if blocks
+ already set */
+ int count; /* count of contexts in the pool */
+ int blocks; /* block size of the pool */
+ int size; /* context size, in blocks */
+};
+
+/*
+ * Convert a pool wildcard to a valid pool index. The wildcards
+ * start at -1 and increase negatively. Map them as:
+ * -1 => 0
+ * -2 => 1
+ * etc.
+ *
+ * Return -1 on non-wildcard input, otherwise convert to a pool number.
+ */
+static int wildcard_to_pool(int wc)
+{
+ if (wc >= 0)
+ return -1; /* non-wildcard */
+ return -wc - 1;
+}
+
+static const char *sc_type_names[SC_MAX] = {
+ "kernel",
+ "ack",
+ "user"
+};
+
+static const char *sc_type_name(int index)
+{
+ if (index < 0 || index >= SC_MAX)
+ return "unknown";
+ return sc_type_names[index];
+}
+
+/*
+ * Read the send context memory pool configuration and send context
+ * size configuration. Replace any wildcards and come up with final
+ * counts and sizes for the send context types.
+ */
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
+{
+ struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
+ int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
+ int total_contexts = 0;
+ int fixed_blocks;
+ int pool_blocks;
+ int used_blocks;
+ int cp_total; /* centipercent total */
+ int ab_total; /* absolute block total */
+ int extra;
+ int i;
+
+ /*
+ * Step 0:
+ * - copy the centipercents/absolute sizes from the pool config
+ * - sanity check these values
+ * - add up centipercents, then later check for full value
+ * - add up absolute blocks, then later check for over-commit
+ */
+ cp_total = 0;
+ ab_total = 0;
+ for (i = 0; i < NUM_SC_POOLS; i++) {
+ int cp = sc_mem_pool_config[i].centipercent;
+ int ab = sc_mem_pool_config[i].absolute_blocks;
+
+ /*
+ * A negative value is "unused" or "invalid". Both *can*
+ * be valid, but centipercent wins, so check that first
+ */
+ if (cp >= 0) { /* centipercent valid */
+ cp_total += cp;
+ } else if (ab >= 0) { /* absolute blocks valid */
+ ab_total += ab;
+ } else { /* neither valid */
+ dd_dev_err(
+ dd,
+ "Send context memory pool %d: both the block count and centipercent are invalid\n",
+ i);
+ return -EINVAL;
+ }
+
+ mem_pool_info[i].centipercent = cp;
+ mem_pool_info[i].blocks = ab;
+ }
+
+ /* do not use both % and absolute blocks for different pools */
+ if (cp_total != 0 && ab_total != 0) {
+ dd_dev_err(
+ dd,
+ "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
+ return -EINVAL;
+ }
+
+ /* if any percentages are present, they must add up to 100% x 100 */
+ if (cp_total != 0 && cp_total != 10000) {
+ dd_dev_err(
+ dd,
+ "Send context memory pool centipercent is %d, expecting 10000\n",
+ cp_total);
+ return -EINVAL;
+ }
+
+ /* the absolute pool total cannot be more than the mem total */
+ if (ab_total > total_blocks) {
+ dd_dev_err(
+ dd,
+ "Send context memory pool absolute block count %d is larger than the memory size %d\n",
+ ab_total, total_blocks);
+ return -EINVAL;
+ }
+
+ /*
+ * Step 2:
+ * - copy from the context size config
+ * - replace context type wildcard counts with real values
+ * - add up non-memory pool block sizes
+ * - add up memory pool user counts
+ */
+ fixed_blocks = 0;
+ for (i = 0; i < SC_MAX; i++) {
+ int count = sc_config_sizes[i].count;
+ int size = sc_config_sizes[i].size;
+ int pool;
+
+ /*
+ * Sanity check count: Either a positive value or
+ * one of the expected wildcards is valid. The positive
+ * value is checked later when we compare against total
+ * memory available.
+ */
+ if (i == SC_ACK) {
+ count = dd->n_krcv_queues;
+ } else if (i == SC_KERNEL) {
+ count = num_vls + 1 /* VL15 */;
+ } else if (count == SCC_PER_CPU) {
+ count = dd->num_rcv_contexts - dd->n_krcv_queues;
+ } else if (count < 0) {
+ dd_dev_err(
+ dd,
+ "%s send context invalid count wildcard %d\n",
+ sc_type_name(i), count);
+ return -EINVAL;
+ }
+ if (total_contexts + count > dd->chip_send_contexts)
+ count = dd->chip_send_contexts - total_contexts;
+
+ total_contexts += count;
+
+ /*
+ * Sanity check pool: The conversion will return a pool
+ * number or -1 if a fixed (non-negative) value. The fixed
+ * value is checked later when we compare against
+ * total memory available.
+ */
+ pool = wildcard_to_pool(size);
+ if (pool == -1) { /* non-wildcard */
+ fixed_blocks += size * count;
+ } else if (pool < NUM_SC_POOLS) { /* valid wildcard */
+ mem_pool_info[pool].count += count;
+ } else { /* invalid wildcard */
+ dd_dev_err(
+ dd,
+ "%s send context invalid pool wildcard %d\n",
+ sc_type_name(i), size);
+ return -EINVAL;
+ }
+
+ dd->sc_sizes[i].count = count;
+ dd->sc_sizes[i].size = size;
+ }
+ if (fixed_blocks > total_blocks) {
+ dd_dev_err(
+ dd,
+ "Send context fixed block count, %u, larger than total block count %u\n",
+ fixed_blocks, total_blocks);
+ return -EINVAL;
+ }
+
+ /* step 3: calculate the blocks in the pools, and pool context sizes */
+ pool_blocks = total_blocks - fixed_blocks;
+ if (ab_total > pool_blocks) {
+ dd_dev_err(
+ dd,
+ "Send context fixed pool sizes, %u, larger than pool block count %u\n",
+ ab_total, pool_blocks);
+ return -EINVAL;
+ }
+ /* subtract off the fixed pool blocks */
+ pool_blocks -= ab_total;
+
+ for (i = 0; i < NUM_SC_POOLS; i++) {
+ struct mem_pool_info *pi = &mem_pool_info[i];
+
+ /* % beats absolute blocks */
+ if (pi->centipercent >= 0)
+ pi->blocks = (pool_blocks * pi->centipercent) / 10000;
+
+ if (pi->blocks == 0 && pi->count != 0) {
+ dd_dev_err(
+ dd,
+ "Send context memory pool %d has %u contexts, but no blocks\n",
+ i, pi->count);
+ return -EINVAL;
+ }
+ if (pi->count == 0) {
+ /* warn about wasted blocks */
+ if (pi->blocks != 0)
+ dd_dev_err(
+ dd,
+ "Send context memory pool %d has %u blocks, but zero contexts\n",
+ i, pi->blocks);
+ pi->size = 0;
+ } else {
+ pi->size = pi->blocks / pi->count;
+ }
+ }
+
+ /* step 4: fill in the context type sizes from the pool sizes */
+ used_blocks = 0;
+ for (i = 0; i < SC_MAX; i++) {
+ if (dd->sc_sizes[i].size < 0) {
+ unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
+
+ WARN_ON_ONCE(pool >= NUM_SC_POOLS);
+ dd->sc_sizes[i].size = mem_pool_info[pool].size;
+ }
+ /* make sure we are not larger than what is allowed by the HW */
+#define PIO_MAX_BLOCKS 1024
+ if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
+ dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
+
+ /* calculate our total usage */
+ used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
+ }
+ extra = total_blocks - used_blocks;
+ if (extra != 0)
+ dd_dev_info(dd, "unused send context blocks: %d\n", extra);
+
+ return total_contexts;
+}
+
+int init_send_contexts(struct hfi1_devdata *dd)
+{
+ u16 base;
+ int ret, i, j, context;
+
+ ret = init_credit_return(dd);
+ if (ret)
+ return ret;
+
+ dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
+ GFP_KERNEL);
+ dd->send_contexts = kcalloc(dd->num_send_contexts,
+ sizeof(struct send_context_info),
+ GFP_KERNEL);
+ if (!dd->send_contexts || !dd->hw_to_sw) {
+ dd_dev_err(dd, "Unable to allocate send context arrays\n");
+ kfree(dd->hw_to_sw);
+ kfree(dd->send_contexts);
+ free_credit_return(dd);
+ return -ENOMEM;
+ }
+
+ /* hardware context map starts with invalid send context indices */
+ for (i = 0; i < TXE_NUM_CONTEXTS; i++)
+ dd->hw_to_sw[i] = INVALID_SCI;
+
+ /*
+ * All send contexts have their credit sizes. Allocate credits
+ * for each context one after another from the global space.
+ */
+ context = 0;
+ base = 1;
+ for (i = 0; i < SC_MAX; i++) {
+ struct sc_config_sizes *scs = &dd->sc_sizes[i];
+
+ for (j = 0; j < scs->count; j++) {
+ struct send_context_info *sci =
+ &dd->send_contexts[context];
+ sci->type = i;
+ sci->base = base;
+ sci->credits = scs->size;
+
+ context++;
+ base += scs->size;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate a software index and hardware context of the given type.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
+ u32 *hw_context)
+{
+ struct send_context_info *sci;
+ u32 index;
+ u32 context;
+
+ for (index = 0, sci = &dd->send_contexts[0];
+ index < dd->num_send_contexts; index++, sci++) {
+ if (sci->type == type && sci->allocated == 0) {
+ sci->allocated = 1;
+ /* use a 1:1 mapping, but make them non-equal */
+ context = dd->chip_send_contexts - index - 1;
+ dd->hw_to_sw[context] = index;
+ *sw_index = index;
+ *hw_context = context;
+ return 0; /* success */
+ }
+ }
+ dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
+ return -ENOSPC;
+}
+
+/*
+ * Free the send context given by its software index.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
+{
+ struct send_context_info *sci;
+
+ sci = &dd->send_contexts[sw_index];
+ if (!sci->allocated) {
+ dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
+ __func__, sw_index, hw_context);
+ }
+ sci->allocated = 0;
+ dd->hw_to_sw[hw_context] = INVALID_SCI;
+}
+
+/* return the base context of a context in a group */
+static inline u32 group_context(u32 context, u32 group)
+{
+ return (context >> group) << group;
+}
+
+/* return the size of a group */
+static inline u32 group_size(u32 group)
+{
+ return 1 << group;
+}
+
+/*
+ * Obtain the credit return addresses, kernel virtual and physical, for the
+ * given sc.
+ *
+ * To understand this routine:
+ * o va and pa are arrays of struct credit_return. One for each physical
+ * send context, per NUMA.
+ * o Each send context always looks in its relative location in a struct
+ * credit_return for its credit return.
+ * o Each send context in a group must have its return address CSR programmed
+ * with the same value. Use the address of the first send context in the
+ * group.
+ */
+static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
+{
+ u32 gc = group_context(sc->hw_context, sc->group);
+ u32 index = sc->hw_context & 0x7;
+
+ sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
+ *pa = (unsigned long)
+ &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
+}
+
+/*
+ * Work queue function triggered in error interrupt routine for
+ * kernel contexts.
+ */
+static void sc_halted(struct work_struct *work)
+{
+ struct send_context *sc;
+
+ sc = container_of(work, struct send_context, halt_work);
+ sc_restart(sc);
+}
+
+/*
+ * Calculate PIO block threshold for this send context using the given MTU.
+ * Trigger a return when one MTU plus optional header of credits remain.
+ *
+ * Parameter mtu is in bytes.
+ * Parameter hdrqentsize is in DWORDs.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
+{
+ u32 release_credits;
+ u32 threshold;
+
+ /* add in the header size, then divide by the PIO block size */
+ mtu += hdrqentsize << 2;
+ release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
+
+ /* check against this context's credits */
+ if (sc->credits <= release_credits)
+ threshold = 1;
+ else
+ threshold = sc->credits - release_credits;
+
+ return threshold;
+}
+
+/*
+ * Calculate credit threshold in terms of percent of the allocated credits.
+ * Trigger when unreturned credits equal or exceed the percentage of the whole.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+{
+ return (sc->credits * percent) / 100;
+}
+
+/*
+ * Set the credit return threshold.
+ */
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
+{
+ unsigned long flags;
+ u32 old_threshold;
+ int force_return = 0;
+
+ spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+
+ old_threshold = (sc->credit_ctrl >>
+ SC(CREDIT_CTRL_THRESHOLD_SHIFT))
+ & SC(CREDIT_CTRL_THRESHOLD_MASK);
+
+ if (new_threshold != old_threshold) {
+ sc->credit_ctrl =
+ (sc->credit_ctrl
+ & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
+ | ((new_threshold
+ & SC(CREDIT_CTRL_THRESHOLD_MASK))
+ << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
+ write_kctxt_csr(sc->dd, sc->hw_context,
+ SC(CREDIT_CTRL), sc->credit_ctrl);
+
+ /* force a credit return on change to avoid a possible stall */
+ force_return = 1;
+ }
+
+ spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+
+ if (force_return)
+ sc_return_credits(sc);
+}
+
+/*
+ * set_pio_integrity
+ *
+ * Set the CHECK_ENABLE register for the send context 'sc'.
+ */
+void set_pio_integrity(struct send_context *sc)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg = 0;
+ u32 hw_context = sc->hw_context;
+ int type = sc->type;
+
+ /*
+ * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
+ * we're snooping.
+ */
+ if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+ dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
+ reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
+
+ write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
+}
+
+/*
+ * Allocate a NUMA relative send context structure of the given type along
+ * with a HW context.
+ */
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+ uint hdrqentsize, int numa)
+{
+ struct send_context_info *sci;
+ struct send_context *sc;
+ dma_addr_t pa;
+ unsigned long flags;
+ u64 reg;
+ u32 thresh;
+ u32 sw_index;
+ u32 hw_context;
+ int ret;
+ u8 opval, opmask;
+
+ /* do not allocate while frozen */
+ if (dd->flags & HFI1_FROZEN)
+ return NULL;
+
+ sc = kzalloc_node(sizeof(struct send_context), GFP_KERNEL, numa);
+ if (!sc) {
+ dd_dev_err(dd, "Cannot allocate send context structure\n");
+ return NULL;
+ }
+
+ spin_lock_irqsave(&dd->sc_lock, flags);
+ ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
+ if (ret) {
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+ kfree(sc);
+ return NULL;
+ }
+
+ sci = &dd->send_contexts[sw_index];
+ sci->sc = sc;
+
+ sc->dd = dd;
+ sc->node = numa;
+ sc->type = type;
+ spin_lock_init(&sc->alloc_lock);
+ spin_lock_init(&sc->release_lock);
+ spin_lock_init(&sc->credit_ctrl_lock);
+ INIT_LIST_HEAD(&sc->piowait);
+ INIT_WORK(&sc->halt_work, sc_halted);
+ atomic_set(&sc->buffers_allocated, 0);
+ init_waitqueue_head(&sc->halt_wait);
+
+ /* grouping is always single context for now */
+ sc->group = 0;
+
+ sc->sw_index = sw_index;
+ sc->hw_context = hw_context;
+ cr_group_addresses(sc, &pa);
+ sc->credits = sci->credits;
+
+/* PIO Send Memory Address details */
+#define PIO_ADDR_CONTEXT_MASK 0xfful
+#define PIO_ADDR_CONTEXT_SHIFT 16
+ sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
+ << PIO_ADDR_CONTEXT_SHIFT);
+
+ /* set base and credits */
+ reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
+ << SC(CTRL_CTXT_DEPTH_SHIFT))
+ | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
+ << SC(CTRL_CTXT_BASE_SHIFT));
+ write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
+
+ set_pio_integrity(sc);
+
+ /* unmask all errors */
+ write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
+
+ /* set the default partition key */
+ write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
+ (DEFAULT_PKEY &
+ SC(CHECK_PARTITION_KEY_VALUE_MASK))
+ << SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
+
+ /* per context type checks */
+ if (type == SC_USER) {
+ opval = USER_OPCODE_CHECK_VAL;
+ opmask = USER_OPCODE_CHECK_MASK;
+ } else {
+ opval = OPCODE_CHECK_VAL_DISABLED;
+ opmask = OPCODE_CHECK_MASK_DISABLED;
+ }
+
+ /* set the send context check opcode mask and value */
+ write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
+ ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
+ ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
+
+ /* set up credit return */
+ reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
+
+ /*
+ * Calculate the initial credit return threshold.
+ *
+ * For Ack contexts, set a threshold for half the credits.
+ * For User contexts use the given percentage. This has been
+ * sanitized on driver start-up.
+ * For Kernel contexts, use the default MTU plus a header.
+ */
+ if (type == SC_ACK) {
+ thresh = sc_percent_to_threshold(sc, 50);
+ } else if (type == SC_USER) {
+ thresh = sc_percent_to_threshold(sc,
+ user_credit_return_threshold);
+ } else { /* kernel */
+ thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
+ }
+ reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
+ /* add in early return */
+ if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
+ reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+ else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
+ reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+
+ /* set up write-through credit_ctrl */
+ sc->credit_ctrl = reg;
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
+
+ /* User send contexts should not allow sending on VL15 */
+ if (type == SC_USER) {
+ reg = 1ULL << 15;
+ write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
+ }
+
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+ /*
+ * Allocate shadow ring to track outstanding PIO buffers _after_
+ * unlocking. We don't know the size until the lock is held and
+ * we can't allocate while the lock is held. No one is using
+ * the context yet, so allocate it now.
+ *
+ * User contexts do not get a shadow ring.
+ */
+ if (type != SC_USER) {
+ /*
+ * Size the shadow ring 1 larger than the number of credits
+ * so head == tail can mean empty.
+ */
+ sc->sr_size = sci->credits + 1;
+ sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
+ sc->sr_size, GFP_KERNEL, numa);
+ if (!sc->sr) {
+ dd_dev_err(dd,
+ "Cannot allocate send context shadow ring structure\n");
+ sc_free(sc);
+ return NULL;
+ }
+ }
+
+ dd_dev_info(dd,
+ "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
+ sw_index,
+ hw_context,
+ sc_type_name(type),
+ sc->group,
+ sc->credits,
+ sc->credit_ctrl,
+ thresh);
+
+ return sc;
+}
+
+/* free a per-NUMA send context structure */
+void sc_free(struct send_context *sc)
+{
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+ u32 sw_index;
+ u32 hw_context;
+
+ if (!sc)
+ return;
+
+ sc->flags |= SCF_IN_FREE; /* ensure no restarts */
+ dd = sc->dd;
+ if (!list_empty(&sc->piowait))
+ dd_dev_err(dd, "piowait list not empty!\n");
+ sw_index = sc->sw_index;
+ hw_context = sc->hw_context;
+ sc_disable(sc); /* make sure the HW is disabled */
+ flush_work(&sc->halt_work);
+
+ spin_lock_irqsave(&dd->sc_lock, flags);
+ dd->send_contexts[sw_index].sc = NULL;
+
+ /* clear/disable all registers set in sc_alloc */
+ write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
+ write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
+ write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
+ write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
+ write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
+
+ /* release the index and context for re-use */
+ sc_hw_free(dd, sw_index, hw_context);
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+ kfree(sc->sr);
+ kfree(sc);
+}
+
+/* disable the context */
+void sc_disable(struct send_context *sc)
+{
+ u64 reg;
+ unsigned long flags;
+ struct pio_buf *pbuf;
+
+ if (!sc)
+ return;
+
+ /* do all steps, even if already disabled */
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
+ reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
+ sc->flags &= ~SCF_ENABLED;
+ sc_wait_for_packet_egress(sc, 1);
+ write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+ /*
+ * Flush any waiters. Once the context is disabled,
+ * credit return interrupts are stopped (although there
+ * could be one in-process when the context is disabled).
+ * Wait one microsecond for any lingering interrupts, then
+ * proceed with the flush.
+ */
+ udelay(1);
+ spin_lock_irqsave(&sc->release_lock, flags);
+ if (sc->sr) { /* this context has a shadow ring */
+ while (sc->sr_tail != sc->sr_head) {
+ pbuf = &sc->sr[sc->sr_tail].pbuf;
+ if (pbuf->cb)
+ (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
+ sc->sr_tail++;
+ if (sc->sr_tail >= sc->sr_size)
+ sc->sr_tail = 0;
+ }
+ }
+ spin_unlock_irqrestore(&sc->release_lock, flags);
+}
+
+/* return SendEgressCtxtStatus.PacketOccupancy */
+#define packet_occupancy(r) \
+ (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
+ >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
+
+/* is egress halted on the context? */
+#define egress_halted(r) \
+ ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
+
+/* wait for packet egress, optionally pause for credit return */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg;
+ u32 loop = 0;
+
+ while (1) {
+ reg = read_csr(dd, sc->hw_context * 8 +
+ SEND_EGRESS_CTXT_STATUS);
+ /* done if egress is stopped */
+ if (egress_halted(reg))
+ break;
+ reg = packet_occupancy(reg);
+ if (reg == 0)
+ break;
+ if (loop > 100) {
+ dd_dev_err(dd,
+ "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u\n",
+ __func__, sc->sw_index,
+ sc->hw_context, (u32)reg);
+ break;
+ }
+ loop++;
+ udelay(1);
+ }
+
+ if (pause)
+ /* Add additional delay to ensure chip returns all credits */
+ pause_for_credit_return(dd);
+}
+
+void sc_wait(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ struct send_context *sc = dd->send_contexts[i].sc;
+
+ if (!sc)
+ continue;
+ sc_wait_for_packet_egress(sc, 0);
+ }
+}
+
+/*
+ * Restart a context after it has been halted due to error.
+ *
+ * If the first step fails - wait for the halt to be asserted, return early.
+ * Otherwise complain about timeouts but keep going.
+ *
+ * It is expected that allocations (enabled flag bit) have been shut off
+ * already (only applies to kernel contexts).
+ */
+int sc_restart(struct send_context *sc)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg;
+ u32 loop;
+ int count;
+
+ /* bounce off if not halted, or being free'd */
+ if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
+ return -EINVAL;
+
+ dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
+ sc->hw_context);
+
+ /*
+ * Step 1: Wait for the context to actually halt.
+ *
+ * The error interrupt is asynchronous to actually setting halt
+ * on the context.
+ */
+ loop = 0;
+ while (1) {
+ reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
+ if (reg & SC(STATUS_CTXT_HALTED_SMASK))
+ break;
+ if (loop > 100) {
+ dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
+ __func__, sc->sw_index, sc->hw_context);
+ return -ETIME;
+ }
+ loop++;
+ udelay(1);
+ }
+
+ /*
+ * Step 2: Ensure no users are still trying to write to PIO.
+ *
+ * For kernel contexts, we have already turned off buffer allocation.
+ * Now wait for the buffer count to go to zero.
+ *
+ * For user contexts, the user handling code has cut off write access
+ * to the context's PIO pages before calling this routine and will
+ * restore write access after this routine returns.
+ */
+ if (sc->type != SC_USER) {
+ /* kernel context */
+ loop = 0;
+ while (1) {
+ count = atomic_read(&sc->buffers_allocated);
+ if (count == 0)
+ break;
+ if (loop > 100) {
+ dd_dev_err(dd,
+ "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
+ __func__, sc->sw_index,
+ sc->hw_context, count);
+ }
+ loop++;
+ udelay(1);
+ }
+ }
+
+ /*
+ * Step 3: Wait for all packets to egress.
+ * This is done while disabling the send context
+ *
+ * Step 4: Disable the context
+ *
+ * This is a superset of the halt. After the disable, the
+ * errors can be cleared.
+ */
+ sc_disable(sc);
+
+ /*
+ * Step 5: Enable the context
+ *
+ * This enable will clear the halted flag and per-send context
+ * error flags.
+ */
+ return sc_enable(sc);
+}
+
+/*
+ * PIO freeze processing. To be called after the TXE block is fully frozen.
+ * Go through all frozen send contexts and disable them. The contexts are
+ * already stopped by the freeze.
+ */
+void pio_freeze(struct hfi1_devdata *dd)
+{
+ struct send_context *sc;
+ int i;
+
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ sc = dd->send_contexts[i].sc;
+ /*
+ * Don't disable unallocated, unfrozen, or user send contexts.
+ * User send contexts will be disabled when the process
+ * calls into the driver to reset its context.
+ */
+ if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+ continue;
+
+ /* only need to disable, the context is already stopped */
+ sc_disable(sc);
+ }
+}
+
+/*
+ * Unfreeze PIO for kernel send contexts. The precondition for calling this
+ * is that all PIO send contexts have been disabled and the SPC freeze has
+ * been cleared. Now perform the last step and re-enable each kernel context.
+ * User (PSM) processing will occur when PSM calls into the kernel to
+ * acknowledge the freeze.
+ */
+void pio_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+ struct send_context *sc;
+ int i;
+
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ sc = dd->send_contexts[i].sc;
+ if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+ continue;
+
+ sc_enable(sc); /* will clear the sc frozen flag */
+ }
+}
+
+/*
+ * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
+ * Returns:
+ * -ETIMEDOUT - if we wait too long
+ * -EIO - if there was an error
+ */
+static int pio_init_wait_progress(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ int max, count = 0;
+
+ /* max is the longest possible HW init time / delay */
+ max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
+ while (1) {
+ reg = read_csr(dd, SEND_PIO_INIT_CTXT);
+ if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
+ break;
+ if (count >= max)
+ return -ETIMEDOUT;
+ udelay(5);
+ count++;
+ }
+
+ return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
+}
+
+/*
+ * Reset all of the send contexts to their power-on state. Used
+ * only during manual init - no lock against sc_enable needed.
+ */
+void pio_reset_all(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ /* make sure the init engine is not busy */
+ ret = pio_init_wait_progress(dd);
+ /* ignore any timeout */
+ if (ret == -EIO) {
+ /* clear the error */
+ write_csr(dd, SEND_PIO_ERR_CLEAR,
+ SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
+ }
+
+ /* reset init all */
+ write_csr(dd, SEND_PIO_INIT_CTXT,
+ SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
+ udelay(2);
+ ret = pio_init_wait_progress(dd);
+ if (ret < 0) {
+ dd_dev_err(dd,
+ "PIO send context init %s while initializing all PIO blocks\n",
+ ret == -ETIMEDOUT ? "is stuck" : "had an error");
+ }
+}
+
+/* enable the context */
+int sc_enable(struct send_context *sc)
+{
+ u64 sc_ctrl, reg, pio;
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+ int ret = 0;
+
+ if (!sc)
+ return -EINVAL;
+ dd = sc->dd;
+
+ /*
+ * Obtain the allocator lock to guard against any allocation
+ * attempts (which should not happen prior to context being
+ * enabled). On the release/disable side we don't need to
+ * worry about locking since the releaser will not do anything
+ * if the context accounting values have not changed.
+ */
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+ if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
+ goto unlock; /* already enabled */
+
+ /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
+
+ *sc->hw_free = 0;
+ sc->free = 0;
+ sc->alloc_free = 0;
+ sc->fill = 0;
+ sc->sr_head = 0;
+ sc->sr_tail = 0;
+ sc->flags = 0;
+ atomic_set(&sc->buffers_allocated, 0);
+
+ /*
+ * Clear all per-context errors. Some of these will be set when
+ * we are re-enabling after a context halt. Now that the context
+ * is disabled, the halt will not clear until after the PIO init
+ * engine runs below.
+ */
+ reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
+ if (reg)
+ write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR),
+ reg);
+
+ /*
+ * The HW PIO initialization engine can handle only one init
+ * request at a time. Serialize access to each device's engine.
+ */
+ spin_lock(&dd->sc_init_lock);
+ /*
+ * Since access to this code block is serialized and
+ * each access waits for the initialization to complete
+ * before releasing the lock, the PIO initialization engine
+ * should not be in use, so we don't have to wait for the
+ * InProgress bit to go down.
+ */
+ pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
+ SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
+ SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
+ write_csr(dd, SEND_PIO_INIT_CTXT, pio);
+ /*
+ * Wait until the engine is done. Give the chip the required time
+ * so, hopefully, we read the register just once.
+ */
+ udelay(2);
+ ret = pio_init_wait_progress(dd);
+ spin_unlock(&dd->sc_init_lock);
+ if (ret) {
+ dd_dev_err(dd,
+ "sctxt%u(%u): Context not enabled due to init failure %d\n",
+ sc->sw_index, sc->hw_context, ret);
+ goto unlock;
+ }
+
+ /*
+ * All is well. Enable the context.
+ */
+ sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
+ write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
+ /*
+ * Read SendCtxtCtrl to force the write out and prevent a timing
+ * hazard where a PIO write may reach the context before the enable.
+ */
+ read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+ sc->flags |= SCF_ENABLED;
+
+unlock:
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+ return ret;
+}
+
+/* force a credit return on the context */
+void sc_return_credits(struct send_context *sc)
+{
+ if (!sc)
+ return;
+
+ /* a 0->1 transition schedules a credit return */
+ write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
+ SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
+ /*
+ * Ensure that the write is flushed and the credit return is
+ * scheduled. We care more about the 0 -> 1 transition.
+ */
+ read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
+ /* set back to 0 for next time */
+ write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
+}
+
+/* allow all in-flight packets to drain on the context */
+void sc_flush(struct send_context *sc)
+{
+ if (!sc)
+ return;
+
+ sc_wait_for_packet_egress(sc, 1);
+}
+
+/* drop all packets on the context, no waiting until they are sent */
+void sc_drop(struct send_context *sc)
+{
+ if (!sc)
+ return;
+
+ dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
+ __func__, sc->sw_index, sc->hw_context);
+}
+
+/*
+ * Start the software reaction to a context halt or SPC freeze:
+ * - mark the context as halted or frozen
+ * - stop buffer allocations
+ *
+ * Called from the error interrupt. Other work is deferred until
+ * out of the interrupt.
+ */
+void sc_stop(struct send_context *sc, int flag)
+{
+ unsigned long flags;
+
+ /* mark the context */
+ sc->flags |= flag;
+
+ /* stop buffer allocations */
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ sc->flags &= ~SCF_ENABLED;
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ wake_up(&sc->halt_wait);
+}
+
+#define BLOCK_DWORDS (PIO_BLOCK_SIZE/sizeof(u32))
+#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
+
+/*
+ * The send context buffer "allocator".
+ *
+ * @sc: the PIO send context we are allocating from
+ * @len: length of whole packet - including PBC - in dwords
+ * @cb: optional callback to call when the buffer is finished sending
+ * @arg: argument for cb
+ *
+ * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ */
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+ pio_release_cb cb, void *arg)
+{
+ struct pio_buf *pbuf = NULL;
+ unsigned long flags;
+ unsigned long avail;
+ unsigned long blocks = dwords_to_blocks(dw_len);
+ unsigned long start_fill;
+ int trycount = 0;
+ u32 head, next;
+
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ if (!(sc->flags & SCF_ENABLED)) {
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ goto done;
+ }
+
+retry:
+ avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
+ if (blocks > avail) {
+ /* not enough room */
+ if (unlikely(trycount)) { /* already tried to get more room */
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ goto done;
+ }
+ /* copy from receiver cache line and recalculate */
+ sc->alloc_free = ACCESS_ONCE(sc->free);
+ avail =
+ (unsigned long)sc->credits -
+ (sc->fill - sc->alloc_free);
+ if (blocks > avail) {
+ /* still no room, actively update */
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ sc_release_update(sc);
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ sc->alloc_free = ACCESS_ONCE(sc->free);
+ trycount++;
+ goto retry;
+ }
+ }
+
+ /* there is enough room */
+
+ atomic_inc(&sc->buffers_allocated);
+
+ /* read this once */
+ head = sc->sr_head;
+
+ /* "allocate" the buffer */
+ start_fill = sc->fill;
+ sc->fill += blocks;
+
+ /*
+ * Fill the parts that the releaser looks at before moving the head.
+ * The only necessary piece is the sent_at field. The credits
+ * we have just allocated cannot have been returned yet, so the
+ * cb and arg will not be looked at for a "while". Put them
+ * on this side of the memory barrier anyway.
+ */
+ pbuf = &sc->sr[head].pbuf;
+ pbuf->sent_at = sc->fill;
+ pbuf->cb = cb;
+ pbuf->arg = arg;
+ pbuf->sc = sc; /* could be filled in at sc->sr init time */
+ /* make sure this is in memory before updating the head */
+
+ /* calculate next head index, do not store */
+ next = head + 1;
+ if (next >= sc->sr_size)
+ next = 0;
+ /* update the head - must be last! - the releaser can look at fields
+ in pbuf once we move the head */
+ smp_wmb();
+ sc->sr_head = next;
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+ /* finish filling in the buffer outside the lock */
+ pbuf->start = sc->base_addr + ((start_fill % sc->credits)
+ * PIO_BLOCK_SIZE);
+ pbuf->size = sc->credits * PIO_BLOCK_SIZE;
+ pbuf->end = sc->base_addr + pbuf->size;
+ pbuf->block_count = blocks;
+ pbuf->qw_written = 0;
+ pbuf->carry_bytes = 0;
+ pbuf->carry.val64 = 0;
+done:
+ return pbuf;
+}
+
+/*
+ * There are at least two entities that can turn on credit return
+ * interrupts and they can overlap. Avoid problems by implementing
+ * a count scheme that is enforced by a lock. The lock is needed because
+ * the count and CSR write must be paired.
+ */
+
+/*
+ * Start credit return interrupts. This is managed by a count. If already
+ * on, just increment the count.
+ */
+void sc_add_credit_return_intr(struct send_context *sc)
+{
+ unsigned long flags;
+
+ /* lock must surround both the count change and the CSR update */
+ spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+ if (sc->credit_intr_count == 0) {
+ sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+ write_kctxt_csr(sc->dd, sc->hw_context,
+ SC(CREDIT_CTRL), sc->credit_ctrl);
+ }
+ sc->credit_intr_count++;
+ spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * Stop credit return interrupts. This is managed by a count. Decrement the
+ * count, if the last user, then turn the credit interrupts off.
+ */
+void sc_del_credit_return_intr(struct send_context *sc)
+{
+ unsigned long flags;
+
+ WARN_ON(sc->credit_intr_count == 0);
+
+ /* lock must surround both the count change and the CSR update */
+ spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+ sc->credit_intr_count--;
+ if (sc->credit_intr_count == 0) {
+ sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+ write_kctxt_csr(sc->dd, sc->hw_context,
+ SC(CREDIT_CTRL), sc->credit_ctrl);
+ }
+ spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * The caller must be careful when calling this. All needint calls
+ * must be paired with !needint.
+ */
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
+{
+ if (needint)
+ sc_add_credit_return_intr(sc);
+ else
+ sc_del_credit_return_intr(sc);
+ trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
+ if (needint) {
+ mmiowb();
+ sc_return_credits(sc);
+ }
+}
+
+/**
+ * sc_piobufavail - callback when a PIO buffer is available
+ * @sc: the send context
+ *
+ * This is called from the interrupt handler when a PIO buffer is
+ * available after hfi1_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+static void sc_piobufavail(struct send_context *sc)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ struct list_head *list;
+ struct hfi1_qp *qps[PIO_WAIT_BATCH_SIZE];
+ struct hfi1_qp *qp;
+ unsigned long flags;
+ unsigned i, n = 0;
+
+ if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
+ return;
+ list = &sc->piowait;
+ /*
+ * Note: checking that the piowait list is empty and clearing
+ * the buffer available interrupt needs to be atomic or we
+ * could end up with QPs on the wait list with the interrupt
+ * disabled.
+ */
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ while (!list_empty(list)) {
+ struct iowait *wait;
+
+ if (n == ARRAY_SIZE(qps))
+ goto full;
+ wait = list_first_entry(list, struct iowait, list);
+ qp = container_of(wait, struct hfi1_qp, s_iowait);
+ list_del_init(&qp->s_iowait.list);
+ /* refcount held until actual wake up */
+ qps[n++] = qp;
+ }
+ /*
+ * Counting: only call wantpiobuf_intr() if there were waiters and they
+ * are now all gone.
+ */
+ if (n)
+ hfi1_sc_wantpiobuf_intr(sc, 0);
+full:
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+ for (i = 0; i < n; i++)
+ hfi1_qp_wakeup(qps[i], HFI1_S_WAIT_PIO);
+}
+
+/* translate a send credit update to a bit code of reasons */
+static inline int fill_code(u64 hw_free)
+{
+ int code = 0;
+
+ if (hw_free & CR_STATUS_SMASK)
+ code |= PRC_STATUS_ERR;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
+ code |= PRC_PBC;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
+ code |= PRC_THRESHOLD;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
+ code |= PRC_FILL_ERR;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
+ code |= PRC_SC_DISABLE;
+ return code;
+}
+
+/* use the jiffies compare to get the wrap right */
+#define sent_before(a, b) time_before(a, b) /* a < b */
+
+/*
+ * The send context buffer "releaser".
+ */
+void sc_release_update(struct send_context *sc)
+{
+ struct pio_buf *pbuf;
+ u64 hw_free;
+ u32 head, tail;
+ unsigned long old_free;
+ unsigned long extra;
+ unsigned long flags;
+ int code;
+
+ if (!sc)
+ return;
+
+ spin_lock_irqsave(&sc->release_lock, flags);
+ /* update free */
+ hw_free = le64_to_cpu(*sc->hw_free); /* volatile read */
+ old_free = sc->free;
+ extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
+ - (old_free & CR_COUNTER_MASK))
+ & CR_COUNTER_MASK;
+ sc->free = old_free + extra;
+ trace_hfi1_piofree(sc, extra);
+
+ /* call sent buffer callbacks */
+ code = -1; /* code not yet set */
+ head = ACCESS_ONCE(sc->sr_head); /* snapshot the head */
+ tail = sc->sr_tail;
+ while (head != tail) {
+ pbuf = &sc->sr[tail].pbuf;
+
+ if (sent_before(sc->free, pbuf->sent_at)) {
+ /* not sent yet */
+ break;
+ }
+ if (pbuf->cb) {
+ if (code < 0) /* fill in code on first user */
+ code = fill_code(hw_free);
+ (*pbuf->cb)(pbuf->arg, code);
+ }
+
+ tail++;
+ if (tail >= sc->sr_size)
+ tail = 0;
+ }
+ /* update tail, in case we moved it */
+ sc->sr_tail = tail;
+ spin_unlock_irqrestore(&sc->release_lock, flags);
+ sc_piobufavail(sc);
+}
+
+/*
+ * Send context group releaser. Argument is the send context that caused
+ * the interrupt. Called from the send context interrupt handler.
+ *
+ * Call release on all contexts in the group.
+ *
+ * This routine takes the sc_lock without an irqsave because it is only
+ * called from an interrupt handler. Adjust if that changes.
+ */
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
+{
+ struct send_context *sc;
+ u32 sw_index;
+ u32 gc, gc_end;
+
+ spin_lock(&dd->sc_lock);
+ sw_index = dd->hw_to_sw[hw_context];
+ if (unlikely(sw_index >= dd->num_send_contexts)) {
+ dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
+ __func__, hw_context, sw_index);
+ goto done;
+ }
+ sc = dd->send_contexts[sw_index].sc;
+ if (unlikely(!sc))
+ goto done;
+
+ gc = group_context(hw_context, sc->group);
+ gc_end = gc + group_size(sc->group);
+ for (; gc < gc_end; gc++) {
+ sw_index = dd->hw_to_sw[gc];
+ if (unlikely(sw_index >= dd->num_send_contexts)) {
+ dd_dev_err(dd,
+ "%s: invalid hw (%u) to sw (%u) mapping\n",
+ __func__, hw_context, sw_index);
+ continue;
+ }
+ sc_release_update(dd->send_contexts[sw_index].sc);
+ }
+done:
+ spin_unlock(&dd->sc_lock);
+}
+
+int init_pervl_scs(struct hfi1_devdata *dd)
+{
+ int i;
+ u64 mask, all_vl_mask = (u64) 0x80ff; /* VLs 0-7, 15 */
+ u32 ctxt;
+
+ dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
+ dd->rcd[0]->rcvhdrqentsize, dd->node);
+ if (!dd->vld[15].sc)
+ goto nomem;
+ hfi1_init_ctxt(dd->vld[15].sc);
+ dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
+ for (i = 0; i < num_vls; i++) {
+ /*
+ * Since this function does not deal with a specific
+ * receive context but we need the RcvHdrQ entry size,
+ * use the size from rcd[0]. It is guaranteed to be
+ * valid at this point and will remain the same for all
+ * receive contexts.
+ */
+ dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
+ dd->rcd[0]->rcvhdrqentsize, dd->node);
+ if (!dd->vld[i].sc)
+ goto nomem;
+
+ hfi1_init_ctxt(dd->vld[i].sc);
+
+ /* non VL15 start with the max MTU */
+ dd->vld[i].mtu = hfi1_max_mtu;
+ }
+ sc_enable(dd->vld[15].sc);
+ ctxt = dd->vld[15].sc->hw_context;
+ mask = all_vl_mask & ~(1LL << 15);
+ write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+ dd_dev_info(dd,
+ "Using send context %u(%u) for VL15\n",
+ dd->vld[15].sc->sw_index, ctxt);
+ for (i = 0; i < num_vls; i++) {
+ sc_enable(dd->vld[i].sc);
+ ctxt = dd->vld[i].sc->hw_context;
+ mask = all_vl_mask & ~(1LL << i);
+ write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+ }
+ return 0;
+nomem:
+ sc_free(dd->vld[15].sc);
+ for (i = 0; i < num_vls; i++)
+ sc_free(dd->vld[i].sc);
+ return -ENOMEM;
+}
+
+int init_credit_return(struct hfi1_devdata *dd)
+{
+ int ret;
+ int num_numa;
+ int i;
+
+ num_numa = num_online_nodes();
+ /* enforce the expectation that the numas are compact */
+ for (i = 0; i < num_numa; i++) {
+ if (!node_online(i)) {
+ dd_dev_err(dd, "NUMA nodes are not compact\n");
+ ret = -EINVAL;
+ goto done;
+ }
+ }
+
+ dd->cr_base = kcalloc(
+ num_numa,
+ sizeof(struct credit_return_base),
+ GFP_KERNEL);
+ if (!dd->cr_base) {
+ dd_dev_err(dd, "Unable to allocate credit return base\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+ for (i = 0; i < num_numa; i++) {
+ int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
+
+ set_dev_node(&dd->pcidev->dev, i);
+ dd->cr_base[i].va = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ bytes,
+ &dd->cr_base[i].pa,
+ GFP_KERNEL);
+ if (dd->cr_base[i].va == NULL) {
+ set_dev_node(&dd->pcidev->dev, dd->node);
+ dd_dev_err(dd,
+ "Unable to allocate credit return DMA range for NUMA %d\n",
+ i);
+ ret = -ENOMEM;
+ goto done;
+ }
+ }
+ set_dev_node(&dd->pcidev->dev, dd->node);
+
+ ret = 0;
+done:
+ return ret;
+}
+
+void free_credit_return(struct hfi1_devdata *dd)
+{
+ int num_numa;
+ int i;
+
+ if (!dd->cr_base)
+ return;
+
+ num_numa = num_online_nodes();
+ for (i = 0; i < num_numa; i++) {
+ if (dd->cr_base[i].va) {
+ dma_free_coherent(&dd->pcidev->dev,
+ TXE_NUM_CONTEXTS
+ * sizeof(struct credit_return),
+ dd->cr_base[i].va,
+ dd->cr_base[i].pa);
+ }
+ }
+ kfree(dd->cr_base);
+ dd->cr_base = NULL;
+}
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
new file mode 100644
index 000000000000..0bb885ca3cfb
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio.h
@@ -0,0 +1,224 @@
+#ifndef _PIO_H
+#define _PIO_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+/* send context types */
+#define SC_KERNEL 0
+#define SC_ACK 1
+#define SC_USER 2
+#define SC_MAX 3
+
+/* invalid send context index */
+#define INVALID_SCI 0xff
+
+/* PIO buffer release callback function */
+typedef void (*pio_release_cb)(void *arg, int code);
+
+/* PIO release codes - in bits, as there could more than one that apply */
+#define PRC_OK 0 /* no known error */
+#define PRC_STATUS_ERR 0x01 /* credit return due to status error */
+#define PRC_PBC 0x02 /* credit return due to PBC */
+#define PRC_THRESHOLD 0x04 /* credit return due to threshold */
+#define PRC_FILL_ERR 0x08 /* credit return due fill error */
+#define PRC_FORCE 0x10 /* credit return due credit force */
+#define PRC_SC_DISABLE 0x20 /* clean-up after a context disable */
+
+/* byte helper */
+union mix {
+ u64 val64;
+ u32 val32[2];
+ u8 val8[8];
+};
+
+/* an allocated PIO buffer */
+struct pio_buf {
+ struct send_context *sc;/* back pointer to owning send context */
+ pio_release_cb cb; /* called when the buffer is released */
+ void *arg; /* argument for cb */
+ void __iomem *start; /* buffer start address */
+ void __iomem *end; /* context end address */
+ unsigned long size; /* context size, in bytes */
+ unsigned long sent_at; /* buffer is sent when <= free */
+ u32 block_count; /* size of buffer, in blocks */
+ u32 qw_written; /* QW written so far */
+ u32 carry_bytes; /* number of valid bytes in carry */
+ union mix carry; /* pending unwritten bytes */
+};
+
+/* cache line aligned pio buffer array */
+union pio_shadow_ring {
+ struct pio_buf pbuf;
+ u64 unused[16]; /* cache line spacer */
+} ____cacheline_aligned;
+
+/* per-NUMA send context */
+struct send_context {
+ /* read-only after init */
+ struct hfi1_devdata *dd; /* device */
+ void __iomem *base_addr; /* start of PIO memory */
+ union pio_shadow_ring *sr; /* shadow ring */
+ volatile __le64 *hw_free; /* HW free counter */
+ struct work_struct halt_work; /* halted context work queue entry */
+ unsigned long flags; /* flags */
+ int node; /* context home node */
+ int type; /* context type */
+ u32 sw_index; /* software index number */
+ u32 hw_context; /* hardware context number */
+ u32 credits; /* number of blocks in context */
+ u32 sr_size; /* size of the shadow ring */
+ u32 group; /* credit return group */
+ /* allocator fields */
+ spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+ unsigned long fill; /* official alloc count */
+ unsigned long alloc_free; /* copy of free (less cache thrash) */
+ u32 sr_head; /* shadow ring head */
+ /* releaser fields */
+ spinlock_t release_lock ____cacheline_aligned_in_smp;
+ unsigned long free; /* official free count */
+ u32 sr_tail; /* shadow ring tail */
+ /* list for PIO waiters */
+ struct list_head piowait ____cacheline_aligned_in_smp;
+ spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
+ u64 credit_ctrl; /* cache for credit control */
+ u32 credit_intr_count; /* count of credit intr users */
+ atomic_t buffers_allocated; /* count of buffers allocated */
+ wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */
+};
+
+/* send context flags */
+#define SCF_ENABLED 0x01
+#define SCF_IN_FREE 0x02
+#define SCF_HALTED 0x04
+#define SCF_FROZEN 0x08
+
+struct send_context_info {
+ struct send_context *sc; /* allocated working context */
+ u16 allocated; /* has this been allocated? */
+ u16 type; /* context type */
+ u16 base; /* base in PIO array */
+ u16 credits; /* size in PIO array */
+};
+
+/* DMA credit return, index is always (context & 0x7) */
+struct credit_return {
+ volatile __le64 cr[8];
+};
+
+/* NUMA indexed credit return array */
+struct credit_return_base {
+ struct credit_return *va;
+ dma_addr_t pa;
+};
+
+/* send context configuration sizes (one per type) */
+struct sc_config_sizes {
+ short int size;
+ short int count;
+};
+
+/* send context functions */
+int init_credit_return(struct hfi1_devdata *dd);
+void free_credit_return(struct hfi1_devdata *dd);
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
+int init_send_contexts(struct hfi1_devdata *dd);
+int init_credit_return(struct hfi1_devdata *dd);
+int init_pervl_scs(struct hfi1_devdata *dd);
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+ uint hdrqentsize, int numa);
+void sc_free(struct send_context *sc);
+int sc_enable(struct send_context *sc);
+void sc_disable(struct send_context *sc);
+int sc_restart(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_flush(struct send_context *sc);
+void sc_drop(struct send_context *sc);
+void sc_stop(struct send_context *sc, int bit);
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+ pio_release_cb cb, void *arg);
+void sc_release_update(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
+void sc_add_credit_return_intr(struct send_context *sc);
+void sc_del_credit_return_intr(struct send_context *sc);
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
+void sc_wait(struct hfi1_devdata *dd);
+void set_pio_integrity(struct send_context *sc);
+
+/* support functions */
+void pio_reset_all(struct hfi1_devdata *dd);
+void pio_freeze(struct hfi1_devdata *dd);
+void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+
+/* global PIO send control operations */
+#define PSC_GLOBAL_ENABLE 0
+#define PSC_GLOBAL_DISABLE 1
+#define PSC_GLOBAL_VLARB_ENABLE 2
+#define PSC_GLOBAL_VLARB_DISABLE 3
+#define PSC_CM_RESET 4
+#define PSC_DATA_VL_ENABLE 5
+#define PSC_DATA_VL_DISABLE 6
+
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
+void pio_send_control(struct hfi1_devdata *dd, int op);
+
+
+/* PIO copy routines */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t count);
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t nbytes);
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
+void seg_pio_copy_end(struct pio_buf *pbuf);
+
+#endif /* _PIO_H */
diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c
new file mode 100644
index 000000000000..8972bbc02038
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/pio_copy.c
@@ -0,0 +1,858 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/* additive distance between non-SOP and SOP space */
+#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
+#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE-1)
+/* number of QUADWORDs in a block */
+#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE/sizeof(u64))
+
+/**
+ * pio_copy - copy data block to MMIO space
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @pbc: PBC to send
+ * @from: source, must be 8 byte aligned
+ * @count: number of DWORD (32-bit) quantities to copy from source
+ *
+ * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
+ * Must always write full BLOCK_SIZE bytes blocks. The first block must
+ * be written to the corresponding SOP=1 address.
+ *
+ * Known:
+ * o pbuf->start always starts on a block boundary
+ * o pbuf can wrap only at a block boundary
+ */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t count)
+{
+ void __iomem *dest = pbuf->start + SOP_DISTANCE;
+ void __iomem *send = dest + PIO_BLOCK_SIZE;
+ void __iomem *dend; /* 8-byte data end */
+
+ /* write the PBC */
+ writeq(pbc, dest);
+ dest += sizeof(u64);
+
+ /* calculate where the QWORD data ends - in SOP=1 space */
+ dend = dest + ((count>>1) * sizeof(u64));
+
+ if (dend < send) {
+ /* all QWORD data is within the SOP block, does *not*
+ reach the end of the SOP block */
+
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /*
+ * No boundary checks are needed here:
+ * 0. We're not on the SOP block boundary
+ * 1. The possible DWORD dangle will still be within
+ * the SOP block
+ * 2. We cannot wrap except on a block boundary.
+ */
+ } else {
+ /* QWORD data extends _to_ or beyond the SOP block */
+
+ /* write 8-byte SOP chunk data */
+ while (dest < send) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /* drop out of the SOP range */
+ dest -= SOP_DISTANCE;
+ dend -= SOP_DISTANCE;
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written, but we will wrap in
+ * case there is a dangling DWORD.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ }
+ /* at this point we have wrapped if we are going to wrap */
+
+ /* write dangling u32, if any */
+ if (count & 1) {
+ union mix val;
+
+ val.val64 = 0;
+ val.val32[0] = *(u32 *)from;
+ writeq(val.val64, dest);
+ dest += sizeof(u64);
+ }
+ /* fill in rest of block, no need to check pbuf->end
+ as we only wrap on a block boundary */
+ while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+ writeq(0, dest);
+ dest += sizeof(u64);
+ }
+
+ /* finished with this buffer */
+ atomic_dec(&pbuf->sc->buffers_allocated);
+}
+
+/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
+#define USE_SHIFTS 1
+#ifdef USE_SHIFTS
+/*
+ * Handle carry bytes using shifts and masks.
+ *
+ * NOTE: the value the unused portion of carry is expected to always be zero.
+ */
+
+/*
+ * "zero" shift - bit shift used to zero out upper bytes. Input is
+ * the count of LSB bytes to preserve.
+ */
+#define zshift(x) (8 * (8-(x)))
+
+/*
+ * "merge" shift - bit shift used to merge with carry bytes. Input is
+ * the LSB byte count to move beyond.
+ */
+#define mshift(x) (8 * (x))
+
+/*
+ * Read nbytes bytes from "from" and return them in the LSB bytes
+ * of pbuf->carry. Other bytes are zeroed. Any previous value
+ * pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned
+ * o nbytes must not span a QW boundary
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+ unsigned int nbytes)
+{
+ unsigned long off;
+
+ if (nbytes == 0) {
+ pbuf->carry.val64 = 0;
+ } else {
+ /* align our pointer */
+ off = (unsigned long)from & 0x7;
+ from = (void *)((unsigned long)from & ~0x7l);
+ pbuf->carry.val64 = ((*(u64 *)from)
+ << zshift(nbytes + off))/* zero upper bytes */
+ >> zshift(nbytes); /* place at bottom */
+ }
+ pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the next significant bytes
+ * of pbuf->carry. Unused bytes are zeroed. It is expected that the extra
+ * read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+ const void *from, unsigned int nbytes)
+{
+ unsigned long off = (unsigned long)from & 0x7;
+ unsigned int room, xbytes;
+
+ /* align our pointer */
+ from = (void *)((unsigned long)from & ~0x7l);
+
+ /* check count first - don't read anything if count is zero */
+ while (nbytes) {
+ /* find the number of bytes in this u64 */
+ room = 8 - off; /* this u64 has room for this many bytes */
+ xbytes = nbytes > room ? room : nbytes;
+
+ /*
+ * shift down to zero lower bytes, shift up to zero upper
+ * bytes, shift back down to move into place
+ */
+ pbuf->carry.val64 |= (((*(u64 *)from)
+ >> mshift(off))
+ << zshift(xbytes))
+ >> zshift(xbytes+pbuf->carry_bytes);
+ off = 0;
+ pbuf->carry_bytes += xbytes;
+ nbytes -= xbytes;
+ from += sizeof(u64);
+ }
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+ unsigned int remaining;
+
+ if (zbytes == 0) /* nothing to do */
+ return;
+
+ remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
+
+ /* NOTE: zshift only guaranteed to work if remaining != 0 */
+ if (remaining)
+ pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
+ >> zshift(remaining);
+ else
+ pbuf->carry.val64 = 0;
+ pbuf->carry_bytes = remaining;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the LSB bytes of
+ * pbuf->carry with the upper bytes zeroed..
+ *
+ * NOTES:
+ * o result must keep unused bytes zeroed
+ * o src must be u64 aligned
+ */
+static inline void merge_write8(
+ struct pio_buf *pbuf,
+ void __iomem *dest,
+ const void *src)
+{
+ u64 new, temp;
+
+ new = *(u64 *)src;
+ temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
+ writeq(temp, dest);
+ pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void __iomem *dest)
+{
+ writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry. If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
+{
+ if (pbuf->carry_bytes) {
+ /* unused bytes are always kept zeroed, so just write */
+ writeq(pbuf->carry.val64, dest);
+ return 1;
+ }
+
+ return 0;
+}
+
+#else /* USE_SHIFTS */
+/*
+ * Handle carry bytes using byte copies.
+ *
+ * NOTE: the value the unused portion of carry is left uninitialized.
+ */
+
+/*
+ * Jump copy - no-loop copy for < 8 bytes.
+ */
+static inline void jcopy(u8 *dest, const u8 *src, u32 n)
+{
+ switch (n) {
+ case 7:
+ *dest++ = *src++;
+ case 6:
+ *dest++ = *src++;
+ case 5:
+ *dest++ = *src++;
+ case 4:
+ *dest++ = *src++;
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+}
+
+/*
+ * Read nbytes from "from" and and place them in the low bytes
+ * of pbuf->carry. Other bytes are left as-is. Any previous
+ * value in pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned.
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+ unsigned int nbytes)
+{
+ jcopy(&pbuf->carry.val8[0], from, nbytes);
+ pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
+ * It is expected that the extra read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+ const void *from, unsigned int nbytes)
+{
+ jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
+ pbuf->carry_bytes += nbytes;
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * We do not care about the value of unused bytes in carry, so just
+ * reduce the byte count.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+ pbuf->carry_bytes -= zbytes;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the low bytes of
+ * pbuf->carry.
+ */
+static inline void merge_write8(
+ struct pio_buf *pbuf,
+ void *dest,
+ const void *src)
+{
+ u32 remainder = 8 - pbuf->carry_bytes;
+
+ jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
+ writeq(pbuf->carry.val64, dest);
+ jcopy(&pbuf->carry.val8[0], src+remainder, pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void *dest)
+{
+ writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry. If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void *dest)
+{
+ if (pbuf->carry_bytes) {
+ u64 zero = 0;
+
+ jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
+ 8 - pbuf->carry_bytes);
+ writeq(pbuf->carry.val64, dest);
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* USE_SHIFTS */
+
+/*
+ * Segmented PIO Copy - start
+ *
+ * Start a PIO copy.
+ *
+ * @pbuf: destination buffer
+ * @pbc: the PBC for the PIO buffer
+ * @from: data source, QWORD aligned
+ * @nbytes: bytes to copy
+ */
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t nbytes)
+{
+ void __iomem *dest = pbuf->start + SOP_DISTANCE;
+ void __iomem *send = dest + PIO_BLOCK_SIZE;
+ void __iomem *dend; /* 8-byte data end */
+
+ writeq(pbc, dest);
+ dest += sizeof(u64);
+
+ /* calculate where the QWORD data ends - in SOP=1 space */
+ dend = dest + ((nbytes>>3) * sizeof(u64));
+
+ if (dend < send) {
+ /* all QWORD data is within the SOP block, does *not*
+ reach the end of the SOP block */
+
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /*
+ * No boundary checks are needed here:
+ * 0. We're not on the SOP block boundary
+ * 1. The possible DWORD dangle will still be within
+ * the SOP block
+ * 2. We cannot wrap except on a block boundary.
+ */
+ } else {
+ /* QWORD data extends _to_ or beyond the SOP block */
+
+ /* write 8-byte SOP chunk data */
+ while (dest < send) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /* drop out of the SOP range */
+ dest -= SOP_DISTANCE;
+ dend -= SOP_DISTANCE;
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written, but we will wrap in
+ * case there is a dangling DWORD.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ }
+ /* at this point we have wrapped if we are going to wrap */
+
+ /* ...but it doesn't matter as we're done writing */
+
+ /* save dangling bytes, if any */
+ read_low_bytes(pbuf, from, nbytes & 0x7);
+
+ pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
+}
+
+/*
+ * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
+ * bytes are non-zero.
+ *
+ * Whole u64s must be written to the chip, so bytes must be manually merged.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned.
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+ void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+ void __iomem *dend; /* 8-byte data end */
+ unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
+ unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
+
+ /* calculate 8-byte data end */
+ dend = dest + (qw_to_write * sizeof(u64));
+
+ if (pbuf->qw_written < PIO_BLOCK_QWS) {
+ /*
+ * Still within SOP block. We don't need to check for
+ * wrap because we are still in the first block and
+ * can only wrap on block boundaries.
+ */
+ void __iomem *send; /* SOP end */
+ void __iomem *xend;
+
+ /* calculate the end of data or end of block, whichever
+ comes first */
+ send = pbuf->start + PIO_BLOCK_SIZE;
+ xend = send < dend ? send : dend;
+
+ /* shift up to SOP=1 space */
+ dest += SOP_DISTANCE;
+ xend += SOP_DISTANCE;
+
+ /* write 8-byte chunk data */
+ while (dest < xend) {
+ merge_write8(pbuf, dest, from);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* shift down to SOP=0 space */
+ dest -= SOP_DISTANCE;
+ }
+ /*
+ * At this point dest could be (either, both, or neither):
+ * - at dend
+ * - at the wrap
+ */
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If dest is at the wrap, we will fall into the if,
+ * not do the loop, when wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ merge_write8(pbuf, dest, from);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ merge_write8(pbuf, dest, from);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* adjust carry */
+ if (pbuf->carry_bytes < bytes_left) {
+ /* need to read more */
+ read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
+ } else {
+ /* remove invalid bytes */
+ zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
+ }
+
+ pbuf->qw_written += qw_to_write;
+}
+
+/*
+ * Mid copy helper, "straight case" - source pointer is 64-bit aligned
+ * with no carry bytes.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_straight(struct pio_buf *pbuf,
+ const void *from, size_t nbytes)
+{
+ void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+ void __iomem *dend; /* 8-byte data end */
+
+ /* calculate 8-byte data end */
+ dend = dest + ((nbytes>>3) * sizeof(u64));
+
+ if (pbuf->qw_written < PIO_BLOCK_QWS) {
+ /*
+ * Still within SOP block. We don't need to check for
+ * wrap because we are still in the first block and
+ * can only wrap on block boundaries.
+ */
+ void __iomem *send; /* SOP end */
+ void __iomem *xend;
+
+ /* calculate the end of data or end of block, whichever
+ comes first */
+ send = pbuf->start + PIO_BLOCK_SIZE;
+ xend = send < dend ? send : dend;
+
+ /* shift up to SOP=1 space */
+ dest += SOP_DISTANCE;
+ xend += SOP_DISTANCE;
+
+ /* write 8-byte chunk data */
+ while (dest < xend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* shift down to SOP=0 space */
+ dest -= SOP_DISTANCE;
+ }
+ /*
+ * At this point dest could be (either, both, or neither):
+ * - at dend
+ * - at the wrap
+ */
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If dest is at the wrap, we will fall into the if,
+ * not do the loop, when wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* we know carry_bytes was zero on entry to this routine */
+ read_low_bytes(pbuf, from, nbytes & 0x7);
+
+ pbuf->qw_written += nbytes>>3;
+}
+
+/*
+ * Segmented PIO Copy - middle
+ *
+ * Must handle any aligned tail and any aligned source with any byte count.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @from: data source
+ * @nbytes: number of bytes to copy
+ */
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+ unsigned long from_align = (unsigned long)from & 0x7;
+
+ if (pbuf->carry_bytes + nbytes < 8) {
+ /* not enough bytes to fill a QW */
+ read_extra_bytes(pbuf, from, nbytes);
+ return;
+ }
+
+ if (from_align) {
+ /* misaligned source pointer - align it */
+ unsigned long to_align;
+
+ /* bytes to read to align "from" */
+ to_align = 8 - from_align;
+
+ /*
+ * In the advance-to-alignment logic below, we do not need
+ * to check if we are using more than nbytes. This is because
+ * if we are here, we already know that carry+nbytes will
+ * fill at least one QW.
+ */
+ if (pbuf->carry_bytes + to_align < 8) {
+ /* not enough align bytes to fill a QW */
+ read_extra_bytes(pbuf, from, to_align);
+ from += to_align;
+ nbytes -= to_align;
+ } else {
+ /* bytes to fill carry */
+ unsigned long to_fill = 8 - pbuf->carry_bytes;
+ /* bytes left over to be read */
+ unsigned long extra = to_align - to_fill;
+ void __iomem *dest;
+
+ /* fill carry... */
+ read_extra_bytes(pbuf, from, to_fill);
+ from += to_fill;
+ nbytes -= to_fill;
+
+ /* ...now write carry */
+ dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+ /*
+ * The two checks immediately below cannot both be
+ * true, hence the else. If we have wrapped, we
+ * cannot still be within the first block.
+ * Conversely, if we are still in the first block, we
+ * cannot have wrapped. We do the wrap check first
+ * as that is more likely.
+ */
+ /* adjust if we've wrapped */
+ if (dest >= pbuf->end)
+ dest -= pbuf->size;
+ /* jump to SOP range if within the first block */
+ else if (pbuf->qw_written < PIO_BLOCK_QWS)
+ dest += SOP_DISTANCE;
+
+ carry8_write8(pbuf->carry, dest);
+ pbuf->qw_written++;
+
+ /* read any extra bytes to do final alignment */
+ /* this will overwrite anything in pbuf->carry */
+ read_low_bytes(pbuf, from, extra);
+ from += extra;
+ nbytes -= extra;
+ }
+
+ /* at this point, from is QW aligned */
+ }
+
+ if (pbuf->carry_bytes)
+ mid_copy_mix(pbuf, from, nbytes);
+ else
+ mid_copy_straight(pbuf, from, nbytes);
+}
+
+/*
+ * Segmented PIO Copy - end
+ *
+ * Write any remainder (in pbuf->carry) and finish writing the whole block.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ */
+void seg_pio_copy_end(struct pio_buf *pbuf)
+{
+ void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+ /*
+ * The two checks immediately below cannot both be true, hence the
+ * else. If we have wrapped, we cannot still be within the first
+ * block. Conversely, if we are still in the first block, we
+ * cannot have wrapped. We do the wrap check first as that is
+ * more likely.
+ */
+ /* adjust if we have wrapped */
+ if (dest >= pbuf->end)
+ dest -= pbuf->size;
+ /* jump to the SOP range if within the first block */
+ else if (pbuf->qw_written < PIO_BLOCK_QWS)
+ dest += SOP_DISTANCE;
+
+ /* write final bytes, if any */
+ if (carry_write8(pbuf, dest)) {
+ dest += sizeof(u64);
+ /*
+ * NOTE: We do not need to recalculate whether dest needs
+ * SOP_DISTANCE or not.
+ *
+ * If we are in the first block and the dangle write
+ * keeps us in the same block, dest will need
+ * to retain SOP_DISTANCE in the loop below.
+ *
+ * If we are in the first block and the dangle write pushes
+ * us to the next block, then loop below will not run
+ * and dest is not used. Hence we do not need to update
+ * it.
+ *
+ * If we are past the first block, then SOP_DISTANCE
+ * was never added, so there is nothing to do.
+ */
+ }
+
+ /* fill in rest of block */
+ while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+ writeq(0, dest);
+ dest += sizeof(u64);
+ }
+
+ /* finished with this buffer */
+ atomic_dec(&pbuf->sc->buffers_allocated);
+}
diff --git a/drivers/staging/rdma/hfi1/platform_config.h b/drivers/staging/rdma/hfi1/platform_config.h
new file mode 100644
index 000000000000..8a94a8342052
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/platform_config.h
@@ -0,0 +1,286 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __PLATFORM_CONFIG_H
+#define __PLATFORM_CONFIG_H
+
+#define METADATA_TABLE_FIELD_START_SHIFT 0
+#define METADATA_TABLE_FIELD_START_LEN_BITS 15
+#define METADATA_TABLE_FIELD_LEN_SHIFT 16
+#define METADATA_TABLE_FIELD_LEN_LEN_BITS 16
+
+/* Header structure */
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT 0
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS 6
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT 16
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS 12
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT 28
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS 4
+
+enum platform_config_table_type_encoding {
+ PLATFORM_CONFIG_TABLE_RESERVED,
+ PLATFORM_CONFIG_SYSTEM_TABLE,
+ PLATFORM_CONFIG_PORT_TABLE,
+ PLATFORM_CONFIG_RX_PRESET_TABLE,
+ PLATFORM_CONFIG_TX_PRESET_TABLE,
+ PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
+ PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
+ PLATFORM_CONFIG_TABLE_MAX
+};
+
+enum platform_config_system_table_fields {
+ SYSTEM_TABLE_RESERVED,
+ SYSTEM_TABLE_NODE_STRING,
+ SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
+ SYSTEM_TABLE_NODE_GUID,
+ SYSTEM_TABLE_REVISION,
+ SYSTEM_TABLE_VENDOR_OUI,
+ SYSTEM_TABLE_META_VERSION,
+ SYSTEM_TABLE_DEVICE_ID,
+ SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
+ SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
+ SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
+ SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+ SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
+ SYSTEM_TABLE_MAX
+};
+
+enum platform_config_port_table_fields {
+ PORT_TABLE_RESERVED,
+ PORT_TABLE_PORT_TYPE,
+ PORT_TABLE_ATTENUATION_12G,
+ PORT_TABLE_ATTENUATION_25G,
+ PORT_TABLE_LINK_SPEED_SUPPORTED,
+ PORT_TABLE_LINK_WIDTH_SUPPORTED,
+ PORT_TABLE_VL_CAP,
+ PORT_TABLE_MTU_CAP,
+ PORT_TABLE_TX_LANE_ENABLE_MASK,
+ PORT_TABLE_LOCAL_MAX_TIMEOUT,
+ PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
+ PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
+ PORT_TABLE_TX_PRESET_IDX_PASSIVE_CU,
+ PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+ PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+ PORT_TABLE_RX_PRESET_IDX,
+ PORT_TABLE_CABLE_REACH_CLASS,
+ PORT_TABLE_MAX
+};
+
+enum platform_config_rx_preset_table_fields {
+ RX_PRESET_TABLE_RESERVED,
+ RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+ RX_PRESET_TABLE_QSFP_RX_EQ_APPLY,
+ RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+ RX_PRESET_TABLE_QSFP_RX_CDR,
+ RX_PRESET_TABLE_QSFP_RX_EQ,
+ RX_PRESET_TABLE_QSFP_RX_AMP,
+ RX_PRESET_TABLE_MAX
+};
+
+enum platform_config_tx_preset_table_fields {
+ TX_PRESET_TABLE_RESERVED,
+ TX_PRESET_TABLE_PRECUR,
+ TX_PRESET_TABLE_ATTN,
+ TX_PRESET_TABLE_POSTCUR,
+ TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
+ TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+ TX_PRESET_TABLE_QSFP_TX_CDR,
+ TX_PRESET_TABLE_QSFP_TX_EQ,
+ TX_PRESET_TABLE_MAX
+};
+
+enum platform_config_qsfp_attn_table_fields {
+ QSFP_ATTEN_TABLE_RESERVED,
+ QSFP_ATTEN_TABLE_TX_PRESET_IDX,
+ QSFP_ATTEN_TABLE_RX_PRESET_IDX,
+ QSFP_ATTEN_TABLE_MAX
+};
+
+enum platform_config_variable_settings_table_fields {
+ VARIABLE_SETTINGS_TABLE_RESERVED,
+ VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
+ VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
+ VARIABLE_SETTINGS_TABLE_MAX
+};
+
+struct platform_config_data {
+ u32 *table;
+ u32 *table_metadata;
+ u32 num_table;
+};
+
+/*
+ * This struct acts as a quick reference into the platform_data binary image
+ * and is populated by parse_platform_config(...) depending on the specific
+ * META_VERSION
+ */
+struct platform_config_cache {
+ u8 cache_valid;
+ struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
+};
+
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+ 0,
+ SYSTEM_TABLE_MAX,
+ PORT_TABLE_MAX,
+ RX_PRESET_TABLE_MAX,
+ TX_PRESET_TABLE_MAX,
+ QSFP_ATTEN_TABLE_MAX,
+ VARIABLE_SETTINGS_TABLE_MAX
+};
+
+/* This section defines default values and encodings for the
+ * fields defined for each table above
+ */
+
+/*=====================================================
+ * System table encodings
+ *====================================================*/
+#define PLATFORM_CONFIG_MAGIC_NUM 0x3d4f5041
+#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN 4
+
+/*
+ * These power classes are the same as defined in SFF 8636 spec rev 2.4
+ * describing byte 129 in table 6-16, except enumerated in a different order
+ */
+enum platform_config_qsfp_power_class_encoding {
+ QSFP_POWER_CLASS_1 = 1,
+ QSFP_POWER_CLASS_2,
+ QSFP_POWER_CLASS_3,
+ QSFP_POWER_CLASS_4,
+ QSFP_POWER_CLASS_5,
+ QSFP_POWER_CLASS_6,
+ QSFP_POWER_CLASS_7
+};
+
+
+/*=====================================================
+ * Port table encodings
+ *==================================================== */
+enum platform_config_port_type_encoding {
+ PORT_TYPE_RESERVED,
+ PORT_TYPE_DISCONNECTED,
+ PORT_TYPE_FIXED,
+ PORT_TYPE_VARIABLE,
+ PORT_TYPE_QSFP,
+ PORT_TYPE_MAX
+};
+
+enum platform_config_link_speed_supported_encoding {
+ LINK_SPEED_SUPP_12G = 1,
+ LINK_SPEED_SUPP_25G,
+ LINK_SPEED_SUPP_12G_25G,
+ LINK_SPEED_SUPP_MAX
+};
+
+/*
+ * This is a subset (not strict) of the link downgrades
+ * supported. The link downgrades supported are expected
+ * to be supplied to the driver by another entity such as
+ * the fabric manager
+ */
+enum platform_config_link_width_supported_encoding {
+ LINK_WIDTH_SUPP_1X = 1,
+ LINK_WIDTH_SUPP_2X,
+ LINK_WIDTH_SUPP_2X_1X,
+ LINK_WIDTH_SUPP_3X,
+ LINK_WIDTH_SUPP_3X_1X,
+ LINK_WIDTH_SUPP_3X_2X,
+ LINK_WIDTH_SUPP_3X_2X_1X,
+ LINK_WIDTH_SUPP_4X,
+ LINK_WIDTH_SUPP_4X_1X,
+ LINK_WIDTH_SUPP_4X_2X,
+ LINK_WIDTH_SUPP_4X_2X_1X,
+ LINK_WIDTH_SUPP_4X_3X,
+ LINK_WIDTH_SUPP_4X_3X_1X,
+ LINK_WIDTH_SUPP_4X_3X_2X,
+ LINK_WIDTH_SUPP_4X_3X_2X_1X,
+ LINK_WIDTH_SUPP_MAX
+};
+
+enum platform_config_virtual_lane_capability_encoding {
+ VL_CAP_VL0 = 1,
+ VL_CAP_VL0_1,
+ VL_CAP_VL0_2,
+ VL_CAP_VL0_3,
+ VL_CAP_VL0_4,
+ VL_CAP_VL0_5,
+ VL_CAP_VL0_6,
+ VL_CAP_VL0_7,
+ VL_CAP_VL0_8,
+ VL_CAP_VL0_9,
+ VL_CAP_VL0_10,
+ VL_CAP_VL0_11,
+ VL_CAP_VL0_12,
+ VL_CAP_VL0_13,
+ VL_CAP_VL0_14,
+ VL_CAP_MAX
+};
+
+/* Max MTU */
+enum platform_config_mtu_capability_encoding {
+ MTU_CAP_256 = 1,
+ MTU_CAP_512 = 2,
+ MTU_CAP_1024 = 3,
+ MTU_CAP_2048 = 4,
+ MTU_CAP_4096 = 5,
+ MTU_CAP_8192 = 6,
+ MTU_CAP_10240 = 7
+};
+
+enum platform_config_local_max_timeout_encoding {
+ LOCAL_MAX_TIMEOUT_10_MS = 1,
+ LOCAL_MAX_TIMEOUT_100_MS,
+ LOCAL_MAX_TIMEOUT_1_S,
+ LOCAL_MAX_TIMEOUT_10_S,
+ LOCAL_MAX_TIMEOUT_100_S,
+ LOCAL_MAX_TIMEOUT_1000_S
+};
+
+#endif /*__PLATFORM_CONFIG_H*/
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
new file mode 100644
index 000000000000..df1fa56eaf85
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -0,0 +1,1687 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+#include "sdma.h"
+
+#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
+
+static unsigned int hfi1_qp_table_size = 256;
+module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static void flush_tx_list(struct hfi1_qp *qp);
+static int iowait_sleep(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *stx,
+ unsigned seq);
+static void iowait_wakeup(struct iowait *wait, int reason);
+
+static inline unsigned mk_qpn(struct hfi1_qpn_table *qpt,
+ struct qpn_map *map, unsigned off)
+{
+ return (map - qpt->map) * BITS_PER_PAGE + off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+ 0, /* 0 */
+ 1, /* 1 */
+ 2, /* 2 */
+ 3, /* 3 */
+ 4, /* 4 */
+ 6, /* 5 */
+ 8, /* 6 */
+ 12, /* 7 */
+ 16, /* 8 */
+ 24, /* 9 */
+ 32, /* A */
+ 48, /* B */
+ 64, /* C */
+ 96, /* D */
+ 128, /* E */
+ 192, /* F */
+ 256, /* 10 */
+ 384, /* 11 */
+ 512, /* 12 */
+ 768, /* 13 */
+ 1024, /* 14 */
+ 1536, /* 15 */
+ 2048, /* 16 */
+ 3072, /* 17 */
+ 4096, /* 18 */
+ 6144, /* 19 */
+ 8192, /* 1A */
+ 12288, /* 1B */
+ 16384, /* 1C */
+ 24576, /* 1D */
+ 32768 /* 1E */
+};
+
+static void get_map_page(struct hfi1_qpn_table *qpt, struct qpn_map *map)
+{
+ unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+ /*
+ * Free the page if someone raced with us installing it.
+ */
+
+ spin_lock(&qpt->lock);
+ if (map->page)
+ free_page(page);
+ else
+ map->page = (void *)page;
+ spin_unlock(&qpt->lock);
+}
+
+/*
+ * Allocate the next available QPN or
+ * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
+ */
+static int alloc_qpn(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt,
+ enum ib_qp_type type, u8 port)
+{
+ u32 i, offset, max_scan, qpn;
+ struct qpn_map *map;
+ u32 ret;
+
+ if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+ unsigned n;
+
+ ret = type == IB_QPT_GSI;
+ n = 1 << (ret + 2 * (port - 1));
+ spin_lock(&qpt->lock);
+ if (qpt->flags & n)
+ ret = -EINVAL;
+ else
+ qpt->flags |= n;
+ spin_unlock(&qpt->lock);
+ goto bail;
+ }
+
+ qpn = qpt->last + qpt->incr;
+ if (qpn >= QPN_MAX)
+ qpn = qpt->incr | ((qpt->last & 1) ^ 1);
+ /* offset carries bit 0 */
+ offset = qpn & BITS_PER_PAGE_MASK;
+ map = &qpt->map[qpn / BITS_PER_PAGE];
+ max_scan = qpt->nmaps - !offset;
+ for (i = 0;;) {
+ if (unlikely(!map->page)) {
+ get_map_page(qpt, map);
+ if (unlikely(!map->page))
+ break;
+ }
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+ qpt->last = qpn;
+ ret = qpn;
+ goto bail;
+ }
+ offset += qpt->incr;
+ /*
+ * This qpn might be bogus if offset >= BITS_PER_PAGE.
+ * That is OK. It gets re-assigned below
+ */
+ qpn = mk_qpn(qpt, map, offset);
+ } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+ /*
+ * In order to keep the number of pages allocated to a
+ * minimum, we scan the all existing pages before increasing
+ * the size of the bitmap table.
+ */
+ if (++i > max_scan) {
+ if (qpt->nmaps == QPNMAP_ENTRIES)
+ break;
+ map = &qpt->map[qpt->nmaps++];
+ /* start at incr with current bit 0 */
+ offset = qpt->incr | (offset & 1);
+ } else if (map < &qpt->map[qpt->nmaps]) {
+ ++map;
+ /* start at incr with current bit 0 */
+ offset = qpt->incr | (offset & 1);
+ } else {
+ map = &qpt->map[0];
+ /* wrap to first map page, invert bit 0 */
+ offset = qpt->incr | ((offset & 1) ^ 1);
+ }
+ /* there can be no bits at shift and below */
+ WARN_ON(offset & (dd->qos_shift - 1));
+ qpn = mk_qpn(qpt, map, offset);
+ }
+
+ ret = -ENOMEM;
+
+bail:
+ return ret;
+}
+
+static void free_qpn(struct hfi1_qpn_table *qpt, u32 qpn)
+{
+ struct qpn_map *map;
+
+ map = qpt->map + qpn / BITS_PER_PAGE;
+ if (map->page)
+ clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+}
+
+/*
+ * Put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static void insert_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ unsigned long flags;
+
+ atomic_inc(&qp->refcount);
+ spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+
+ if (qp->ibqp.qp_num <= 1) {
+ rcu_assign_pointer(ibp->qp[qp->ibqp.qp_num], qp);
+ } else {
+ u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
+
+ qp->next = dev->qp_dev->qp_table[n];
+ rcu_assign_pointer(dev->qp_dev->qp_table[n], qp);
+ trace_hfi1_qpinsert(qp, n);
+ }
+
+ spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+}
+
+/*
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void remove_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
+ unsigned long flags;
+ int removed = 1;
+
+ spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+
+ if (rcu_dereference_protected(ibp->qp[0],
+ lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
+ RCU_INIT_POINTER(ibp->qp[0], NULL);
+ } else if (rcu_dereference_protected(ibp->qp[1],
+ lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
+ RCU_INIT_POINTER(ibp->qp[1], NULL);
+ } else {
+ struct hfi1_qp *q;
+ struct hfi1_qp __rcu **qpp;
+
+ removed = 0;
+ qpp = &dev->qp_dev->qp_table[n];
+ for (; (q = rcu_dereference_protected(*qpp,
+ lockdep_is_held(&dev->qp_dev->qpt_lock)))
+ != NULL;
+ qpp = &q->next)
+ if (q == qp) {
+ RCU_INIT_POINTER(*qpp,
+ rcu_dereference_protected(qp->next,
+ lockdep_is_held(&dev->qp_dev->qpt_lock)));
+ removed = 1;
+ trace_hfi1_qpremove(qp, n);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+ if (removed) {
+ synchronize_rcu();
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+}
+
+/**
+ * free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+static unsigned free_all_qps(struct hfi1_devdata *dd)
+{
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ unsigned long flags;
+ struct hfi1_qp *qp;
+ unsigned n, qp_inuse = 0;
+
+ for (n = 0; n < dd->num_pports; n++) {
+ struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
+
+ if (!hfi1_mcast_tree_empty(ibp))
+ qp_inuse++;
+ rcu_read_lock();
+ if (rcu_dereference(ibp->qp[0]))
+ qp_inuse++;
+ if (rcu_dereference(ibp->qp[1]))
+ qp_inuse++;
+ rcu_read_unlock();
+ }
+
+ if (!dev->qp_dev)
+ goto bail;
+ spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+ for (n = 0; n < dev->qp_dev->qp_table_size; n++) {
+ qp = rcu_dereference_protected(dev->qp_dev->qp_table[n],
+ lockdep_is_held(&dev->qp_dev->qpt_lock));
+ RCU_INIT_POINTER(dev->qp_dev->qp_table[n], NULL);
+
+ for (; qp; qp = rcu_dereference_protected(qp->next,
+ lockdep_is_held(&dev->qp_dev->qpt_lock)))
+ qp_inuse++;
+ }
+ spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+ synchronize_rcu();
+bail:
+ return qp_inuse;
+}
+
+/**
+ * reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void reset_qp(struct hfi1_qp *qp, enum ib_qp_type type)
+{
+ qp->remote_qpn = 0;
+ qp->qkey = 0;
+ qp->qp_access_flags = 0;
+ iowait_init(
+ &qp->s_iowait,
+ 1,
+ hfi1_do_send,
+ iowait_sleep,
+ iowait_wakeup);
+ qp->s_flags &= HFI1_S_SIGNAL_REQ_WR;
+ qp->s_hdrwords = 0;
+ qp->s_wqe = NULL;
+ qp->s_draining = 0;
+ qp->s_next_psn = 0;
+ qp->s_last_psn = 0;
+ qp->s_sending_psn = 0;
+ qp->s_sending_hpsn = 0;
+ qp->s_psn = 0;
+ qp->r_psn = 0;
+ qp->r_msn = 0;
+ if (type == IB_QPT_RC) {
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ qp->r_state = IB_OPCODE_RC_SEND_LAST;
+ } else {
+ qp->s_state = IB_OPCODE_UC_SEND_LAST;
+ qp->r_state = IB_OPCODE_UC_SEND_LAST;
+ }
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+ qp->r_nak_state = 0;
+ qp->r_aflags = 0;
+ qp->r_flags = 0;
+ qp->s_head = 0;
+ qp->s_tail = 0;
+ qp->s_cur = 0;
+ qp->s_acked = 0;
+ qp->s_last = 0;
+ qp->s_ssn = 1;
+ qp->s_lsn = 0;
+ clear_ahg(qp);
+ qp->s_mig_state = IB_MIG_MIGRATED;
+ memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+ qp->r_head_ack_queue = 0;
+ qp->s_tail_ack_queue = 0;
+ qp->s_num_rd_atomic = 0;
+ if (qp->r_rq.wq) {
+ qp->r_rq.wq->head = 0;
+ qp->r_rq.wq->tail = 0;
+ }
+ qp->r_sge.num_sge = 0;
+}
+
+static void clear_mr_refs(struct hfi1_qp *qp, int clr_sends)
+{
+ unsigned n;
+
+ if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+ hfi1_put_ss(&qp->s_rdma_read_sge);
+
+ hfi1_put_ss(&qp->r_sge);
+
+ if (clr_sends) {
+ while (qp->s_last != qp->s_head) {
+ struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+ unsigned i;
+
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct hfi1_sge *sge = &wqe->sg_list[i];
+
+ hfi1_put_mr(sge->mr);
+ }
+ if (qp->ibqp.qp_type == IB_QPT_UD ||
+ qp->ibqp.qp_type == IB_QPT_SMI ||
+ qp->ibqp.qp_type == IB_QPT_GSI)
+ atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ }
+ if (qp->s_rdma_mr) {
+ hfi1_put_mr(qp->s_rdma_mr);
+ qp->s_rdma_mr = NULL;
+ }
+ }
+
+ if (qp->ibqp.qp_type != IB_QPT_RC)
+ return;
+
+ for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+ struct hfi1_ack_entry *e = &qp->s_ack_queue[n];
+
+ if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
+ e->rdma_sge.mr) {
+ hfi1_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ }
+}
+
+/**
+ * hfi1_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err)
+{
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ib_wc wc;
+ int ret = 0;
+
+ if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+ goto bail;
+
+ qp->state = IB_QPS_ERR;
+
+ if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+ qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+ del_timer(&qp->s_timer);
+ }
+
+ if (qp->s_flags & HFI1_S_ANY_WAIT_SEND)
+ qp->s_flags &= ~HFI1_S_ANY_WAIT_SEND;
+
+ write_seqlock(&dev->iowait_lock);
+ if (!list_empty(&qp->s_iowait.list) && !(qp->s_flags & HFI1_S_BUSY)) {
+ qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+ list_del_init(&qp->s_iowait.list);
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+ write_sequnlock(&dev->iowait_lock);
+
+ if (!(qp->s_flags & HFI1_S_BUSY)) {
+ qp->s_hdrwords = 0;
+ if (qp->s_rdma_mr) {
+ hfi1_put_mr(qp->s_rdma_mr);
+ qp->s_rdma_mr = NULL;
+ }
+ flush_tx_list(qp);
+ }
+
+ /* Schedule the sending tasklet to drain the send work queue. */
+ if (qp->s_last != qp->s_head)
+ hfi1_schedule_send(qp);
+
+ clear_mr_refs(qp, 0);
+
+ memset(&wc, 0, sizeof(wc));
+ wc.qp = &qp->ibqp;
+ wc.opcode = IB_WC_RECV;
+
+ if (test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) {
+ wc.wr_id = qp->r_wr_id;
+ wc.status = err;
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+ }
+ wc.status = IB_WC_WR_FLUSH_ERR;
+
+ if (qp->r_rq.wq) {
+ struct hfi1_rwq *wq;
+ u32 head;
+ u32 tail;
+
+ spin_lock(&qp->r_rq.lock);
+
+ /* sanity check pointers before trusting them */
+ wq = qp->r_rq.wq;
+ head = wq->head;
+ if (head >= qp->r_rq.size)
+ head = 0;
+ tail = wq->tail;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ while (tail != head) {
+ wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+ if (++tail >= qp->r_rq.size)
+ tail = 0;
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+ }
+ wq->tail = tail;
+
+ spin_unlock(&qp->r_rq.lock);
+ } else if (qp->ibqp.event_handler)
+ ret = 1;
+
+bail:
+ return ret;
+}
+
+static void flush_tx_list(struct hfi1_qp *qp)
+{
+ while (!list_empty(&qp->s_iowait.tx_head)) {
+ struct sdma_txreq *tx;
+
+ tx = list_first_entry(
+ &qp->s_iowait.tx_head,
+ struct sdma_txreq,
+ list);
+ list_del_init(&tx->list);
+ hfi1_put_txreq(
+ container_of(tx, struct verbs_txreq, txreq));
+ }
+}
+
+static void flush_iowait(struct hfi1_qp *qp)
+{
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ if (!list_empty(&qp->s_iowait.list)) {
+ list_del_init(&qp->s_iowait.list);
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+}
+
+static inline int opa_mtu_enum_to_int(int mtu)
+{
+ switch (mtu) {
+ case OPA_MTU_8192: return 8192;
+ case OPA_MTU_10240: return 10240;
+ default: return -1;
+ }
+}
+
+/**
+ * This function is what we would push to the core layer if we wanted to be a
+ * "first class citizen". Instead we hide this here and rely on Verbs ULPs
+ * to blindly pass the MTU enum value from the PathRecord to us.
+ *
+ * The actual flag used to determine "8k MTU" will change and is currently
+ * unknown.
+ */
+static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
+{
+ int val = opa_mtu_enum_to_int((int)mtu);
+
+ if (val > 0)
+ return val;
+ return ib_mtu_enum_to_int(mtu);
+}
+
+
+/**
+ * hfi1_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct hfi1_ibdev *dev = to_idev(ibqp->device);
+ struct hfi1_qp *qp = to_iqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ struct ib_event ev;
+ int lastwqe = 0;
+ int mig = 0;
+ int ret;
+ u32 pmtu = 0; /* for gcc warning only */
+ struct hfi1_devdata *dd;
+
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_lock);
+
+ cur_state = attr_mask & IB_QP_CUR_STATE ?
+ attr->cur_qp_state : qp->state;
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+ attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+ goto inval;
+
+ if (attr_mask & IB_QP_AV) {
+ if (attr->ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
+ goto inval;
+ if (hfi1_check_ah(qp->ibqp.device, &attr->ah_attr))
+ goto inval;
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ if (attr->alt_ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
+ goto inval;
+ if (hfi1_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
+ goto inval;
+ if (attr->alt_pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+ goto inval;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ if (attr->pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+ goto inval;
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER)
+ if (attr->min_rnr_timer > 31)
+ goto inval;
+
+ if (attr_mask & IB_QP_PORT)
+ if (qp->ibqp.qp_type == IB_QPT_SMI ||
+ qp->ibqp.qp_type == IB_QPT_GSI ||
+ attr->port_num == 0 ||
+ attr->port_num > ibqp->device->phys_port_cnt)
+ goto inval;
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ if (attr->dest_qp_num > HFI1_QPN_MASK)
+ goto inval;
+
+ if (attr_mask & IB_QP_RETRY_CNT)
+ if (attr->retry_cnt > 7)
+ goto inval;
+
+ if (attr_mask & IB_QP_RNR_RETRY)
+ if (attr->rnr_retry > 7)
+ goto inval;
+
+ /*
+ * Don't allow invalid path_mtu values. OK to set greater
+ * than the active mtu (or even the max_cap, if we have tuned
+ * that to a small mtu. We'll set qp->path_mtu
+ * to the lesser of requested attribute mtu and active,
+ * for packetizing messages.
+ * Note that the QP port has to be set in INIT and MTU in RTR.
+ */
+ if (attr_mask & IB_QP_PATH_MTU) {
+ int mtu, pidx = qp->port_num - 1;
+
+ dd = dd_from_dev(dev);
+ mtu = verbs_mtu_enum_to_int(ibqp->device, attr->path_mtu);
+ if (mtu == -1)
+ goto inval;
+
+ if (mtu > dd->pport[pidx].ibmtu)
+ pmtu = mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
+ else
+ pmtu = attr->path_mtu;
+ }
+
+ if (attr_mask & IB_QP_PATH_MIG_STATE) {
+ if (attr->path_mig_state == IB_MIG_REARM) {
+ if (qp->s_mig_state == IB_MIG_ARMED)
+ goto inval;
+ if (new_state != IB_QPS_RTS)
+ goto inval;
+ } else if (attr->path_mig_state == IB_MIG_MIGRATED) {
+ if (qp->s_mig_state == IB_MIG_REARM)
+ goto inval;
+ if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
+ goto inval;
+ if (qp->s_mig_state == IB_MIG_ARMED)
+ mig = 1;
+ } else
+ goto inval;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ if (attr->max_dest_rd_atomic > HFI1_MAX_RDMA_ATOMIC)
+ goto inval;
+
+ switch (new_state) {
+ case IB_QPS_RESET:
+ if (qp->state != IB_QPS_RESET) {
+ qp->state = IB_QPS_RESET;
+ flush_iowait(qp);
+ qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irq(&qp->r_lock);
+ /* Stop the sending work queue and retry timer */
+ cancel_work_sync(&qp->s_iowait.iowork);
+ del_timer_sync(&qp->s_timer);
+ iowait_sdma_drain(&qp->s_iowait);
+ flush_tx_list(qp);
+ remove_qp(dev, qp);
+ wait_event(qp->wait, !atomic_read(&qp->refcount));
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_lock);
+ clear_mr_refs(qp, 1);
+ clear_ahg(qp);
+ reset_qp(qp, ibqp->qp_type);
+ }
+ break;
+
+ case IB_QPS_RTR:
+ /* Allow event to re-trigger if QP set to RTR more than once */
+ qp->r_flags &= ~HFI1_R_COMM_EST;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_SQD:
+ qp->s_draining = qp->s_last != qp->s_cur;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_SQE:
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ goto inval;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_ERR:
+ lastwqe = hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ break;
+
+ default:
+ qp->state = new_state;
+ break;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ qp->s_pkey_index = attr->pkey_index;
+
+ if (attr_mask & IB_QP_PORT)
+ qp->port_num = attr->port_num;
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ qp->remote_qpn = attr->dest_qp_num;
+
+ if (attr_mask & IB_QP_SQ_PSN) {
+ qp->s_next_psn = attr->sq_psn & PSN_MODIFY_MASK;
+ qp->s_psn = qp->s_next_psn;
+ qp->s_sending_psn = qp->s_next_psn;
+ qp->s_last_psn = qp->s_next_psn - 1;
+ qp->s_sending_hpsn = qp->s_last_psn;
+ }
+
+ if (attr_mask & IB_QP_RQ_PSN)
+ qp->r_psn = attr->rq_psn & PSN_MODIFY_MASK;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->qp_access_flags = attr->qp_access_flags;
+
+ if (attr_mask & IB_QP_AV) {
+ qp->remote_ah_attr = attr->ah_attr;
+ qp->s_srate = attr->ah_attr.static_rate;
+ qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ qp->alt_ah_attr = attr->alt_ah_attr;
+ qp->s_alt_pkey_index = attr->alt_pkey_index;
+ }
+
+ if (attr_mask & IB_QP_PATH_MIG_STATE) {
+ qp->s_mig_state = attr->path_mig_state;
+ if (mig) {
+ qp->remote_ah_attr = qp->alt_ah_attr;
+ qp->port_num = qp->alt_ah_attr.port_num;
+ qp->s_pkey_index = qp->s_alt_pkey_index;
+ qp->s_flags |= HFI1_S_AHG_CLEAR;
+ }
+ }
+
+ if (attr_mask & IB_QP_PATH_MTU) {
+ struct hfi1_ibport *ibp;
+ u8 sc, vl;
+ u32 mtu;
+
+ dd = dd_from_dev(dev);
+ ibp = &dd->pport[qp->port_num - 1].ibport_data;
+
+ sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ vl = sc_to_vlt(dd, sc);
+
+ mtu = verbs_mtu_enum_to_int(ibqp->device, pmtu);
+ if (vl < PER_VL_SEND_CONTEXTS)
+ mtu = min_t(u32, mtu, dd->vld[vl].mtu);
+ pmtu = mtu_to_enum(mtu, OPA_MTU_8192);
+
+ qp->path_mtu = pmtu;
+ qp->pmtu = mtu;
+ }
+
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ qp->s_retry_cnt = attr->retry_cnt;
+ qp->s_retry = attr->retry_cnt;
+ }
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ qp->s_rnr_retry_cnt = attr->rnr_retry;
+ qp->s_rnr_retry = attr->rnr_retry;
+ }
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER)
+ qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+ if (attr_mask & IB_QP_TIMEOUT) {
+ qp->timeout = attr->timeout;
+ qp->timeout_jiffies =
+ usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+ 1000UL);
+ }
+
+ if (attr_mask & IB_QP_QKEY)
+ qp->qkey = attr->qkey;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+ qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irq(&qp->r_lock);
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ insert_qp(dev, qp);
+
+ if (lastwqe) {
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ if (mig) {
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_PATH_MIG;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ ret = 0;
+ goto bail;
+
+inval:
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irq(&qp->r_lock);
+ ret = -EINVAL;
+
+bail:
+ return ret;
+}
+
+int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+ struct hfi1_qp *qp = to_iqp(ibqp);
+
+ attr->qp_state = qp->state;
+ attr->cur_qp_state = attr->qp_state;
+ attr->path_mtu = qp->path_mtu;
+ attr->path_mig_state = qp->s_mig_state;
+ attr->qkey = qp->qkey;
+ attr->rq_psn = mask_psn(qp->r_psn);
+ attr->sq_psn = mask_psn(qp->s_next_psn);
+ attr->dest_qp_num = qp->remote_qpn;
+ attr->qp_access_flags = qp->qp_access_flags;
+ attr->cap.max_send_wr = qp->s_size - 1;
+ attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+ attr->cap.max_send_sge = qp->s_max_sge;
+ attr->cap.max_recv_sge = qp->r_rq.max_sge;
+ attr->cap.max_inline_data = 0;
+ attr->ah_attr = qp->remote_ah_attr;
+ attr->alt_ah_attr = qp->alt_ah_attr;
+ attr->pkey_index = qp->s_pkey_index;
+ attr->alt_pkey_index = qp->s_alt_pkey_index;
+ attr->en_sqd_async_notify = 0;
+ attr->sq_draining = qp->s_draining;
+ attr->max_rd_atomic = qp->s_max_rd_atomic;
+ attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+ attr->min_rnr_timer = qp->r_min_rnr_timer;
+ attr->port_num = qp->port_num;
+ attr->timeout = qp->timeout;
+ attr->retry_cnt = qp->s_retry_cnt;
+ attr->rnr_retry = qp->s_rnr_retry_cnt;
+ attr->alt_port_num = qp->alt_ah_attr.port_num;
+ attr->alt_timeout = qp->alt_timeout;
+
+ init_attr->event_handler = qp->ibqp.event_handler;
+ init_attr->qp_context = qp->ibqp.qp_context;
+ init_attr->send_cq = qp->ibqp.send_cq;
+ init_attr->recv_cq = qp->ibqp.recv_cq;
+ init_attr->srq = qp->ibqp.srq;
+ init_attr->cap = attr->cap;
+ if (qp->s_flags & HFI1_S_SIGNAL_REQ_WR)
+ init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+ else
+ init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+ init_attr->qp_type = qp->ibqp.qp_type;
+ init_attr->port_num = qp->port_num;
+ return 0;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct hfi1_qp *qp)
+{
+ u32 aeth = qp->r_msn & HFI1_MSN_MASK;
+
+ if (qp->ibqp.srq) {
+ /*
+ * Shared receive queues don't generate credits.
+ * Set the credit field to the invalid value.
+ */
+ aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
+ } else {
+ u32 min, max, x;
+ u32 credits;
+ struct hfi1_rwq *wq = qp->r_rq.wq;
+ u32 head;
+ u32 tail;
+
+ /* sanity check pointers before trusting them */
+ head = wq->head;
+ if (head >= qp->r_rq.size)
+ head = 0;
+ tail = wq->tail;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ /*
+ * Compute the number of credits available (RWQEs).
+ * There is a small chance that the pair of reads are
+ * not atomic, which is OK, since the fuzziness is
+ * resolved as further ACKs go out.
+ */
+ credits = head - tail;
+ if ((int)credits < 0)
+ credits += qp->r_rq.size;
+ /*
+ * Binary search the credit table to find the code to
+ * use.
+ */
+ min = 0;
+ max = 31;
+ for (;;) {
+ x = (min + max) / 2;
+ if (credit_table[x] == credits)
+ break;
+ if (credit_table[x] > credits)
+ max = x;
+ else if (min == x)
+ break;
+ else
+ min = x;
+ }
+ aeth |= x << HFI1_AETH_CREDIT_SHIFT;
+ }
+ return cpu_to_be32(aeth);
+}
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct hfi1_qp *qp;
+ int err;
+ struct hfi1_swqe *swq = NULL;
+ struct hfi1_ibdev *dev;
+ struct hfi1_devdata *dd;
+ size_t sz;
+ size_t sg_list_sz;
+ struct ib_qp *ret;
+
+ if (init_attr->cap.max_send_sge > hfi1_max_sges ||
+ init_attr->cap.max_send_wr > hfi1_max_qp_wrs ||
+ init_attr->create_flags) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ /* Check receive queue parameters if no SRQ is specified. */
+ if (!init_attr->srq) {
+ if (init_attr->cap.max_recv_sge > hfi1_max_sges ||
+ init_attr->cap.max_recv_wr > hfi1_max_qp_wrs) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+ if (init_attr->cap.max_send_sge +
+ init_attr->cap.max_send_wr +
+ init_attr->cap.max_recv_sge +
+ init_attr->cap.max_recv_wr == 0) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+ }
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ if (init_attr->port_num == 0 ||
+ init_attr->port_num > ibpd->device->phys_port_cnt) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+ case IB_QPT_UC:
+ case IB_QPT_RC:
+ case IB_QPT_UD:
+ sz = sizeof(struct hfi1_sge) *
+ init_attr->cap.max_send_sge +
+ sizeof(struct hfi1_swqe);
+ swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+ if (swq == NULL) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+ sz = sizeof(*qp);
+ sg_list_sz = 0;
+ if (init_attr->srq) {
+ struct hfi1_srq *srq = to_isrq(init_attr->srq);
+
+ if (srq->rq.max_sge > 1)
+ sg_list_sz = sizeof(*qp->r_sg_list) *
+ (srq->rq.max_sge - 1);
+ } else if (init_attr->cap.max_recv_sge > 1)
+ sg_list_sz = sizeof(*qp->r_sg_list) *
+ (init_attr->cap.max_recv_sge - 1);
+ qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+ if (!qp) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_swq;
+ }
+ RCU_INIT_POINTER(qp->next, NULL);
+ qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+ if (!qp->s_hdr) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_qp;
+ }
+ qp->timeout_jiffies =
+ usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+ 1000UL);
+ if (init_attr->srq)
+ sz = 0;
+ else {
+ qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+ qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+ sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+ sizeof(struct hfi1_rwqe);
+ qp->r_rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) +
+ qp->r_rq.size * sz);
+ if (!qp->r_rq.wq) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_qp;
+ }
+ }
+
+ /*
+ * ib_create_qp() will initialize qp->ibqp
+ * except for qp->ibqp.qp_num.
+ */
+ spin_lock_init(&qp->r_lock);
+ spin_lock_init(&qp->s_lock);
+ spin_lock_init(&qp->r_rq.lock);
+ atomic_set(&qp->refcount, 0);
+ init_waitqueue_head(&qp->wait);
+ init_timer(&qp->s_timer);
+ qp->s_timer.data = (unsigned long)qp;
+ INIT_LIST_HEAD(&qp->rspwait);
+ qp->state = IB_QPS_RESET;
+ qp->s_wq = swq;
+ qp->s_size = init_attr->cap.max_send_wr + 1;
+ qp->s_max_sge = init_attr->cap.max_send_sge;
+ if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+ qp->s_flags = HFI1_S_SIGNAL_REQ_WR;
+ dev = to_idev(ibpd->device);
+ dd = dd_from_dev(dev);
+ err = alloc_qpn(dd, &dev->qp_dev->qpn_table, init_attr->qp_type,
+ init_attr->port_num);
+ if (err < 0) {
+ ret = ERR_PTR(err);
+ vfree(qp->r_rq.wq);
+ goto bail_qp;
+ }
+ qp->ibqp.qp_num = err;
+ qp->port_num = init_attr->port_num;
+ reset_qp(qp, init_attr->qp_type);
+
+ break;
+
+ default:
+ /* Don't support raw QPs */
+ ret = ERR_PTR(-ENOSYS);
+ goto bail;
+ }
+
+ init_attr->cap.max_inline_data = 0;
+
+ /*
+ * Return the address of the RWQ as the offset to mmap.
+ * See hfi1_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ if (!qp->r_rq.wq) {
+ __u64 offset = 0;
+
+ err = ib_copy_to_udata(udata, &offset,
+ sizeof(offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ } else {
+ u32 s = sizeof(struct hfi1_rwq) + qp->r_rq.size * sz;
+
+ qp->ip = hfi1_create_mmap_info(dev, s,
+ ibpd->uobject->context,
+ qp->r_rq.wq);
+ if (!qp->ip) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ err = ib_copy_to_udata(udata, &(qp->ip->offset),
+ sizeof(qp->ip->offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ }
+ }
+
+ spin_lock(&dev->n_qps_lock);
+ if (dev->n_qps_allocated == hfi1_max_qps) {
+ spin_unlock(&dev->n_qps_lock);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ dev->n_qps_allocated++;
+ spin_unlock(&dev->n_qps_lock);
+
+ if (qp->ip) {
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ ret = &qp->ibqp;
+
+ /*
+ * We have our QP and its good, now keep track of what types of opcodes
+ * can be processed on this QP. We do this by keeping track of what the
+ * 3 high order bits of the opcode are.
+ */
+ switch (init_attr->qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & OPCODE_QP_MASK;
+ break;
+ case IB_QPT_RC:
+ qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & OPCODE_QP_MASK;
+ break;
+ case IB_QPT_UC:
+ qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & OPCODE_QP_MASK;
+ break;
+ default:
+ ret = ERR_PTR(-EINVAL);
+ goto bail_ip;
+ }
+
+ goto bail;
+
+bail_ip:
+ if (qp->ip)
+ kref_put(&qp->ip->ref, hfi1_release_mmap_info);
+ else
+ vfree(qp->r_rq.wq);
+ free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
+bail_qp:
+ kfree(qp->s_hdr);
+ kfree(qp);
+bail_swq:
+ vfree(swq);
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int hfi1_destroy_qp(struct ib_qp *ibqp)
+{
+ struct hfi1_qp *qp = to_iqp(ibqp);
+ struct hfi1_ibdev *dev = to_idev(ibqp->device);
+
+ /* Make sure HW and driver activity is stopped. */
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_lock);
+ if (qp->state != IB_QPS_RESET) {
+ qp->state = IB_QPS_RESET;
+ flush_iowait(qp);
+ qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irq(&qp->r_lock);
+ cancel_work_sync(&qp->s_iowait.iowork);
+ del_timer_sync(&qp->s_timer);
+ iowait_sdma_drain(&qp->s_iowait);
+ flush_tx_list(qp);
+ remove_qp(dev, qp);
+ wait_event(qp->wait, !atomic_read(&qp->refcount));
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_lock);
+ clear_mr_refs(qp, 1);
+ clear_ahg(qp);
+ }
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irq(&qp->r_lock);
+
+ /* all user's cleaned up, mark it available */
+ free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
+ spin_lock(&dev->n_qps_lock);
+ dev->n_qps_allocated--;
+ spin_unlock(&dev->n_qps_lock);
+
+ if (qp->ip)
+ kref_put(&qp->ip->ref, hfi1_release_mmap_info);
+ else
+ vfree(qp->r_rq.wq);
+ vfree(qp->s_wq);
+ kfree(qp->s_hdr);
+ kfree(qp);
+ return 0;
+}
+
+/**
+ * init_qpn_table - initialize the QP number table for a device
+ * @qpt: the QPN table
+ */
+static int init_qpn_table(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt)
+{
+ u32 offset, qpn, i;
+ struct qpn_map *map;
+ int ret = 0;
+
+ spin_lock_init(&qpt->lock);
+
+ qpt->last = 0;
+ qpt->incr = 1 << dd->qos_shift;
+
+ /* insure we don't assign QPs from KDETH 64K window */
+ qpn = kdeth_qp << 16;
+ qpt->nmaps = qpn / BITS_PER_PAGE;
+ /* This should always be zero */
+ offset = qpn & BITS_PER_PAGE_MASK;
+ map = &qpt->map[qpt->nmaps];
+ dd_dev_info(dd, "Reserving QPNs for KDETH window from 0x%x to 0x%x\n",
+ qpn, qpn + 65535);
+ for (i = 0; i < 65536; i++) {
+ if (!map->page) {
+ get_map_page(qpt, map);
+ if (!map->page) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+ set_bit(offset, map->page);
+ offset++;
+ if (offset == BITS_PER_PAGE) {
+ /* next page */
+ qpt->nmaps++;
+ map++;
+ offset = 0;
+ }
+ }
+ return ret;
+}
+
+/**
+ * free_qpn_table - free the QP number table for a device
+ * @qpt: the QPN table
+ */
+static void free_qpn_table(struct hfi1_qpn_table *qpt)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
+ free_page((unsigned long) qpt->map[i].page);
+}
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth)
+{
+ u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
+
+ /*
+ * If the credit is invalid, we can send
+ * as many packets as we like. Otherwise, we have to
+ * honor the credit field.
+ */
+ if (credit == HFI1_AETH_CREDIT_INVAL) {
+ if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
+ qp->s_flags |= HFI1_S_UNLIMITED_CREDIT;
+ if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
+ qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
+ hfi1_schedule_send(qp);
+ }
+ }
+ } else if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
+ /* Compute new LSN (i.e., MSN + credit) */
+ credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
+ if (cmp_msn(credit, qp->s_lsn) > 0) {
+ qp->s_lsn = credit;
+ if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
+ qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
+ hfi1_schedule_send(qp);
+ }
+ }
+ }
+}
+
+void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (qp->s_flags & flag) {
+ qp->s_flags &= ~flag;
+ trace_hfi1_qpwakeup(qp, flag);
+ hfi1_schedule_send(qp);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ /* Notify hfi1_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+static int iowait_sleep(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *stx,
+ unsigned seq)
+{
+ struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
+ struct hfi1_qp *qp;
+ unsigned long flags;
+ int ret = 0;
+ struct hfi1_ibdev *dev;
+
+ qp = tx->qp;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+
+ /*
+ * If we couldn't queue the DMA request, save the info
+ * and try again later rather than destroying the
+ * buffer and undoing the side effects of the copy.
+ */
+ /* Make a common routine? */
+ dev = &sde->dd->verbs_dev;
+ list_add_tail(&stx->list, &wait->tx_head);
+ write_seqlock(&dev->iowait_lock);
+ if (sdma_progress(sde, seq, stx))
+ goto eagain;
+ if (list_empty(&qp->s_iowait.list)) {
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+
+ ibp->n_dmawait++;
+ qp->s_flags |= HFI1_S_WAIT_DMA_DESC;
+ list_add_tail(&qp->s_iowait.list, &sde->dmawait);
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_DMA_DESC);
+ atomic_inc(&qp->refcount);
+ }
+ write_sequnlock(&dev->iowait_lock);
+ qp->s_flags &= ~HFI1_S_BUSY;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ ret = -EBUSY;
+ } else {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ hfi1_put_txreq(tx);
+ }
+ return ret;
+eagain:
+ write_sequnlock(&dev->iowait_lock);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ list_del_init(&stx->list);
+ return -EAGAIN;
+}
+
+static void iowait_wakeup(struct iowait *wait, int reason)
+{
+ struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
+
+ WARN_ON(reason != SDMA_AVAIL_REASON);
+ hfi1_qp_wakeup(qp, HFI1_S_WAIT_DMA_DESC);
+}
+
+int hfi1_qp_init(struct hfi1_ibdev *dev)
+{
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ int i;
+ int ret = -ENOMEM;
+
+ /* allocate parent object */
+ dev->qp_dev = kzalloc(sizeof(*dev->qp_dev), GFP_KERNEL);
+ if (!dev->qp_dev)
+ goto nomem;
+ /* allocate hash table */
+ dev->qp_dev->qp_table_size = hfi1_qp_table_size;
+ dev->qp_dev->qp_table_bits = ilog2(hfi1_qp_table_size);
+ dev->qp_dev->qp_table =
+ kmalloc(dev->qp_dev->qp_table_size *
+ sizeof(*dev->qp_dev->qp_table),
+ GFP_KERNEL);
+ if (!dev->qp_dev->qp_table)
+ goto nomem;
+ for (i = 0; i < dev->qp_dev->qp_table_size; i++)
+ RCU_INIT_POINTER(dev->qp_dev->qp_table[i], NULL);
+ spin_lock_init(&dev->qp_dev->qpt_lock);
+ /* initialize qpn map */
+ ret = init_qpn_table(dd, &dev->qp_dev->qpn_table);
+ if (ret)
+ goto nomem;
+ return ret;
+nomem:
+ if (dev->qp_dev) {
+ kfree(dev->qp_dev->qp_table);
+ free_qpn_table(&dev->qp_dev->qpn_table);
+ kfree(dev->qp_dev);
+ }
+ return ret;
+}
+
+void hfi1_qp_exit(struct hfi1_ibdev *dev)
+{
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ u32 qps_inuse;
+
+ qps_inuse = free_all_qps(dd);
+ if (qps_inuse)
+ dd_dev_err(dd, "QP memory leak! %u still in use\n",
+ qps_inuse);
+ if (dev->qp_dev) {
+ kfree(dev->qp_dev->qp_table);
+ free_qpn_table(&dev->qp_dev->qpn_table);
+ kfree(dev->qp_dev);
+ }
+}
+
+/**
+ *
+ * qp_to_sdma_engine - map a qp to a send engine
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send engine for the qp or NULL for SMI type qp.
+ */
+struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+ struct sdma_engine *sde;
+
+ if (!(dd->flags & HFI1_HAS_SEND_DMA))
+ return NULL;
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_UC:
+ case IB_QPT_RC:
+ break;
+ case IB_QPT_SMI:
+ return NULL;
+ default:
+ break;
+ }
+ sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
+ return sde;
+}
+
+struct qp_iter {
+ struct hfi1_ibdev *dev;
+ struct hfi1_qp *qp;
+ int specials;
+ int n;
+};
+
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
+{
+ struct qp_iter *iter;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->dev = dev;
+ iter->specials = dev->ibdev.phys_port_cnt * 2;
+ if (qp_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+int qp_iter_next(struct qp_iter *iter)
+{
+ struct hfi1_ibdev *dev = iter->dev;
+ int n = iter->n;
+ int ret = 1;
+ struct hfi1_qp *pqp = iter->qp;
+ struct hfi1_qp *qp;
+
+ /*
+ * The approach is to consider the special qps
+ * as an additional table entries before the
+ * real hash table. Since the qp code sets
+ * the qp->next hash link to NULL, this works just fine.
+ *
+ * iter->specials is 2 * # ports
+ *
+ * n = 0..iter->specials is the special qp indices
+ *
+ * n = iter->specials..dev->qp_dev->qp_table_size+iter->specials are
+ * the potential hash bucket entries
+ *
+ */
+ for (; n < dev->qp_dev->qp_table_size + iter->specials; n++) {
+ if (pqp) {
+ qp = rcu_dereference(pqp->next);
+ } else {
+ if (n < iter->specials) {
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ int pidx;
+
+ pidx = n % dev->ibdev.phys_port_cnt;
+ ppd = &dd_from_dev(dev)->pport[pidx];
+ ibp = &ppd->ibport_data;
+
+ if (!(n & 1))
+ qp = rcu_dereference(ibp->qp[0]);
+ else
+ qp = rcu_dereference(ibp->qp[1]);
+ } else {
+ qp = rcu_dereference(
+ dev->qp_dev->qp_table[
+ (n - iter->specials)]);
+ }
+ }
+ pqp = qp;
+ if (qp) {
+ iter->qp = qp;
+ iter->n = n;
+ return 0;
+ }
+ }
+ return ret;
+}
+
+static const char * const qp_type_str[] = {
+ "SMI", "GSI", "RC", "UC", "UD",
+};
+
+static int qp_idle(struct hfi1_qp *qp)
+{
+ return
+ qp->s_last == qp->s_acked &&
+ qp->s_acked == qp->s_cur &&
+ qp->s_cur == qp->s_tail &&
+ qp->s_tail == qp->s_head;
+}
+
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
+{
+ struct hfi1_swqe *wqe;
+ struct hfi1_qp *qp = iter->qp;
+ struct sdma_engine *sde;
+
+ sde = qp_to_sdma_engine(qp, qp->s_sc);
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ seq_printf(s,
+ "N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x SL %u MTU %d %u %u %u SDE %p,%u\n",
+ iter->n,
+ qp_idle(qp) ? "I" : "B",
+ qp->ibqp.qp_num,
+ atomic_read(&qp->refcount),
+ qp_type_str[qp->ibqp.qp_type],
+ qp->state,
+ wqe ? wqe->wr.opcode : 0,
+ qp->s_hdrwords,
+ qp->s_flags,
+ atomic_read(&qp->s_iowait.sdma_busy),
+ !list_empty(&qp->s_iowait.list),
+ qp->timeout,
+ wqe ? wqe->ssn : 0,
+ qp->s_lsn,
+ qp->s_last_psn,
+ qp->s_psn, qp->s_next_psn,
+ qp->s_sending_psn, qp->s_sending_hpsn,
+ qp->s_last, qp->s_acked, qp->s_cur,
+ qp->s_tail, qp->s_head, qp->s_size,
+ qp->remote_qpn,
+ qp->remote_ah_attr.dlid,
+ qp->remote_ah_attr.sl,
+ qp->pmtu,
+ qp->s_retry_cnt,
+ qp->timeout,
+ qp->s_rnr_retry_cnt,
+ sde,
+ sde ? sde->this_idx : 0);
+}
+
+void qp_comm_est(struct hfi1_qp *qp)
+{
+ qp->r_flags |= HFI1_R_COMM_EST;
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_COMM_EST;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
new file mode 100644
index 000000000000..6b505859b59c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qp.h
@@ -0,0 +1,235 @@
+#ifndef _QP_H
+#define _QP_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include "verbs.h"
+
+#define QPN_MAX (1 << 24)
+#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+ void *page;
+};
+
+struct hfi1_qpn_table {
+ spinlock_t lock; /* protect changes in this struct */
+ unsigned flags; /* flags for QP0/1 allocated for each port */
+ u32 last; /* last QP number allocated */
+ u32 nmaps; /* size of the map table */
+ u16 limit;
+ u8 incr;
+ /* bit map of free QP numbers other than 0/1 */
+ struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct hfi1_qp_ibdev {
+ u32 qp_table_size;
+ u32 qp_table_bits;
+ struct hfi1_qp __rcu **qp_table;
+ spinlock_t qpt_lock;
+ struct hfi1_qpn_table qpn_table;
+};
+
+static inline u32 qpn_hash(struct hfi1_qp_ibdev *dev, u32 qpn)
+{
+ return hash_32(qpn, dev->qp_table_bits);
+}
+
+/**
+ * hfi1_lookup_qpn - return the QP with the given QPN
+ * @ibp: the ibport
+ * @qpn: the QP number to look up
+ *
+ * The caller must hold the rcu_read_lock(), and keep the lock until
+ * the returned qp is no longer in use.
+ */
+static inline struct hfi1_qp *hfi1_lookup_qpn(struct hfi1_ibport *ibp,
+ u32 qpn) __must_hold(RCU)
+{
+ struct hfi1_qp *qp = NULL;
+
+ if (unlikely(qpn <= 1)) {
+ qp = rcu_dereference(ibp->qp[qpn]);
+ } else {
+ struct hfi1_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev;
+ u32 n = qpn_hash(dev->qp_dev, qpn);
+
+ for (qp = rcu_dereference(dev->qp_dev->qp_table[n]); qp;
+ qp = rcu_dereference(qp->next))
+ if (qp->ibqp.qp_num == qpn)
+ break;
+ }
+ return qp;
+}
+
+/**
+ * hfi1_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err);
+
+/**
+ * hfi1_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_qp_init_attr *init_attr);
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct hfi1_qp *qp);
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+/**
+ * hfi1_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int hfi1_destroy_qp(struct ib_qp *ibqp);
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth);
+
+/**
+ * hfi1_qp_init - allocate QP tables
+ * @dev: a pointer to the hfi1_ibdev
+ */
+int hfi1_qp_init(struct hfi1_ibdev *dev);
+
+/**
+ * hfi1_qp_exit - free the QP related structures
+ * @dev: a pointer to the hfi1_ibdev
+ */
+void hfi1_qp_exit(struct hfi1_ibdev *dev);
+
+/**
+ * hfi1_qp_waitup - wake up on the indicated event
+ * @qp: the QP
+ * @flag: flag the qp on which the qp is stalled
+ */
+void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag);
+
+struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5);
+
+struct qp_iter;
+
+/**
+ * qp_iter_init - wake up on the indicated event
+ * @dev: the hfi1_ibdev
+ */
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
+
+/**
+ * qp_iter_next - wakeup on the indicated event
+ * @iter: the iterator for the qp hash list
+ */
+int qp_iter_next(struct qp_iter *iter);
+
+/**
+ * qp_iter_next - wake up on the indicated event
+ * @s: the seq_file to emit the qp information on
+ * @iter: the iterator for the qp hash list
+ */
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
+
+/**
+ * qp_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void qp_comm_est(struct hfi1_qp *qp);
+
+#endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
new file mode 100644
index 000000000000..3138936157db
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qsfp.c
@@ -0,0 +1,546 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
+ * in twsi.c
+ */
+#define I2C_MAX_RETRY 4
+
+/*
+ * Unlocked i2c write. Must hold dd->qsfp_i2c_mutex.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int ret, cnt;
+ u8 *buff = bp;
+
+ /* Make sure TWSI bus is in sane state. */
+ ret = hfi1_twsi_reset(dd, target);
+ if (ret) {
+ hfi1_dev_porterr(dd, ppd->port,
+ "I2C interface Reset for write failed\n");
+ return -EIO;
+ }
+
+ cnt = 0;
+ while (cnt < len) {
+ int wlen = len - cnt;
+
+ ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
+ buff + cnt, wlen);
+ if (ret) {
+ /* hfi1_twsi_blk_wr() 1 for error, else 0 */
+ return -EIO;
+ }
+ offset += wlen;
+ cnt += wlen;
+ }
+
+ /* Must wait min 20us between qsfp i2c transactions */
+ udelay(20);
+
+ return cnt;
+}
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+ void *bp, int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int ret;
+
+ ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
+ if (!ret) {
+ ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+ mutex_unlock(&dd->qsfp_i2c_mutex);
+ }
+
+ return ret;
+}
+
+/*
+ * Unlocked i2c read. Must hold dd->qsfp_i2c_mutex.
+ */
+static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int ret, cnt, pass = 0;
+ int stuck = 0;
+ u8 *buff = bp;
+
+ /* Make sure TWSI bus is in sane state. */
+ ret = hfi1_twsi_reset(dd, target);
+ if (ret) {
+ hfi1_dev_porterr(dd, ppd->port,
+ "I2C interface Reset for read failed\n");
+ ret = -EIO;
+ stuck = 1;
+ goto exit;
+ }
+
+ cnt = 0;
+ while (cnt < len) {
+ int rlen = len - cnt;
+
+ ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
+ buff + cnt, rlen);
+ /* Some QSFP's fail first try. Retry as experiment */
+ if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
+ continue;
+ if (ret) {
+ /* hfi1_twsi_blk_rd() 1 for error, else 0 */
+ ret = -EIO;
+ goto exit;
+ }
+ offset += rlen;
+ cnt += rlen;
+ }
+
+ ret = cnt;
+
+exit:
+ if (stuck)
+ dd_dev_err(dd, "I2C interface bus stuck non-idle\n");
+
+ if (pass >= I2C_MAX_RETRY && ret)
+ hfi1_dev_porterr(dd, ppd->port,
+ "I2C failed even retrying\n");
+ else if (pass)
+ hfi1_dev_porterr(dd, ppd->port, "I2C retries: %d\n", pass);
+
+ /* Must wait min 20us between qsfp i2c transactions */
+ udelay(20);
+
+ return ret;
+}
+
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+ void *bp, int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int ret;
+
+ ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
+ if (!ret) {
+ ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+ mutex_unlock(&dd->qsfp_i2c_mutex);
+ }
+
+ return ret;
+}
+
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len)
+{
+ int count = 0;
+ int offset;
+ int nwrite;
+ int ret;
+ u8 page;
+
+ ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
+ if (ret)
+ return ret;
+
+ while (count < len) {
+ /*
+ * Set the qsfp page based on a zero-based addresss
+ * and a page size of QSFP_PAGESIZE bytes.
+ */
+ page = (u8)(addr / QSFP_PAGESIZE);
+
+ ret = __i2c_write(ppd, target, QSFP_DEV,
+ QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+ if (ret != 1) {
+ hfi1_dev_porterr(
+ ppd->dd,
+ ppd->port,
+ "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
+ ret = -EIO;
+ break;
+ }
+
+ /* truncate write to end of page if crossing page boundary */
+ offset = addr % QSFP_PAGESIZE;
+ nwrite = len - count;
+ if ((offset + nwrite) > QSFP_PAGESIZE)
+ nwrite = QSFP_PAGESIZE - offset;
+
+ ret = __i2c_write(ppd, target, QSFP_DEV, offset, bp + count,
+ nwrite);
+ if (ret <= 0) /* stop on error or nothing read */
+ break;
+
+ count += ret;
+ addr += ret;
+ }
+
+ mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
+
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len)
+{
+ int count = 0;
+ int offset;
+ int nread;
+ int ret;
+ u8 page;
+
+ ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
+ if (ret)
+ return ret;
+
+ while (count < len) {
+ /*
+ * Set the qsfp page based on a zero-based address
+ * and a page size of QSFP_PAGESIZE bytes.
+ */
+ page = (u8)(addr / QSFP_PAGESIZE);
+ ret = __i2c_write(ppd, target, QSFP_DEV,
+ QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+ if (ret != 1) {
+ hfi1_dev_porterr(
+ ppd->dd,
+ ppd->port,
+ "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
+ ret = -EIO;
+ break;
+ }
+
+ /* truncate read to end of page if crossing page boundary */
+ offset = addr % QSFP_PAGESIZE;
+ nread = len - count;
+ if ((offset + nread) > QSFP_PAGESIZE)
+ nread = QSFP_PAGESIZE - offset;
+
+ ret = __i2c_read(ppd, target, QSFP_DEV, offset, bp + count,
+ nread);
+ if (ret <= 0) /* stop on error or nothing read */
+ break;
+
+ count += ret;
+ addr += ret;
+ }
+
+ mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
+
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+/*
+ * This function caches the QSFP memory range in 128 byte chunks.
+ * As an example, the next byte after address 255 is byte 128 from
+ * upper page 01H (if existing) rather than byte 0 from lower page 00H.
+ */
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
+{
+ u32 target = ppd->dd->hfi1_id;
+ int ret;
+ unsigned long flags;
+ u8 *cache = &cp->cache[0];
+
+ /* ensure sane contents on invalid reads, for cable swaps */
+ memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
+ dd_dev_info(ppd->dd, "%s: called\n", __func__);
+ if (!qsfp_mod_present(ppd)) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ ret = qsfp_read(ppd, target, 0, cache, 256);
+ if (ret != 256) {
+ dd_dev_info(ppd->dd,
+ "%s: Read of pages 00H failed, expected 256, got %d\n",
+ __func__, ret);
+ goto bail;
+ }
+
+ if (cache[0] != 0x0C && cache[0] != 0x0D)
+ goto bail;
+
+ /* Is paging enabled? */
+ if (!(cache[2] & 4)) {
+
+ /* Paging enabled, page 03 required */
+ if ((cache[195] & 0xC0) == 0xC0) {
+ /* all */
+ ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ } else if ((cache[195] & 0x80) == 0x80) {
+ /* only page 2 and 3 */
+ ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ } else if ((cache[195] & 0x40) == 0x40) {
+ /* only page 1 and 3 */
+ ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ } else {
+ /* only page 3 */
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+ goto bail;
+ }
+ }
+ }
+
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.cache_valid = 1;
+ ppd->qsfp_info.cache_refresh_required = 0;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+ return 0;
+
+bail:
+ memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
+ return ret;
+}
+
+const char * const hfi1_qsfp_devtech[16] = {
+ "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+ "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+ "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+ "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+
+int qsfp_mod_present(struct hfi1_pportdata *ppd)
+{
+ if (HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg;
+
+ reg = read_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+ return !(reg & QSFP_HFI0_MODPRST_N);
+ }
+ /* always return cable present */
+ return 1;
+}
+
+/*
+ * This function maps QSFP memory addresses in 128 byte chunks in the following
+ * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
+ * spec
+ * For addr 000-127, lower page 00h
+ * For addr 128-255, upper page 00h
+ * For addr 256-383, upper page 01h
+ * For addr 384-511, upper page 02h
+ * For addr 512-639, upper page 03h
+ *
+ * For addresses beyond this range, it returns the invalid range of data buffer
+ * set to 0.
+ * For upper pages that are optional, if they are not valid, returns the
+ * particular range of bytes in the data buffer set to 0.
+ */
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
+ u8 *data)
+{
+ struct hfi1_pportdata *ppd;
+ u32 excess_len = 0;
+ int ret = 0;
+
+ if (port_num > dd->num_pports || port_num < 1) {
+ dd_dev_info(dd, "%s: Invalid port number %d\n",
+ __func__, port_num);
+ ret = -EINVAL;
+ goto set_zeroes;
+ }
+
+ ppd = dd->pport + (port_num - 1);
+ if (!qsfp_mod_present(ppd)) {
+ ret = -ENODEV;
+ goto set_zeroes;
+ }
+
+ if (!ppd->qsfp_info.cache_valid) {
+ ret = -EINVAL;
+ goto set_zeroes;
+ }
+
+ if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
+ ret = -ERANGE;
+ goto set_zeroes;
+ }
+
+ if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
+ excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
+ memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
+ data += (len - excess_len);
+ goto set_zeroes;
+ }
+
+ memcpy(data, &ppd->qsfp_info.cache[addr], len);
+ return 0;
+
+set_zeroes:
+ memset(data, 0, excess_len);
+ return ret;
+}
+
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
+{
+ u8 *cache = &ppd->qsfp_info.cache[0];
+ u8 bin_buff[QSFP_DUMP_CHUNK];
+ char lenstr[6];
+ int sofar, ret;
+ int bidx = 0;
+ u8 *atten = &cache[QSFP_ATTEN_OFFS];
+ u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+
+ sofar = 0;
+ lenstr[0] = ' ';
+ lenstr[1] = '\0';
+
+ if (ppd->qsfp_info.cache_valid) {
+
+ if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+ sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
+ pwr_codes +
+ (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
+
+ sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
+ lenstr,
+ hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+ QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+ QSFP_OUI(vendor_oui));
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+ QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+ QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
+
+ if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+ sofar += scnprintf(buf + sofar, len - sofar,
+ "Atten:%d, %d\n",
+ QSFP_ATTEN_SDR(atten),
+ QSFP_ATTEN_DDR(atten));
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+ QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+ QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+ QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
+
+ while (bidx < QSFP_DEFAULT_HDR_CNT) {
+ int iidx;
+
+ memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
+ for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
+ sofar += scnprintf(buf + sofar, len-sofar,
+ " %02X", bin_buff[iidx]);
+ }
+ sofar += scnprintf(buf + sofar, len - sofar, "\n");
+ bidx += QSFP_DUMP_CHUNK;
+ }
+ }
+ ret = sofar;
+ return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
new file mode 100644
index 000000000000..d30c2a6baa0b
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/qsfp.h
@@ -0,0 +1,222 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/* QSFP support common definitions, for hfi driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+/* 128 byte pages, per SFF 8636 rev 2.4 */
+#define QSFP_MAX_NUM_PAGES 5
+
+/*
+ * Below are masks for QSFP pins. Pins are the same for HFI0 and HFI1.
+ * _N means asserted low
+ */
+#define QSFP_HFI0_I2CCLK (1 << 0)
+#define QSFP_HFI0_I2CDAT (1 << 1)
+#define QSFP_HFI0_RESET_N (1 << 2)
+#define QSFP_HFI0_INT_N (1 << 3)
+#define QSFP_HFI0_MODPRST_N (1 << 4)
+
+/* QSFP is paged at 256 bytes */
+#define QSFP_PAGESIZE 256
+
+/* Defined fields that Intel requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */
+/*
+ * Rest of first 128 not used, although 127 is reserved for page select
+ * if module is not "Flat memory".
+ */
+#define QSFP_PAGE_SELECT_BYTE_OFFS 127
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
+ * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not Intel req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not Intel req'd */
+/* Byte 141 is Extended Rate Select. Not Intel req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Intel req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const hfi1_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+ oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR 0x009065
+#define QSFP_OUI_GORE 0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
+ * If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/* Bytes 188,189 are Wavelength tolerance, not Intel req'd */
+/* Byte 190 is Max Case Temp. Not Intel req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
+#define QSFP_CC_OFFS 191
+/* Bytes 192..195 are Options implemented in qsfp. Not Intel req'd */
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * Interrupt flag masks
+ */
+#define QSFP_DATA_NOT_READY 0x01
+
+#define QSFP_HIGH_TEMP_ALARM 0x80
+#define QSFP_LOW_TEMP_ALARM 0x40
+#define QSFP_HIGH_TEMP_WARNING 0x20
+#define QSFP_LOW_TEMP_WARNING 0x10
+
+#define QSFP_HIGH_VCC_ALARM 0x80
+#define QSFP_LOW_VCC_ALARM 0x40
+#define QSFP_HIGH_VCC_WARNING 0x20
+#define QSFP_LOW_VCC_WARNING 0x10
+
+#define QSFP_HIGH_POWER_ALARM 0x88
+#define QSFP_LOW_POWER_ALARM 0x44
+#define QSFP_HIGH_POWER_WARNING 0x22
+#define QSFP_LOW_POWER_WARNING 0x11
+
+#define QSFP_HIGH_BIAS_ALARM 0x88
+#define QSFP_LOW_BIAS_ALARM 0x44
+#define QSFP_HIGH_BIAS_WARNING 0x22
+#define QSFP_LOW_BIAS_WARNING 0x11
+
+/*
+ * struct qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the qsfp_lock arbitrate access to common resources.
+ *
+ */
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+struct qsfp_data {
+ /* Helps to find our way */
+ struct hfi1_pportdata *ppd;
+ struct work_struct qsfp_work;
+ u8 cache[QSFP_MAX_NUM_PAGES*128];
+ spinlock_t qsfp_lock;
+ u8 check_interrupt_flags;
+ u8 qsfp_interrupt_functional;
+ u8 cache_valid;
+ u8 cache_refresh_required;
+};
+
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
+ struct qsfp_data *cp);
+int qsfp_mod_present(struct hfi1_pportdata *ppd);
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
+ u32 len, u8 *data);
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len);
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len);
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len);
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
new file mode 100644
index 000000000000..632dd5ba7dfd
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/rc.c
@@ -0,0 +1,2426 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/io.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "sdma.h"
+#include "trace.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static void rc_timeout(unsigned long arg);
+
+static u32 restart_sge(struct hfi1_sge_state *ss, struct hfi1_swqe *wqe,
+ u32 psn, u32 pmtu)
+{
+ u32 len;
+
+ len = delta_psn(psn, wqe->psn) * pmtu;
+ ss->sge = wqe->sg_list[0];
+ ss->sg_list = wqe->sg_list + 1;
+ ss->num_sge = wqe->wr.num_sge;
+ ss->total_len = wqe->length;
+ hfi1_skip_sge(ss, len, 0);
+ return wqe->length - len;
+}
+
+static void start_timer(struct hfi1_qp *qp)
+{
+ qp->s_flags |= HFI1_S_TIMER;
+ qp->s_timer.function = rc_timeout;
+ /* 4.096 usec. * (1 << qp->timeout) */
+ qp->s_timer.expires = jiffies + qp->timeout_jiffies;
+ add_timer(&qp->s_timer);
+}
+
+/**
+ * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp,
+ struct hfi1_other_headers *ohdr, u32 pmtu)
+{
+ struct hfi1_ack_entry *e;
+ u32 hwords;
+ u32 len;
+ u32 bth0;
+ u32 bth2;
+ int middle = 0;
+
+ /* Don't send an ACK if we aren't supposed to. */
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
+ goto bail;
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ switch (qp->s_ack_state) {
+ case OP(RDMA_READ_RESPONSE_LAST):
+ case OP(RDMA_READ_RESPONSE_ONLY):
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->rdma_sge.mr) {
+ hfi1_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ /* FALLTHROUGH */
+ case OP(ATOMIC_ACKNOWLEDGE):
+ /*
+ * We can increment the tail pointer now that the last
+ * response has been sent instead of only being
+ * constructed.
+ */
+ if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+ qp->s_tail_ack_queue = 0;
+ /* FALLTHROUGH */
+ case OP(SEND_ONLY):
+ case OP(ACKNOWLEDGE):
+ /* Check for no next entry in the queue. */
+ if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+ if (qp->s_flags & HFI1_S_ACK_PENDING)
+ goto normal;
+ goto bail;
+ }
+
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->opcode == OP(RDMA_READ_REQUEST)) {
+ /*
+ * If a RDMA read response is being resent and
+ * we haven't seen the duplicate request yet,
+ * then stop sending the remaining responses the
+ * responder has seen until the requester re-sends it.
+ */
+ len = e->rdma_sge.sge_length;
+ if (len && !e->rdma_sge.mr) {
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
+ goto bail;
+ }
+ /* Copy SGE state in case we need to resend */
+ qp->s_rdma_mr = e->rdma_sge.mr;
+ if (qp->s_rdma_mr)
+ hfi1_get_mr(qp->s_rdma_mr);
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
+ qp->s_ack_rdma_sge.num_sge = 1;
+ qp->s_cur_sge = &qp->s_ack_rdma_sge;
+ if (len > pmtu) {
+ len = pmtu;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+ } else {
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+ e->sent = 1;
+ }
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ hwords++;
+ qp->s_ack_rdma_psn = e->psn;
+ bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else {
+ /* COMPARE_SWAP or FETCH_ADD */
+ qp->s_cur_sge = NULL;
+ len = 0;
+ qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+ ohdr->u.at.aeth = hfi1_compute_aeth(qp);
+ ohdr->u.at.atomic_ack_eth[0] =
+ cpu_to_be32(e->atomic_data >> 32);
+ ohdr->u.at.atomic_ack_eth[1] =
+ cpu_to_be32(e->atomic_data);
+ hwords += sizeof(ohdr->u.at) / sizeof(u32);
+ bth2 = mask_psn(e->psn);
+ e->sent = 1;
+ }
+ bth0 = qp->s_ack_state << 24;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ qp->s_cur_sge = &qp->s_ack_rdma_sge;
+ qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
+ if (qp->s_rdma_mr)
+ hfi1_get_mr(qp->s_rdma_mr);
+ len = qp->s_ack_rdma_sge.sge.sge_length;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ } else {
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ hwords++;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ e->sent = 1;
+ }
+ bth0 = qp->s_ack_state << 24;
+ bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ break;
+
+ default:
+normal:
+ /*
+ * Send a regular ACK.
+ * Set the s_ack_state so we wait until after sending
+ * the ACK before setting s_ack_state to ACKNOWLEDGE
+ * (see above).
+ */
+ qp->s_ack_state = OP(SEND_ONLY);
+ qp->s_flags &= ~HFI1_S_ACK_PENDING;
+ qp->s_cur_sge = NULL;
+ if (qp->s_nak_state)
+ ohdr->u.aeth =
+ cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+ (qp->s_nak_state <<
+ HFI1_AETH_CREDIT_SHIFT));
+ else
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ hwords++;
+ len = 0;
+ bth0 = OP(ACKNOWLEDGE) << 24;
+ bth2 = mask_psn(qp->s_ack_psn);
+ }
+ qp->s_rdma_ack_cnt++;
+ qp->s_hdrwords = hwords;
+ qp->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle);
+ return 1;
+
+bail:
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ /*
+ * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+ * HFI1_S_RESP_PENDING
+ */
+ smp_wmb();
+ qp->s_flags &= ~(HFI1_S_RESP_PENDING
+ | HFI1_S_ACK_PENDING
+ | HFI1_S_AHG_VALID);
+ return 0;
+}
+
+/**
+ * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_rc_req(struct hfi1_qp *qp)
+{
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_sge_state *ss;
+ struct hfi1_swqe *wqe;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ u32 hwords = 5;
+ u32 len;
+ u32 bth0 = 0;
+ u32 bth2;
+ u32 pmtu = qp->pmtu;
+ char newreq;
+ unsigned long flags;
+ int ret = 0;
+ int middle = 0;
+ int delta;
+
+ ohdr = &qp->s_hdr->ibh.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &qp->s_hdr->ibh.u.l.oth;
+
+ /*
+ * The lock is needed to synchronize between the sending tasklet,
+ * the receive interrupt handler, and timeout re-sends.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Sending responses has higher priority over sending requests. */
+ if ((qp->s_flags & HFI1_S_RESP_PENDING) &&
+ make_rc_ack(dev, qp, ohdr, pmtu))
+ goto done;
+
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == qp->s_head)
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (atomic_read(&qp->s_iowait.sdma_busy)) {
+ qp->s_flags |= HFI1_S_WAIT_DMA;
+ goto bail;
+ }
+ clear_ahg(qp);
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ /* will get called again */
+ goto done;
+ }
+
+ if (qp->s_flags & (HFI1_S_WAIT_RNR | HFI1_S_WAIT_ACK))
+ goto bail;
+
+ if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+ if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+ qp->s_flags |= HFI1_S_WAIT_PSN;
+ goto bail;
+ }
+ qp->s_sending_psn = qp->s_psn;
+ qp->s_sending_hpsn = qp->s_psn - 1;
+ }
+
+ /* Send a request. */
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ switch (qp->s_state) {
+ default:
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK))
+ goto bail;
+ /*
+ * Resend an old request or start a new one.
+ *
+ * We keep track of the current SWQE so that
+ * we don't reset the "furthest progress" state
+ * if we need to back up.
+ */
+ newreq = 0;
+ if (qp->s_cur == qp->s_tail) {
+ /* Check if send work queue is empty. */
+ if (qp->s_tail == qp->s_head) {
+ clear_ahg(qp);
+ goto bail;
+ }
+ /*
+ * If a fence is requested, wait for previous
+ * RDMA read and atomic operations to finish.
+ */
+ if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+ qp->s_num_rd_atomic) {
+ qp->s_flags |= HFI1_S_WAIT_FENCE;
+ goto bail;
+ }
+ wqe->psn = qp->s_next_psn;
+ newreq = 1;
+ }
+ /*
+ * Note that we have to be careful not to modify the
+ * original work request since we may need to resend
+ * it.
+ */
+ len = wqe->length;
+ ss = &qp->s_sge;
+ bth2 = mask_psn(qp->s_psn);
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ /* If no credit, return. */
+ if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
+ cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+ qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
+ goto bail;
+ }
+ wqe->lpsn = wqe->psn;
+ if (len > pmtu) {
+ wqe->lpsn += (len - 1) / pmtu;
+ qp->s_state = OP(SEND_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_ONLY);
+ else {
+ qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ if (newreq && !(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ /* FALLTHROUGH */
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ /* If no credit, return. */
+ if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
+ cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+ qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
+ goto bail;
+ }
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / sizeof(u32);
+ wqe->lpsn = wqe->psn;
+ if (len > pmtu) {
+ wqe->lpsn += (len - 1) / pmtu;
+ qp->s_state = OP(RDMA_WRITE_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_ONLY);
+ else {
+ qp->s_state =
+ OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after RETH */
+ ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_READ:
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow.
+ */
+ if (newreq) {
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= HFI1_S_WAIT_RDMAR;
+ goto bail;
+ }
+ qp->s_num_rd_atomic++;
+ if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ /*
+ * Adjust s_next_psn to count the
+ * expected number of responses.
+ */
+ if (len > pmtu)
+ qp->s_next_psn += (len - 1) / pmtu;
+ wqe->lpsn = qp->s_next_psn++;
+ }
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+ ss = NULL;
+ len = 0;
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow.
+ */
+ if (newreq) {
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= HFI1_S_WAIT_RDMAR;
+ goto bail;
+ }
+ qp->s_num_rd_atomic++;
+ if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ wqe->lpsn = wqe->psn;
+ }
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ qp->s_state = OP(COMPARE_SWAP);
+ ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+ wqe->wr.wr.atomic.swap);
+ ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+ wqe->wr.wr.atomic.compare_add);
+ } else {
+ qp->s_state = OP(FETCH_ADD);
+ ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+ wqe->wr.wr.atomic.compare_add);
+ ohdr->u.atomic_eth.compare_data = 0;
+ }
+ ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+ wqe->wr.wr.atomic.remote_addr >> 32);
+ ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+ wqe->wr.wr.atomic.remote_addr);
+ ohdr->u.atomic_eth.rkey = cpu_to_be32(
+ wqe->wr.wr.atomic.rkey);
+ hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+ ss = NULL;
+ len = 0;
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ default:
+ goto bail;
+ }
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ if (newreq) {
+ qp->s_tail++;
+ if (qp->s_tail >= qp->s_size)
+ qp->s_tail = 0;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ qp->s_psn = wqe->lpsn + 1;
+ else {
+ qp->s_psn++;
+ if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ }
+ break;
+
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ /*
+ * qp->s_state is normally set to the opcode of the
+ * last packet constructed for new requests and therefore
+ * is never set to RDMA read response.
+ * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+ * thread to indicate a SEND needs to be restarted from an
+ * earlier PSN without interfering with the sending thread.
+ * See restart_rc().
+ */
+ qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+ /* FALLTHROUGH */
+ case OP(SEND_FIRST):
+ qp->s_state = OP(SEND_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ bth2 = mask_psn(qp->s_psn++);
+ if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_LAST);
+ else {
+ qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ bth2 |= IB_BTH_REQ_ACK;
+ qp->s_cur++;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_LAST):
+ /*
+ * qp->s_state is normally set to the opcode of the
+ * last packet constructed for new requests and therefore
+ * is never set to RDMA read response.
+ * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+ * thread to indicate a RDMA write needs to be restarted from
+ * an earlier PSN without interfering with the sending thread.
+ * See restart_rc().
+ */
+ qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_FIRST):
+ qp->s_state = OP(RDMA_WRITE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ bth2 = mask_psn(qp->s_psn++);
+ if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_LAST);
+ else {
+ qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ bth2 |= IB_BTH_REQ_ACK;
+ qp->s_cur++;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ /*
+ * qp->s_state is normally set to the opcode of the
+ * last packet constructed for new requests and therefore
+ * is never set to RDMA read response.
+ * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+ * thread to indicate a RDMA read needs to be restarted from
+ * an earlier PSN without interfering with the sending thread.
+ * See restart_rc().
+ */
+ len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+ bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
+ qp->s_psn = wqe->lpsn + 1;
+ ss = NULL;
+ len = 0;
+ qp->s_cur++;
+ if (qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+ }
+ qp->s_sending_hpsn = bth2;
+ delta = delta_psn(bth2, wqe->psn);
+ if (delta && delta % HFI1_PSN_CREDIT == 0)
+ bth2 |= IB_BTH_REQ_ACK;
+ if (qp->s_flags & HFI1_S_SEND_ONE) {
+ qp->s_flags &= ~HFI1_S_SEND_ONE;
+ qp->s_flags |= HFI1_S_WAIT_ACK;
+ bth2 |= IB_BTH_REQ_ACK;
+ }
+ qp->s_len -= len;
+ qp->s_hdrwords = hwords;
+ qp->s_cur_sge = ss;
+ qp->s_cur_size = len;
+ hfi1_make_ruc_header(
+ qp,
+ ohdr,
+ bth0 | (qp->s_state << 24),
+ bth2,
+ middle);
+done:
+ ret = 1;
+ goto unlock;
+
+bail:
+ qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+/**
+ * hfi1_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp,
+ int is_fecn)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u64 pbc, pbc_flags = 0;
+ u16 lrh0;
+ u16 sc5;
+ u32 bth0;
+ u32 hwords;
+ u32 vl, plen;
+ struct send_context *sc;
+ struct pio_buf *pbuf;
+ struct hfi1_ib_header hdr;
+ struct hfi1_other_headers *ohdr;
+
+ /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+ if (qp->s_flags & HFI1_S_RESP_PENDING)
+ goto queue_ack;
+
+ /* Ensure s_rdma_ack_cnt changes are committed */
+ smp_read_barrier_depends();
+ if (qp->s_rdma_ack_cnt)
+ goto queue_ack;
+
+ /* Construct the header */
+ /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
+ hwords = 6;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
+ &qp->remote_ah_attr.grh, hwords, 0);
+ ohdr = &hdr.u.l.oth;
+ lrh0 = HFI1_LRH_GRH;
+ } else {
+ ohdr = &hdr.u.oth;
+ lrh0 = HFI1_LRH_BTH;
+ }
+ /* read pkey_index w/o lock (its atomic) */
+ bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+ if (qp->s_mig_state == IB_MIG_MIGRATED)
+ bth0 |= IB_BTH_MIG_REQ;
+ if (qp->r_nak_state)
+ ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+ (qp->r_nak_state <<
+ HFI1_AETH_CREDIT_SHIFT));
+ else
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+ pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+ lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+ hdr.lrh[0] = cpu_to_be16(lrh0);
+ hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+ hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+ ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
+ ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
+
+ /* Don't try to send ACKs if the link isn't ACTIVE */
+ if (driver_lstate(ppd) != IB_PORT_ACTIVE)
+ return;
+
+ sc = rcd->sc;
+ plen = 2 /* PBC */ + hwords;
+ vl = sc_to_vlt(ppd->dd, sc5);
+ pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+
+ pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+ if (!pbuf) {
+ /*
+ * We have no room to send at the moment. Pass
+ * responsibility for sending the ACK to the send tasklet
+ * so that when enough buffer space becomes available,
+ * the ACK is sent ahead of other outgoing packets.
+ */
+ goto queue_ack;
+ }
+
+ trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
+
+ /* write the pbc and data */
+ ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
+
+ return;
+
+queue_ack:
+ this_cpu_inc(*ibp->rc_qacks);
+ spin_lock(&qp->s_lock);
+ qp->s_flags |= HFI1_S_ACK_PENDING | HFI1_S_RESP_PENDING;
+ qp->s_nak_state = qp->r_nak_state;
+ qp->s_ack_psn = qp->r_ack_psn;
+ if (is_fecn)
+ qp->s_flags |= HFI1_S_ECN;
+
+ /* Schedule the send tasklet. */
+ hfi1_schedule_send(qp);
+ spin_unlock(&qp->s_lock);
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct hfi1_qp *qp, u32 psn)
+{
+ u32 n = qp->s_acked;
+ struct hfi1_swqe *wqe = get_swqe_ptr(qp, n);
+ u32 opcode;
+
+ qp->s_cur = n;
+
+ /*
+ * If we are starting the request from the beginning,
+ * let the normal send code handle initialization.
+ */
+ if (cmp_psn(psn, wqe->psn) <= 0) {
+ qp->s_state = OP(SEND_LAST);
+ goto done;
+ }
+
+ /* Find the work request opcode corresponding to the given PSN. */
+ opcode = wqe->wr.opcode;
+ for (;;) {
+ int diff;
+
+ if (++n == qp->s_size)
+ n = 0;
+ if (n == qp->s_tail)
+ break;
+ wqe = get_swqe_ptr(qp, n);
+ diff = cmp_psn(psn, wqe->psn);
+ if (diff < 0)
+ break;
+ qp->s_cur = n;
+ /*
+ * If we are starting the request from the beginning,
+ * let the normal send code handle initialization.
+ */
+ if (diff == 0) {
+ qp->s_state = OP(SEND_LAST);
+ goto done;
+ }
+ opcode = wqe->wr.opcode;
+ }
+
+ /*
+ * Set the state to restart in the middle of a request.
+ * Don't change the s_sge, s_cur_sge, or s_cur_size.
+ * See hfi1_make_rc_req().
+ */
+ switch (opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+ break;
+
+ case IB_WR_RDMA_READ:
+ qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+ break;
+
+ default:
+ /*
+ * This case shouldn't happen since its only
+ * one PSN per req.
+ */
+ qp->s_state = OP(SEND_LAST);
+ }
+done:
+ qp->s_psn = psn;
+ /*
+ * Set HFI1_S_WAIT_PSN as rc_complete() may start the timer
+ * asynchronously before the send tasklet can get scheduled.
+ * Doing it in hfi1_make_rc_req() is too late.
+ */
+ if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+ (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+ qp->s_flags |= HFI1_S_WAIT_PSN;
+ qp->s_flags &= ~HFI1_S_AHG_VALID;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void restart_rc(struct hfi1_qp *qp, u32 psn, int wait)
+{
+ struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
+ struct hfi1_ibport *ibp;
+
+ if (qp->s_retry == 0) {
+ if (qp->s_mig_state == IB_MIG_ARMED) {
+ hfi1_migrate_qp(qp);
+ qp->s_retry = qp->s_retry_cnt;
+ } else if (qp->s_last == qp->s_acked) {
+ hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+ hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ return;
+ } else /* need to handle delayed completion */
+ return;
+ } else
+ qp->s_retry--;
+
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ ibp->n_rc_resends++;
+ else
+ ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
+
+ qp->s_flags &= ~(HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR |
+ HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_PSN |
+ HFI1_S_WAIT_ACK);
+ if (wait)
+ qp->s_flags |= HFI1_S_SEND_ONE;
+ reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+static void rc_timeout(unsigned long arg)
+{
+ struct hfi1_qp *qp = (struct hfi1_qp *)arg;
+ struct hfi1_ibport *ibp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ if (qp->s_flags & HFI1_S_TIMER) {
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ibp->n_rc_timeouts++;
+ qp->s_flags &= ~HFI1_S_TIMER;
+ del_timer(&qp->s_timer);
+ restart_rc(qp, qp->s_last_psn + 1, 1);
+ hfi1_schedule_send(qp);
+ }
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void hfi1_rc_rnr_retry(unsigned long arg)
+{
+ struct hfi1_qp *qp = (struct hfi1_qp *)arg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (qp->s_flags & HFI1_S_WAIT_RNR) {
+ qp->s_flags &= ~HFI1_S_WAIT_RNR;
+ del_timer(&qp->s_timer);
+ hfi1_schedule_send(qp);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct hfi1_qp *qp, u32 psn)
+{
+ struct hfi1_swqe *wqe;
+ u32 n = qp->s_last;
+
+ /* Find the work request corresponding to the given PSN. */
+ for (;;) {
+ wqe = get_swqe_ptr(qp, n);
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ qp->s_sending_psn = wqe->lpsn + 1;
+ else
+ qp->s_sending_psn = psn + 1;
+ break;
+ }
+ if (++n == qp->s_size)
+ n = 0;
+ if (n == qp->s_tail)
+ break;
+ }
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr)
+{
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_swqe *wqe;
+ struct ib_wc wc;
+ unsigned i;
+ u32 opcode;
+ u32 psn;
+
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+ return;
+
+ /* Find out where the BTH is */
+ if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ WARN_ON(!qp->s_rdma_ack_cnt);
+ qp->s_rdma_ack_cnt--;
+ return;
+ }
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ reset_sending_psn(qp, psn);
+
+ /*
+ * Start timer after a packet requesting an ACK has been sent and
+ * there are still requests that haven't been acked.
+ */
+ if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+ !(qp->s_flags &
+ (HFI1_S_TIMER | HFI1_S_WAIT_RNR | HFI1_S_WAIT_PSN)) &&
+ (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
+ start_timer(qp);
+
+ while (qp->s_last != qp->s_acked) {
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+ cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+ break;
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct hfi1_sge *sge = &wqe->sg_list[i];
+
+ hfi1_put_mr(sge->mr);
+ }
+ /* Post a send completion queue entry if requested. */
+ if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+ }
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ }
+ /*
+ * If we were waiting for sends to complete before re-sending,
+ * and they are now complete, restart sending.
+ */
+ trace_hfi1_rc_sendcomplete(qp, psn);
+ if (qp->s_flags & HFI1_S_WAIT_PSN &&
+ cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+ qp->s_flags &= ~HFI1_S_WAIT_PSN;
+ qp->s_sending_psn = qp->s_psn;
+ qp->s_sending_hpsn = qp->s_psn - 1;
+ hfi1_schedule_send(qp);
+ }
+}
+
+static inline void update_last_psn(struct hfi1_qp *qp, u32 psn)
+{
+ qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to hfi1_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp,
+ struct hfi1_swqe *wqe,
+ struct hfi1_ibport *ibp)
+{
+ struct ib_wc wc;
+ unsigned i;
+
+ /*
+ * Don't decrement refcount and don't generate a
+ * completion if the SWQE is being resent until the send
+ * is finished.
+ */
+ if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
+ cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct hfi1_sge *sge = &wqe->sg_list[i];
+
+ hfi1_put_mr(sge->mr);
+ }
+ /* Post a send completion queue entry if requested. */
+ if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+ }
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ } else {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ this_cpu_inc(*ibp->rc_delayed_comp);
+ /*
+ * If send progress not running attempt to progress
+ * SDMA queue.
+ */
+ if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
+ struct sdma_engine *engine;
+ u8 sc5;
+
+ /* For now use sc to find engine */
+ sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ engine = qp_to_sdma_engine(qp, sc5);
+ sdma_engine_progress_schedule(engine);
+ }
+ }
+
+ qp->s_retry = qp->s_retry_cnt;
+ update_last_psn(qp, wqe->lpsn);
+
+ /*
+ * If we are completing a request which is in the process of
+ * being resent, we can stop re-sending it since we know the
+ * responder has already seen it.
+ */
+ if (qp->s_acked == qp->s_cur) {
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ qp->s_acked = qp->s_cur;
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ if (qp->s_acked != qp->s_tail) {
+ qp->s_state = OP(SEND_LAST);
+ qp->s_psn = wqe->psn;
+ }
+ } else {
+ if (++qp->s_acked >= qp->s_size)
+ qp->s_acked = 0;
+ if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+ qp->s_draining = 0;
+ wqe = get_swqe_ptr(qp, qp->s_acked);
+ }
+ return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode,
+ u64 val, struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_ibport *ibp;
+ enum ib_wc_status status;
+ struct hfi1_swqe *wqe;
+ int ret = 0;
+ u32 ack_psn;
+ int diff;
+
+ /* Remove QP from retry timer */
+ if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+ qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+ del_timer(&qp->s_timer);
+ }
+
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+ * requests and implicitly NAK RDMA read and atomic requests issued
+ * before the NAK'ed request. The MSN won't include the NAK'ed
+ * request but will include an ACK'ed request(s).
+ */
+ ack_psn = psn;
+ if (aeth >> 29)
+ ack_psn--;
+ wqe = get_swqe_ptr(qp, qp->s_acked);
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+ /*
+ * The MSN might be for a later WQE than the PSN indicates so
+ * only complete WQEs that the PSN finishes.
+ */
+ while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
+ /*
+ * RDMA_READ_RESPONSE_ONLY is a special case since
+ * we want to generate completion events for everything
+ * before the RDMA read, copy the data, then generate
+ * the completion for the read.
+ */
+ if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+ opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+ diff == 0) {
+ ret = 1;
+ goto bail;
+ }
+ /*
+ * If this request is a RDMA read or atomic, and the ACK is
+ * for a later operation, this ACK NAKs the RDMA read or
+ * atomic. In other words, only a RDMA_READ_LAST or ONLY
+ * can ACK a RDMA read and likewise for atomic ops. Note
+ * that the NAK case can only happen if relaxed ordering is
+ * used and requests are sent after an RDMA read or atomic
+ * is sent but before the response is received.
+ */
+ if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+ (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+ ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+ /* Retry this request. */
+ if (!(qp->r_flags & HFI1_R_RDMAR_SEQ)) {
+ qp->r_flags |= HFI1_R_RDMAR_SEQ;
+ restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= HFI1_R_RSP_SEND;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait,
+ &rcd->qp_wait_list);
+ }
+ }
+ /*
+ * No need to process the ACK/NAK since we are
+ * restarting an earlier request.
+ */
+ goto bail;
+ }
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ u64 *vaddr = wqe->sg_list[0].vaddr;
+ *vaddr = val;
+ }
+ if (qp->s_num_rd_atomic &&
+ (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+ qp->s_num_rd_atomic--;
+ /* Restart sending task if fence is complete */
+ if ((qp->s_flags & HFI1_S_WAIT_FENCE) &&
+ !qp->s_num_rd_atomic) {
+ qp->s_flags &= ~(HFI1_S_WAIT_FENCE |
+ HFI1_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ } else if (qp->s_flags & HFI1_S_WAIT_RDMAR) {
+ qp->s_flags &= ~(HFI1_S_WAIT_RDMAR |
+ HFI1_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+ }
+ wqe = do_rc_completion(qp, wqe, ibp);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ }
+
+ switch (aeth >> 29) {
+ case 0: /* ACK */
+ this_cpu_inc(*ibp->rc_acks);
+ if (qp->s_acked != qp->s_tail) {
+ /*
+ * We are expecting more ACKs so
+ * reset the re-transmit timer.
+ */
+ start_timer(qp);
+ /*
+ * We can stop re-sending the earlier packets and
+ * continue with the next packet the receiver wants.
+ */
+ if (cmp_psn(qp->s_psn, psn) <= 0)
+ reset_psn(qp, psn + 1);
+ } else if (cmp_psn(qp->s_psn, psn) <= 0) {
+ qp->s_state = OP(SEND_LAST);
+ qp->s_psn = psn + 1;
+ }
+ if (qp->s_flags & HFI1_S_WAIT_ACK) {
+ qp->s_flags &= ~HFI1_S_WAIT_ACK;
+ hfi1_schedule_send(qp);
+ }
+ hfi1_get_credit(qp, aeth);
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ qp->s_retry = qp->s_retry_cnt;
+ update_last_psn(qp, psn);
+ ret = 1;
+ goto bail;
+
+ case 1: /* RNR NAK */
+ ibp->n_rnr_naks++;
+ if (qp->s_acked == qp->s_tail)
+ goto bail;
+ if (qp->s_flags & HFI1_S_WAIT_RNR)
+ goto bail;
+ if (qp->s_rnr_retry == 0) {
+ status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto class_b;
+ }
+ if (qp->s_rnr_retry_cnt < 7)
+ qp->s_rnr_retry--;
+
+ /* The last valid PSN is the previous PSN. */
+ update_last_psn(qp, psn - 1);
+
+ ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
+
+ reset_psn(qp, psn);
+
+ qp->s_flags &= ~(HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_ACK);
+ qp->s_flags |= HFI1_S_WAIT_RNR;
+ qp->s_timer.function = hfi1_rc_rnr_retry;
+ qp->s_timer.expires = jiffies + usecs_to_jiffies(
+ ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
+ HFI1_AETH_CREDIT_MASK]);
+ add_timer(&qp->s_timer);
+ goto bail;
+
+ case 3: /* NAK */
+ if (qp->s_acked == qp->s_tail)
+ goto bail;
+ /* The last valid PSN is the previous PSN. */
+ update_last_psn(qp, psn - 1);
+ switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
+ HFI1_AETH_CREDIT_MASK) {
+ case 0: /* PSN sequence error */
+ ibp->n_seq_naks++;
+ /*
+ * Back up to the responder's expected PSN.
+ * Note that we might get a NAK in the middle of an
+ * RDMA READ response which terminates the RDMA
+ * READ.
+ */
+ restart_rc(qp, psn, 0);
+ hfi1_schedule_send(qp);
+ break;
+
+ case 1: /* Invalid Request */
+ status = IB_WC_REM_INV_REQ_ERR;
+ ibp->n_other_naks++;
+ goto class_b;
+
+ case 2: /* Remote Access Error */
+ status = IB_WC_REM_ACCESS_ERR;
+ ibp->n_other_naks++;
+ goto class_b;
+
+ case 3: /* Remote Operation Error */
+ status = IB_WC_REM_OP_ERR;
+ ibp->n_other_naks++;
+class_b:
+ if (qp->s_last == qp->s_acked) {
+ hfi1_send_complete(qp, wqe, status);
+ hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
+ break;
+
+ default:
+ /* Ignore other reserved NAK error codes */
+ goto reserved;
+ }
+ qp->s_retry = qp->s_retry_cnt;
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ goto bail;
+
+ default: /* 2: reserved */
+reserved:
+ /* Ignore reserved NAK codes. */
+ goto bail;
+ }
+
+bail:
+ return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn,
+ struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_swqe *wqe;
+
+ /* Remove QP from retry timer */
+ if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+ qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+ del_timer(&qp->s_timer);
+ }
+
+ wqe = get_swqe_ptr(qp, qp->s_acked);
+
+ while (cmp_psn(psn, wqe->lpsn) > 0) {
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+ break;
+ wqe = do_rc_completion(qp, wqe, ibp);
+ }
+
+ ibp->n_rdma_seq++;
+ qp->r_flags |= HFI1_R_RDMAR_SEQ;
+ restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= HFI1_R_RSP_SEND;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+/**
+ * rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void rc_rcv_resp(struct hfi1_ibport *ibp,
+ struct hfi1_other_headers *ohdr,
+ void *data, u32 tlen, struct hfi1_qp *qp,
+ u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
+ struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_swqe *wqe;
+ enum ib_wc_status status;
+ unsigned long flags;
+ int diff;
+ u32 pad;
+ u32 aeth;
+ u64 val;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Ignore invalid responses. */
+ if (cmp_psn(psn, qp->s_next_psn) >= 0)
+ goto ack_done;
+
+ /* Ignore duplicate responses. */
+ diff = cmp_psn(psn, qp->s_last_psn);
+ if (unlikely(diff <= 0)) {
+ /* Update credits for "ghost" ACKs */
+ if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ if ((aeth >> 29) == 0)
+ hfi1_get_credit(qp, aeth);
+ }
+ goto ack_done;
+ }
+
+ /*
+ * Skip everything other than the PSN we expect, if we are waiting
+ * for a reply to a restarted RDMA read or atomic op.
+ */
+ if (qp->r_flags & HFI1_R_RDMAR_SEQ) {
+ if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+ goto ack_done;
+ qp->r_flags &= ~HFI1_R_RDMAR_SEQ;
+ }
+
+ if (unlikely(qp->s_acked == qp->s_tail))
+ goto ack_done;
+ wqe = get_swqe_ptr(qp, qp->s_acked);
+ status = IB_WC_SUCCESS;
+
+ switch (opcode) {
+ case OP(ACKNOWLEDGE):
+ case OP(ATOMIC_ACKNOWLEDGE):
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+ __be32 *p = ohdr->u.at.atomic_ack_eth;
+
+ val = ((u64) be32_to_cpu(p[0]) << 32) |
+ be32_to_cpu(p[1]);
+ } else
+ val = 0;
+ if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+ opcode != OP(RDMA_READ_RESPONSE_FIRST))
+ goto ack_done;
+ wqe = get_swqe_ptr(qp, qp->s_acked);
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+ /*
+ * If this is a response to a resent RDMA read, we
+ * have to be careful to copy the data to the right
+ * location.
+ */
+ qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+ wqe, psn, pmtu);
+ goto read_middle;
+
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ /* no AETH, no ACK */
+ if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+ goto ack_seq_err;
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+read_middle:
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto ack_len_err;
+ if (unlikely(pmtu >= qp->s_rdma_read_len))
+ goto ack_len_err;
+
+ /*
+ * We got a response so update the timeout.
+ * 4.096 usec. * (1 << qp->timeout)
+ */
+ qp->s_flags |= HFI1_S_TIMER;
+ mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+ if (qp->s_flags & HFI1_S_WAIT_ACK) {
+ qp->s_flags &= ~HFI1_S_WAIT_ACK;
+ hfi1_schedule_send(qp);
+ }
+
+ if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+ qp->s_retry = qp->s_retry_cnt;
+
+ /*
+ * Update the RDMA receive state but do the copy w/o
+ * holding the locks and blocking interrupts.
+ */
+ qp->s_rdma_read_len -= pmtu;
+ update_last_psn(qp, psn);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
+ goto bail;
+
+ case OP(RDMA_READ_RESPONSE_ONLY):
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+ goto ack_done;
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /*
+ * Check that the data size is >= 0 && <= pmtu.
+ * Remember to account for ICRC (4).
+ */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto ack_len_err;
+ /*
+ * If this is a response to a resent RDMA read, we
+ * have to be careful to copy the data to the right
+ * location.
+ */
+ wqe = get_swqe_ptr(qp, qp->s_acked);
+ qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+ wqe, psn, pmtu);
+ goto read_last;
+
+ case OP(RDMA_READ_RESPONSE_LAST):
+ /* ACKs READ req. */
+ if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+ goto ack_seq_err;
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /*
+ * Check that the data size is >= 1 && <= pmtu.
+ * Remember to account for ICRC (4).
+ */
+ if (unlikely(tlen <= (hdrsize + pad + 4)))
+ goto ack_len_err;
+read_last:
+ tlen -= hdrsize + pad + 4;
+ if (unlikely(tlen != qp->s_rdma_read_len))
+ goto ack_len_err;
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+ WARN_ON(qp->s_rdma_read_sge.num_sge);
+ (void) do_rc_ack(qp, aeth, psn,
+ OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+ goto ack_done;
+ }
+
+ack_op_err:
+ status = IB_WC_LOC_QP_OP_ERR;
+ goto ack_err;
+
+ack_seq_err:
+ rdma_seq_err(qp, ibp, psn, rcd);
+ goto ack_done;
+
+ack_len_err:
+ status = IB_WC_LOC_LEN_ERR;
+ack_err:
+ if (qp->s_last == qp->s_acked) {
+ hfi1_send_complete(qp, wqe, status);
+ hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+ return;
+}
+
+/**
+ * rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from hfi1_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
+ struct hfi1_qp *qp, u32 opcode, u32 psn, int diff,
+ struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_ack_entry *e;
+ unsigned long flags;
+ u8 i, prev;
+ int old_req;
+
+ if (diff > 0) {
+ /*
+ * Packet sequence error.
+ * A NAK will ACK earlier sends and RDMA writes.
+ * Don't queue the NAK if we already sent one.
+ */
+ if (!qp->r_nak_state) {
+ ibp->n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ /* Use the expected PSN. */
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Wait to send the sequence NAK until all packets
+ * in the receive queue have been processed.
+ * Otherwise, we end up propagating congestion.
+ */
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= HFI1_R_RSP_NAK;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ }
+ goto done;
+ }
+
+ /*
+ * Handle a duplicate request. Don't re-execute SEND, RDMA
+ * write or atomic op. Don't NAK errors, just silently drop
+ * the duplicate request. Note that r_sge, r_len, and
+ * r_rcv_len may be in use so don't modify them.
+ *
+ * We are supposed to ACK the earliest duplicate PSN but we
+ * can coalesce an outstanding duplicate ACK. We have to
+ * send the earliest so that RDMA reads can be restarted at
+ * the requester's expected PSN.
+ *
+ * First, find where this duplicate PSN falls within the
+ * ACKs previously sent.
+ * old_req is true if there is an older response that is scheduled
+ * to be sent before sending this one.
+ */
+ e = NULL;
+ old_req = 1;
+ ibp->n_rc_dupreq++;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ for (i = qp->r_head_ack_queue; ; i = prev) {
+ if (i == qp->s_tail_ack_queue)
+ old_req = 0;
+ if (i)
+ prev = i - 1;
+ else
+ prev = HFI1_MAX_RDMA_ATOMIC;
+ if (prev == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[prev];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (cmp_psn(psn, e->psn) >= 0) {
+ if (prev == qp->s_tail_ack_queue &&
+ cmp_psn(psn, e->lpsn) <= 0)
+ old_req = 0;
+ break;
+ }
+ }
+ switch (opcode) {
+ case OP(RDMA_READ_REQUEST): {
+ struct ib_reth *reth;
+ u32 offset;
+ u32 len;
+
+ /*
+ * If we didn't find the RDMA read request in the ack queue,
+ * we can ignore this request.
+ */
+ if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+ goto unlock_done;
+ /* RETH comes after BTH */
+ reth = &ohdr->u.rc.reth;
+ /*
+ * Address range must be a subset of the original
+ * request and start on pmtu boundaries.
+ * We reuse the old ack_queue slot since the requester
+ * should not back up and request an earlier PSN for the
+ * same request.
+ */
+ offset = delta_psn(psn, e->psn) * qp->pmtu;
+ len = be32_to_cpu(reth->length);
+ if (unlikely(offset + len != e->rdma_sge.sge_length))
+ goto unlock_done;
+ if (e->rdma_sge.mr) {
+ hfi1_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ if (len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock_done;
+ } else {
+ e->rdma_sge.vaddr = NULL;
+ e->rdma_sge.length = 0;
+ e->rdma_sge.sge_length = 0;
+ }
+ e->psn = psn;
+ if (old_req)
+ goto unlock_done;
+ qp->s_tail_ack_queue = prev;
+ break;
+ }
+
+ case OP(COMPARE_SWAP):
+ case OP(FETCH_ADD): {
+ /*
+ * If we didn't find the atomic request in the ack queue
+ * or the send tasklet is already backed up to send an
+ * earlier entry, we can ignore this request.
+ */
+ if (!e || e->opcode != (u8) opcode || old_req)
+ goto unlock_done;
+ qp->s_tail_ack_queue = prev;
+ break;
+ }
+
+ default:
+ /*
+ * Ignore this operation if it doesn't request an ACK
+ * or an earlier RDMA read or atomic is going to be resent.
+ */
+ if (!(psn & IB_BTH_REQ_ACK) || old_req)
+ goto unlock_done;
+ /*
+ * Resend the most recent ACK if this request is
+ * after all the previous RDMA reads and atomics.
+ */
+ if (i == qp->r_head_ack_queue) {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ qp->r_nak_state = 0;
+ qp->r_ack_psn = qp->r_psn - 1;
+ goto send_ack;
+ }
+
+ /*
+ * Resend the RDMA read or atomic op which
+ * ACKs this duplicate request.
+ */
+ qp->s_tail_ack_queue = i;
+ break;
+ }
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ qp->s_flags |= HFI1_S_RESP_PENDING;
+ qp->r_nak_state = 0;
+ hfi1_schedule_send(qp);
+
+unlock_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+
+send_ack:
+ return 0;
+}
+
+void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err)
+{
+ unsigned long flags;
+ int lastwqe;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ lastwqe = hfi1_error_qp(qp, err);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
+
+static inline void update_ack_queue(struct hfi1_qp *qp, unsigned n)
+{
+ unsigned next;
+
+ next = n + 1;
+ if (next > HFI1_MAX_RDMA_ATOMIC)
+ next = 0;
+ qp->s_tail_ack_queue = next;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
+ u32 lqpn, u32 rqpn, u8 svc_type)
+{
+ struct opa_hfi1_cong_log_event_internal *cc_event;
+
+ if (sl >= OPA_MAX_SLS)
+ return;
+
+ spin_lock(&ppd->cc_log_lock);
+
+ ppd->threshold_cong_event_map[sl/8] |= 1 << (sl % 8);
+ ppd->threshold_event_counter++;
+
+ cc_event = &ppd->cc_events[ppd->cc_log_idx++];
+ if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
+ ppd->cc_log_idx = 0;
+ cc_event->lqpn = lqpn & HFI1_QPN_MASK;
+ cc_event->rqpn = rqpn & HFI1_QPN_MASK;
+ cc_event->sl = sl;
+ cc_event->svc_type = svc_type;
+ cc_event->rlid = rlid;
+ /* keep timestamp in units of 1.024 usec */
+ cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
+
+ spin_unlock(&ppd->cc_log_lock);
+}
+
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+ u32 rqpn, u8 svc_type)
+{
+ struct cca_timer *cca_timer;
+ u16 ccti, ccti_incr, ccti_timer, ccti_limit;
+ u8 trigger_threshold;
+ struct cc_state *cc_state;
+
+ if (sl >= OPA_MAX_SLS)
+ return;
+
+ cca_timer = &ppd->cca_timer[sl];
+
+ cc_state = get_cc_state(ppd);
+
+ if (cc_state == NULL)
+ return;
+
+ /*
+ * 1) increase CCTI (for this SL)
+ * 2) select IPG (i.e., call set_link_ipg())
+ * 3) start timer
+ */
+ ccti_limit = cc_state->cct.ccti_limit;
+ ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
+ ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+ trigger_threshold =
+ cc_state->cong_setting.entries[sl].trigger_threshold;
+
+ spin_lock(&ppd->cca_timer_lock);
+
+ if (cca_timer->ccti < ccti_limit) {
+ if (cca_timer->ccti + ccti_incr <= ccti_limit)
+ cca_timer->ccti += ccti_incr;
+ else
+ cca_timer->ccti = ccti_limit;
+ set_link_ipg(ppd);
+ }
+
+ spin_unlock(&ppd->cca_timer_lock);
+
+ ccti = cca_timer->ccti;
+
+ if (!hrtimer_active(&cca_timer->hrtimer)) {
+ /* ccti_timer is in units of 1.024 usec */
+ unsigned long nsec = 1024 * ccti_timer;
+
+ hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
+ HRTIMER_MODE_REL);
+ }
+
+ if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
+ log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
+
+/**
+ * hfi1_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_rc_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 rcv_flags = packet->rcv_flags;
+ void *data = packet->ebuf;
+ u32 tlen = packet->tlen;
+ struct hfi1_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_other_headers *ohdr = packet->ohdr;
+ u32 bth0, opcode;
+ u32 hdrsize = packet->hlen;
+ u32 psn;
+ u32 pad;
+ struct ib_wc wc;
+ u32 pmtu = qp->pmtu;
+ int diff;
+ struct ib_reth *reth;
+ unsigned long flags;
+ u32 bth1;
+ int ret, is_fecn = 0;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
+ return;
+
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+ if (bth1 & HFI1_BECN_SMASK) {
+ u16 rlid = qp->remote_ah_attr.dlid;
+ u32 lqpn, rqpn;
+
+ lqpn = qp->ibqp.qp_num;
+ rqpn = qp->remote_qpn;
+ process_becn(
+ ppd,
+ qp->remote_ah_attr.sl,
+ rlid, lqpn, rqpn,
+ IB_CC_SVCTYPE_RC);
+ }
+ is_fecn = bth1 & HFI1_FECN_SMASK;
+ }
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ opcode = bth0 >> 24;
+
+ /*
+ * Process responses (ACKs) before anything else. Note that the
+ * packet sequence number will be for something in the send work
+ * queue rather than the expected receive packet sequence number.
+ * In other words, this QP is the requester.
+ */
+ if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+ hdrsize, pmtu, rcd);
+ if (is_fecn)
+ goto send_ack;
+ return;
+ }
+
+ /* Compute 24 bits worth of difference. */
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+ return;
+ goto send_ack;
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case OP(SEND_FIRST):
+ case OP(SEND_MIDDLE):
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+ break;
+ goto nack_inv;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_MIDDLE):
+ if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ break;
+ goto nack_inv;
+
+ default:
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ goto nack_inv;
+ /*
+ * Note that it is up to the requester to not send a new
+ * RDMA read or atomic operation before receiving an ACK
+ * for the previous operation.
+ */
+ break;
+ }
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
+ qp_comm_est(qp);
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ ret = hfi1_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ qp->r_rcv_len = 0;
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto nack_inv;
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len))
+ goto nack_inv;
+ hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
+ break;
+
+ case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ /* consume RWQE */
+ ret = hfi1_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ goto send_last_imm;
+
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+ ret = hfi1_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ qp->r_rcv_len = 0;
+ if (opcode == OP(SEND_ONLY))
+ goto no_immediate_data;
+ /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+ case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+ wc.ex.imm_data = ohdr->u.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ goto send_last;
+ case OP(SEND_LAST):
+ case OP(RDMA_WRITE_LAST):
+no_immediate_data:
+ wc.wc_flags = 0;
+ wc.ex.imm_data = 0;
+send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (bth0 >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto nack_inv;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len))
+ goto nack_inv;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+ hfi1_put_ss(&qp->r_sge);
+ qp->r_msn++;
+ if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+ break;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ /*
+ * It seems that IB mandates the presence of an SL in a
+ * work completion only for the UD transport (see section
+ * 11.4.2 of IBTA Vol. 1).
+ *
+ * However, the way the SL is chosen below is consistent
+ * with the way that IB/qib works and is trying avoid
+ * introducing incompatibilities.
+ *
+ * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+ */
+ wc.sl = qp->remote_ah_attr.sl;
+ /* zero fields that are N/A */
+ wc.vendor_err = 0;
+ wc.pkey_index = 0;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ /* Signal completion event if the solicited bit is set. */
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ (bth0 & IB_BTH_SOLICITED) != 0);
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto nack_inv;
+ /* consume RWQE */
+ reth = &ohdr->u.rc.reth;
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ qp->r_sge.sg_list = NULL;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey & NAK */
+ ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_WRITE);
+ if (unlikely(!ok))
+ goto nack_acc;
+ qp->r_sge.num_sge = 1;
+ } else {
+ qp->r_sge.num_sge = 0;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (opcode == OP(RDMA_WRITE_FIRST))
+ goto send_middle;
+ else if (opcode == OP(RDMA_WRITE_ONLY))
+ goto no_immediate_data;
+ ret = hfi1_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ wc.ex.imm_data = ohdr->u.rc.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ goto send_last;
+
+ case OP(RDMA_READ_REQUEST): {
+ struct hfi1_ack_entry *e;
+ u32 len;
+ u8 next;
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+ next = qp->r_head_ack_queue + 1;
+ /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
+ if (next > HFI1_MAX_RDMA_ATOMIC)
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlck;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+ hfi1_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ reth = &ohdr->u.rc.reth;
+ len = be32_to_cpu(reth->length);
+ if (len) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey & NAK */
+ ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto nack_acc_unlck;
+ /*
+ * Update the next expected PSN. We add 1 later
+ * below, so only add the remainder here.
+ */
+ if (len > pmtu)
+ qp->r_psn += (len - 1) / pmtu;
+ } else {
+ e->rdma_sge.mr = NULL;
+ e->rdma_sge.vaddr = NULL;
+ e->rdma_sge.length = 0;
+ e->rdma_sge.sge_length = 0;
+ }
+ e->opcode = opcode;
+ e->sent = 0;
+ e->psn = psn;
+ e->lpsn = qp->r_psn;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_nak_state = 0;
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= HFI1_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+ }
+
+ case OP(COMPARE_SWAP):
+ case OP(FETCH_ADD): {
+ struct ib_atomic_eth *ateth;
+ struct hfi1_ack_entry *e;
+ u64 vaddr;
+ atomic64_t *maddr;
+ u64 sdata;
+ u32 rkey;
+ u8 next;
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ goto nack_inv;
+ next = qp->r_head_ack_queue + 1;
+ if (next > HFI1_MAX_RDMA_ATOMIC)
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlck;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+ hfi1_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ ateth = &ohdr->u.atomic_eth;
+ vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+ be32_to_cpu(ateth->vaddr[1]);
+ if (unlikely(vaddr & (sizeof(u64) - 1)))
+ goto nack_inv_unlck;
+ rkey = be32_to_cpu(ateth->rkey);
+ /* Check rkey & NAK */
+ if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+ vaddr, rkey,
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto nack_acc_unlck;
+ /* Perform atomic OP and save result. */
+ maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+ sdata = be64_to_cpu(ateth->swap_data);
+ e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+ (u64) atomic64_add_return(sdata, maddr) - sdata :
+ (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+ be64_to_cpu(ateth->compare_data),
+ sdata);
+ hfi1_put_mr(qp->r_sge.sge.mr);
+ qp->r_sge.num_sge = 0;
+ e->opcode = opcode;
+ e->sent = 0;
+ e->psn = psn;
+ e->lpsn = psn;
+ qp->r_msn++;
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_nak_state = 0;
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= HFI1_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+ }
+
+ default:
+ /* NAK unknown opcodes. */
+ goto nack_inv;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_ack_psn = psn;
+ qp->r_nak_state = 0;
+ /* Send an ACK if requested or required. */
+ if (psn & (1 << 31))
+ goto send_ack;
+ return;
+
+rnr_nak:
+ qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue RNR NAK for later */
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= HFI1_R_RSP_NAK;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ return;
+
+nack_op_err:
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= HFI1_R_RSP_NAK;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ return;
+
+nack_inv_unlck:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= HFI1_R_RSP_NAK;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ return;
+
+nack_acc_unlck:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+ hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(rcd, qp, is_fecn);
+}
+
+void hfi1_rc_hdrerr(
+ struct hfi1_ctxtdata *rcd,
+ struct hfi1_ib_header *hdr,
+ u32 rcv_flags,
+ struct hfi1_qp *qp)
+{
+ int has_grh = rcv_flags & HFI1_HAS_GRH;
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ int diff;
+ u8 opcode;
+ u32 psn;
+
+ /* Check for GRH */
+ ohdr = &hdr->u.oth;
+ if (has_grh)
+ ohdr = &hdr->u.l.oth;
+
+ opcode = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+ return;
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ opcode >>= 24;
+
+ /* Only deal with RDMA Writes for now */
+ if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+ diff = delta_psn(psn, qp->r_psn);
+ if (!qp->r_nak_state && diff >= 0) {
+ ibp->n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ /* Use the expected PSN. */
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Wait to send the sequence
+ * NAK until all packets
+ * in the receive queue have
+ * been processed.
+ * Otherwise, we end up
+ * propagating congestion.
+ */
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= HFI1_R_RSP_NAK;
+ atomic_inc(&qp->refcount);
+ list_add_tail(
+ &qp->rspwait,
+ &rcd->qp_wait_list);
+ }
+ } /* Out of sequence NAK */
+ } /* QP Request NAKs */
+}
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
new file mode 100644
index 000000000000..a4115288db66
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -0,0 +1,948 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+#include "sdma.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_hfi1_rnr_table[32] = {
+ 655360, /* 00: 655.36 */
+ 10, /* 01: .01 */
+ 20, /* 02 .02 */
+ 30, /* 03: .03 */
+ 40, /* 04: .04 */
+ 60, /* 05: .06 */
+ 80, /* 06: .08 */
+ 120, /* 07: .12 */
+ 160, /* 08: .16 */
+ 240, /* 09: .24 */
+ 320, /* 0A: .32 */
+ 480, /* 0B: .48 */
+ 640, /* 0C: .64 */
+ 960, /* 0D: .96 */
+ 1280, /* 0E: 1.28 */
+ 1920, /* 0F: 1.92 */
+ 2560, /* 10: 2.56 */
+ 3840, /* 11: 3.84 */
+ 5120, /* 12: 5.12 */
+ 7680, /* 13: 7.68 */
+ 10240, /* 14: 10.24 */
+ 15360, /* 15: 15.36 */
+ 20480, /* 16: 20.48 */
+ 30720, /* 17: 30.72 */
+ 40960, /* 18: 40.96 */
+ 61440, /* 19: 61.44 */
+ 81920, /* 1A: 81.92 */
+ 122880, /* 1B: 122.88 */
+ 163840, /* 1C: 163.84 */
+ 245760, /* 1D: 245.76 */
+ 327680, /* 1E: 327.68 */
+ 491520 /* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe)
+{
+ int i, j, ret;
+ struct ib_wc wc;
+ struct hfi1_lkey_table *rkt;
+ struct hfi1_pd *pd;
+ struct hfi1_sge_state *ss;
+
+ rkt = &to_idev(qp->ibqp.device)->lk_table;
+ pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+ ss = &qp->r_sge;
+ ss->sg_list = qp->r_sg_list;
+ qp->r_len = 0;
+ for (i = j = 0; i < wqe->num_sge; i++) {
+ if (wqe->sg_list[i].length == 0)
+ continue;
+ /* Check LKEY */
+ if (!hfi1_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+ &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+ goto bad_lkey;
+ qp->r_len += wqe->sg_list[i].length;
+ j++;
+ }
+ ss->num_sge = j;
+ ss->total_len = qp->r_len;
+ ret = 1;
+ goto bail;
+
+bad_lkey:
+ while (j) {
+ struct hfi1_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+ hfi1_put_mr(sge->mr);
+ }
+ ss->num_sge = 0;
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr_id;
+ wc.status = IB_WC_LOC_PROT_ERR;
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ /* Signal solicited completion event. */
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+ ret = 0;
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only)
+{
+ unsigned long flags;
+ struct hfi1_rq *rq;
+ struct hfi1_rwq *wq;
+ struct hfi1_srq *srq;
+ struct hfi1_rwqe *wqe;
+ void (*handler)(struct ib_event *, void *);
+ u32 tail;
+ int ret;
+
+ if (qp->ibqp.srq) {
+ srq = to_isrq(qp->ibqp.srq);
+ handler = srq->ibsrq.event_handler;
+ rq = &srq->rq;
+ } else {
+ srq = NULL;
+ handler = NULL;
+ rq = &qp->r_rq;
+ }
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
+ ret = 0;
+ goto unlock;
+ }
+
+ wq = rq->wq;
+ tail = wq->tail;
+ /* Validate tail before using it since it is user writable. */
+ if (tail >= rq->size)
+ tail = 0;
+ if (unlikely(tail == wq->head)) {
+ ret = 0;
+ goto unlock;
+ }
+ /* Make sure entry is read after head index is read. */
+ smp_rmb();
+ wqe = get_rwqe_ptr(rq, tail);
+ /*
+ * Even though we update the tail index in memory, the verbs
+ * consumer is not supposed to post more entries until a
+ * completion is generated.
+ */
+ if (++tail >= rq->size)
+ tail = 0;
+ wq->tail = tail;
+ if (!wr_id_only && !init_sge(qp, wqe)) {
+ ret = -1;
+ goto unlock;
+ }
+ qp->r_wr_id = wqe->wr_id;
+
+ ret = 1;
+ set_bit(HFI1_R_WRID_VALID, &qp->r_aflags);
+ if (handler) {
+ u32 n;
+
+ /*
+ * Validate head pointer value and compute
+ * the number of remaining WQEs.
+ */
+ n = wq->head;
+ if (n >= rq->size)
+ n = 0;
+ if (n < tail)
+ n += rq->size - tail;
+ else
+ n -= tail;
+ if (n < srq->limit) {
+ struct ib_event ev;
+
+ srq->limit = 0;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ handler(&ev, srq->ibsrq.srq_context);
+ goto bail;
+ }
+ }
+unlock:
+ spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+ return ret;
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct hfi1_qp *qp)
+{
+ struct ib_event ev;
+
+ qp->s_mig_state = IB_MIG_MIGRATED;
+ qp->remote_ah_attr = qp->alt_ah_attr;
+ qp->port_num = qp->alt_ah_attr.port_num;
+ qp->s_pkey_index = qp->s_alt_pkey_index;
+ qp->s_flags |= HFI1_S_AHG_CLEAR;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_PATH_MIG;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
+{
+ if (!index) {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ return cpu_to_be64(ppd->guid);
+ }
+ return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+ return (gid->global.interface_id == id &&
+ (gid->global.subnet_prefix == gid_prefix ||
+ gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the hfi1_migrate_qp() call.
+ */
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+ int has_grh, struct hfi1_qp *qp, u32 bth0)
+{
+ __be64 guid;
+ unsigned long flags;
+ u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+ if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+ if (!has_grh) {
+ if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+ goto err;
+ } else {
+ if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+ goto err;
+ guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+ if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+ goto err;
+ if (!gid_ok(&hdr->u.l.grh.sgid,
+ qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+ qp->alt_ah_attr.grh.dgid.global.interface_id))
+ goto err;
+ }
+ if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+ sc5, be16_to_cpu(hdr->lrh[3])))) {
+ hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+ (u16)bth0,
+ (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+ 0, qp->ibqp.qp_num,
+ hdr->lrh[3], hdr->lrh[1]);
+ goto err;
+ }
+ /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+ if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+ ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+ goto err;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_migrate_qp(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ } else {
+ if (!has_grh) {
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ goto err;
+ } else {
+ if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+ goto err;
+ guid = get_sguid(ibp,
+ qp->remote_ah_attr.grh.sgid_index);
+ if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+ goto err;
+ if (!gid_ok(&hdr->u.l.grh.sgid,
+ qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+ qp->remote_ah_attr.grh.dgid.global.interface_id))
+ goto err;
+ }
+ if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+ sc5, be16_to_cpu(hdr->lrh[3])))) {
+ hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+ (u16)bth0,
+ (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+ 0, qp->ibqp.qp_num,
+ hdr->lrh[3], hdr->lrh[1]);
+ goto err;
+ }
+ /* Validate the SLID. See Ch. 9.6.1.5 */
+ if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+ ppd_from_ibp(ibp)->port != qp->port_num)
+ goto err;
+ if (qp->s_mig_state == IB_MIG_REARM &&
+ !(bth0 & IB_BTH_MIG_REQ))
+ qp->s_mig_state = IB_MIG_ARMED;
+ }
+
+ return 0;
+
+err:
+ return 1;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from hfi1_do_send() to
+ * forward a WQE addressed to the same HFI.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send(). We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ruc_loopback(struct hfi1_qp *sqp)
+{
+ struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+ struct hfi1_qp *qp;
+ struct hfi1_swqe *wqe;
+ struct hfi1_sge *sge;
+ unsigned long flags;
+ struct ib_wc wc;
+ u64 sdata;
+ atomic64_t *maddr;
+ enum ib_wc_status send_status;
+ int release;
+ int ret;
+
+ rcu_read_lock();
+
+ /*
+ * Note that we check the responder QP state after
+ * checking the requester's state.
+ */
+ qp = hfi1_lookup_qpn(ibp, sqp->remote_qpn);
+
+ spin_lock_irqsave(&sqp->s_lock, flags);
+
+ /* Return if we are already busy processing a work request. */
+ if ((sqp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT)) ||
+ !(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+ goto unlock;
+
+ sqp->s_flags |= HFI1_S_BUSY;
+
+again:
+ if (sqp->s_last == sqp->s_head)
+ goto clr_busy;
+ wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+ /* Return if it is not OK to start a new work request. */
+ if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
+ if (!(ib_hfi1_state_ops[sqp->state] & HFI1_FLUSH_SEND))
+ goto clr_busy;
+ /* We are in the error state, flush the work request. */
+ send_status = IB_WC_WR_FLUSH_ERR;
+ goto flush_send;
+ }
+
+ /*
+ * We can rely on the entry not changing without the s_lock
+ * being held until we update s_last.
+ * We increment s_cur to indicate s_last is in progress.
+ */
+ if (sqp->s_last == sqp->s_cur) {
+ if (++sqp->s_cur >= sqp->s_size)
+ sqp->s_cur = 0;
+ }
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+ if (!qp || !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) ||
+ qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+ ibp->n_pkt_drops++;
+ /*
+ * For RC, the requester would timeout and retry so
+ * shortcut the timeouts and just signal too many retries.
+ */
+ if (sqp->ibqp.qp_type == IB_QPT_RC)
+ send_status = IB_WC_RETRY_EXC_ERR;
+ else
+ send_status = IB_WC_SUCCESS;
+ goto serr;
+ }
+
+ memset(&wc, 0, sizeof(wc));
+ send_status = IB_WC_SUCCESS;
+
+ release = 1;
+ sqp->s_sge.sge = wqe->sg_list[0];
+ sqp->s_sge.sg_list = wqe->sg_list + 1;
+ sqp->s_sge.num_sge = wqe->wr.num_sge;
+ sqp->s_len = wqe->length;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ /* FALLTHROUGH */
+ case IB_WR_SEND:
+ ret = hfi1_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto rnr_nak;
+ break;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ ret = hfi1_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto rnr_nak;
+ /* FALLTHROUGH */
+ case IB_WR_RDMA_WRITE:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+ if (wqe->length == 0)
+ break;
+ if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+ wqe->wr.wr.rdma.remote_addr,
+ wqe->wr.wr.rdma.rkey,
+ IB_ACCESS_REMOTE_WRITE)))
+ goto acc_err;
+ qp->r_sge.sg_list = NULL;
+ qp->r_sge.num_sge = 1;
+ qp->r_sge.total_len = wqe->length;
+ break;
+
+ case IB_WR_RDMA_READ:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto inv_err;
+ if (unlikely(!hfi1_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+ wqe->wr.wr.rdma.remote_addr,
+ wqe->wr.wr.rdma.rkey,
+ IB_ACCESS_REMOTE_READ)))
+ goto acc_err;
+ release = 0;
+ sqp->s_sge.sg_list = NULL;
+ sqp->s_sge.num_sge = 1;
+ qp->r_sge.sge = wqe->sg_list[0];
+ qp->r_sge.sg_list = wqe->sg_list + 1;
+ qp->r_sge.num_sge = wqe->wr.num_sge;
+ qp->r_sge.total_len = wqe->length;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ goto inv_err;
+ if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+ wqe->wr.wr.atomic.remote_addr,
+ wqe->wr.wr.atomic.rkey,
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto acc_err;
+ /* Perform atomic OP and save result. */
+ maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+ sdata = wqe->wr.wr.atomic.compare_add;
+ *(u64 *) sqp->s_sge.sge.vaddr =
+ (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+ (u64) atomic64_add_return(sdata, maddr) - sdata :
+ (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+ sdata, wqe->wr.wr.atomic.swap);
+ hfi1_put_mr(qp->r_sge.sge.mr);
+ qp->r_sge.num_sge = 0;
+ goto send_comp;
+
+ default:
+ send_status = IB_WC_LOC_QP_OP_ERR;
+ goto serr;
+ }
+
+ sge = &sqp->s_sge.sge;
+ while (sqp->s_len) {
+ u32 len = sqp->s_len;
+
+ if (len > sge->length)
+ len = sge->length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (!release)
+ hfi1_put_mr(sge->mr);
+ if (--sqp->s_sge.num_sge)
+ *sge = *sqp->s_sge.sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= HFI1_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ sqp->s_len -= len;
+ }
+ if (release)
+ hfi1_put_ss(&qp->r_sge);
+
+ if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+ goto send_comp;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc.opcode = IB_WC_RECV;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.port_num = 1;
+ /* Signal completion event if the solicited bit is set. */
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ ibp->n_loop_pkts++;
+flush_send:
+ sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+ hfi1_send_complete(sqp, wqe, send_status);
+ goto again;
+
+rnr_nak:
+ /* Handle RNR NAK */
+ if (qp->ibqp.qp_type == IB_QPT_UC)
+ goto send_comp;
+ ibp->n_rnr_naks++;
+ /*
+ * Note: we don't need the s_lock held since the BUSY flag
+ * makes this single threaded.
+ */
+ if (sqp->s_rnr_retry == 0) {
+ send_status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto serr;
+ }
+ if (sqp->s_rnr_retry_cnt < 7)
+ sqp->s_rnr_retry--;
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_RECV_OK))
+ goto clr_busy;
+ sqp->s_flags |= HFI1_S_WAIT_RNR;
+ sqp->s_timer.function = hfi1_rc_rnr_retry;
+ sqp->s_timer.expires = jiffies +
+ usecs_to_jiffies(ib_hfi1_rnr_table[qp->r_min_rnr_timer]);
+ add_timer(&sqp->s_timer);
+ goto clr_busy;
+
+op_err:
+ send_status = IB_WC_REM_OP_ERR;
+ wc.status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+
+inv_err:
+ send_status = IB_WC_REM_INV_REQ_ERR;
+ wc.status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+
+acc_err:
+ send_status = IB_WC_REM_ACCESS_ERR;
+ wc.status = IB_WC_LOC_PROT_ERR;
+err:
+ /* responder goes to error state */
+ hfi1_rc_error(qp, wc.status);
+
+serr:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ hfi1_send_complete(sqp, wqe, send_status);
+ if (sqp->ibqp.qp_type == IB_QPT_RC) {
+ int lastwqe = hfi1_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+ sqp->s_flags &= ~HFI1_S_BUSY;
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = sqp->ibqp.device;
+ ev.element.qp = &sqp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+ }
+ goto done;
+ }
+clr_busy:
+ sqp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+ rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+ struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+ hdr->version_tclass_flow =
+ cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+ (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+ (grh->flow_label << IB_GRH_FLOW_SHIFT));
+ hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+ /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+ hdr->next_hdr = IB_GRH_NEXT_HDR;
+ hdr->hop_limit = grh->hop_limit;
+ /* The SGID is 32-bit aligned. */
+ hdr->sgid.global.subnet_prefix = ibp->gid_prefix;
+ hdr->sgid.global.interface_id =
+ grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
+ ibp->guids[grh->sgid_index - 1] :
+ cpu_to_be64(ppd_from_ibp(ibp)->guid);
+ hdr->dgid = grh->dgid;
+
+ /* GRH header size in 32-bit words. */
+ return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+/*
+ * free_ahg - clear ahg from QP
+ */
+void clear_ahg(struct hfi1_qp *qp)
+{
+ qp->s_hdr->ahgcount = 0;
+ qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR);
+ if (qp->s_sde)
+ sdma_ahg_free(qp->s_sde, qp->s_ahgidx);
+ qp->s_ahgidx = -1;
+ qp->s_sde = NULL;
+}
+
+#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
+
+/**
+ * build_ahg - create ahg in s_hdr
+ * @qp: a pointer to QP
+ * @npsn: the next PSN for the request/response
+ *
+ * This routine handles the AHG by allocating an ahg entry and causing the
+ * copy of the first middle.
+ *
+ * Subsequent middles use the copied entry, editing the
+ * PSN with 1 or 2 edits.
+ */
+static inline void build_ahg(struct hfi1_qp *qp, u32 npsn)
+{
+ if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR))
+ clear_ahg(qp);
+ if (!(qp->s_flags & HFI1_S_AHG_VALID)) {
+ /* first middle that needs copy */
+ if (qp->s_ahgidx < 0) {
+ if (!qp->s_sde)
+ qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
+ qp->s_ahgidx = sdma_ahg_alloc(qp->s_sde);
+ }
+ if (qp->s_ahgidx >= 0) {
+ qp->s_ahgpsn = npsn;
+ qp->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+ /* save to protect a change in another thread */
+ qp->s_hdr->sde = qp->s_sde;
+ qp->s_hdr->ahgidx = qp->s_ahgidx;
+ qp->s_flags |= HFI1_S_AHG_VALID;
+ }
+ } else {
+ /* subsequent middle after valid */
+ if (qp->s_ahgidx >= 0) {
+ qp->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+ qp->s_hdr->ahgidx = qp->s_ahgidx;
+ qp->s_hdr->ahgcount++;
+ qp->s_hdr->ahgdesc[0] =
+ sdma_build_ahg_descriptor(
+ (__force u16)cpu_to_be16((u16)npsn),
+ BTH2_OFFSET,
+ 16,
+ 16);
+ if ((npsn & 0xffff0000) !=
+ (qp->s_ahgpsn & 0xffff0000)) {
+ qp->s_hdr->ahgcount++;
+ qp->s_hdr->ahgdesc[1] =
+ sdma_build_ahg_descriptor(
+ (__force u16)cpu_to_be16(
+ (u16)(npsn >> 16)),
+ BTH2_OFFSET,
+ 0,
+ 16);
+ }
+ }
+ }
+}
+
+void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
+ u32 bth0, u32 bth2, int middle)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ u16 lrh0;
+ u32 nwords;
+ u32 extra_bytes;
+ u8 sc5;
+ u32 bth1;
+
+ /* Construct the header. */
+ extra_bytes = -qp->s_cur_size & 3;
+ nwords = (qp->s_cur_size + extra_bytes) >> 2;
+ lrh0 = HFI1_LRH_BTH;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
+ &qp->remote_ah_attr.grh,
+ qp->s_hdrwords, nwords);
+ lrh0 = HFI1_LRH_GRH;
+ middle = 0;
+ }
+ sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+ qp->s_sc = sc5;
+ /*
+ * reset s_hdr/AHG fields
+ *
+ * This insures that the ahgentry/ahgcount
+ * are at a non-AHG default to protect
+ * build_verbs_tx_desc() from using
+ * an include ahgidx.
+ *
+ * build_ahg() will modify as appropriate
+ * to use the AHG feature.
+ */
+ qp->s_hdr->tx_flags = 0;
+ qp->s_hdr->ahgcount = 0;
+ qp->s_hdr->ahgidx = 0;
+ qp->s_hdr->sde = NULL;
+ if (qp->s_mig_state == IB_MIG_MIGRATED)
+ bth0 |= IB_BTH_MIG_REQ;
+ else
+ middle = 0;
+ if (middle)
+ build_ahg(qp, bth2);
+ else
+ qp->s_flags &= ~HFI1_S_AHG_VALID;
+ qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
+ qp->s_hdr->ibh.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ qp->s_hdr->ibh.lrh[2] =
+ cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+ qp->s_hdr->ibh.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+ qp->remote_ah_attr.src_path_bits);
+ bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+ bth0 |= extra_bytes << 20;
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ bth1 = qp->remote_qpn;
+ if (qp->s_flags & HFI1_S_ECN) {
+ qp->s_flags &= ~HFI1_S_ECN;
+ /* we recently received a FECN, so return a BECN */
+ bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+ }
+ ohdr->bth[1] = cpu_to_be32(bth1);
+ ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * hfi1_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted. Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void hfi1_do_send(struct work_struct *work)
+{
+ struct iowait *wait = container_of(work, struct iowait, iowork);
+ struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ int (*make_req)(struct hfi1_qp *qp);
+ unsigned long flags;
+
+ if ((qp->ibqp.qp_type == IB_QPT_RC ||
+ qp->ibqp.qp_type == IB_QPT_UC) &&
+ !loopback &&
+ (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) {
+ ruc_loopback(qp);
+ return;
+ }
+
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ make_req = hfi1_make_rc_req;
+ else if (qp->ibqp.qp_type == IB_QPT_UC)
+ make_req = hfi1_make_uc_req;
+ else
+ make_req = hfi1_make_ud_req;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Return if we are already busy processing a work request. */
+ if (!hfi1_send_ok(qp)) {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return;
+ }
+
+ qp->s_flags |= HFI1_S_BUSY;
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ do {
+ /* Check for a constructed packet to be sent. */
+ if (qp->s_hdrwords != 0) {
+ /*
+ * If the packet cannot be sent now, return and
+ * the send tasklet will be woken up later.
+ */
+ if (hfi1_verbs_send(qp, qp->s_hdr, qp->s_hdrwords,
+ qp->s_cur_sge, qp->s_cur_size))
+ break;
+ /* Record that s_hdr is empty. */
+ qp->s_hdrwords = 0;
+ }
+ } while (make_req(qp));
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
+ enum ib_wc_status status)
+{
+ u32 old_last, last;
+ unsigned i;
+
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+ return;
+
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct hfi1_sge *sge = &wqe->sg_list[i];
+
+ hfi1_put_mr(sge->mr);
+ }
+ if (qp->ibqp.qp_type == IB_QPT_UD ||
+ qp->ibqp.qp_type == IB_QPT_SMI ||
+ qp->ibqp.qp_type == IB_QPT_GSI)
+ atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+
+ /* See ch. 11.2.4.1 and 10.7.3.1 */
+ if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+ status != IB_WC_SUCCESS) {
+ struct ib_wc wc;
+
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = status;
+ wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+ wc.qp = &qp->ibqp;
+ if (status == IB_WC_SUCCESS)
+ wc.byte_len = wqe->length;
+ hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+ status != IB_WC_SUCCESS);
+ }
+
+ last = qp->s_last;
+ old_last = last;
+ if (++last >= qp->s_size)
+ last = 0;
+ qp->s_last = last;
+ if (qp->s_acked == old_last)
+ qp->s_acked = last;
+ if (qp->s_cur == old_last)
+ qp->s_cur = last;
+ if (qp->s_tail == old_last)
+ qp->s_tail = last;
+ if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+ qp->s_draining = 0;
+}
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
new file mode 100644
index 000000000000..aecd1a74741c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sdma.c
@@ -0,0 +1,2962 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "qp.h"
+#include "sdma.h"
+#include "iowait.h"
+#include "trace.h"
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 1024
+#define INVALID_TAIL 0xffff
+
+static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
+module_param(sdma_descq_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+static uint sdma_idle_cnt = 250;
+module_param(sdma_idle_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
+
+uint mod_num_sdma;
+module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
+MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
+
+#define SDMA_WAIT_BATCH_SIZE 20
+/* max wait time for a SDMA engine to indicate it has halted */
+#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
+/* all SDMA engine errors that cause a halt */
+
+#define SD(name) SEND_DMA_##name
+#define ALL_SDMA_ENG_HALT_ERRS \
+ (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
+
+/* sdma_sendctrl operations */
+#define SDMA_SENDCTRL_OP_ENABLE (1U << 0)
+#define SDMA_SENDCTRL_OP_INTENABLE (1U << 1)
+#define SDMA_SENDCTRL_OP_HALT (1U << 2)
+#define SDMA_SENDCTRL_OP_CLEANUP (1U << 3)
+
+/* handle long defines */
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
+
+static const char * const sdma_state_names[] = {
+ [sdma_state_s00_hw_down] = "s00_HwDown",
+ [sdma_state_s10_hw_start_up_halt_wait] = "s10_HwStartUpHaltWait",
+ [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
+ [sdma_state_s20_idle] = "s20_Idle",
+ [sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait",
+ [sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait",
+ [sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait",
+ [sdma_state_s60_idle_halt_wait] = "s60_IdleHaltWait",
+ [sdma_state_s80_hw_freeze] = "s80_HwFreeze",
+ [sdma_state_s82_freeze_sw_clean] = "s82_FreezeSwClean",
+ [sdma_state_s99_running] = "s99_Running",
+};
+
+static const char * const sdma_event_names[] = {
+ [sdma_event_e00_go_hw_down] = "e00_GoHwDown",
+ [sdma_event_e10_go_hw_start] = "e10_GoHwStart",
+ [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
+ [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
+ [sdma_event_e30_go_running] = "e30_GoRunning",
+ [sdma_event_e40_sw_cleaned] = "e40_SwCleaned",
+ [sdma_event_e50_hw_cleaned] = "e50_HwCleaned",
+ [sdma_event_e60_hw_halted] = "e60_HwHalted",
+ [sdma_event_e70_go_idle] = "e70_GoIdle",
+ [sdma_event_e80_hw_freeze] = "e80_HwFreeze",
+ [sdma_event_e81_hw_frozen] = "e81_HwFrozen",
+ [sdma_event_e82_hw_unfreeze] = "e82_HwUnfreeze",
+ [sdma_event_e85_link_down] = "e85_LinkDown",
+ [sdma_event_e90_sw_halted] = "e90_SwHalted",
+};
+
+static const struct sdma_set_state_action sdma_action_table[] = {
+ [sdma_state_s00_hw_down] = {
+ .go_s99_running_tofalse = 1,
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s10_hw_start_up_halt_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 1,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s15_hw_start_up_clean_wait] = {
+ .op_enable = 0,
+ .op_intenable = 1,
+ .op_halt = 0,
+ .op_cleanup = 1,
+ },
+ [sdma_state_s20_idle] = {
+ .op_enable = 0,
+ .op_intenable = 1,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s30_sw_clean_up_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s40_hw_clean_up_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 1,
+ },
+ [sdma_state_s50_hw_halt_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s60_idle_halt_wait] = {
+ .go_s99_running_tofalse = 1,
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 1,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s80_hw_freeze] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s82_freeze_sw_clean] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s99_running] = {
+ .op_enable = 1,
+ .op_intenable = 1,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ .go_s99_running_totrue = 1,
+ },
+};
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/* declare all statics here rather than keep sorting */
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct sdma_state *);
+static void sdma_get(struct sdma_state *);
+static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_put(struct sdma_state *);
+static void sdma_set_state(struct sdma_engine *, enum sdma_states);
+static void sdma_start_hw_clean_up(struct sdma_engine *);
+static void sdma_start_sw_clean_up(struct sdma_engine *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sendctrl(struct sdma_engine *, unsigned);
+static void init_sdma_regs(struct sdma_engine *, u32, uint);
+static void sdma_process_event(
+ struct sdma_engine *sde,
+ enum sdma_events event);
+static void __sdma_process_event(
+ struct sdma_engine *sde,
+ enum sdma_events event);
+static void dump_sdma_state(struct sdma_engine *sde);
+static void sdma_make_progress(struct sdma_engine *sde, u64 status);
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
+static void sdma_flush_descq(struct sdma_engine *sde);
+
+/**
+ * sdma_state_name() - return state string from enum
+ * @state: state
+ */
+static const char *sdma_state_name(enum sdma_states state)
+{
+ return sdma_state_names[state];
+}
+
+static void sdma_get(struct sdma_state *ss)
+{
+ kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+ struct sdma_state *ss =
+ container_of(kref, struct sdma_state, kref);
+
+ complete(&ss->comp);
+}
+
+static void sdma_put(struct sdma_state *ss)
+{
+ kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct sdma_state *ss)
+{
+ sdma_put(ss);
+ wait_for_completion(&ss->comp);
+}
+
+static inline void write_sde_csr(
+ struct sdma_engine *sde,
+ u32 offset0,
+ u64 value)
+{
+ write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
+}
+
+static inline u64 read_sde_csr(
+ struct sdma_engine *sde,
+ u32 offset0)
+{
+ return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
+}
+
+/*
+ * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
+ * sdma engine 'sde' to drop to 0.
+ */
+static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
+ int pause)
+{
+ u64 off = 8 * sde->this_idx;
+ struct hfi1_devdata *dd = sde->dd;
+ int lcnt = 0;
+
+ while (1) {
+ u64 reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
+
+ reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
+ reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
+ if (reg == 0)
+ break;
+ if (lcnt++ > 100) {
+ dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u\n",
+ __func__, sde->this_idx, (u32)reg);
+ break;
+ }
+ udelay(1);
+ }
+}
+
+/*
+ * sdma_wait() - wait for packet egress to complete for all SDMA engines,
+ * and pause for credit return.
+ */
+void sdma_wait(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < dd->num_sdma; i++) {
+ struct sdma_engine *sde = &dd->per_sdma[i];
+
+ sdma_wait_for_packet_egress(sde, 0);
+ }
+}
+
+static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
+{
+ u64 reg;
+
+ if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
+ return;
+ reg = cnt;
+ reg &= SD(DESC_CNT_CNT_MASK);
+ reg <<= SD(DESC_CNT_CNT_SHIFT);
+ write_sde_csr(sde, SD(DESC_CNT), reg);
+}
+
+/*
+ * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
+ *
+ * Depending on timing there can be txreqs in two places:
+ * - in the descq ring
+ * - in the flush list
+ *
+ * To avoid ordering issues the descq ring needs to be flushed
+ * first followed by the flush list.
+ *
+ * This routine is called from two places
+ * - From a work queue item
+ * - Directly from the state machine just before setting the
+ * state to running
+ *
+ * Must be called with head_lock held
+ *
+ */
+static void sdma_flush(struct sdma_engine *sde)
+{
+ struct sdma_txreq *txp, *txp_next;
+ LIST_HEAD(flushlist);
+
+ /* flush from head to tail */
+ sdma_flush_descq(sde);
+ spin_lock(&sde->flushlist_lock);
+ /* copy flush list */
+ list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
+ list_del_init(&txp->list);
+ list_add_tail(&txp->list, &flushlist);
+ }
+ spin_unlock(&sde->flushlist_lock);
+ /* flush from flush list */
+ list_for_each_entry_safe(txp, txp_next, &flushlist, list) {
+ int drained = 0;
+ /* protect against complete modifying */
+ struct iowait *wait = txp->wait;
+
+ list_del_init(&txp->list);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ trace_hfi1_sdma_out_sn(sde, txp->sn);
+ if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+ dd_dev_err(sde->dd, "expected %llu got %llu\n",
+ sde->head_sn, txp->sn);
+ sde->head_sn++;
+#endif
+ sdma_txclean(sde->dd, txp);
+ if (wait)
+ drained = atomic_dec_and_test(&wait->sdma_busy);
+ if (txp->complete)
+ (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
+ if (wait && drained)
+ iowait_drain_wakeup(wait);
+ }
+}
+
+/*
+ * Fields a work request for flushing the descq ring
+ * and the flush list
+ *
+ * If the engine has been brought to running during
+ * the scheduling delay, the flush is ignored, assuming
+ * that the process of bringing the engine to running
+ * would have done this flush prior to going to running.
+ *
+ */
+static void sdma_field_flush(struct work_struct *work)
+{
+ unsigned long flags;
+ struct sdma_engine *sde =
+ container_of(work, struct sdma_engine, flush_worker);
+
+ write_seqlock_irqsave(&sde->head_lock, flags);
+ if (!__sdma_running(sde))
+ sdma_flush(sde);
+ write_sequnlock_irqrestore(&sde->head_lock, flags);
+}
+
+static void sdma_err_halt_wait(struct work_struct *work)
+{
+ struct sdma_engine *sde = container_of(work, struct sdma_engine,
+ err_halt_worker);
+ u64 statuscsr;
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
+ while (1) {
+ statuscsr = read_sde_csr(sde, SD(STATUS));
+ statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
+ if (statuscsr)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(sde->dd,
+ "SDMA engine %d - timeout waiting for engine to halt\n",
+ sde->this_idx);
+ /*
+ * Continue anyway. This could happen if there was
+ * an uncorrectable error in the wrong spot.
+ */
+ break;
+ }
+ usleep_range(80, 120);
+ }
+
+ sdma_process_event(sde, sdma_event_e15_hw_halt_done);
+}
+
+static void sdma_start_err_halt_wait(struct sdma_engine *sde)
+{
+ schedule_work(&sde->err_halt_worker);
+}
+
+
+static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
+{
+ if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
+
+ unsigned index;
+ struct hfi1_devdata *dd = sde->dd;
+
+ for (index = 0; index < dd->num_sdma; index++) {
+ struct sdma_engine *curr_sdma = &dd->per_sdma[index];
+
+ if (curr_sdma != sde)
+ curr_sdma->progress_check_head =
+ curr_sdma->descq_head;
+ }
+ dd_dev_err(sde->dd,
+ "SDMA engine %d - check scheduled\n",
+ sde->this_idx);
+ mod_timer(&sde->err_progress_check_timer, jiffies + 10);
+ }
+}
+
+static void sdma_err_progress_check(unsigned long data)
+{
+ unsigned index;
+ struct sdma_engine *sde = (struct sdma_engine *)data;
+
+ dd_dev_err(sde->dd, "SDE progress check event\n");
+ for (index = 0; index < sde->dd->num_sdma; index++) {
+ struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
+ unsigned long flags;
+
+ /* check progress on each engine except the current one */
+ if (curr_sde == sde)
+ continue;
+ /*
+ * We must lock interrupts when acquiring sde->lock,
+ * to avoid a deadlock if interrupt triggers and spins on
+ * the same lock on same CPU
+ */
+ spin_lock_irqsave(&curr_sde->tail_lock, flags);
+ write_seqlock(&curr_sde->head_lock);
+
+ /* skip non-running queues */
+ if (curr_sde->state.current_state != sdma_state_s99_running) {
+ write_sequnlock(&curr_sde->head_lock);
+ spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+ continue;
+ }
+
+ if ((curr_sde->descq_head != curr_sde->descq_tail) &&
+ (curr_sde->descq_head ==
+ curr_sde->progress_check_head))
+ __sdma_process_event(curr_sde,
+ sdma_event_e90_sw_halted);
+ write_sequnlock(&curr_sde->head_lock);
+ spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+ }
+ schedule_work(&sde->err_halt_worker);
+}
+
+static void sdma_hw_clean_up_task(unsigned long opaque)
+{
+ struct sdma_engine *sde = (struct sdma_engine *) opaque;
+ u64 statuscsr;
+
+ while (1) {
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__,
+ __func__);
+#endif
+ statuscsr = read_sde_csr(sde, SD(STATUS));
+ statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
+ if (statuscsr)
+ break;
+ udelay(10);
+ }
+
+ sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
+}
+
+static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
+{
+ smp_read_barrier_depends(); /* see sdma_update_tail() */
+ return sde->tx_ring[sde->tx_head & sde->sdma_mask];
+}
+
+/*
+ * flush ring for recovery
+ */
+static void sdma_flush_descq(struct sdma_engine *sde)
+{
+ u16 head, tail;
+ int progress = 0;
+ struct sdma_txreq *txp = get_txhead(sde);
+
+ /* The reason for some of the complexity of this code is that
+ * not all descriptors have corresponding txps. So, we have to
+ * be able to skip over descs until we wander into the range of
+ * the next txp on the list.
+ */
+ head = sde->descq_head & sde->sdma_mask;
+ tail = sde->descq_tail & sde->sdma_mask;
+ while (head != tail) {
+ /* advance head, wrap if needed */
+ head = ++sde->descq_head & sde->sdma_mask;
+ /* if now past this txp's descs, do the callback */
+ if (txp && txp->next_descq_idx == head) {
+ int drained = 0;
+ /* protect against complete modifying */
+ struct iowait *wait = txp->wait;
+
+ /* remove from list */
+ sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+ if (wait)
+ drained = atomic_dec_and_test(&wait->sdma_busy);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ trace_hfi1_sdma_out_sn(sde, txp->sn);
+ if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+ dd_dev_err(sde->dd, "expected %llu got %llu\n",
+ sde->head_sn, txp->sn);
+ sde->head_sn++;
+#endif
+ sdma_txclean(sde->dd, txp);
+ trace_hfi1_sdma_progress(sde, head, tail, txp);
+ if (txp->complete)
+ (*txp->complete)(
+ txp,
+ SDMA_TXREQ_S_ABORTED,
+ drained);
+ if (wait && drained)
+ iowait_drain_wakeup(wait);
+ /* see if there is another txp */
+ txp = get_txhead(sde);
+ }
+ progress++;
+ }
+ if (progress)
+ sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+ struct sdma_engine *sde = (struct sdma_engine *) opaque;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sde->tail_lock, flags);
+ write_seqlock(&sde->head_lock);
+
+ /*
+ * At this point, the following should always be true:
+ * - We are halted, so no more descriptors are getting retired.
+ * - We are not running, so no one is submitting new work.
+ * - Only we can send the e40_sw_cleaned, so we can't start
+ * running again until we say so. So, the active list and
+ * descq are ours to play with.
+ */
+
+
+ /*
+ * In the error clean up sequence, software clean must be called
+ * before the hardware clean so we can use the hardware head in
+ * the progress routine. A hardware clean or SPC unfreeze will
+ * reset the hardware head.
+ *
+ * Process all retired requests. The progress routine will use the
+ * latest physical hardware head - we are not running so speed does
+ * not matter.
+ */
+ sdma_make_progress(sde, 0);
+
+ sdma_flush(sde);
+
+ /*
+ * Reset our notion of head and tail.
+ * Note that the HW registers have been reset via an earlier
+ * clean up.
+ */
+ sde->descq_tail = 0;
+ sde->descq_head = 0;
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ *sde->head_dma = 0;
+
+ __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
+
+ write_sequnlock(&sde->head_lock);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sw_tear_down(struct sdma_engine *sde)
+{
+ struct sdma_state *ss = &sde->state;
+
+ /* Releasing this reference means the state machine has stopped. */
+ sdma_put(ss);
+
+ /* stop waiting for all unfreeze events to complete */
+ atomic_set(&sde->dd->sdma_unfreeze_count, -1);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+}
+
+static void sdma_start_hw_clean_up(struct sdma_engine *sde)
+{
+ tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
+}
+
+static void sdma_start_sw_clean_up(struct sdma_engine *sde)
+{
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+}
+
+static void sdma_set_state(struct sdma_engine *sde,
+ enum sdma_states next_state)
+{
+ struct sdma_state *ss = &sde->state;
+ const struct sdma_set_state_action *action = sdma_action_table;
+ unsigned op = 0;
+
+ trace_hfi1_sdma_state(
+ sde,
+ sdma_state_names[ss->current_state],
+ sdma_state_names[next_state]);
+
+ /* debugging bookkeeping */
+ ss->previous_state = ss->current_state;
+ ss->previous_op = ss->current_op;
+ ss->current_state = next_state;
+
+ if (ss->previous_state != sdma_state_s99_running
+ && next_state == sdma_state_s99_running)
+ sdma_flush(sde);
+
+ if (action[next_state].op_enable)
+ op |= SDMA_SENDCTRL_OP_ENABLE;
+
+ if (action[next_state].op_intenable)
+ op |= SDMA_SENDCTRL_OP_INTENABLE;
+
+ if (action[next_state].op_halt)
+ op |= SDMA_SENDCTRL_OP_HALT;
+
+ if (action[next_state].op_cleanup)
+ op |= SDMA_SENDCTRL_OP_CLEANUP;
+
+ if (action[next_state].go_s99_running_tofalse)
+ ss->go_s99_running = 0;
+
+ if (action[next_state].go_s99_running_totrue)
+ ss->go_s99_running = 1;
+
+ ss->current_op = op;
+ sdma_sendctrl(sde, ss->current_op);
+}
+
+/**
+ * sdma_get_descq_cnt() - called when device probed
+ *
+ * Return a validated descq count.
+ *
+ * This is currently only used in the verbs initialization to build the tx
+ * list.
+ *
+ * This will probably be deleted in favor of a more scalable approach to
+ * alloc tx's.
+ *
+ */
+u16 sdma_get_descq_cnt(void)
+{
+ u16 count = sdma_descq_cnt;
+
+ if (!count)
+ return SDMA_DESCQ_CNT;
+ /* count must be a power of 2 greater than 64 and less than
+ * 32768. Otherwise return default.
+ */
+ if (!is_power_of_2(count))
+ return SDMA_DESCQ_CNT;
+ if (count < 64 || count > 32768)
+ return SDMA_DESCQ_CNT;
+ return count;
+}
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl. The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 vl)
+{
+ struct sdma_vl_map *m;
+ struct sdma_map_elem *e;
+ struct sdma_engine *rval;
+
+ if (WARN_ON(vl > 8))
+ return NULL;
+
+ rcu_read_lock();
+ m = rcu_dereference(dd->sdma_map);
+ if (unlikely(!m)) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ e = m->map[vl & m->mask];
+ rval = e->sde[selector & e->mask];
+ rcu_read_unlock();
+
+ trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+ return rval;
+}
+
+/**
+ * sdma_select_engine_sc() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ *
+ * This function returns an engine based on the selector and an sc.
+ */
+struct sdma_engine *sdma_select_engine_sc(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 sc5)
+{
+ u8 vl = sc_to_vlt(dd, sc5);
+
+ return sdma_select_engine_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void sdma_map_free(struct sdma_vl_map *m)
+{
+ int i;
+
+ for (i = 0; m && i < m->actual_vls; i++)
+ kfree(m->map[i]);
+ kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void sdma_map_rcu_callback(struct rcu_head *list)
+{
+ struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
+
+ sdma_map_free(m);
+}
+
+/**
+ * sdma_map_init - called when # vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_engines: per vl engine mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_engines is used to specify a non-uniform vl/engine loading. NULL
+ * implies auto computing the loading and giving each VLs a uniform
+ * distribution of engines per VL.
+ *
+ * The auto algorithm computes the sde_per_vl and the number of extra
+ * engines. Any extra engines are added from the last VL on down.
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_sdma are non-power of 2, the array sizes
+ * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
+ * up to the next highest power of 2 and the first entry is reused
+ * in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is
+ * not changed.
+ *
+ */
+int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
+{
+ int i, j;
+ int extra, sde_per_vl;
+ int engine = 0;
+ u8 lvl_engines[OPA_MAX_VLS];
+ struct sdma_vl_map *oldmap, *newmap;
+
+ if (!(dd->flags & HFI1_HAS_SEND_DMA))
+ return 0;
+
+ if (!vl_engines) {
+ /* truncate divide */
+ sde_per_vl = dd->num_sdma / num_vls;
+ /* extras */
+ extra = dd->num_sdma % num_vls;
+ vl_engines = lvl_engines;
+ /* add extras from last vl down */
+ for (i = num_vls - 1; i >= 0; i--, extra--)
+ vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
+ }
+ /* build new map */
+ newmap = kzalloc(
+ sizeof(struct sdma_vl_map) +
+ roundup_pow_of_two(num_vls) *
+ sizeof(struct sdma_map_elem *),
+ GFP_KERNEL);
+ if (!newmap)
+ goto bail;
+ newmap->actual_vls = num_vls;
+ newmap->vls = roundup_pow_of_two(num_vls);
+ newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+ for (i = 0; i < newmap->vls; i++) {
+ /* save for wrap around */
+ int first_engine = engine;
+
+ if (i < newmap->actual_vls) {
+ int sz = roundup_pow_of_two(vl_engines[i]);
+
+ /* only allocate once */
+ newmap->map[i] = kzalloc(
+ sizeof(struct sdma_map_elem) +
+ sz * sizeof(struct sdma_engine *),
+ GFP_KERNEL);
+ if (!newmap->map[i])
+ goto bail;
+ newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+ /* assign engines */
+ for (j = 0; j < sz; j++) {
+ newmap->map[i]->sde[j] =
+ &dd->per_sdma[engine];
+ if (++engine >= first_engine + vl_engines[i])
+ /* wrap back to first engine */
+ engine = first_engine;
+ }
+ } else {
+ /* just re-use entry without allocating */
+ newmap->map[i] = newmap->map[i % num_vls];
+ }
+ engine = first_engine + vl_engines[i];
+ }
+ /* newmap in hand, save old map */
+ spin_lock_irq(&dd->sde_map_lock);
+ oldmap = rcu_dereference_protected(dd->sdma_map,
+ lockdep_is_held(&dd->sde_map_lock));
+
+ /* publish newmap */
+ rcu_assign_pointer(dd->sdma_map, newmap);
+
+ spin_unlock_irq(&dd->sde_map_lock);
+ /* success, free any old map after grace period */
+ if (oldmap)
+ call_rcu(&oldmap->list, sdma_map_rcu_callback);
+ return 0;
+bail:
+ /* free any partial allocation */
+ sdma_map_free(newmap);
+ return -ENOMEM;
+}
+
+/*
+ * Clean up allocated memory.
+ *
+ * This routine is can be called regardless of the success of sdma_init()
+ *
+ */
+static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
+{
+ size_t i;
+ struct sdma_engine *sde;
+
+ if (dd->sdma_pad_dma) {
+ dma_free_coherent(&dd->pcidev->dev, 4,
+ (void *)dd->sdma_pad_dma,
+ dd->sdma_pad_phys);
+ dd->sdma_pad_dma = NULL;
+ dd->sdma_pad_phys = 0;
+ }
+ if (dd->sdma_heads_dma) {
+ dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
+ (void *)dd->sdma_heads_dma,
+ dd->sdma_heads_phys);
+ dd->sdma_heads_dma = NULL;
+ dd->sdma_heads_phys = 0;
+ }
+ for (i = 0; dd->per_sdma && i < num_engines; ++i) {
+ sde = &dd->per_sdma[i];
+
+ sde->head_dma = NULL;
+ sde->head_phys = 0;
+
+ if (sde->descq) {
+ dma_free_coherent(
+ &dd->pcidev->dev,
+ sde->descq_cnt * sizeof(u64[2]),
+ sde->descq,
+ sde->descq_phys
+ );
+ sde->descq = NULL;
+ sde->descq_phys = 0;
+ }
+ if (is_vmalloc_addr(sde->tx_ring))
+ vfree(sde->tx_ring);
+ else
+ kfree(sde->tx_ring);
+ sde->tx_ring = NULL;
+ }
+ spin_lock_irq(&dd->sde_map_lock);
+ kfree(rcu_access_pointer(dd->sdma_map));
+ RCU_INIT_POINTER(dd->sdma_map, NULL);
+ spin_unlock_irq(&dd->sde_map_lock);
+ synchronize_rcu();
+ kfree(dd->per_sdma);
+ dd->per_sdma = NULL;
+}
+
+/**
+ * sdma_init() - called when device probed
+ * @dd: hfi1_devdata
+ * @port: port number (currently only zero)
+ *
+ * sdma_init initializes the specified number of engines.
+ *
+ * The code initializes each sde, its csrs. Interrupts
+ * are not required to be enabled.
+ *
+ * Returns:
+ * 0 - success, -errno on failure
+ */
+int sdma_init(struct hfi1_devdata *dd, u8 port)
+{
+ unsigned this_idx;
+ struct sdma_engine *sde;
+ u16 descq_cnt;
+ void *curr_head;
+ struct hfi1_pportdata *ppd = dd->pport + port;
+ u32 per_sdma_credits;
+ uint idle_cnt = sdma_idle_cnt;
+ size_t num_engines = dd->chip_sdma_engines;
+
+ if (!HFI1_CAP_IS_KSET(SDMA)) {
+ HFI1_CAP_CLEAR(SDMA_AHG);
+ return 0;
+ }
+ if (mod_num_sdma &&
+ /* can't exceed chip support */
+ mod_num_sdma <= dd->chip_sdma_engines &&
+ /* count must be >= vls */
+ mod_num_sdma >= num_vls)
+ num_engines = mod_num_sdma;
+
+ dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
+ dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
+ dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
+ dd->chip_sdma_mem_size);
+
+ per_sdma_credits =
+ dd->chip_sdma_mem_size/(num_engines * SDMA_BLOCK_SIZE);
+
+ /* set up freeze waitqueue */
+ init_waitqueue_head(&dd->sdma_unfreeze_wq);
+ atomic_set(&dd->sdma_unfreeze_count, 0);
+
+ descq_cnt = sdma_get_descq_cnt();
+ dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
+ num_engines, descq_cnt);
+
+ /* alloc memory for array of send engines */
+ dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
+ if (!dd->per_sdma)
+ return -ENOMEM;
+
+ idle_cnt = ns_to_cclock(dd, idle_cnt);
+ /* Allocate memory for SendDMA descriptor FIFOs */
+ for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+ sde = &dd->per_sdma[this_idx];
+ sde->dd = dd;
+ sde->ppd = ppd;
+ sde->this_idx = this_idx;
+ sde->descq_cnt = descq_cnt;
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ sde->sdma_shift = ilog2(descq_cnt);
+ sde->sdma_mask = (1 << sde->sdma_shift) - 1;
+ sde->descq_full_count = 0;
+
+ /* Create a mask for all 3 chip interrupt sources */
+ sde->imask = (u64)1 << (0*TXE_NUM_SDMA_ENGINES + this_idx)
+ | (u64)1 << (1*TXE_NUM_SDMA_ENGINES + this_idx)
+ | (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
+ /* Create a mask specifically for sdma_idle */
+ sde->idle_mask =
+ (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
+ /* Create a mask specifically for sdma_progress */
+ sde->progress_mask =
+ (u64)1 << (TXE_NUM_SDMA_ENGINES + this_idx);
+ spin_lock_init(&sde->tail_lock);
+ seqlock_init(&sde->head_lock);
+ spin_lock_init(&sde->senddmactrl_lock);
+ spin_lock_init(&sde->flushlist_lock);
+ /* insure there is always a zero bit */
+ sde->ahg_bits = 0xfffffffe00000000ULL;
+
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+
+ /* set up reference counting */
+ kref_init(&sde->state.kref);
+ init_completion(&sde->state.comp);
+
+ INIT_LIST_HEAD(&sde->flushlist);
+ INIT_LIST_HEAD(&sde->dmawait);
+
+ sde->tail_csr =
+ get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
+
+ if (idle_cnt)
+ dd->default_desc1 =
+ SDMA_DESC1_HEAD_TO_HOST_FLAG;
+ else
+ dd->default_desc1 =
+ SDMA_DESC1_INT_REQ_FLAG;
+
+ tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
+ (unsigned long)sde);
+
+ tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+ (unsigned long)sde);
+ INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
+ INIT_WORK(&sde->flush_worker, sdma_field_flush);
+
+ sde->progress_check_head = 0;
+
+ init_timer(&sde->err_progress_check_timer);
+ sde->err_progress_check_timer.function =
+ sdma_err_progress_check;
+ sde->err_progress_check_timer.data = (unsigned long)sde;
+
+ sde->descq = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ descq_cnt * sizeof(u64[2]),
+ &sde->descq_phys,
+ GFP_KERNEL
+ );
+ if (!sde->descq)
+ goto bail;
+ sde->tx_ring =
+ kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
+ GFP_KERNEL);
+ if (!sde->tx_ring)
+ sde->tx_ring =
+ vzalloc(
+ sizeof(struct sdma_txreq *) *
+ descq_cnt);
+ if (!sde->tx_ring)
+ goto bail;
+ }
+
+ dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
+ /* Allocate memory for DMA of head registers to memory */
+ dd->sdma_heads_dma = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ dd->sdma_heads_size,
+ &dd->sdma_heads_phys,
+ GFP_KERNEL
+ );
+ if (!dd->sdma_heads_dma) {
+ dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
+ goto bail;
+ }
+
+ /* Allocate memory for pad */
+ dd->sdma_pad_dma = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ sizeof(u32),
+ &dd->sdma_pad_phys,
+ GFP_KERNEL
+ );
+ if (!dd->sdma_pad_dma) {
+ dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
+ goto bail;
+ }
+
+ /* assign each engine to different cacheline and init registers */
+ curr_head = (void *)dd->sdma_heads_dma;
+ for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+ unsigned long phys_offset;
+
+ sde = &dd->per_sdma[this_idx];
+
+ sde->head_dma = curr_head;
+ curr_head += L1_CACHE_BYTES;
+ phys_offset = (unsigned long)sde->head_dma -
+ (unsigned long)dd->sdma_heads_dma;
+ sde->head_phys = dd->sdma_heads_phys + phys_offset;
+ init_sdma_regs(sde, per_sdma_credits, idle_cnt);
+ }
+ dd->flags |= HFI1_HAS_SEND_DMA;
+ dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
+ dd->num_sdma = num_engines;
+ if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
+ goto bail;
+ dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
+ return 0;
+
+bail:
+ sdma_clean(dd, num_engines);
+ return -ENOMEM;
+}
+
+/**
+ * sdma_all_running() - called when the link goes up
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the running state.
+ */
+void sdma_all_running(struct hfi1_devdata *dd)
+{
+ struct sdma_engine *sde;
+ unsigned int i;
+
+ /* move all engines to running */
+ for (i = 0; i < dd->num_sdma; ++i) {
+ sde = &dd->per_sdma[i];
+ sdma_process_event(sde, sdma_event_e30_go_running);
+ }
+}
+
+/**
+ * sdma_all_idle() - called when the link goes down
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the idle state.
+ */
+void sdma_all_idle(struct hfi1_devdata *dd)
+{
+ struct sdma_engine *sde;
+ unsigned int i;
+
+ /* idle all engines */
+ for (i = 0; i < dd->num_sdma; ++i) {
+ sde = &dd->per_sdma[i];
+ sdma_process_event(sde, sdma_event_e70_go_idle);
+ }
+}
+
+/**
+ * sdma_start() - called to kick off state processing for all engines
+ * @dd: hfi1_devdata
+ *
+ * This routine is for kicking off the state processing for all required
+ * sdma engines. Interrupts need to be working at this point.
+ *
+ */
+void sdma_start(struct hfi1_devdata *dd)
+{
+ unsigned i;
+ struct sdma_engine *sde;
+
+ /* kick off the engines state processing */
+ for (i = 0; i < dd->num_sdma; ++i) {
+ sde = &dd->per_sdma[i];
+ sdma_process_event(sde, sdma_event_e10_go_hw_start);
+ }
+}
+
+/**
+ * sdma_exit() - used when module is removed
+ * @dd: hfi1_devdata
+ */
+void sdma_exit(struct hfi1_devdata *dd)
+{
+ unsigned this_idx;
+ struct sdma_engine *sde;
+
+ for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
+ ++this_idx) {
+
+ sde = &dd->per_sdma[this_idx];
+ if (!list_empty(&sde->dmawait))
+ dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
+ sde->this_idx);
+ sdma_process_event(sde, sdma_event_e00_go_hw_down);
+
+ del_timer_sync(&sde->err_progress_check_timer);
+
+ /*
+ * This waits for the state machine to exit so it is not
+ * necessary to kill the sdma_sw_clean_up_task to make sure
+ * it is not running.
+ */
+ sdma_finalput(&sde->state);
+ }
+ sdma_clean(dd, dd->num_sdma);
+}
+
+/*
+ * unmap the indicated descriptor
+ */
+static inline void sdma_unmap_desc(
+ struct hfi1_devdata *dd,
+ struct sdma_desc *descp)
+{
+ switch (sdma_mapping_type(descp)) {
+ case SDMA_MAP_SINGLE:
+ dma_unmap_single(
+ &dd->pcidev->dev,
+ sdma_mapping_addr(descp),
+ sdma_mapping_len(descp),
+ DMA_TO_DEVICE);
+ break;
+ case SDMA_MAP_PAGE:
+ dma_unmap_page(
+ &dd->pcidev->dev,
+ sdma_mapping_addr(descp),
+ sdma_mapping_len(descp),
+ DMA_TO_DEVICE);
+ break;
+ }
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+ return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+ >> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void sdma_txclean(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx)
+{
+ u16 i;
+
+ if (tx->num_desc) {
+ u8 skip = 0, mode = ahg_mode(tx);
+
+ /* unmap first */
+ sdma_unmap_desc(dd, &tx->descp[0]);
+ /* determine number of AHG descriptors to skip */
+ if (mode > SDMA_AHG_APPLY_UPDATE1)
+ skip = mode >> 1;
+ for (i = 1 + skip; i < tx->num_desc; i++)
+ sdma_unmap_desc(dd, &tx->descp[i]);
+ tx->num_desc = 0;
+ }
+ kfree(tx->coalesce_buf);
+ tx->coalesce_buf = NULL;
+ /* kmalloc'ed descp */
+ if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+ tx->desc_limit = ARRAY_SIZE(tx->descs);
+ kfree(tx->descp);
+ }
+}
+
+static inline u16 sdma_gethead(struct sdma_engine *sde)
+{
+ struct hfi1_devdata *dd = sde->dd;
+ int use_dmahead;
+ u16 hwhead;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+retry:
+ use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
+ (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
+ hwhead = use_dmahead ?
+ (u16) le64_to_cpu(*sde->head_dma) :
+ (u16) read_sde_csr(sde, SD(HEAD));
+
+ if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
+ u16 cnt;
+ u16 swtail;
+ u16 swhead;
+ int sane;
+
+ swhead = sde->descq_head & sde->sdma_mask;
+ /* this code is really bad for cache line trading */
+ swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+ cnt = sde->descq_cnt;
+
+ if (swhead < swtail)
+ /* not wrapped */
+ sane = (hwhead >= swhead) & (hwhead <= swtail);
+ else if (swhead > swtail)
+ /* wrapped around */
+ sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+ (hwhead <= swtail);
+ else
+ /* empty */
+ sane = (hwhead == swhead);
+
+ if (unlikely(!sane)) {
+ dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
+ sde->this_idx,
+ use_dmahead ? "dma" : "kreg",
+ hwhead, swhead, swtail, cnt);
+ if (use_dmahead) {
+ /* try one more time, using csr */
+ use_dmahead = 0;
+ goto retry;
+ }
+ /* proceed as if no progress */
+ hwhead = swhead;
+ }
+ }
+ return hwhead;
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with head_lock held.
+ */
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
+{
+ struct iowait *wait, *nw;
+ struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
+ unsigned i, n = 0, seq;
+ struct sdma_txreq *stx;
+ struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ dd_dev_err(sde->dd, "avail: %u\n", avail);
+#endif
+
+ do {
+ seq = read_seqbegin(&dev->iowait_lock);
+ if (!list_empty(&sde->dmawait)) {
+ /* at least one item */
+ write_seqlock(&dev->iowait_lock);
+ /* Harvest waiters wanting DMA descriptors */
+ list_for_each_entry_safe(
+ wait,
+ nw,
+ &sde->dmawait,
+ list) {
+ u16 num_desc = 0;
+
+ if (!wait->wakeup)
+ continue;
+ if (n == ARRAY_SIZE(waits))
+ break;
+ if (!list_empty(&wait->tx_head)) {
+ stx = list_first_entry(
+ &wait->tx_head,
+ struct sdma_txreq,
+ list);
+ num_desc = stx->num_desc;
+ }
+ if (num_desc > avail)
+ break;
+ avail -= num_desc;
+ list_del_init(&wait->list);
+ waits[n++] = wait;
+ }
+ write_sequnlock(&dev->iowait_lock);
+ break;
+ }
+ } while (read_seqretry(&dev->iowait_lock, seq));
+
+ for (i = 0; i < n; i++)
+ waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+}
+
+/* head_lock must be held */
+static void sdma_make_progress(struct sdma_engine *sde, u64 status)
+{
+ struct sdma_txreq *txp = NULL;
+ int progress = 0;
+ u16 hwhead, swhead, swtail;
+ int idle_check_done = 0;
+
+ hwhead = sdma_gethead(sde);
+
+ /* The reason for some of the complexity of this code is that
+ * not all descriptors have corresponding txps. So, we have to
+ * be able to skip over descs until we wander into the range of
+ * the next txp on the list.
+ */
+
+retry:
+ txp = get_txhead(sde);
+ swhead = sde->descq_head & sde->sdma_mask;
+ trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+ while (swhead != hwhead) {
+ /* advance head, wrap if needed */
+ swhead = ++sde->descq_head & sde->sdma_mask;
+
+ /* if now past this txp's descs, do the callback */
+ if (txp && txp->next_descq_idx == swhead) {
+ int drained = 0;
+ /* protect against complete modifying */
+ struct iowait *wait = txp->wait;
+
+ /* remove from list */
+ sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+ if (wait)
+ drained = atomic_dec_and_test(&wait->sdma_busy);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ trace_hfi1_sdma_out_sn(sde, txp->sn);
+ if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+ dd_dev_err(sde->dd, "expected %llu got %llu\n",
+ sde->head_sn, txp->sn);
+ sde->head_sn++;
+#endif
+ sdma_txclean(sde->dd, txp);
+ if (txp->complete)
+ (*txp->complete)(
+ txp,
+ SDMA_TXREQ_S_OK,
+ drained);
+ if (wait && drained)
+ iowait_drain_wakeup(wait);
+ /* see if there is another txp */
+ txp = get_txhead(sde);
+ }
+ trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+ progress++;
+ }
+
+ /*
+ * The SDMA idle interrupt is not guaranteed to be ordered with respect
+ * to updates to the the dma_head location in host memory. The head
+ * value read might not be fully up to date. If there are pending
+ * descriptors and the SDMA idle interrupt fired then read from the
+ * CSR SDMA head instead to get the latest value from the hardware.
+ * The hardware SDMA head should be read at most once in this invocation
+ * of sdma_make_progress(..) which is ensured by idle_check_done flag
+ */
+ if ((status & sde->idle_mask) && !idle_check_done) {
+ swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+ if (swtail != hwhead) {
+ hwhead = (u16)read_sde_csr(sde, SD(HEAD));
+ idle_check_done = 1;
+ goto retry;
+ }
+ }
+
+ sde->last_status = status;
+ if (progress)
+ sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+/*
+ * sdma_engine_interrupt() - interrupt handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ *
+ * Status is a mask of the 3 possible interrupts for this engine. It will
+ * contain bits _only_ for this SDMA engine. It will contain at least one
+ * bit, it may contain more.
+ */
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
+{
+ trace_hfi1_sdma_engine_interrupt(sde, status);
+ write_seqlock(&sde->head_lock);
+ sdma_set_desc_cnt(sde, sde->descq_cnt / 2);
+ sdma_make_progress(sde, status);
+ write_sequnlock(&sde->head_lock);
+}
+
+/**
+ * sdma_engine_error() - error handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ */
+void sdma_engine_error(struct sdma_engine *sde, u64 status)
+{
+ unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
+ sde->this_idx,
+ (unsigned long long)status,
+ sdma_state_names[sde->state.current_state]);
+#endif
+ spin_lock_irqsave(&sde->tail_lock, flags);
+ write_seqlock(&sde->head_lock);
+ if (status & ALL_SDMA_ENG_HALT_ERRS)
+ __sdma_process_event(sde, sdma_event_e60_hw_halted);
+ if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
+ dd_dev_err(sde->dd,
+ "SDMA (%u) engine error: 0x%llx state %s\n",
+ sde->this_idx,
+ (unsigned long long)status,
+ sdma_state_names[sde->state.current_state]);
+ dump_sdma_state(sde);
+ }
+ write_sequnlock(&sde->head_lock);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
+{
+ u64 set_senddmactrl = 0;
+ u64 clr_senddmactrl = 0;
+ unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
+ sde->this_idx,
+ (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
+ (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
+ (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
+ (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
+#endif
+
+ if (op & SDMA_SENDCTRL_OP_ENABLE)
+ set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+ else
+ clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+
+ if (op & SDMA_SENDCTRL_OP_INTENABLE)
+ set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+ else
+ clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+
+ if (op & SDMA_SENDCTRL_OP_HALT)
+ set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+ else
+ clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+
+ spin_lock_irqsave(&sde->senddmactrl_lock, flags);
+
+ sde->p_senddmactrl |= set_senddmactrl;
+ sde->p_senddmactrl &= ~clr_senddmactrl;
+
+ if (op & SDMA_SENDCTRL_OP_CLEANUP)
+ write_sde_csr(sde, SD(CTRL),
+ sde->p_senddmactrl |
+ SD(CTRL_SDMA_CLEANUP_SMASK));
+ else
+ write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
+
+ spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ sdma_dumpstate(sde);
+#endif
+}
+
+static void sdma_setlengen(struct sdma_engine *sde)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+ /*
+ * Set SendDmaLenGen and clear-then-set the MSB of the generation
+ * count to enable generation checking and load the internal
+ * generation counter.
+ */
+ write_sde_csr(sde, SD(LEN_GEN),
+ (sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT)
+ );
+ write_sde_csr(sde, SD(LEN_GEN),
+ ((sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT))
+ | (4ULL << SD(LEN_GEN_GENERATION_SHIFT))
+ );
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+ /* Commit writes to memory and advance the tail on the chip */
+ smp_wmb(); /* see get_txhead() */
+ writeq(tail, sde->tail_csr);
+}
+
+/*
+ * This is called when changing to state s10_hw_start_up_halt_wait as
+ * a result of send buffer errors or send DMA descriptor errors.
+ */
+static void sdma_hw_start_up(struct sdma_engine *sde)
+{
+ u64 reg;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+ sdma_setlengen(sde);
+ sdma_update_tail(sde, 0); /* Set SendDmaTail */
+ *sde->head_dma = 0;
+
+ reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
+ SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
+ write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+/*
+ * set_sdma_integrity
+ *
+ * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
+ */
+static void set_sdma_integrity(struct sdma_engine *sde)
+{
+ struct hfi1_devdata *dd = sde->dd;
+ u64 reg;
+
+ if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
+ return;
+
+ reg = hfi1_pkt_base_sdma_integrity(dd);
+
+ if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
+ CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+ else
+ SET_STATIC_RATE_CONTROL_SMASK(reg);
+
+ write_sde_csr(sde, SD(CHECK_ENABLE), reg);
+}
+
+
+static void init_sdma_regs(
+ struct sdma_engine *sde,
+ u32 credits,
+ uint idle_cnt)
+{
+ u8 opval, opmask;
+#ifdef CONFIG_SDMA_VERBOSITY
+ struct hfi1_devdata *dd = sde->dd;
+
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+ write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
+ sdma_setlengen(sde);
+ sdma_update_tail(sde, 0); /* Set SendDmaTail */
+ write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
+ write_sde_csr(sde, SD(DESC_CNT), 0);
+ write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
+ write_sde_csr(sde, SD(MEMORY),
+ ((u64)credits <<
+ SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
+ ((u64)(credits * sde->this_idx) <<
+ SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
+ write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
+ set_sdma_integrity(sde);
+ opmask = OPCODE_CHECK_MASK_DISABLED;
+ opval = OPCODE_CHECK_VAL_DISABLED;
+ write_sde_csr(sde, SD(CHECK_OPCODE),
+ (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
+ (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
+}
+
+#ifdef CONFIG_SDMA_VERBOSITY
+
+#define sdma_dumpstate_helper0(reg) do { \
+ csr = read_csr(sde->dd, reg); \
+ dd_dev_err(sde->dd, "%36s 0x%016llx\n", #reg, csr); \
+ } while (0)
+
+#define sdma_dumpstate_helper(reg) do { \
+ csr = read_sde_csr(sde, reg); \
+ dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
+ #reg, sde->this_idx, csr); \
+ } while (0)
+
+#define sdma_dumpstate_helper2(reg) do { \
+ csr = read_csr(sde->dd, reg + (8 * i)); \
+ dd_dev_err(sde->dd, "%33s_%02u 0x%016llx\n", \
+ #reg, i, csr); \
+ } while (0)
+
+void sdma_dumpstate(struct sdma_engine *sde)
+{
+ u64 csr;
+ unsigned i;
+
+ sdma_dumpstate_helper(SD(CTRL));
+ sdma_dumpstate_helper(SD(STATUS));
+ sdma_dumpstate_helper0(SD(ERR_STATUS));
+ sdma_dumpstate_helper0(SD(ERR_MASK));
+ sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
+ sdma_dumpstate_helper(SD(ENG_ERR_MASK));
+
+ for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
+ sdma_dumpstate_helper2(CCE_INT_STATUS);
+ sdma_dumpstate_helper2(CCE_INT_MASK);
+ sdma_dumpstate_helper2(CCE_INT_BLOCKED);
+ }
+
+ sdma_dumpstate_helper(SD(TAIL));
+ sdma_dumpstate_helper(SD(HEAD));
+ sdma_dumpstate_helper(SD(PRIORITY_THLD));
+ sdma_dumpstate_helper(SD(IDLE_CNT));
+ sdma_dumpstate_helper(SD(RELOAD_CNT));
+ sdma_dumpstate_helper(SD(DESC_CNT));
+ sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
+ sdma_dumpstate_helper(SD(MEMORY));
+ sdma_dumpstate_helper0(SD(ENGINES));
+ sdma_dumpstate_helper0(SD(MEM_SIZE));
+ /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS); */
+ sdma_dumpstate_helper(SD(BASE_ADDR));
+ sdma_dumpstate_helper(SD(LEN_GEN));
+ sdma_dumpstate_helper(SD(HEAD_ADDR));
+ sdma_dumpstate_helper(SD(CHECK_ENABLE));
+ sdma_dumpstate_helper(SD(CHECK_VL));
+ sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
+ sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
+ sdma_dumpstate_helper(SD(CHECK_SLID));
+ sdma_dumpstate_helper(SD(CHECK_OPCODE));
+}
+#endif
+
+static void dump_sdma_state(struct sdma_engine *sde)
+{
+ struct hw_sdma_desc *descq;
+ struct hw_sdma_desc *descqp;
+ u64 desc[2];
+ u64 addr;
+ u8 gen;
+ u16 len;
+ u16 head, tail, cnt;
+
+ head = sde->descq_head & sde->sdma_mask;
+ tail = sde->descq_tail & sde->sdma_mask;
+ cnt = sdma_descq_freecnt(sde);
+ descq = sde->descq;
+
+ dd_dev_err(sde->dd,
+ "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
+ sde->this_idx,
+ head,
+ tail,
+ cnt,
+ !list_empty(&sde->flushlist));
+
+ /* print info for each entry in the descriptor queue */
+ while (head != tail) {
+ char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+ descqp = &sde->descq[head];
+ desc[0] = le64_to_cpu(descqp->qw[0]);
+ desc[1] = le64_to_cpu(descqp->qw[1]);
+ flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+ flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+ 'H' : '-';
+ flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+ flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+ addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+ & SDMA_DESC0_PHY_ADDR_MASK;
+ gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+ & SDMA_DESC1_GENERATION_MASK;
+ len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+ & SDMA_DESC0_BYTE_COUNT_MASK;
+ dd_dev_err(sde->dd,
+ "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+ head, flags, addr, gen, len);
+ dd_dev_err(sde->dd,
+ "\tdesc0:0x%016llx desc1 0x%016llx\n",
+ desc[0], desc[1]);
+ if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+ dd_dev_err(sde->dd,
+ "\taidx: %u amode: %u alen: %u\n",
+ (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
+ >> SDMA_DESC1_HEADER_INDEX_SHIFT),
+ (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+ >> SDMA_DESC1_HEADER_MODE_SHIFT),
+ (u8)((desc[1] & SDMA_DESC1_HEADER_DWS_SMASK)
+ >> SDMA_DESC1_HEADER_DWS_SHIFT));
+ head++;
+ head &= sde->sdma_mask;
+ }
+}
+
+#define SDE_FMT \
+ "SDE %u STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+/**
+ * sdma_seqfile_dump_sde() - debugfs dump of sde
+ * @s: seq file
+ * @sde: send dma engine to dump
+ *
+ * This routine dumps the sde to the indicated seq file.
+ */
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
+{
+ u16 head, tail;
+ struct hw_sdma_desc *descqp;
+ u64 desc[2];
+ u64 addr;
+ u8 gen;
+ u16 len;
+
+ head = sde->descq_head & sde->sdma_mask;
+ tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+ seq_printf(s, SDE_FMT, sde->this_idx,
+ sdma_state_name(sde->state.current_state),
+ (unsigned long long)read_sde_csr(sde, SD(CTRL)),
+ (unsigned long long)read_sde_csr(sde, SD(STATUS)),
+ (unsigned long long)read_sde_csr(sde,
+ SD(ENG_ERR_STATUS)),
+ (unsigned long long)read_sde_csr(sde, SD(TAIL)),
+ tail,
+ (unsigned long long)read_sde_csr(sde, SD(HEAD)),
+ head,
+ (unsigned long long)le64_to_cpu(*sde->head_dma),
+ (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
+ (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
+ (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
+ (unsigned long long)sde->last_status,
+ (unsigned long long)sde->ahg_bits,
+ sde->tx_tail,
+ sde->tx_head,
+ sde->descq_tail,
+ sde->descq_head,
+ !list_empty(&sde->flushlist),
+ sde->descq_full_count,
+ (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
+
+ /* print info for each entry in the descriptor queue */
+ while (head != tail) {
+ char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+ descqp = &sde->descq[head];
+ desc[0] = le64_to_cpu(descqp->qw[0]);
+ desc[1] = le64_to_cpu(descqp->qw[1]);
+ flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+ flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+ 'H' : '-';
+ flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+ flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+ addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+ & SDMA_DESC0_PHY_ADDR_MASK;
+ gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+ & SDMA_DESC1_GENERATION_MASK;
+ len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+ & SDMA_DESC0_BYTE_COUNT_MASK;
+ seq_printf(s,
+ "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+ head, flags, addr, gen, len);
+ if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+ seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
+ (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
+ >> SDMA_DESC1_HEADER_INDEX_SHIFT),
+ (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+ >> SDMA_DESC1_HEADER_MODE_SHIFT));
+ head = (head + 1) & sde->sdma_mask;
+ }
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+ u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+ qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+ qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+ << SDMA_DESC1_GENERATION_SHIFT;
+ return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+ int i;
+ u16 tail;
+ struct sdma_desc *descp = tx->descp;
+ u8 skip = 0, mode = ahg_mode(tx);
+
+ tail = sde->descq_tail & sde->sdma_mask;
+ sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+ sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+ trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+ tail, &sde->descq[tail]);
+ tail = ++sde->descq_tail & sde->sdma_mask;
+ descp++;
+ if (mode > SDMA_AHG_APPLY_UPDATE1)
+ skip = mode >> 1;
+ for (i = 1; i < tx->num_desc; i++, descp++) {
+ u64 qw1;
+
+ sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+ if (skip) {
+ /* edits don't have generation */
+ qw1 = descp->qw[1];
+ skip--;
+ } else {
+ /* replace generation with real one for non-edits */
+ qw1 = add_gen(sde, descp->qw[1]);
+ }
+ sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+ trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+ tail, &sde->descq[tail]);
+ tail = ++sde->descq_tail & sde->sdma_mask;
+ }
+ tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ tx->sn = sde->tail_sn++;
+ trace_hfi1_sdma_in_sn(sde, tx->sn);
+ WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+ sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+ sde->desc_avail -= tx->num_desc;
+ return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx)
+{
+ int ret;
+
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ if (tx->num_desc <= sde->desc_avail)
+ return -EAGAIN;
+ /* pulse the head_lock */
+ if (wait && wait->sleep) {
+ unsigned seq;
+
+ seq = raw_seqcount_begin(
+ (const seqcount_t *)&sde->head_lock.seqcount);
+ ret = wait->sleep(sde, wait, tx, seq);
+ if (ret == -EAGAIN)
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ } else
+ ret = -EBUSY;
+ return ret;
+}
+
+/**
+ * sdma_send_txreq() - submit a tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx: sdma_txreq to submit
+ *
+ * The call submits the tx into the ring. If a iowait structure is non-NULL
+ * the packet will be queued to the list in wait.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
+ * ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txreq(struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx)
+{
+ int ret = 0;
+ u16 tail;
+ unsigned long flags;
+
+ /* user should have supplied entire packet */
+ if (unlikely(tx->tlen))
+ return -EINVAL;
+ tx->wait = wait;
+ spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+ if (unlikely(!__sdma_running(sde)))
+ goto unlock_noconn;
+ if (unlikely(tx->num_desc > sde->desc_avail))
+ goto nodesc;
+ tail = submit_tx(sde, tx);
+ if (wait)
+ atomic_inc(&wait->sdma_busy);
+ sdma_update_tail(sde, tail);
+unlock:
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+ return ret;
+unlock_noconn:
+ if (wait)
+ atomic_inc(&wait->sdma_busy);
+ tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ tx->sn = sde->tail_sn++;
+ trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+ spin_lock(&sde->flushlist_lock);
+ list_add_tail(&tx->list, &sde->flushlist);
+ spin_unlock(&sde->flushlist_lock);
+ if (wait) {
+ wait->tx_count++;
+ wait->count += tx->num_desc;
+ }
+ schedule_work(&sde->flush_worker);
+ ret = -ECOMM;
+ goto unlock;
+nodesc:
+ ret = sdma_check_progress(sde, wait, tx);
+ if (ret == -EAGAIN) {
+ ret = 0;
+ goto retry;
+ }
+ sde->descq_full_count++;
+ goto unlock;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring
+ * (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde,
+ struct iowait *wait,
+ struct list_head *tx_list)
+{
+ struct sdma_txreq *tx, *tx_next;
+ int ret = 0;
+ unsigned long flags;
+ u16 tail = INVALID_TAIL;
+ int count = 0;
+
+ spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+ list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+ tx->wait = wait;
+ if (unlikely(!__sdma_running(sde)))
+ goto unlock_noconn;
+ if (unlikely(tx->num_desc > sde->desc_avail))
+ goto nodesc;
+ if (unlikely(tx->tlen)) {
+ ret = -EINVAL;
+ goto update_tail;
+ }
+ list_del_init(&tx->list);
+ tail = submit_tx(sde, tx);
+ count++;
+ if (tail != INVALID_TAIL &&
+ (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+ sdma_update_tail(sde, tail);
+ tail = INVALID_TAIL;
+ }
+ }
+update_tail:
+ if (wait)
+ atomic_add(count, &wait->sdma_busy);
+ if (tail != INVALID_TAIL)
+ sdma_update_tail(sde, tail);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+ return ret;
+unlock_noconn:
+ spin_lock(&sde->flushlist_lock);
+ list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+ tx->wait = wait;
+ list_del_init(&tx->list);
+ if (wait)
+ atomic_inc(&wait->sdma_busy);
+ tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ tx->sn = sde->tail_sn++;
+ trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+ list_add_tail(&tx->list, &sde->flushlist);
+ if (wait) {
+ wait->tx_count++;
+ wait->count += tx->num_desc;
+ }
+ }
+ spin_unlock(&sde->flushlist_lock);
+ schedule_work(&sde->flush_worker);
+ ret = -ECOMM;
+ goto update_tail;
+nodesc:
+ ret = sdma_check_progress(sde, wait, tx);
+ if (ret == -EAGAIN) {
+ ret = 0;
+ goto retry;
+ }
+ sde->descq_full_count++;
+ goto update_tail;
+}
+
+static void sdma_process_event(struct sdma_engine *sde,
+ enum sdma_events event)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sde->tail_lock, flags);
+ write_seqlock(&sde->head_lock);
+
+ __sdma_process_event(sde, event);
+
+ if (sde->state.current_state == sdma_state_s99_running)
+ sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+
+ write_sequnlock(&sde->head_lock);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void __sdma_process_event(struct sdma_engine *sde,
+ enum sdma_events event)
+{
+ struct sdma_state *ss = &sde->state;
+ int need_progress = 0;
+
+ /* CONFIG SDMA temporary */
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
+ sdma_state_names[ss->current_state],
+ sdma_event_names[event]);
+#endif
+
+ switch (ss->current_state) {
+ case sdma_state_s00_hw_down:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ break;
+ case sdma_event_e30_go_running:
+ /*
+ * If down, but running requested (usually result
+ * of link up, then we need to start up.
+ * This can happen when hw down is requested while
+ * bringing the link up with traffic active on
+ * 7220, e.g. */
+ ss->go_s99_running = 1;
+ /* fall through and start dma engine */
+ case sdma_event_e10_go_hw_start:
+ /* This reference means the state machine is started */
+ sdma_get(&sde->state);
+ sdma_set_state(sde,
+ sdma_state_s10_hw_start_up_halt_wait);
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e40_sw_cleaned:
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s10_hw_start_up_halt_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ sdma_set_state(sde,
+ sdma_state_s15_hw_start_up_clean_wait);
+ sdma_start_hw_clean_up(sde);
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ sdma_start_err_halt_wait(sde);
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s15_hw_start_up_clean_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ sdma_hw_start_up(sde);
+ sdma_set_state(sde, ss->go_s99_running ?
+ sdma_state_s99_running :
+ sdma_state_s20_idle);
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s20_idle:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ sdma_set_state(sde, sdma_state_s99_running);
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+ sdma_start_err_halt_wait(sde);
+ break;
+ case sdma_event_e70_go_idle:
+ break;
+ case sdma_event_e85_link_down:
+ /* fall through */
+ case sdma_event_e80_hw_freeze:
+ sdma_set_state(sde, sdma_state_s80_hw_freeze);
+ atomic_dec(&sde->dd->sdma_unfreeze_count);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s30_sw_clean_up_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
+ sdma_start_hw_clean_up(sde);
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s40_hw_clean_up_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ sdma_hw_start_up(sde);
+ sdma_set_state(sde, ss->go_s99_running ?
+ sdma_state_s99_running :
+ sdma_state_s20_idle);
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s50_hw_halt_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ sdma_start_err_halt_wait(sde);
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s60_idle_halt_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ sdma_start_err_halt_wait(sde);
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s80_hw_freeze:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s82_freeze_sw_clean:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ /* notify caller this engine is done cleaning */
+ atomic_dec(&sde->dd->sdma_unfreeze_count);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ sdma_hw_start_up(sde);
+ sdma_set_state(sde, ss->go_s99_running ?
+ sdma_state_s99_running :
+ sdma_state_s20_idle);
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s99_running:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_start_sw_clean_up(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ need_progress = 1;
+ sdma_err_progress_check_schedule(sde);
+ case sdma_event_e90_sw_halted:
+ /*
+ * SW initiated halt does not perform engines
+ * progress check
+ */
+ sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+ sdma_start_err_halt_wait(sde);
+ break;
+ case sdma_event_e70_go_idle:
+ sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ /* fall through */
+ case sdma_event_e80_hw_freeze:
+ sdma_set_state(sde, sdma_state_s80_hw_freeze);
+ atomic_dec(&sde->dd->sdma_unfreeze_count);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ }
+ break;
+ }
+
+ ss->last_event = event;
+ if (need_progress)
+ sdma_make_progress(sde, 0);
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors. There doesn't seem
+ * much point in an interim step.
+ *
+ */
+int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+ int i;
+
+ tx->descp = kmalloc_array(
+ MAX_DESC,
+ sizeof(struct sdma_desc),
+ GFP_ATOMIC);
+ if (!tx->descp)
+ return -ENOMEM;
+ tx->desc_limit = MAX_DESC;
+ /* copy ones already built */
+ for (i = 0; i < tx->num_desc; i++)
+ tx->descp[i] = tx->descs[i];
+ return 0;
+}
+
+/* Update sdes when the lmc changes */
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
+{
+ struct sdma_engine *sde;
+ int i;
+ u64 sreg;
+
+ sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
+ SD(CHECK_SLID_MASK_SHIFT)) |
+ (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
+ SD(CHECK_SLID_VALUE_SHIFT));
+
+ for (i = 0; i < dd->num_sdma; i++) {
+ hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
+ i, (u32)sreg);
+ sde = &dd->per_sdma[i];
+ write_sde_csr(sde, SD(CHECK_SLID), sreg);
+ }
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+ int rval = 0;
+
+ if ((unlikely(tx->num_desc == tx->desc_limit))) {
+ rval = _extend_sdma_tx_descs(dd, tx);
+ if (rval)
+ return rval;
+ }
+ /* finish the one just added */
+ tx->num_desc++;
+ make_tx_sdma_desc(
+ tx,
+ SDMA_MAP_NONE,
+ dd->sdma_pad_phys,
+ sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+ _sdma_close_tx(dd, tx);
+ return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+ struct sdma_txreq *tx,
+ u8 num_ahg,
+ u8 ahg_entry,
+ u32 *ahg,
+ u8 ahg_hlen)
+{
+ u32 i, shift = 0, desc = 0;
+ u8 mode;
+
+ WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+ /* compute mode */
+ if (num_ahg == 1)
+ mode = SDMA_AHG_APPLY_UPDATE1;
+ else if (num_ahg <= 5)
+ mode = SDMA_AHG_APPLY_UPDATE2;
+ else
+ mode = SDMA_AHG_APPLY_UPDATE3;
+ tx->num_desc++;
+ /* initialize to consumed descriptors to zero */
+ switch (mode) {
+ case SDMA_AHG_APPLY_UPDATE3:
+ tx->num_desc++;
+ tx->descs[2].qw[0] = 0;
+ tx->descs[2].qw[1] = 0;
+ /* FALLTHROUGH */
+ case SDMA_AHG_APPLY_UPDATE2:
+ tx->num_desc++;
+ tx->descs[1].qw[0] = 0;
+ tx->descs[1].qw[1] = 0;
+ break;
+ }
+ ahg_hlen >>= 2;
+ tx->descs[0].qw[1] |=
+ (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+ << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+ (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+ << SDMA_DESC1_HEADER_DWS_SHIFT) |
+ (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+ << SDMA_DESC1_HEADER_MODE_SHIFT) |
+ (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+ << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+ for (i = 0; i < (num_ahg - 1); i++) {
+ if (!shift && !(i & 2))
+ desc++;
+ tx->descs[desc].qw[!!(i & 2)] |=
+ (((u64)ahg[i + 1])
+ << shift);
+ shift = (shift + 32) & 63;
+ }
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+ int nr;
+ int oldbit;
+
+ if (!sde) {
+ trace_hfi1_ahg_allocate(sde, -EINVAL);
+ return -EINVAL;
+ }
+ while (1) {
+ nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+ if (nr > 31) {
+ trace_hfi1_ahg_allocate(sde, -ENOSPC);
+ return -ENOSPC;
+ }
+ oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+ if (!oldbit)
+ break;
+ cpu_relax();
+ }
+ trace_hfi1_ahg_allocate(sde, nr);
+ return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+ if (!sde)
+ return;
+ trace_hfi1_ahg_deallocate(sde, ahg_index);
+ if (ahg_index < 0 || ahg_index > 31)
+ return;
+ clear_bit(ahg_index, &sde->ahg_bits);
+}
+
+/*
+ * SPC freeze handling for SDMA engines. Called when the driver knows
+ * the SPC is going into a freeze but before the freeze is fully
+ * settled. Generally an error interrupt.
+ *
+ * This event will pull the engine out of running so no more entries can be
+ * added to the engine's queue.
+ */
+void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
+{
+ int i;
+ enum sdma_events event = link_down ? sdma_event_e85_link_down :
+ sdma_event_e80_hw_freeze;
+
+ /* set up the wait but do not wait here */
+ atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+ /* tell all engines to stop running and wait */
+ for (i = 0; i < dd->num_sdma; i++)
+ sdma_process_event(&dd->per_sdma[i], event);
+
+ /* sdma_freeze() will wait for all engines to have stopped */
+}
+
+/*
+ * SPC freeze handling for SDMA engines. Called when the driver knows
+ * the SPC is fully frozen.
+ */
+void sdma_freeze(struct hfi1_devdata *dd)
+{
+ int i;
+ int ret;
+
+ /*
+ * Make sure all engines have moved out of the running state before
+ * continuing.
+ */
+ ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
+ atomic_read(&dd->sdma_unfreeze_count) <= 0);
+ /* interrupted or count is negative, then unloading - just exit */
+ if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
+ return;
+
+ /* set up the count for the next wait */
+ atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+ /* tell all engines that the SPC is frozen, they can start cleaning */
+ for (i = 0; i < dd->num_sdma; i++)
+ sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
+
+ /*
+ * Wait for everyone to finish software clean before exiting. The
+ * software clean will read engine CSRs, so must be completed before
+ * the next step, which will clear the engine CSRs.
+ */
+ (void) wait_event_interruptible(dd->sdma_unfreeze_wq,
+ atomic_read(&dd->sdma_unfreeze_count) <= 0);
+ /* no need to check results - done no matter what */
+}
+
+/*
+ * SPC freeze handling for the SDMA engines. Called after the SPC is unfrozen.
+ *
+ * The SPC freeze acts like a SDMA halt and a hardware clean combined. All
+ * that is left is a software clean. We could do it after the SPC is fully
+ * frozen, but then we'd have to add another state to wait for the unfreeze.
+ * Instead, just defer the software clean until the unfreeze step.
+ */
+void sdma_unfreeze(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* tell all engines start freeze clean up */
+ for (i = 0; i < dd->num_sdma; i++)
+ sdma_process_event(&dd->per_sdma[i],
+ sdma_event_e82_hw_unfreeze);
+}
+
+/**
+ * _sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ */
+void _sdma_engine_progress_schedule(
+ struct sdma_engine *sde)
+{
+ trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
+ /* assume we have selected a good cpu */
+ write_csr(sde->dd,
+ CCE_INT_FORCE + (8*(IS_SDMA_START/64)), sde->progress_mask);
+}
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h
new file mode 100644
index 000000000000..496086903891
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sdma.h
@@ -0,0 +1,1123 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+
+/* increased for AHG */
+#define NUM_DESC 6
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+
+#define SDMA_TXREQ_S_OK 0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED 2
+#define SDMA_TXREQ_S_SHUTDOWN 3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT 0x0001
+#define SDMA_TXREQ_F_AHG_COPY 0x0002
+#define SDMA_TXREQ_F_USE_AHG 0x0004
+
+#define SDMA_MAP_NONE 0
+#define SDMA_MAP_SINGLE 1
+#define SDMA_MAP_PAGE 2
+
+#define SDMA_AHG_VALUE_MASK 0xffff
+#define SDMA_AHG_VALUE_SHIFT 0
+#define SDMA_AHG_INDEX_MASK 0xf
+#define SDMA_AHG_INDEX_SHIFT 16
+#define SDMA_AHG_FIELD_LEN_MASK 0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT 20
+#define SDMA_AHG_FIELD_START_MASK 0x1f
+#define SDMA_AHG_FIELD_START_SHIFT 24
+#define SDMA_AHG_UPDATE_ENABLE_MASK 0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG 0
+#define SDMA_AHG_COPY 1
+#define SDMA_AHG_APPLY_UPDATE1 2
+#define SDMA_AHG_APPLY_UPDATE2 3
+#define SDMA_AHG_APPLY_UPDATE3 4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG (1ULL << 63)
+#define SDMA_DESC0_LAST_DESC_FLAG (1ULL << 62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT 48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH 14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+ ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+ (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT 0
+#define SDMA_DESC0_PHY_ADDR_WIDTH 48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+ ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+ (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+ (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT 13
+#define SDMA_DESC1_HEADER_MODE_WIDTH 3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+ (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT 8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH 5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+ (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT 4
+#define SDMA_DESC1_HEADER_DWS_WIDTH 4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+ (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT 2
+#define SDMA_DESC1_GENERATION_WIDTH 2
+#define SDMA_DESC1_GENERATION_MASK \
+ ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
+#define SDMA_DESC1_GENERATION_SMASK \
+ (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG (1ULL << 1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG (1ULL << 0)
+
+enum sdma_states {
+ sdma_state_s00_hw_down,
+ sdma_state_s10_hw_start_up_halt_wait,
+ sdma_state_s15_hw_start_up_clean_wait,
+ sdma_state_s20_idle,
+ sdma_state_s30_sw_clean_up_wait,
+ sdma_state_s40_hw_clean_up_wait,
+ sdma_state_s50_hw_halt_wait,
+ sdma_state_s60_idle_halt_wait,
+ sdma_state_s80_hw_freeze,
+ sdma_state_s82_freeze_sw_clean,
+ sdma_state_s99_running,
+};
+
+enum sdma_events {
+ sdma_event_e00_go_hw_down,
+ sdma_event_e10_go_hw_start,
+ sdma_event_e15_hw_halt_done,
+ sdma_event_e25_hw_clean_up_done,
+ sdma_event_e30_go_running,
+ sdma_event_e40_sw_cleaned,
+ sdma_event_e50_hw_cleaned,
+ sdma_event_e60_hw_halted,
+ sdma_event_e70_go_idle,
+ sdma_event_e80_hw_freeze,
+ sdma_event_e81_hw_frozen,
+ sdma_event_e82_hw_unfreeze,
+ sdma_event_e85_link_down,
+ sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+ unsigned op_enable:1;
+ unsigned op_intenable:1;
+ unsigned op_halt:1;
+ unsigned op_cleanup:1;
+ unsigned go_s99_running_tofalse:1;
+ unsigned go_s99_running_totrue:1;
+};
+
+struct sdma_state {
+ struct kref kref;
+ struct completion comp;
+ enum sdma_states current_state;
+ unsigned current_op;
+ unsigned go_s99_running;
+ /* debugging/development */
+ enum sdma_states previous_state;
+ unsigned previous_op;
+ enum sdma_events last_event;
+};
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ * to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ * and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ * and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ. The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used. Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized, is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location. It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls.. The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx. An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent(). For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added. An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init(). Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+ /* private: don't use directly */
+ __le64 qw[2];
+};
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+ /* private: don't use directly */
+ u64 qw[2];
+};
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int, int);
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments. Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+struct sdma_txreq {
+ struct list_head list;
+ /* private: */
+ struct sdma_desc *descp;
+ /* private: */
+ void *coalesce_buf;
+ /* private: */
+ struct iowait *wait;
+ /* private: */
+ callback_t complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ u64 sn;
+#endif
+ /* private: - used in coalesce/pad processing */
+ u16 packet_len;
+ /* private: - down-counted to trigger last */
+ u16 tlen;
+ /* private: flags */
+ u16 flags;
+ /* private: */
+ u16 num_desc;
+ /* private: */
+ u16 desc_limit;
+ /* private: */
+ u16 next_descq_idx;
+ /* private: */
+ struct sdma_desc descs[NUM_DESC];
+};
+
+struct verbs_txreq {
+ struct hfi1_pio_header phdr;
+ struct sdma_txreq txreq;
+ struct hfi1_qp *qp;
+ struct hfi1_swqe *wqe;
+ struct hfi1_mregion *mr;
+ struct hfi1_sge_state *ss;
+ struct sdma_engine *sde;
+ u16 hdr_dwords;
+ u16 hdr_inx;
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+struct sdma_engine {
+ /* read mostly */
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ /* private: */
+ void __iomem *tail_csr;
+ u64 imask; /* clear interrupt mask */
+ u64 idle_mask;
+ u64 progress_mask;
+ /* private: */
+ struct workqueue_struct *wq;
+ /* private: */
+ volatile __le64 *head_dma; /* DMA'ed by chip */
+ /* private: */
+ dma_addr_t head_phys;
+ /* private: */
+ struct hw_sdma_desc *descq;
+ /* private: */
+ unsigned descq_full_count;
+ struct sdma_txreq **tx_ring;
+ /* private: */
+ dma_addr_t descq_phys;
+ /* private */
+ u32 sdma_mask;
+ /* private */
+ struct sdma_state state;
+ /* private: */
+ u8 sdma_shift;
+ /* private: */
+ u8 this_idx; /* zero relative engine */
+ /* protect changes to senddmactrl shadow */
+ spinlock_t senddmactrl_lock;
+ /* private: */
+ u64 p_senddmactrl; /* shadow per-engine SendDmaCtrl */
+
+ /* read/write using tail_lock */
+ spinlock_t tail_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ /* private: */
+ u64 tail_sn;
+#endif
+ /* private: */
+ u32 descq_tail;
+ /* private: */
+ unsigned long ahg_bits;
+ /* private: */
+ u16 desc_avail;
+ /* private: */
+ u16 tx_tail;
+ /* private: */
+ u16 descq_cnt;
+
+ /* read/write using head_lock */
+ /* private: */
+ seqlock_t head_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ /* private: */
+ u64 head_sn;
+#endif
+ /* private: */
+ u32 descq_head;
+ /* private: */
+ u16 tx_head;
+ /* private: */
+ u64 last_status;
+
+ /* private: */
+ struct list_head dmawait;
+
+ /* CONFIG SDMA for now, just blindly duplicate */
+ /* private: */
+ struct tasklet_struct sdma_hw_clean_up_task
+ ____cacheline_aligned_in_smp;
+
+ /* private: */
+ struct tasklet_struct sdma_sw_clean_up_task
+ ____cacheline_aligned_in_smp;
+ /* private: */
+ struct work_struct err_halt_worker;
+ /* private */
+ struct timer_list err_progress_check_timer;
+ u32 progress_check_head;
+ /* private: */
+ struct work_struct flush_worker;
+ spinlock_t flushlist_lock;
+ /* private: */
+ struct list_head flushlist;
+};
+
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+ return sde->descq_tail == sde->descq_head;
+}
+
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+ return sde->descq_cnt -
+ (sde->descq_tail -
+ ACCESS_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+ return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+ return engine->state.current_state == sdma_state_s99_running;
+}
+
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&engine->tail_lock, flags);
+ ret = __sdma_running(engine);
+ spin_unlock_irqrestore(&engine->tail_lock, flags);
+ return ret;
+}
+
+void _sdma_txreq_ahgadd(
+ struct sdma_txreq *tx,
+ u8 num_ahg,
+ u8 ahg_entry,
+ u32 *ahg,
+ u8 ahg_hlen);
+
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent. This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg(). The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used. Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise. The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header. This is for cases where the stored header is
+ * larger than the header to be used in a packet. This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+ struct sdma_txreq *tx,
+ u16 flags,
+ u16 tlen,
+ u8 ahg_entry,
+ u8 num_ahg,
+ u32 *ahg,
+ u8 ahg_hlen,
+ void (*cb)(struct sdma_txreq *, int, int))
+{
+ if (tlen == 0)
+ return -ENODATA;
+ if (tlen > MAX_SDMA_PKT_SIZE)
+ return -EMSGSIZE;
+ tx->desc_limit = ARRAY_SIZE(tx->descs);
+ tx->descp = &tx->descs[0];
+ INIT_LIST_HEAD(&tx->list);
+ tx->num_desc = 0;
+ tx->flags = flags;
+ tx->complete = cb;
+ tx->coalesce_buf = NULL;
+ tx->wait = NULL;
+ tx->tlen = tx->packet_len = tlen;
+ tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+ tx->descs[0].qw[1] = 0;
+ if (flags & SDMA_TXREQ_F_AHG_COPY)
+ tx->descs[0].qw[1] |=
+ (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+ << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+ (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+ << SDMA_DESC1_HEADER_MODE_SHIFT);
+ else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+ _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+ return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent. This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg(). The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used. The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL, will be provided this tx and a status. The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+ struct sdma_txreq *tx,
+ u16 flags,
+ u16 tlen,
+ void (*cb)(struct sdma_txreq *, int, int))
+{
+ return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+ return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+ >> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+ return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+ >> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+ return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+ >> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+static inline void make_tx_sdma_desc(
+ struct sdma_txreq *tx,
+ int type,
+ dma_addr_t addr,
+ size_t len)
+{
+ struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+ if (!tx->num_desc) {
+ /* qw[0] zero; qw[1] first, ahg mode already in from init */
+ desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+ << SDMA_DESC1_GENERATION_SHIFT;
+ } else {
+ desc->qw[0] = 0;
+ desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+ << SDMA_DESC1_GENERATION_SHIFT;
+ }
+ desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+ << SDMA_DESC0_PHY_ADDR_SHIFT) |
+ (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+ << SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int _extend_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+ struct sdma_txreq *tx)
+{
+ tx->descp[tx->num_desc].qw[0] |=
+ SDMA_DESC0_LAST_DESC_FLAG;
+ tx->descp[tx->num_desc].qw[1] |=
+ dd->default_desc1;
+ if (tx->flags & SDMA_TXREQ_F_URGENT)
+ tx->descp[tx->num_desc].qw[1] |=
+ (SDMA_DESC1_HEAD_TO_HOST_FLAG|
+ SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+ struct hfi1_devdata *dd,
+ int type,
+ struct sdma_txreq *tx,
+ dma_addr_t addr,
+ u16 len)
+{
+ int rval = 0;
+
+ if ((unlikely(tx->num_desc == tx->desc_limit))) {
+ rval = _extend_sdma_tx_descs(dd, tx);
+ if (rval)
+ return rval;
+ }
+ make_tx_sdma_desc(
+ tx,
+ type,
+ addr, len);
+ WARN_ON(len > tx->tlen);
+ tx->tlen -= len;
+ /* special cases for last */
+ if (!tx->tlen) {
+ if (tx->packet_len & (sizeof(u32) - 1))
+ rval = _pad_sdma_tx_descs(dd, tx);
+ else
+ _sdma_close_tx(dd, tx);
+ }
+ tx->num_desc++;
+ return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend descriptor array or couldn't allocate coalesce
+ * buffer.
+ *
+ */
+static inline int sdma_txadd_page(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx,
+ struct page *page,
+ unsigned long offset,
+ u16 len)
+{
+ dma_addr_t addr =
+ dma_map_page(
+ &dd->pcidev->dev,
+ page,
+ offset,
+ len,
+ DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+ sdma_txclean(dd, tx);
+ return -ENOSPC;
+ }
+ return _sdma_txadd_daddr(
+ dd, SDMA_MAP_PAGE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx,
+ dma_addr_t addr,
+ u16 len)
+{
+ return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx,
+ void *kvaddr,
+ u16 len)
+{
+ dma_addr_t addr =
+ dma_map_single(
+ &dd->pcidev->dev,
+ kvaddr,
+ len,
+ DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+ sdma_txclean(dd, tx);
+ return -ENOSPC;
+ }
+ return _sdma_txadd_daddr(
+ dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait;
+
+int sdma_send_txreq(struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx);
+int sdma_send_txlist(struct sdma_engine *sde,
+ struct iowait *wait,
+ struct list_head *tx_list);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+ u16 data,
+ u8 dwindex,
+ u8 startbit,
+ u8 bits)
+{
+ return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+ ((startbit & SDMA_AHG_FIELD_START_MASK) <<
+ SDMA_AHG_FIELD_START_SHIFT) |
+ ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+ SDMA_AHG_FIELD_LEN_SHIFT) |
+ ((dwindex & SDMA_AHG_INDEX_MASK) <<
+ SDMA_AHG_INDEX_SHIFT) |
+ ((data & SDMA_AHG_VALUE_MASK) <<
+ SDMA_AHG_VALUE_SHIFT));
+}
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress. This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+ struct sdma_txreq *tx)
+{
+ if (read_seqretry(&sde->head_lock, seq)) {
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ if (tx->num_desc > sde->desc_avail)
+ return 0;
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+ struct sdma_engine *sde,
+ struct iowait *wait)
+{
+ iowait_schedule(wait, sde->wq);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ * dd->sdma_map
+ * | sdma_map_elem[0]
+ * | +--------------------+
+ * v | mask |
+ * sdma_vl_map |--------------------|
+ * +--------------------------+ | sde[0] -> eng 1 |
+ * | list (RCU) | |--------------------|
+ * |--------------------------| ->| sde[1] -> eng 2 |
+ * | mask | --/ |--------------------|
+ * |--------------------------| -/ | * |
+ * | actual_vls (max 8) | -/ |--------------------|
+ * |--------------------------| --/ | sde[n] -> eng n |
+ * | vls (max 8) | -/ +--------------------+
+ * |--------------------------| --/
+ * | map[0] |-/
+ * |--------------------------| +--------------------+
+ * | map[1] |--- | mask |
+ * |--------------------------| \---- |--------------------|
+ * | * | \-- | sde[0] -> eng 1+n |
+ * | * | \---- |--------------------|
+ * | * | \->| sde[1] -> eng 2+n |
+ * |--------------------------| |--------------------|
+ * | map[vls - 1] |- | * |
+ * +--------------------------+ \- |--------------------|
+ * \- | sde[m] -> eng m+n |
+ * \ +--------------------+
+ * \-
+ * \
+ * \- +--------------------+
+ * \- | mask |
+ * \ |--------------------|
+ * \- | sde[0] -> eng 1+m+n|
+ * \- |--------------------|
+ * >| sde[1] -> eng 2+m+n|
+ * |--------------------|
+ * | * |
+ * |--------------------|
+ * | sde[o] -> eng o+m+n|
+ * +--------------------+
+ *
+ */
+
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+ u32 mask;
+ struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure. The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+ struct rcu_head list;
+ u32 mask;
+ u8 actual_vls;
+ u8 vls;
+ struct sdma_map_elem *map[0];
+};
+
+int sdma_map_init(
+ struct hfi1_devdata *dd,
+ u8 port,
+ u8 num_vls,
+ u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+ struct sdma_engine *sde)
+{
+ if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+ return;
+ _sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 sc5);
+
+struct sdma_engine *sdma_select_engine_vl(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 vl);
+
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+ char *r = s;
+
+ while (*s)
+ if (*s++ == '/')
+ r = s;
+ return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/srq.c b/drivers/staging/rdma/hfi1/srq.c
new file mode 100644
index 000000000000..67786d417493
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/srq.c
@@ -0,0 +1,397 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "verbs.h"
+
+/**
+ * hfi1_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: A pointer to the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct hfi1_srq *srq = to_isrq(ibsrq);
+ struct hfi1_rwq *wq;
+ unsigned long flags;
+ int ret;
+
+ for (; wr; wr = wr->next) {
+ struct hfi1_rwqe *wqe;
+ u32 next;
+ int i;
+
+ if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+ *bad_wr = wr;
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&srq->rq.lock, flags);
+ wq = srq->rq.wq;
+ next = wq->head + 1;
+ if (next >= srq->rq.size)
+ next = 0;
+ if (next == wq->tail) {
+ spin_unlock_irqrestore(&srq->rq.lock, flags);
+ *bad_wr = wr;
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ wqe = get_rwqe_ptr(&srq->rq, wq->head);
+ wqe->wr_id = wr->wr_id;
+ wqe->num_sge = wr->num_sge;
+ for (i = 0; i < wr->num_sge; i++)
+ wqe->sg_list[i] = wr->sg_list[i];
+ /* Make sure queue entry is written before the head index. */
+ smp_wmb();
+ wq->head = next;
+ spin_unlock_irqrestore(&srq->rq.lock, flags);
+ }
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libibverbs when creating a user SRQ
+ */
+struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata)
+{
+ struct hfi1_ibdev *dev = to_idev(ibpd->device);
+ struct hfi1_srq *srq;
+ u32 sz;
+ struct ib_srq *ret;
+
+ if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+ ret = ERR_PTR(-ENOSYS);
+ goto done;
+ }
+
+ if (srq_init_attr->attr.max_sge == 0 ||
+ srq_init_attr->attr.max_sge > hfi1_max_srq_sges ||
+ srq_init_attr->attr.max_wr == 0 ||
+ srq_init_attr->attr.max_wr > hfi1_max_srq_wrs) {
+ ret = ERR_PTR(-EINVAL);
+ goto done;
+ }
+
+ srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+ if (!srq) {
+ ret = ERR_PTR(-ENOMEM);
+ goto done;
+ }
+
+ /*
+ * Need to use vmalloc() if we want to support large #s of entries.
+ */
+ srq->rq.size = srq_init_attr->attr.max_wr + 1;
+ srq->rq.max_sge = srq_init_attr->attr.max_sge;
+ sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+ sizeof(struct hfi1_rwqe);
+ srq->rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + srq->rq.size * sz);
+ if (!srq->rq.wq) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_srq;
+ }
+
+ /*
+ * Return the address of the RWQ as the offset to mmap.
+ * See hfi1_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ int err;
+ u32 s = sizeof(struct hfi1_rwq) + srq->rq.size * sz;
+
+ srq->ip =
+ hfi1_create_mmap_info(dev, s, ibpd->uobject->context,
+ srq->rq.wq);
+ if (!srq->ip) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_wq;
+ }
+
+ err = ib_copy_to_udata(udata, &srq->ip->offset,
+ sizeof(srq->ip->offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ } else
+ srq->ip = NULL;
+
+ /*
+ * ib_create_srq() will initialize srq->ibsrq.
+ */
+ spin_lock_init(&srq->rq.lock);
+ srq->rq.wq->head = 0;
+ srq->rq.wq->tail = 0;
+ srq->limit = srq_init_attr->attr.srq_limit;
+
+ spin_lock(&dev->n_srqs_lock);
+ if (dev->n_srqs_allocated == hfi1_max_srqs) {
+ spin_unlock(&dev->n_srqs_lock);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ dev->n_srqs_allocated++;
+ spin_unlock(&dev->n_srqs_lock);
+
+ if (srq->ip) {
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ ret = &srq->ibsrq;
+ goto done;
+
+bail_ip:
+ kfree(srq->ip);
+bail_wq:
+ vfree(srq->rq.wq);
+bail_srq:
+ kfree(srq);
+done:
+ return ret;
+}
+
+/**
+ * hfi1_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for libibverbs.so
+ */
+int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask,
+ struct ib_udata *udata)
+{
+ struct hfi1_srq *srq = to_isrq(ibsrq);
+ struct hfi1_rwq *wq;
+ int ret = 0;
+
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ struct hfi1_rwq *owq;
+ struct hfi1_rwqe *p;
+ u32 sz, size, n, head, tail;
+
+ /* Check that the requested sizes are below the limits. */
+ if ((attr->max_wr > hfi1_max_srq_wrs) ||
+ ((attr_mask & IB_SRQ_LIMIT) ?
+ attr->srq_limit : srq->limit) > attr->max_wr) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ sz = sizeof(struct hfi1_rwqe) +
+ srq->rq.max_sge * sizeof(struct ib_sge);
+ size = attr->max_wr + 1;
+ wq = vmalloc_user(sizeof(struct hfi1_rwq) + size * sz);
+ if (!wq) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ /* Check that we can write the offset to mmap. */
+ if (udata && udata->inlen >= sizeof(__u64)) {
+ __u64 offset_addr;
+ __u64 offset = 0;
+
+ ret = ib_copy_from_udata(&offset_addr, udata,
+ sizeof(offset_addr));
+ if (ret)
+ goto bail_free;
+ udata->outbuf =
+ (void __user *) (unsigned long) offset_addr;
+ ret = ib_copy_to_udata(udata, &offset,
+ sizeof(offset));
+ if (ret)
+ goto bail_free;
+ }
+
+ spin_lock_irq(&srq->rq.lock);
+ /*
+ * validate head and tail pointer values and compute
+ * the number of remaining WQEs.
+ */
+ owq = srq->rq.wq;
+ head = owq->head;
+ tail = owq->tail;
+ if (head >= srq->rq.size || tail >= srq->rq.size) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ n = head;
+ if (n < tail)
+ n += srq->rq.size - tail;
+ else
+ n -= tail;
+ if (size <= n) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ n = 0;
+ p = wq->wq;
+ while (tail != head) {
+ struct hfi1_rwqe *wqe;
+ int i;
+
+ wqe = get_rwqe_ptr(&srq->rq, tail);
+ p->wr_id = wqe->wr_id;
+ p->num_sge = wqe->num_sge;
+ for (i = 0; i < wqe->num_sge; i++)
+ p->sg_list[i] = wqe->sg_list[i];
+ n++;
+ p = (struct hfi1_rwqe *)((char *)p + sz);
+ if (++tail >= srq->rq.size)
+ tail = 0;
+ }
+ srq->rq.wq = wq;
+ srq->rq.size = size;
+ wq->head = n;
+ wq->tail = 0;
+ if (attr_mask & IB_SRQ_LIMIT)
+ srq->limit = attr->srq_limit;
+ spin_unlock_irq(&srq->rq.lock);
+
+ vfree(owq);
+
+ if (srq->ip) {
+ struct hfi1_mmap_info *ip = srq->ip;
+ struct hfi1_ibdev *dev = to_idev(srq->ibsrq.device);
+ u32 s = sizeof(struct hfi1_rwq) + size * sz;
+
+ hfi1_update_mmap_info(dev, ip, s, wq);
+
+ /*
+ * Return the offset to mmap.
+ * See hfi1_mmap() for details.
+ */
+ if (udata && udata->inlen >= sizeof(__u64)) {
+ ret = ib_copy_to_udata(udata, &ip->offset,
+ sizeof(ip->offset));
+ if (ret)
+ goto bail;
+ }
+
+ /*
+ * Put user mapping info onto the pending list
+ * unless it already is on the list.
+ */
+ spin_lock_irq(&dev->pending_lock);
+ if (list_empty(&ip->pending_mmaps))
+ list_add(&ip->pending_mmaps,
+ &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+ } else if (attr_mask & IB_SRQ_LIMIT) {
+ spin_lock_irq(&srq->rq.lock);
+ if (attr->srq_limit >= srq->rq.size)
+ ret = -EINVAL;
+ else
+ srq->limit = attr->srq_limit;
+ spin_unlock_irq(&srq->rq.lock);
+ }
+ goto bail;
+
+bail_unlock:
+ spin_unlock_irq(&srq->rq.lock);
+bail_free:
+ vfree(wq);
+bail:
+ return ret;
+}
+
+int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+ struct hfi1_srq *srq = to_isrq(ibsrq);
+
+ attr->max_wr = srq->rq.size - 1;
+ attr->max_sge = srq->rq.max_sge;
+ attr->srq_limit = srq->limit;
+ return 0;
+}
+
+/**
+ * hfi1_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int hfi1_destroy_srq(struct ib_srq *ibsrq)
+{
+ struct hfi1_srq *srq = to_isrq(ibsrq);
+ struct hfi1_ibdev *dev = to_idev(ibsrq->device);
+
+ spin_lock(&dev->n_srqs_lock);
+ dev->n_srqs_allocated--;
+ spin_unlock(&dev->n_srqs_lock);
+ if (srq->ip)
+ kref_put(&srq->ip->ref, hfi1_release_mmap_info);
+ else
+ vfree(srq->rq.wq);
+ kfree(srq);
+
+ return 0;
+}
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
new file mode 100644
index 000000000000..b78c72861ef9
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/sysfs.c
@@ -0,0 +1,739 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/ctype.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ int ret;
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+ struct cc_state *cc_state;
+
+ ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+ + sizeof(__be16);
+
+ if (pos > ret)
+ return -EINVAL;
+
+ if (count > ret - pos)
+ count = ret - pos;
+
+ if (!count)
+ return count;
+
+ rcu_read_lock();
+ cc_state = get_cc_state(ppd);
+ if (cc_state == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ memcpy(buf, &cc_state->cct, count);
+ rcu_read_unlock();
+
+ return count;
+}
+
+static void port_release(struct kobject *kobj)
+{
+ /* nothing to do since memory is freed by hfi1_free_devdata() */
+}
+
+static struct kobj_type port_cc_ktype = {
+ .release = port_release,
+};
+
+static struct bin_attribute cc_table_bin_attr = {
+ .attr = {.name = "cc_table_bin", .mode = 0444},
+ .read = read_cc_table_bin,
+ .size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ int ret;
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+ struct cc_state *cc_state;
+
+ ret = sizeof(struct opa_congestion_setting_attr_shadow);
+
+ if (pos > ret)
+ return -EINVAL;
+ if (count > ret - pos)
+ count = ret - pos;
+
+ if (!count)
+ return count;
+
+ rcu_read_lock();
+ cc_state = get_cc_state(ppd);
+ if (cc_state == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ memcpy(buf, &cc_state->cong_setting, count);
+ rcu_read_unlock();
+
+ return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+ .attr = {.name = "cc_settings_bin", .mode = 0444},
+ .read = read_cc_setting_bin,
+ .size = PAGE_SIZE,
+};
+
+/* Start sc2vl */
+#define HFI1_SC2VL_ATTR(N) \
+ static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
+ .attr = { .name = __stringify(N), .mode = 0444 }, \
+ .sc = N \
+ }
+
+struct hfi1_sc2vl_attr {
+ struct attribute attr;
+ int sc;
+};
+
+HFI1_SC2VL_ATTR(0);
+HFI1_SC2VL_ATTR(1);
+HFI1_SC2VL_ATTR(2);
+HFI1_SC2VL_ATTR(3);
+HFI1_SC2VL_ATTR(4);
+HFI1_SC2VL_ATTR(5);
+HFI1_SC2VL_ATTR(6);
+HFI1_SC2VL_ATTR(7);
+HFI1_SC2VL_ATTR(8);
+HFI1_SC2VL_ATTR(9);
+HFI1_SC2VL_ATTR(10);
+HFI1_SC2VL_ATTR(11);
+HFI1_SC2VL_ATTR(12);
+HFI1_SC2VL_ATTR(13);
+HFI1_SC2VL_ATTR(14);
+HFI1_SC2VL_ATTR(15);
+HFI1_SC2VL_ATTR(16);
+HFI1_SC2VL_ATTR(17);
+HFI1_SC2VL_ATTR(18);
+HFI1_SC2VL_ATTR(19);
+HFI1_SC2VL_ATTR(20);
+HFI1_SC2VL_ATTR(21);
+HFI1_SC2VL_ATTR(22);
+HFI1_SC2VL_ATTR(23);
+HFI1_SC2VL_ATTR(24);
+HFI1_SC2VL_ATTR(25);
+HFI1_SC2VL_ATTR(26);
+HFI1_SC2VL_ATTR(27);
+HFI1_SC2VL_ATTR(28);
+HFI1_SC2VL_ATTR(29);
+HFI1_SC2VL_ATTR(30);
+HFI1_SC2VL_ATTR(31);
+
+
+static struct attribute *sc2vl_default_attributes[] = {
+ &hfi1_sc2vl_attr_0.attr,
+ &hfi1_sc2vl_attr_1.attr,
+ &hfi1_sc2vl_attr_2.attr,
+ &hfi1_sc2vl_attr_3.attr,
+ &hfi1_sc2vl_attr_4.attr,
+ &hfi1_sc2vl_attr_5.attr,
+ &hfi1_sc2vl_attr_6.attr,
+ &hfi1_sc2vl_attr_7.attr,
+ &hfi1_sc2vl_attr_8.attr,
+ &hfi1_sc2vl_attr_9.attr,
+ &hfi1_sc2vl_attr_10.attr,
+ &hfi1_sc2vl_attr_11.attr,
+ &hfi1_sc2vl_attr_12.attr,
+ &hfi1_sc2vl_attr_13.attr,
+ &hfi1_sc2vl_attr_14.attr,
+ &hfi1_sc2vl_attr_15.attr,
+ &hfi1_sc2vl_attr_16.attr,
+ &hfi1_sc2vl_attr_17.attr,
+ &hfi1_sc2vl_attr_18.attr,
+ &hfi1_sc2vl_attr_19.attr,
+ &hfi1_sc2vl_attr_20.attr,
+ &hfi1_sc2vl_attr_21.attr,
+ &hfi1_sc2vl_attr_22.attr,
+ &hfi1_sc2vl_attr_23.attr,
+ &hfi1_sc2vl_attr_24.attr,
+ &hfi1_sc2vl_attr_25.attr,
+ &hfi1_sc2vl_attr_26.attr,
+ &hfi1_sc2vl_attr_27.attr,
+ &hfi1_sc2vl_attr_28.attr,
+ &hfi1_sc2vl_attr_29.attr,
+ &hfi1_sc2vl_attr_30.attr,
+ &hfi1_sc2vl_attr_31.attr,
+ NULL
+};
+
+static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct hfi1_sc2vl_attr *sattr =
+ container_of(attr, struct hfi1_sc2vl_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
+ struct hfi1_devdata *dd = ppd->dd;
+
+ return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
+}
+
+static const struct sysfs_ops hfi1_sc2vl_ops = {
+ .show = sc2vl_attr_show,
+};
+
+static struct kobj_type hfi1_sc2vl_ktype = {
+ .release = port_release,
+ .sysfs_ops = &hfi1_sc2vl_ops,
+ .default_attrs = sc2vl_default_attributes
+};
+
+/* End sc2vl */
+
+/* Start sl2sc */
+#define HFI1_SL2SC_ATTR(N) \
+ static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = { \
+ .attr = { .name = __stringify(N), .mode = 0444 }, \
+ .sl = N \
+ }
+
+struct hfi1_sl2sc_attr {
+ struct attribute attr;
+ int sl;
+};
+
+HFI1_SL2SC_ATTR(0);
+HFI1_SL2SC_ATTR(1);
+HFI1_SL2SC_ATTR(2);
+HFI1_SL2SC_ATTR(3);
+HFI1_SL2SC_ATTR(4);
+HFI1_SL2SC_ATTR(5);
+HFI1_SL2SC_ATTR(6);
+HFI1_SL2SC_ATTR(7);
+HFI1_SL2SC_ATTR(8);
+HFI1_SL2SC_ATTR(9);
+HFI1_SL2SC_ATTR(10);
+HFI1_SL2SC_ATTR(11);
+HFI1_SL2SC_ATTR(12);
+HFI1_SL2SC_ATTR(13);
+HFI1_SL2SC_ATTR(14);
+HFI1_SL2SC_ATTR(15);
+HFI1_SL2SC_ATTR(16);
+HFI1_SL2SC_ATTR(17);
+HFI1_SL2SC_ATTR(18);
+HFI1_SL2SC_ATTR(19);
+HFI1_SL2SC_ATTR(20);
+HFI1_SL2SC_ATTR(21);
+HFI1_SL2SC_ATTR(22);
+HFI1_SL2SC_ATTR(23);
+HFI1_SL2SC_ATTR(24);
+HFI1_SL2SC_ATTR(25);
+HFI1_SL2SC_ATTR(26);
+HFI1_SL2SC_ATTR(27);
+HFI1_SL2SC_ATTR(28);
+HFI1_SL2SC_ATTR(29);
+HFI1_SL2SC_ATTR(30);
+HFI1_SL2SC_ATTR(31);
+
+
+static struct attribute *sl2sc_default_attributes[] = {
+ &hfi1_sl2sc_attr_0.attr,
+ &hfi1_sl2sc_attr_1.attr,
+ &hfi1_sl2sc_attr_2.attr,
+ &hfi1_sl2sc_attr_3.attr,
+ &hfi1_sl2sc_attr_4.attr,
+ &hfi1_sl2sc_attr_5.attr,
+ &hfi1_sl2sc_attr_6.attr,
+ &hfi1_sl2sc_attr_7.attr,
+ &hfi1_sl2sc_attr_8.attr,
+ &hfi1_sl2sc_attr_9.attr,
+ &hfi1_sl2sc_attr_10.attr,
+ &hfi1_sl2sc_attr_11.attr,
+ &hfi1_sl2sc_attr_12.attr,
+ &hfi1_sl2sc_attr_13.attr,
+ &hfi1_sl2sc_attr_14.attr,
+ &hfi1_sl2sc_attr_15.attr,
+ &hfi1_sl2sc_attr_16.attr,
+ &hfi1_sl2sc_attr_17.attr,
+ &hfi1_sl2sc_attr_18.attr,
+ &hfi1_sl2sc_attr_19.attr,
+ &hfi1_sl2sc_attr_20.attr,
+ &hfi1_sl2sc_attr_21.attr,
+ &hfi1_sl2sc_attr_22.attr,
+ &hfi1_sl2sc_attr_23.attr,
+ &hfi1_sl2sc_attr_24.attr,
+ &hfi1_sl2sc_attr_25.attr,
+ &hfi1_sl2sc_attr_26.attr,
+ &hfi1_sl2sc_attr_27.attr,
+ &hfi1_sl2sc_attr_28.attr,
+ &hfi1_sl2sc_attr_29.attr,
+ &hfi1_sl2sc_attr_30.attr,
+ &hfi1_sl2sc_attr_31.attr,
+ NULL
+};
+
+static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct hfi1_sl2sc_attr *sattr =
+ container_of(attr, struct hfi1_sl2sc_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+ return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
+}
+
+static const struct sysfs_ops hfi1_sl2sc_ops = {
+ .show = sl2sc_attr_show,
+};
+
+static struct kobj_type hfi1_sl2sc_ktype = {
+ .release = port_release,
+ .sysfs_ops = &hfi1_sl2sc_ops,
+ .default_attrs = sl2sc_default_attributes
+};
+
+/* End sl2sc */
+
+/* Start vl2mtu */
+
+#define HFI1_VL2MTU_ATTR(N) \
+ static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
+ .attr = { .name = __stringify(N), .mode = 0444 }, \
+ .vl = N \
+ }
+
+struct hfi1_vl2mtu_attr {
+ struct attribute attr;
+ int vl;
+};
+
+HFI1_VL2MTU_ATTR(0);
+HFI1_VL2MTU_ATTR(1);
+HFI1_VL2MTU_ATTR(2);
+HFI1_VL2MTU_ATTR(3);
+HFI1_VL2MTU_ATTR(4);
+HFI1_VL2MTU_ATTR(5);
+HFI1_VL2MTU_ATTR(6);
+HFI1_VL2MTU_ATTR(7);
+HFI1_VL2MTU_ATTR(8);
+HFI1_VL2MTU_ATTR(9);
+HFI1_VL2MTU_ATTR(10);
+HFI1_VL2MTU_ATTR(11);
+HFI1_VL2MTU_ATTR(12);
+HFI1_VL2MTU_ATTR(13);
+HFI1_VL2MTU_ATTR(14);
+HFI1_VL2MTU_ATTR(15);
+
+static struct attribute *vl2mtu_default_attributes[] = {
+ &hfi1_vl2mtu_attr_0.attr,
+ &hfi1_vl2mtu_attr_1.attr,
+ &hfi1_vl2mtu_attr_2.attr,
+ &hfi1_vl2mtu_attr_3.attr,
+ &hfi1_vl2mtu_attr_4.attr,
+ &hfi1_vl2mtu_attr_5.attr,
+ &hfi1_vl2mtu_attr_6.attr,
+ &hfi1_vl2mtu_attr_7.attr,
+ &hfi1_vl2mtu_attr_8.attr,
+ &hfi1_vl2mtu_attr_9.attr,
+ &hfi1_vl2mtu_attr_10.attr,
+ &hfi1_vl2mtu_attr_11.attr,
+ &hfi1_vl2mtu_attr_12.attr,
+ &hfi1_vl2mtu_attr_13.attr,
+ &hfi1_vl2mtu_attr_14.attr,
+ &hfi1_vl2mtu_attr_15.attr,
+ NULL
+};
+
+static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct hfi1_vl2mtu_attr *vlattr =
+ container_of(attr, struct hfi1_vl2mtu_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
+ struct hfi1_devdata *dd = ppd->dd;
+
+ return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
+}
+
+static const struct sysfs_ops hfi1_vl2mtu_ops = {
+ .show = vl2mtu_attr_show,
+};
+
+static struct kobj_type hfi1_vl2mtu_ktype = {
+ .release = port_release,
+ .sysfs_ops = &hfi1_vl2mtu_ops,
+ .default_attrs = vl2mtu_default_attributes
+};
+
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+
+ return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ int ret;
+
+ if (!dd->boardname)
+ ret = -EINVAL;
+ else
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+ return ret;
+}
+
+static ssize_t show_boardversion(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ /* The string printed here is already newline-terminated. */
+ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+
+static ssize_t show_nctxts(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ /*
+ * Return the smaller of send and receive contexts.
+ * Normally, user level applications would require both a send
+ * and a receive context, so returning the smaller of the two counts
+ * give a more accurate picture of total contexts available.
+ */
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ min(dd->num_rcv_contexts - dd->first_user_ctxt,
+ (u32)dd->sc_sizes[SC_USER].count));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ /* Return the number of free user ports (contexts) available. */
+ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
+
+}
+
+static ssize_t store_chip_reset(struct device *device,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ int ret;
+
+ if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ ret = hfi1_reset_device(dd->unit);
+bail:
+ return ret < 0 ? ret : count;
+}
+
+/*
+ * Convert the reported temperature from an integer (reported in
+ * units of 0.25C) to a floating point number.
+ */
+#define temp2str(temp, buf, size, idx) \
+ scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ", \
+ ((temp) >> 2), ((temp) & 0x3) * 25)
+
+/*
+ * Dump tempsense values, in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ struct hfi1_temp temp;
+ int ret = -ENXIO;
+
+ ret = hfi1_tempsense_rd(dd, &temp);
+ if (!ret) {
+ int idx = 0;
+
+ idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
+ idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
+ idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
+ idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
+ idx += scnprintf(buf + idx, PAGE_SIZE - idx,
+ "%u %u %u\n", temp.triggers & 0x1,
+ temp.triggers & 0x2, temp.triggers & 0x4);
+ ret = idx;
+ }
+ return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *hfi1_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_board_id,
+ &dev_attr_nctxts,
+ &dev_attr_nfreectxts,
+ &dev_attr_serial,
+ &dev_attr_boardversion,
+ &dev_attr_tempsense,
+ &dev_attr_chip_reset,
+};
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+ struct kobject *kobj)
+{
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ int ret;
+
+ if (!port_num || port_num > dd->num_pports) {
+ dd_dev_err(dd,
+ "Skipping infiniband class with invalid port %u\n",
+ port_num);
+ return -ENODEV;
+ }
+ ppd = &dd->pport[port_num - 1];
+
+ ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
+ "sc2vl");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping sc2vl sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail;
+ }
+ kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
+
+ ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
+ "sl2sc");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping sl2sc sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_sc2vl;
+ }
+ kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
+
+ ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
+ "vl2mtu");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping vl2mtu sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_sl2sc;
+ }
+ kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
+
+
+ ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
+ kobj, "CCMgtA");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_vl2mtu;
+ }
+
+ kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+ ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+ &cc_setting_bin_attr);
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_cc;
+ }
+
+ ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+ &cc_table_bin_attr);
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_cc_entry_bin;
+ }
+
+ dd_dev_info(dd,
+ "IB%u: Congestion Control Agent enabled for port %d\n",
+ dd->unit, port_num);
+
+ return 0;
+
+bail_cc_entry_bin:
+ sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+ &cc_setting_bin_attr);
+bail_cc:
+ kobject_put(&ppd->pport_cc_kobj);
+bail_vl2mtu:
+ kobject_put(&ppd->vl2mtu_kobj);
+bail_sl2sc:
+ kobject_put(&ppd->sl2sc_kobj);
+bail_sc2vl:
+ kobject_put(&ppd->sc2vl_kobj);
+bail:
+ return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
+{
+ struct ib_device *dev = &dd->verbs_dev.ibdev;
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
+ ret = device_create_file(&dev->dev, hfi1_attributes[i]);
+ if (ret)
+ goto bail;
+ }
+
+ return 0;
+bail:
+ for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
+ device_remove_file(&dev->dev, hfi1_attributes[i]);
+ return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ for (i = 0; i < dd->num_pports; i++) {
+ ppd = &dd->pport[i];
+
+ sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+ &cc_setting_bin_attr);
+ sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+ &cc_table_bin_attr);
+ kobject_put(&ppd->pport_cc_kobj);
+ kobject_put(&ppd->vl2mtu_kobj);
+ kobject_put(&ppd->sl2sc_kobj);
+ kobject_put(&ppd->sc2vl_kobj);
+ }
+}
diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c
new file mode 100644
index 000000000000..70ad7b9fc1ce
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/trace.c
@@ -0,0 +1,221 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
+{
+ struct hfi1_other_headers *ohdr;
+ u8 opcode;
+ u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+
+ if (lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ return hdr_len_by_opcode[opcode] == 0 ?
+ 0 : hdr_len_by_opcode[opcode] - (12 + 8);
+}
+
+#define IMM_PRN "imm %d"
+#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
+#define AETH_PRN "aeth syn 0x%.2x msn 0x%.8x"
+#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
+#define ATOMICACKETH_PRN "origdata %lld"
+#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
+
+#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
+
+static u64 ib_u64_get(__be32 *p)
+{
+ return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
+}
+
+const char *parse_everbs_hdrs(
+ struct trace_seq *p,
+ u8 opcode,
+ void *ehdrs)
+{
+ union ib_ehdrs *eh = ehdrs;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ switch (opcode) {
+ /* imm */
+ case OP(RC, SEND_LAST_WITH_IMMEDIATE):
+ case OP(UC, SEND_LAST_WITH_IMMEDIATE):
+ case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
+ case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
+ case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ trace_seq_printf(p, IMM_PRN,
+ be32_to_cpu(eh->imm_data));
+ break;
+ /* reth + imm */
+ case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ trace_seq_printf(p, RETH_PRN " " IMM_PRN,
+ (unsigned long long)ib_u64_get(
+ (__be32 *)&eh->rc.reth.vaddr),
+ be32_to_cpu(eh->rc.reth.rkey),
+ be32_to_cpu(eh->rc.reth.length),
+ be32_to_cpu(eh->rc.imm_data));
+ break;
+ /* reth */
+ case OP(RC, RDMA_READ_REQUEST):
+ case OP(RC, RDMA_WRITE_FIRST):
+ case OP(UC, RDMA_WRITE_FIRST):
+ case OP(RC, RDMA_WRITE_ONLY):
+ case OP(UC, RDMA_WRITE_ONLY):
+ trace_seq_printf(p, RETH_PRN,
+ (unsigned long long)ib_u64_get(
+ (__be32 *)&eh->rc.reth.vaddr),
+ be32_to_cpu(eh->rc.reth.rkey),
+ be32_to_cpu(eh->rc.reth.length));
+ break;
+ case OP(RC, RDMA_READ_RESPONSE_FIRST):
+ case OP(RC, RDMA_READ_RESPONSE_LAST):
+ case OP(RC, RDMA_READ_RESPONSE_ONLY):
+ case OP(RC, ACKNOWLEDGE):
+ trace_seq_printf(p, AETH_PRN,
+ be32_to_cpu(eh->aeth) >> 24,
+ be32_to_cpu(eh->aeth) & HFI1_QPN_MASK);
+ break;
+ /* aeth + atomicacketh */
+ case OP(RC, ATOMIC_ACKNOWLEDGE):
+ trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
+ (be32_to_cpu(eh->at.aeth) >> 24) & 0xff,
+ be32_to_cpu(eh->at.aeth) & HFI1_QPN_MASK,
+ (unsigned long long)ib_u64_get(eh->at.atomic_ack_eth));
+ break;
+ /* atomiceth */
+ case OP(RC, COMPARE_SWAP):
+ case OP(RC, FETCH_ADD):
+ trace_seq_printf(p, ATOMICETH_PRN,
+ (unsigned long long)ib_u64_get(eh->atomic_eth.vaddr),
+ eh->atomic_eth.rkey,
+ (unsigned long long)ib_u64_get(
+ (__be32 *)&eh->atomic_eth.swap_data),
+ (unsigned long long) ib_u64_get(
+ (__be32 *)&eh->atomic_eth.compare_data));
+ break;
+ /* deth */
+ case OP(UD, SEND_ONLY):
+ case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
+ trace_seq_printf(p, DETH_PRN,
+ be32_to_cpu(eh->ud.deth[0]),
+ be32_to_cpu(eh->ud.deth[1]) & HFI1_QPN_MASK);
+ break;
+ }
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+const char *parse_sdma_flags(
+ struct trace_seq *p,
+ u64 desc0, u64 desc1)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ char flags[5] = { 'x', 'x', 'x', 'x', 0 };
+
+ flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+ flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? 'H' : '-';
+ flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+ flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+ trace_seq_printf(p, "%s", flags);
+ if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
+ trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
+ (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT)
+ & SDMA_DESC1_HEADER_MODE_MASK),
+ (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT)
+ & SDMA_DESC1_HEADER_INDEX_MASK),
+ (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT)
+ & SDMA_DESC1_HEADER_DWS_MASK));
+ return ret;
+}
+
+const char *print_u32_array(
+ struct trace_seq *p,
+ u32 *arr, int len)
+{
+ int i;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ for (i = 0; i < len ; i++)
+ trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+const char *print_u64_array(
+ struct trace_seq *p,
+ u64 *arr, int len)
+{
+ int i;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ for (i = 0; i < len; i++)
+ trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+__hfi1_trace_fn(PKT);
+__hfi1_trace_fn(PROC);
+__hfi1_trace_fn(SDMA);
+__hfi1_trace_fn(LINKVERB);
+__hfi1_trace_fn(DEBUG);
+__hfi1_trace_fn(SNOOP);
+__hfi1_trace_fn(CNTR);
+__hfi1_trace_fn(PIO);
+__hfi1_trace_fn(DC8051);
+__hfi1_trace_fn(FIRMWARE);
+__hfi1_trace_fn(RCVCTRL);
+__hfi1_trace_fn(TID);
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
new file mode 100644
index 000000000000..d7851c0a0171
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/trace.h
@@ -0,0 +1,1409 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR hfi1
+
+#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype) \
+__print_symbolic(etype, \
+ packettype_name(EXPECTED), \
+ packettype_name(EAGER), \
+ packettype_name(IB), \
+ packettype_name(ERROR), \
+ packettype_name(BYPASS))
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+ TP_PROTO(struct hfi1_devdata *dd,
+ u64 eflags,
+ u32 ctxt,
+ u32 etype,
+ u32 hlen,
+ u32 tlen,
+ u32 updegr,
+ u32 etail),
+ TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u64, eflags)
+ __field(u32, ctxt)
+ __field(u32, etype)
+ __field(u32, hlen)
+ __field(u32, tlen)
+ __field(u32, updegr)
+ __field(u32, etail)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->eflags = eflags;
+ __entry->ctxt = ctxt;
+ __entry->etype = etype;
+ __entry->hlen = hlen;
+ __entry->tlen = tlen;
+ __entry->updegr = updegr;
+ __entry->etail = etail;
+ ),
+ TP_printk(
+"[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->eflags,
+ __entry->etype, show_packettype(__entry->etype),
+ __entry->hlen,
+ __entry->tlen,
+ __entry->updegr,
+ __entry->etail
+ )
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+ TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+ TP_ARGS(dd, ctxt),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u32, ctxt)
+ __field(u8, slow_path)
+ __field(u8, dma_rtail)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ if (dd->rcd[ctxt]->do_interrupt ==
+ &handle_receive_interrupt) {
+ __entry->slow_path = 1;
+ __entry->dma_rtail = 0xFF;
+ } else if (dd->rcd[ctxt]->do_interrupt ==
+ &handle_receive_interrupt_dma_rtail){
+ __entry->dma_rtail = 1;
+ __entry->slow_path = 0;
+ } else if (dd->rcd[ctxt]->do_interrupt ==
+ &handle_receive_interrupt_nodma_rtail) {
+ __entry->dma_rtail = 0;
+ __entry->slow_path = 0;
+ }
+ ),
+ TP_printk(
+ "[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->slow_path,
+ __entry->dma_rtail
+ )
+);
+
+const char *print_u64_array(struct trace_seq *, u64 *, int);
+
+TRACE_EVENT(hfi1_exp_tid_map,
+ TP_PROTO(unsigned ctxt, u16 subctxt, int dir,
+ unsigned long *maps, u16 count),
+ TP_ARGS(ctxt, subctxt, dir, maps, count),
+ TP_STRUCT__entry(
+ __field(unsigned, ctxt)
+ __field(u16, subctxt)
+ __field(int, dir)
+ __field(u16, count)
+ __dynamic_array(unsigned long, maps, sizeof(*maps) * count)
+ ),
+ TP_fast_assign(
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->dir = dir;
+ __entry->count = count;
+ memcpy(__get_dynamic_array(maps), maps,
+ sizeof(*maps) * count);
+ ),
+ TP_printk("[%3u:%02u] %s tidmaps %s",
+ __entry->ctxt,
+ __entry->subctxt,
+ (__entry->dir ? ">" : "<"),
+ print_u64_array(p, __get_dynamic_array(maps),
+ __entry->count)
+ )
+ );
+
+TRACE_EVENT(hfi1_exp_rcv_set,
+ TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
+ unsigned long vaddr, u64 phys_addr, void *page),
+ TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page),
+ TP_STRUCT__entry(
+ __field(unsigned, ctxt)
+ __field(u16, subctxt)
+ __field(u32, tid)
+ __field(unsigned long, vaddr)
+ __field(u64, phys_addr)
+ __field(void *, page)
+ ),
+ TP_fast_assign(
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->tid = tid;
+ __entry->vaddr = vaddr;
+ __entry->phys_addr = phys_addr;
+ __entry->page = page;
+ ),
+ TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->tid,
+ __entry->vaddr,
+ __entry->phys_addr,
+ __entry->page
+ )
+ );
+
+TRACE_EVENT(hfi1_exp_rcv_free,
+ TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
+ unsigned long phys, void *page),
+ TP_ARGS(ctxt, subctxt, tid, phys, page),
+ TP_STRUCT__entry(
+ __field(unsigned, ctxt)
+ __field(u16, subctxt)
+ __field(u32, tid)
+ __field(unsigned long, phys)
+ __field(void *, page)
+ ),
+ TP_fast_assign(
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->tid = tid;
+ __entry->phys = phys;
+ __entry->page = page;
+ ),
+ TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->tid,
+ __entry->phys,
+ __entry->page
+ )
+ );
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+ TP_PROTO(struct send_context *sc, int extra),
+ TP_ARGS(sc, extra),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sc->dd)
+ __field(u32, sw_index)
+ __field(u32, hw_context)
+ __field(int, extra)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sc->dd);
+ __entry->sw_index = sc->sw_index;
+ __entry->hw_context = sc->hw_context;
+ __entry->extra = extra;
+ ),
+ TP_printk(
+ "[%s] ctxt %u(%u) extra %d",
+ __get_str(dev),
+ __entry->sw_index,
+ __entry->hw_context,
+ __entry->extra
+ )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+ TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+ TP_ARGS(sc, needint, credit_ctrl),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sc->dd)
+ __field(u32, sw_index)
+ __field(u32, hw_context)
+ __field(u32, needint)
+ __field(u64, credit_ctrl)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sc->dd);
+ __entry->sw_index = sc->sw_index;
+ __entry->hw_context = sc->hw_context;
+ __entry->needint = needint;
+ __entry->credit_ctrl = credit_ctrl;
+ ),
+ TP_printk(
+ "[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+ __get_str(dev),
+ __entry->sw_index,
+ __entry->hw_context,
+ __entry->needint,
+ (unsigned long long)__entry->credit_ctrl
+ )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+ TP_PROTO(struct hfi1_qp *qp, u32 flags),
+ TP_ARGS(qp, flags),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, flags)
+ __field(u32, s_flags)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->flags = flags;
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->s_flags = qp->s_flags;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->flags,
+ __entry->s_flags
+ )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+ TP_PROTO(struct hfi1_qp *qp, u32 flags),
+ TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+ TP_PROTO(struct hfi1_qp *qp, u32 flags),
+ TP_ARGS(qp, flags));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_qphash
+DECLARE_EVENT_CLASS(hfi1_qphash_template,
+ TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+ TP_ARGS(qp, bucket),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, bucket)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->bucket = bucket;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x bucket %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->bucket
+ )
+);
+
+DEFINE_EVENT(hfi1_qphash_template, hfi1_qpinsert,
+ TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+ TP_ARGS(qp, bucket));
+
+DEFINE_EVENT(hfi1_qphash_template, hfi1_qpremove,
+ TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+ TP_ARGS(qp, bucket));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(
+ struct trace_seq *p,
+ u8 opcode,
+ void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+const char *parse_sdma_flags(
+ struct trace_seq *p,
+ u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh) \
+__print_symbolic(lrh, \
+ lrh_name(LRH_BTH), \
+ lrh_name(LRH_GRH))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode }
+#define show_ib_opcode(opcode) \
+__print_symbolic(opcode, \
+ ib_opcode_name(RC_SEND_FIRST), \
+ ib_opcode_name(RC_SEND_MIDDLE), \
+ ib_opcode_name(RC_SEND_LAST), \
+ ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_SEND_ONLY), \
+ ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_RDMA_WRITE_FIRST), \
+ ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \
+ ib_opcode_name(RC_RDMA_WRITE_LAST), \
+ ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_RDMA_WRITE_ONLY), \
+ ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_RDMA_READ_REQUEST), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \
+ ib_opcode_name(RC_ACKNOWLEDGE), \
+ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
+ ib_opcode_name(RC_COMPARE_SWAP), \
+ ib_opcode_name(RC_FETCH_ADD), \
+ ib_opcode_name(UC_SEND_FIRST), \
+ ib_opcode_name(UC_SEND_MIDDLE), \
+ ib_opcode_name(UC_SEND_LAST), \
+ ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(UC_SEND_ONLY), \
+ ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(UC_RDMA_WRITE_FIRST), \
+ ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \
+ ib_opcode_name(UC_RDMA_WRITE_LAST), \
+ ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(UC_RDMA_WRITE_ONLY), \
+ ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(UD_SEND_ONLY), \
+ ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE))
+
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+ "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+ "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+ TP_PROTO(struct hfi1_devdata *dd,
+ struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ /* LRH */
+ __field(u8, vl)
+ __field(u8, lver)
+ __field(u8, sl)
+ __field(u8, lnh)
+ __field(u16, dlid)
+ __field(u16, len)
+ __field(u16, slid)
+ /* BTH */
+ __field(u8, opcode)
+ __field(u8, se)
+ __field(u8, m)
+ __field(u8, pad)
+ __field(u8, tver)
+ __field(u16, pkey)
+ __field(u8, f)
+ __field(u8, b)
+ __field(u32, qpn)
+ __field(u8, a)
+ __field(u32, psn)
+ /* extended headers */
+ __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+ ),
+ TP_fast_assign(
+ struct hfi1_other_headers *ohdr;
+
+ DD_DEV_ASSIGN(dd);
+ /* LRH */
+ __entry->vl =
+ (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+ __entry->lver =
+ (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+ __entry->sl =
+ (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+ __entry->lnh =
+ (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+ __entry->dlid =
+ be16_to_cpu(hdr->lrh[1]);
+ /* allow for larger len */
+ __entry->len =
+ be16_to_cpu(hdr->lrh[2]);
+ __entry->slid =
+ be16_to_cpu(hdr->lrh[3]);
+ /* BTH */
+ if (__entry->lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+ __entry->opcode =
+ (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+ __entry->se =
+ (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+ __entry->m =
+ (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+ __entry->pad =
+ (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ __entry->tver =
+ (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+ __entry->pkey =
+ be32_to_cpu(ohdr->bth[0]) & 0xffff;
+ __entry->f =
+ (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT)
+ & HFI1_FECN_MASK;
+ __entry->b =
+ (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT)
+ & HFI1_BECN_MASK;
+ __entry->qpn =
+ be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+ __entry->a =
+ (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+ /* allow for larger PSN */
+ __entry->psn =
+ be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+ /* extended headers */
+ memcpy(
+ __get_dynamic_array(ehdrs),
+ &ohdr->u,
+ ibhdr_exhdr_len(hdr));
+ ),
+ TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+ __get_str(dev),
+ /* LRH */
+ __entry->vl,
+ __entry->lver,
+ __entry->sl,
+ __entry->lnh, show_lnh(__entry->lnh),
+ __entry->dlid,
+ __entry->len,
+ __entry->slid,
+ /* BTH */
+ __entry->opcode, show_ib_opcode(__entry->opcode),
+ __entry->se,
+ __entry->m,
+ __entry->pad,
+ __entry->tver,
+ __entry->pkey,
+ __entry->f,
+ __entry->b,
+ __entry->qpn,
+ __entry->a,
+ __entry->psn,
+ /* extended headers */
+ __parse_ib_ehdrs(
+ __entry->opcode,
+ (void *)__get_dynamic_array(ehdrs))
+ )
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, output_ibhdr,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr));
+
+#define SNOOP_PRN \
+ "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+ "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_snoop
+
+
+TRACE_EVENT(snoop_capture,
+ TP_PROTO(struct hfi1_devdata *dd,
+ int hdr_len,
+ struct hfi1_ib_header *hdr,
+ int data_len,
+ void *data),
+ TP_ARGS(dd, hdr_len, hdr, data_len, data),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, slid)
+ __field(u16, dlid)
+ __field(u32, qpn)
+ __field(u8, opcode)
+ __field(u8, sl)
+ __field(u16, pkey)
+ __field(u32, hdr_len)
+ __field(u32, data_len)
+ __field(u8, lnh)
+ __dynamic_array(u8, raw_hdr, hdr_len)
+ __dynamic_array(u8, raw_pkt, data_len)
+ ),
+ TP_fast_assign(
+ struct hfi1_other_headers *ohdr;
+
+ __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+ if (__entry->lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+ DD_DEV_ASSIGN(dd);
+ __entry->slid = be16_to_cpu(hdr->lrh[3]);
+ __entry->dlid = be16_to_cpu(hdr->lrh[1]);
+ __entry->qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+ __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+ __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+ __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
+ __entry->hdr_len = hdr_len;
+ __entry->data_len = data_len;
+ memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+ memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+ ),
+ TP_printk("[%s] " SNOOP_PRN,
+ __get_str(dev),
+ __entry->slid,
+ __entry->dlid,
+ __entry->qpn,
+ __entry->opcode,
+ show_ib_opcode(__entry->opcode),
+ __entry->sl,
+ __entry->pkey,
+ __entry->hdr_len,
+ __entry->data_len
+ )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+ "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \
+ "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+ TP_ARGS(dd, uctxt),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(unsigned, ctxt)
+ __field(u32, credits)
+ __field(u64, hw_free)
+ __field(u64, piobase)
+ __field(u16, rcvhdrq_cnt)
+ __field(u64, rcvhdrq_phys)
+ __field(u32, eager_cnt)
+ __field(u64, rcvegr_phys)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = uctxt->ctxt;
+ __entry->credits = uctxt->sc->credits;
+ __entry->hw_free = (u64)uctxt->sc->hw_free;
+ __entry->piobase = (u64)uctxt->sc->base_addr;
+ __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+ __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+ __entry->eager_cnt = uctxt->egrbufs.alloced;
+ __entry->rcvegr_phys = uctxt->egrbufs.rcvtids[0].phys;
+ ),
+ TP_printk(
+ "[%s] ctxt %u " UCTXT_FMT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->credits,
+ __entry->hw_free,
+ __entry->piobase,
+ __entry->rcvhdrq_cnt,
+ __entry->rcvhdrq_phys,
+ __entry->eager_cnt,
+ __entry->rcvegr_phys
+ )
+ );
+
+#define CINFO_FMT \
+ "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+ TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
+ struct hfi1_ctxt_info cinfo),
+ TP_ARGS(dd, ctxt, subctxt, cinfo),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(unsigned, ctxt)
+ __field(unsigned, subctxt)
+ __field(u16, egrtids)
+ __field(u16, rcvhdrq_cnt)
+ __field(u16, rcvhdrq_size)
+ __field(u16, sdma_ring_size)
+ __field(u32, rcvegr_size)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->egrtids = cinfo.egrtids;
+ __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+ __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+ __entry->sdma_ring_size = cinfo.sdma_ring_size;
+ __entry->rcvegr_size = cinfo.rcvegr_size;
+ ),
+ TP_printk(
+ "[%s] ctxt %u:%u " CINFO_FMT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->egrtids,
+ __entry->rcvegr_size,
+ __entry->rcvhdrq_cnt,
+ __entry->rcvhdrq_size,
+ __entry->sdma_ring_size
+ )
+ );
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sma
+
+#define BCT_FORMAT \
+ "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+ be16_to_cpu( \
+ ((struct buffer_control *)__get_dynamic_array(bct))->field \
+ )
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+ TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+ TP_ARGS(dd, bc),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __dynamic_array(u8, bct, sizeof(*bc))
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ memcpy(
+ __get_dynamic_array(bct),
+ bc,
+ sizeof(*bc));
+ ),
+ TP_printk(BCT_FORMAT,
+ BCT(overall_shared_limit),
+
+ BCT(vl[0].dedicated),
+ BCT(vl[0].shared),
+
+ BCT(vl[1].dedicated),
+ BCT(vl[1].shared),
+
+ BCT(vl[2].dedicated),
+ BCT(vl[2].shared),
+
+ BCT(vl[3].dedicated),
+ BCT(vl[3].shared),
+
+ BCT(vl[4].dedicated),
+ BCT(vl[4].shared),
+
+ BCT(vl[5].dedicated),
+ BCT(vl[5].shared),
+
+ BCT(vl[6].dedicated),
+ BCT(vl[6].shared),
+
+ BCT(vl[7].dedicated),
+ BCT(vl[7].shared),
+
+ BCT(vl[15].dedicated),
+ BCT(vl[15].shared)
+ )
+);
+
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+ TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+ TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+ TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+ TP_ARGS(dd, bc));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sdma
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 desc0,
+ u64 desc1,
+ u16 e,
+ void *descp),
+ TP_ARGS(sde, desc0, desc1, e, descp),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sde->dd)
+ __field(void *, descp)
+ __field(u64, desc0)
+ __field(u64, desc1)
+ __field(u16, e)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sde->dd);
+ __entry->desc0 = desc0;
+ __entry->desc1 = desc1;
+ __entry->idx = sde->this_idx;
+ __entry->descp = descp;
+ __entry->e = e;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+ __get_str(dev),
+ __entry->idx,
+ __parse_sdma_flags(__entry->desc0, __entry->desc1),
+ (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT)
+ & SDMA_DESC0_PHY_ADDR_MASK,
+ (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT)
+ & SDMA_DESC1_GENERATION_MASK),
+ (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+ & SDMA_DESC0_BYTE_COUNT_MASK),
+ __entry->desc0,
+ __entry->desc1,
+ __entry->descp,
+ __entry->e
+ )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+ TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+ TP_ARGS(dd, sel, vl, idx),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u32, sel)
+ __field(u8, vl)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->sel = sel;
+ __entry->vl = vl;
+ __entry->idx = idx;
+ ),
+ TP_printk(
+ "[%s] selecting SDE %u sel 0x%x vl %u",
+ __get_str(dev),
+ __entry->idx,
+ __entry->sel,
+ __entry->vl
+ )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 status
+ ),
+ TP_ARGS(sde, status),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sde->dd)
+ __field(u64, status)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sde->dd);
+ __entry->status = status;
+ __entry->idx = sde->this_idx;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) status %llx",
+ __get_str(dev),
+ __entry->idx,
+ (unsigned long long)__entry->status
+ )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 status
+ ),
+ TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 status
+ ),
+ TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ int aidx
+ ),
+ TP_ARGS(sde, aidx),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sde->dd)
+ __field(int, aidx)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sde->dd);
+ __entry->idx = sde->this_idx;
+ __entry->aidx = aidx;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) aidx %d",
+ __get_str(dev),
+ __entry->idx,
+ __entry->aidx
+ )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ int aidx
+ ),
+ TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ int aidx
+ ),
+ TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u16 hwhead,
+ u16 swhead,
+ struct sdma_txreq *txp
+ ),
+ TP_ARGS(sde, hwhead, swhead, txp),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sde->dd)
+ __field(u64, sn)
+ __field(u16, hwhead)
+ __field(u16, swhead)
+ __field(u16, txnext)
+ __field(u16, tx_tail)
+ __field(u16, tx_head)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sde->dd);
+ __entry->hwhead = hwhead;
+ __entry->swhead = swhead;
+ __entry->tx_tail = sde->tx_tail;
+ __entry->tx_head = sde->tx_head;
+ __entry->txnext = txp ? txp->next_descq_idx : ~0;
+ __entry->idx = sde->this_idx;
+ __entry->sn = txp ? txp->sn : ~0;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+ __get_str(dev),
+ __entry->idx,
+ __entry->sn,
+ __entry->hwhead,
+ __entry->swhead,
+ __entry->txnext,
+ __entry->tx_head,
+ __entry->tx_tail
+ )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u16 hwhead,
+ u16 swhead,
+ struct sdma_txreq *txp
+ ),
+ TP_ARGS(sde, hwhead, swhead, txp),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sde->dd)
+ __field(u16, hwhead)
+ __field(u16, swhead)
+ __field(u16, txnext)
+ __field(u16, tx_tail)
+ __field(u16, tx_head)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sde->dd);
+ __entry->hwhead = hwhead;
+ __entry->swhead = swhead;
+ __entry->tx_tail = sde->tx_tail;
+ __entry->tx_head = sde->tx_head;
+ __entry->txnext = txp ? txp->next_descq_idx : ~0;
+ __entry->idx = sde->this_idx;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+ __get_str(dev),
+ __entry->idx,
+ __entry->hwhead,
+ __entry->swhead,
+ __entry->txnext,
+ __entry->tx_head,
+ __entry->tx_tail
+ )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 sn
+ ),
+ TP_ARGS(sde, sn),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sde->dd)
+ __field(u64, sn)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sde->dd);
+ __entry->sn = sn;
+ __entry->idx = sde->this_idx;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) sn %llu",
+ __get_str(dev),
+ __entry->idx,
+ __entry->sn
+ )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 sn
+ ),
+ TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 sn
+ ),
+ TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+ "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+ struct hfi1_pkt_header *hdr, u32 tidval),
+ TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u16, req)
+ __field(__le32, pbc0)
+ __field(__le32, pbc1)
+ __field(__be32, lrh0)
+ __field(__be32, lrh1)
+ __field(__be32, bth0)
+ __field(__be32, bth1)
+ __field(__be32, bth2)
+ __field(__le32, kdeth0)
+ __field(__le32, kdeth1)
+ __field(__le32, kdeth2)
+ __field(__le32, kdeth3)
+ __field(__le32, kdeth4)
+ __field(__le32, kdeth5)
+ __field(__le32, kdeth6)
+ __field(__le32, kdeth7)
+ __field(__le32, kdeth8)
+ __field(u32, tidval)
+ ),
+ TP_fast_assign(
+ __le32 *pbc = (__le32 *)hdr->pbc;
+ __be32 *lrh = (__be32 *)hdr->lrh;
+ __be32 *bth = (__be32 *)hdr->bth;
+ __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->req = req;
+ __entry->pbc0 = pbc[0];
+ __entry->pbc1 = pbc[1];
+ __entry->lrh0 = be32_to_cpu(lrh[0]);
+ __entry->lrh1 = be32_to_cpu(lrh[1]);
+ __entry->bth0 = be32_to_cpu(bth[0]);
+ __entry->bth1 = be32_to_cpu(bth[1]);
+ __entry->bth2 = be32_to_cpu(bth[2]);
+ __entry->kdeth0 = kdeth[0];
+ __entry->kdeth1 = kdeth[1];
+ __entry->kdeth2 = kdeth[2];
+ __entry->kdeth3 = kdeth[3];
+ __entry->kdeth4 = kdeth[4];
+ __entry->kdeth5 = kdeth[5];
+ __entry->kdeth6 = kdeth[6];
+ __entry->kdeth7 = kdeth[7];
+ __entry->kdeth8 = kdeth[8];
+ __entry->tidval = tidval;
+ ),
+ TP_printk(USDMA_HDR_FORMAT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->req,
+ __entry->pbc1,
+ __entry->pbc0,
+ __entry->lrh0,
+ __entry->lrh1,
+ __entry->bth0,
+ __entry->bth1,
+ __entry->bth2,
+ __entry->kdeth0,
+ __entry->kdeth1,
+ __entry->kdeth2,
+ __entry->kdeth3,
+ __entry->kdeth4,
+ __entry->kdeth5,
+ __entry->kdeth6,
+ __entry->kdeth7,
+ __entry->kdeth8,
+ __entry->tidval
+ )
+ );
+
+#define SDMA_UREQ_FMT \
+ "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+ TP_ARGS(dd, ctxt, subctxt, i),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd);
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u8, ver_opcode)
+ __field(u8, iovcnt)
+ __field(u16, npkts)
+ __field(u16, fragsize)
+ __field(u16, comp_idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->ver_opcode = i[0] & 0xff;
+ __entry->iovcnt = (i[0] >> 8) & 0xff;
+ __entry->npkts = i[1];
+ __entry->fragsize = i[2];
+ __entry->comp_idx = i[3];
+ ),
+ TP_printk(SDMA_UREQ_FMT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->ver_opcode,
+ __entry->iovcnt,
+ __entry->npkts,
+ __entry->fragsize,
+ __entry->comp_idx
+ )
+ );
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st) \
+ __print_symbolic(st, \
+ usdma_complete_name(FREE), \
+ usdma_complete_name(QUEUED), \
+ usdma_complete_name(COMPLETE), \
+ usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+ u8 state, int code),
+ TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u16, idx)
+ __field(u8, state)
+ __field(int, code)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->idx = idx;
+ __entry->state = state;
+ __entry->code = code;
+ ),
+ TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+ __get_str(dev), __entry->ctxt, __entry->subctxt,
+ __entry->idx, show_usdma_complete_state(__entry->state),
+ __entry->code)
+ );
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+ u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+ TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u16, req)
+ __field(u8, sde)
+ __field(u8, idx)
+ __field(int, len)
+ __field(u32, tidval)
+ __array(u32, ahg, 10)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->req = req;
+ __entry->sde = sde;
+ __entry->idx = ahgidx;
+ __entry->len = len;
+ __entry->tidval = tidval;
+ memcpy(__entry->ahg, ahg, len * sizeof(u32));
+ ),
+ TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->req,
+ __entry->sde,
+ __entry->idx,
+ __entry->len - 1,
+ __print_u32_hex(__entry->ahg, __entry->len),
+ __entry->tidval
+ )
+ );
+
+TRACE_EVENT(hfi1_sdma_state,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ const char *cstate,
+ const char *nstate
+ ),
+ TP_ARGS(sde, cstate, nstate),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(sde->dd)
+ __string(curstate, cstate)
+ __string(newstate, nstate)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(sde->dd);
+ __assign_str(curstate, cstate);
+ __assign_str(newstate, nstate);
+ ),
+ TP_printk("[%s] current state %s new state %s",
+ __get_str(dev),
+ __get_str(curstate),
+ __get_str(newstate)
+ )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_sdma_rc,
+ TP_PROTO(struct hfi1_qp *qp, u32 psn),
+ TP_ARGS(qp, psn),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, flags)
+ __field(u32, psn)
+ __field(u32, sending_psn)
+ __field(u32, sending_hpsn)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->flags = qp->s_flags;
+ __entry->psn = psn;
+ __entry->sending_psn = qp->s_sending_psn;
+ __entry->sending_hpsn = qp->s_sending_hpsn;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x flags 0x%x psn 0x%x sending_psn 0x%x sending_hpsn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->flags,
+ __entry->psn,
+ __entry->sending_psn,
+ __entry->sending_psn
+ )
+);
+
+DEFINE_EVENT(hfi1_sdma_rc, hfi1_rc_sendcomplete,
+ TP_PROTO(struct hfi1_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+ TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+ int src),
+ TP_ARGS(dd, is_entry, src),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __array(char, buf, 64)
+ __field(int, src)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd)
+ is_entry->is_name(__entry->buf, 64, src - is_entry->start);
+ __entry->src = src;
+ ),
+ TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+ __entry->src)
+);
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_trace
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+ TP_PROTO(const char *function, struct va_format *vaf),
+ TP_ARGS(function, vaf),
+ TP_STRUCT__entry(
+ __string(function, function)
+ __dynamic_array(char, msg, MAX_MSG_LEN)
+ ),
+ TP_fast_assign(
+ __assign_str(function, function);
+ WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
+ MAX_MSG_LEN, vaf->fmt,
+ *vaf->va) >= MAX_MSG_LEN);
+ ),
+ TP_printk("(%s) %s",
+ __get_str(function),
+ __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \
+ \
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \
+ TP_PROTO(const char *function, struct va_format *vaf), \
+ TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \
+{ \
+ struct va_format vaf = { \
+ .fmt = fmt, \
+ }; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ vaf.va = &args; \
+ trace_hfi1_ ##lvl(func, &vaf); \
+ va_end(args); \
+ return; \
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+
+#define hfi1_cdbg(which, fmt, ...) \
+ __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+ hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+ trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c
new file mode 100644
index 000000000000..ea54fd2700ad
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/twsi.c
@@ -0,0 +1,518 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * "Two Wire Serial Interface" support.
+ *
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the hfi1_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip. Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
+{
+ /*
+ * implicit read of EXTStatus is as good as explicit
+ * read of scratch, if all we want to do is flush
+ * writes.
+ */
+ hfi1_gpio_mod(dd, target, 0, 0, 0);
+ rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+ u32 mask;
+
+ udelay(1);
+
+ mask = QSFP_HFI0_I2CCLK;
+
+ /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+ hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+ /*
+ * Allow for slow slaves by simple
+ * delay for falling edge, sampling on rise.
+ */
+ if (!bit)
+ udelay(2);
+ else {
+ int rise_usec;
+
+ for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+ if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
+ break;
+ udelay(2);
+ }
+ if (rise_usec <= 0)
+ dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+ SCL_WAIT_USEC);
+ }
+ i2c_wait_for_writes(dd, target);
+}
+
+static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+ u32 mask;
+
+ mask = QSFP_HFI0_I2CDAT;
+
+ /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+ hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+ i2c_wait_for_writes(dd, target);
+ udelay(2);
+}
+
+static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
+{
+ u32 read_val, mask;
+
+ mask = QSFP_HFI0_I2CDAT;
+ /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+ hfi1_gpio_mod(dd, target, 0, 0, mask);
+ read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
+ if (wait)
+ i2c_wait_for_writes(dd, target);
+ return (read_val & mask) >> GPIO_SDA_NUM;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the hfi1_ib device
+ */
+static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
+{
+ u8 ack_received;
+
+ /* AT ENTRY SCL = LOW */
+ /* change direction, ignore data */
+ ack_received = sda_in(dd, target, 1);
+ scl_out(dd, target, 1);
+ ack_received = sda_in(dd, target, 1) == 0;
+ scl_out(dd, target, 0);
+ return ack_received;
+}
+
+static void stop_cmd(struct hfi1_devdata *dd, u32 target);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the hfi1_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
+{
+ int bit_cntr, data;
+
+ data = 0;
+
+ for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+ data <<= 1;
+ scl_out(dd, target, 1);
+ data |= sda_in(dd, target, 0);
+ scl_out(dd, target, 0);
+ }
+ if (last) {
+ scl_out(dd, target, 1);
+ stop_cmd(dd, target);
+ } else {
+ sda_out(dd, target, 0);
+ scl_out(dd, target, 1);
+ scl_out(dd, target, 0);
+ sda_out(dd, target, 1);
+ }
+ return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the hfi1_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
+{
+ int bit_cntr;
+ u8 bit;
+
+ for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+ bit = (data >> bit_cntr) & 1;
+ sda_out(dd, target, bit);
+ scl_out(dd, target, 1);
+ scl_out(dd, target, 0);
+ }
+ return (!i2c_ackrcv(dd, target)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct hfi1_devdata *dd, u32 target)
+{
+ sda_out(dd, target, 1);
+ scl_out(dd, target, 1);
+ sda_out(dd, target, 0);
+ udelay(1);
+ scl_out(dd, target, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct hfi1_devdata *dd, u32 target)
+{
+ scl_out(dd, target, 0);
+ sda_out(dd, target, 0);
+ scl_out(dd, target, 1);
+ sda_out(dd, target, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct hfi1_devdata *dd, u32 target)
+{
+ stop_seq(dd, target);
+ udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * hfi1_twsi_reset - reset I2C communication
+ * @dd: the hfi1_ib device
+ */
+
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
+{
+ int clock_cycles_left = 9;
+ int was_high = 0;
+ u32 pins, mask;
+
+ /* Both SCL and SDA should be high. If not, there
+ * is something wrong.
+ */
+ mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
+
+ /*
+ * Force pins to desired innocuous state.
+ * This is the default power-on state with out=0 and dir=0,
+ * So tri-stated and should be floating high (barring HW problems)
+ */
+ hfi1_gpio_mod(dd, target, 0, 0, mask);
+
+ /*
+ * Clock nine times to get all listeners into a sane state.
+ * If SDA does not go high at any point, we are wedged.
+ * One vendor recommends then issuing START followed by STOP.
+ * we cannot use our "normal" functions to do that, because
+ * if SCL drops between them, another vendor's part will
+ * wedge, dropping SDA and keeping it low forever, at the end of
+ * the next transaction (even if it was not the device addressed).
+ * So our START and STOP take place with SCL held high.
+ */
+ while (clock_cycles_left--) {
+ scl_out(dd, target, 0);
+ scl_out(dd, target, 1);
+ /* Note if SDA is high, but keep clocking to sync slave */
+ was_high |= sda_in(dd, target, 0);
+ }
+
+ if (was_high) {
+ /*
+ * We saw a high, which we hope means the slave is sync'd.
+ * Issue START, STOP, pause for T_BUF.
+ */
+
+ pins = hfi1_gpio_mod(dd, target, 0, 0, 0);
+ if ((pins & mask) != mask)
+ dd_dev_err(dd, "GPIO pins not at rest: %d\n",
+ pins & mask);
+ /* Drop SDA to issue START */
+ udelay(1); /* Guarantee .6 uSec setup */
+ sda_out(dd, target, 0);
+ udelay(1); /* Guarantee .6 uSec hold */
+ /* At this point, SCL is high, SDA low. Raise SDA for STOP */
+ sda_out(dd, target, 1);
+ udelay(TWSI_BUF_WAIT_USEC);
+ }
+
+ return !was_high;
+}
+
+#define HFI1_TWSI_START 0x100
+#define HFI1_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
+{
+ int ret = 1;
+
+ if (flags & HFI1_TWSI_START)
+ start_seq(dd, target);
+
+ /* Leaves SCL low (from i2c_ackrcv()) */
+ ret = wr_byte(dd, target, data);
+
+ if (flags & HFI1_TWSI_STOP)
+ stop_cmd(dd, target);
+ return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define HFI1_TEMP_DEV 0x98
+
+/*
+ * hfi1_twsi_blk_rd
+ * General interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+ void *buffer, int len)
+{
+ int ret;
+ u8 *bp = buffer;
+
+ ret = 1;
+
+ if (dev == HFI1_TWSI_NO_DEV) {
+ /* legacy not-really-I2C */
+ addr = (addr << 1) | READ_CMD;
+ ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
+ } else {
+ /* Actual I2C */
+ ret = twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START);
+ if (ret) {
+ stop_cmd(dd, target);
+ ret = 1;
+ goto bail;
+ }
+ /*
+ * SFF spec claims we do _not_ stop after the addr
+ * but simply issue a start with the "read" dev-addr.
+ * Since we are implicitly waiting for ACK here,
+ * we need t_buf (nominally 20uSec) before that start,
+ * and cannot rely on the delay built in to the STOP
+ */
+ ret = twsi_wr(dd, target, addr, 0);
+ udelay(TWSI_BUF_WAIT_USEC);
+
+ if (ret) {
+ dd_dev_err(dd,
+ "Failed to write interface read addr %02X\n",
+ addr);
+ ret = 1;
+ goto bail;
+ }
+ ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
+ }
+ if (ret) {
+ stop_cmd(dd, target);
+ ret = 1;
+ goto bail;
+ }
+
+ /*
+ * block devices keeps clocking data out as long as we ack,
+ * automatically incrementing the address. Some have "pages"
+ * whose boundaries will not be crossed, but the handling
+ * of these is left to the caller, who is in a better
+ * position to know.
+ */
+ while (len-- > 0) {
+ /*
+ * Get and store data, sending ACK if length remaining,
+ * else STOP
+ */
+ *bp++ = rd_byte(dd, target, !len);
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/*
+ * hfi1_twsi_blk_wr
+ * General interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+ const void *buffer, int len)
+{
+ int sub_len;
+ const u8 *bp = buffer;
+ int max_wait_time, i;
+ int ret = 1;
+
+ while (len > 0) {
+ if (dev == HFI1_TWSI_NO_DEV) {
+ if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
+ HFI1_TWSI_START)) {
+ goto failed_write;
+ }
+ } else {
+ /* Real I2C */
+ if (twsi_wr(dd, target,
+ dev | WRITE_CMD, HFI1_TWSI_START))
+ goto failed_write;
+ ret = twsi_wr(dd, target, addr, 0);
+ if (ret) {
+ dd_dev_err(dd,
+ "Failed to write interface write addr %02X\n",
+ addr);
+ goto failed_write;
+ }
+ }
+
+ sub_len = min(len, 4);
+ addr += sub_len;
+ len -= sub_len;
+
+ for (i = 0; i < sub_len; i++)
+ if (twsi_wr(dd, target, *bp++, 0))
+ goto failed_write;
+
+ stop_cmd(dd, target);
+
+ /*
+ * Wait for write complete by waiting for a successful
+ * read (the chip replies with a zero after the write
+ * cmd completes, and before it writes to the eeprom.
+ * The startcmd for the read will fail the ack until
+ * the writes have completed. We do this inline to avoid
+ * the debug prints that are in the real read routine
+ * if the startcmd fails.
+ * We also use the proper device address, so it doesn't matter
+ * whether we have real eeprom_dev. Legacy likes any address.
+ */
+ max_wait_time = 100;
+ while (twsi_wr(dd, target,
+ dev | READ_CMD, HFI1_TWSI_START)) {
+ stop_cmd(dd, target);
+ if (!--max_wait_time)
+ goto failed_write;
+ }
+ /* now read (and ignore) the resulting byte */
+ rd_byte(dd, target, 1);
+ }
+
+ ret = 0;
+ goto bail;
+
+failed_write:
+ stop_cmd(dd, target);
+ ret = 1;
+
+bail:
+ return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h
new file mode 100644
index 000000000000..5907e029613d
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/twsi.h
@@ -0,0 +1,68 @@
+#ifndef _TWSI_H
+#define _TWSI_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define HFI1_TWSI_NO_DEV 0xFF
+
+struct hfi1_devdata;
+
+/* Bit position of SDA pin in ASIC_QSFP* registers */
+#define GPIO_SDA_NUM 1
+
+/* these functions must be called with qsfp_lock held */
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+ void *buffer, int len);
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+ const void *buffer, int len);
+
+
+#endif /* _TWSI_H */
diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c
new file mode 100644
index 000000000000..b536f397737c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/uc.c
@@ -0,0 +1,585 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "sdma.h"
+#include "qp.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_uc_req(struct hfi1_qp *qp)
+{
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_swqe *wqe;
+ unsigned long flags;
+ u32 hwords = 5;
+ u32 bth0 = 0;
+ u32 len;
+ u32 pmtu = qp->pmtu;
+ int ret = 0;
+ int middle = 0;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == qp->s_head)
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (atomic_read(&qp->s_iowait.sdma_busy)) {
+ qp->s_flags |= HFI1_S_WAIT_DMA;
+ goto bail;
+ }
+ clear_ahg(qp);
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ goto done;
+ }
+
+ ohdr = &qp->s_hdr->ibh.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &qp->s_hdr->ibh.u.l.oth;
+
+ /* Get the next send request. */
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ qp->s_wqe = NULL;
+ switch (qp->s_state) {
+ default:
+ if (!(ib_hfi1_state_ops[qp->state] &
+ HFI1_PROCESS_NEXT_SEND_OK))
+ goto bail;
+ /* Check if send work queue is empty. */
+ if (qp->s_cur == qp->s_head) {
+ clear_ahg(qp);
+ goto bail;
+ }
+ /*
+ * Start a new request.
+ */
+ wqe->psn = qp->s_next_psn;
+ qp->s_psn = qp->s_next_psn;
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ len = wqe->length;
+ qp->s_len = len;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ if (len > pmtu) {
+ qp->s_state = OP(SEND_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_ONLY);
+ else {
+ qp->s_state =
+ OP(SEND_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / 4;
+ if (len > pmtu) {
+ qp->s_state = OP(RDMA_WRITE_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_ONLY);
+ else {
+ qp->s_state =
+ OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the RETH */
+ ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ default:
+ goto bail;
+ }
+ break;
+
+ case OP(SEND_FIRST):
+ qp->s_state = OP(SEND_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_LAST);
+ else {
+ qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ qp->s_state = OP(RDMA_WRITE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_LAST);
+ else {
+ qp->s_state =
+ OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+ }
+ qp->s_len -= len;
+ qp->s_hdrwords = hwords;
+ qp->s_cur_sge = &qp->s_sge;
+ qp->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+ mask_psn(qp->s_next_psn++), middle);
+done:
+ ret = 1;
+ goto unlock;
+
+bail:
+ qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+/**
+ * hfi1_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_uc_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 rcv_flags = packet->rcv_flags;
+ void *data = packet->ebuf;
+ u32 tlen = packet->tlen;
+ struct hfi1_qp *qp = packet->qp;
+ struct hfi1_other_headers *ohdr = packet->ohdr;
+ u32 opcode;
+ u32 hdrsize = packet->hlen;
+ u32 psn;
+ u32 pad;
+ struct ib_wc wc;
+ u32 pmtu = qp->pmtu;
+ struct ib_reth *reth;
+ int has_grh = rcv_flags & HFI1_HAS_GRH;
+ int ret;
+ u32 bth1;
+ struct ib_grh *grh = NULL;
+
+ opcode = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+ return;
+
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+ if (bth1 & HFI1_BECN_SMASK) {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 rqpn, lqpn;
+ u16 rlid = be16_to_cpu(hdr->lrh[3]);
+ u8 sl, sc5;
+
+ lqpn = bth1 & HFI1_QPN_MASK;
+ rqpn = qp->remote_qpn;
+
+ sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ sl = ibp->sc_to_sl[sc5];
+
+ process_becn(ppd, sl, rlid, lqpn, rqpn,
+ IB_CC_SVCTYPE_UC);
+ }
+
+ if (bth1 & HFI1_FECN_SMASK) {
+ u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+ u16 slid = be16_to_cpu(hdr->lrh[3]);
+ u16 dlid = be16_to_cpu(hdr->lrh[1]);
+ u32 src_qp = qp->remote_qpn;
+ u8 sc5;
+
+ sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+ return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+ }
+ }
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ opcode >>= 24;
+
+ /* Compare the PSN verses the expected PSN. */
+ if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
+ /*
+ * Handle a sequence error.
+ * Silently drop any current message.
+ */
+ qp->r_psn = psn;
+inv:
+ if (qp->r_state == OP(SEND_FIRST) ||
+ qp->r_state == OP(SEND_MIDDLE)) {
+ set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
+ qp->r_sge.num_sge = 0;
+ } else
+ hfi1_put_ss(&qp->r_sge);
+ qp->r_state = OP(SEND_LAST);
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+ goto send_first;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ goto rdma_first;
+
+ default:
+ goto drop;
+ }
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case OP(SEND_FIRST):
+ case OP(SEND_MIDDLE):
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+ break;
+ goto inv;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_MIDDLE):
+ if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ break;
+ goto inv;
+
+ default:
+ if (opcode == OP(SEND_FIRST) ||
+ opcode == OP(SEND_ONLY) ||
+ opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_FIRST) ||
+ opcode == OP(RDMA_WRITE_ONLY) ||
+ opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+ break;
+ goto inv;
+ }
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
+ qp_comm_est(qp);
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+ if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+ qp->r_sge = qp->s_rdma_read_sge;
+ else {
+ ret = hfi1_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto drop;
+ /*
+ * qp->s_rdma_read_sge will be the owner
+ * of the mr references.
+ */
+ qp->s_rdma_read_sge = qp->r_sge;
+ }
+ qp->r_rcv_len = 0;
+ if (opcode == OP(SEND_ONLY))
+ goto no_immediate_data;
+ else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+ goto send_last_imm;
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto rewind;
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len))
+ goto rewind;
+ hfi1_copy_sge(&qp->r_sge, data, pmtu, 0);
+ break;
+
+ case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+ wc.ex.imm_data = ohdr->u.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ goto send_last;
+ case OP(SEND_LAST):
+no_immediate_data:
+ wc.ex.imm_data = 0;
+ wc.wc_flags = 0;
+send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto rewind;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len))
+ goto rewind;
+ wc.opcode = IB_WC_RECV;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 0);
+ hfi1_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ /*
+ * It seems that IB mandates the presence of an SL in a
+ * work completion only for the UD transport (see section
+ * 11.4.2 of IBTA Vol. 1).
+ *
+ * However, the way the SL is chosen below is consistent
+ * with the way that IB/qib works and is trying avoid
+ * introducing incompatibilities.
+ *
+ * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+ */
+ wc.sl = qp->remote_ah_attr.sl;
+ /* zero fields that are N/A */
+ wc.vendor_err = 0;
+ wc.pkey_index = 0;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ /* Signal completion event if the solicited bit is set. */
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ (ohdr->bth[0] &
+ cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+ if (unlikely(!(qp->qp_access_flags &
+ IB_ACCESS_REMOTE_WRITE))) {
+ goto drop;
+ }
+ reth = &ohdr->u.rc.reth;
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ qp->r_sge.sg_list = NULL;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey */
+ ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+ vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+ if (unlikely(!ok))
+ goto drop;
+ qp->r_sge.num_sge = 1;
+ } else {
+ qp->r_sge.num_sge = 0;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (opcode == OP(RDMA_WRITE_ONLY))
+ goto rdma_last;
+ else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+ wc.ex.imm_data = ohdr->u.rc.imm_data;
+ goto rdma_last_imm;
+ }
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto drop;
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len))
+ goto drop;
+ hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
+ break;
+
+ case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+ wc.wc_flags = IB_WC_WITH_IMM;
+
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto drop;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+ goto drop;
+ if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+ hfi1_put_ss(&qp->s_rdma_read_sge);
+ else {
+ ret = hfi1_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto drop;
+ }
+ wc.byte_len = qp->r_len;
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+ hfi1_put_ss(&qp->r_sge);
+ goto last_imm;
+
+ case OP(RDMA_WRITE_LAST):
+rdma_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto drop;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+ goto drop;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+ hfi1_put_ss(&qp->r_sge);
+ break;
+
+ default:
+ /* Drop packet for unknown opcodes. */
+ goto drop;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+ return;
+
+rewind:
+ set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
+ qp->r_sge.num_sge = 0;
+drop:
+ ibp->n_pkt_drops++;
+ return;
+
+op_err:
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ return;
+
+}
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
new file mode 100644
index 000000000000..d40d1a1e10aa
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -0,0 +1,885 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+
+/**
+ * ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from hfi1_make_ud_req() to forward a WQE addressed
+ * to the same HFI.
+ * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
+ * while this is being called.
+ */
+static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe)
+{
+ struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+ struct hfi1_pportdata *ppd;
+ struct hfi1_qp *qp;
+ struct ib_ah_attr *ah_attr;
+ unsigned long flags;
+ struct hfi1_sge_state ssge;
+ struct hfi1_sge *sge;
+ struct ib_wc wc;
+ u32 length;
+ enum ib_qp_type sqptype, dqptype;
+
+ rcu_read_lock();
+
+ qp = hfi1_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);
+ if (!qp) {
+ ibp->n_pkt_drops++;
+ rcu_read_unlock();
+ return;
+ }
+
+ sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+ IB_QPT_UD : sqp->ibqp.qp_type;
+ dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+ IB_QPT_UD : qp->ibqp.qp_type;
+
+ if (dqptype != sqptype ||
+ !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
+ ibp->n_pkt_drops++;
+ goto drop;
+ }
+
+ ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+ ppd = ppd_from_ibp(ibp);
+
+ if (qp->ibqp.qp_num > 1) {
+ u16 pkey;
+ u16 slid;
+ u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
+
+ pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
+ slid = ppd->lid | (ah_attr->src_path_bits &
+ ((1 << ppd->lmc) - 1));
+ if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
+ qp->s_pkey_index, slid))) {
+ hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey,
+ ah_attr->sl,
+ sqp->ibqp.qp_num, qp->ibqp.qp_num,
+ cpu_to_be16(slid),
+ cpu_to_be16(ah_attr->dlid));
+ goto drop;
+ }
+ }
+
+ /*
+ * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+ * Qkeys with the high order bit set mean use the
+ * qkey from the QP context instead of the WR (see 10.2.5).
+ */
+ if (qp->ibqp.qp_num) {
+ u32 qkey;
+
+ qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ?
+ sqp->qkey : swqe->wr.wr.ud.remote_qkey;
+ if (unlikely(qkey != qp->qkey)) {
+ u16 lid;
+
+ lid = ppd->lid | (ah_attr->src_path_bits &
+ ((1 << ppd->lmc) - 1));
+ hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+ ah_attr->sl,
+ sqp->ibqp.qp_num, qp->ibqp.qp_num,
+ cpu_to_be16(lid),
+ cpu_to_be16(ah_attr->dlid));
+ goto drop;
+ }
+ }
+
+ /*
+ * A GRH is expected to precede the data even if not
+ * present on the wire.
+ */
+ length = swqe->length;
+ memset(&wc, 0, sizeof(wc));
+ wc.byte_len = length + sizeof(struct ib_grh);
+
+ if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = swqe->wr.ex.imm_data;
+ }
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+
+ /*
+ * Get the next work request entry to find where to put the data.
+ */
+ if (qp->r_flags & HFI1_R_REUSE_SGE)
+ qp->r_flags &= ~HFI1_R_REUSE_SGE;
+ else {
+ int ret;
+
+ ret = hfi1_get_rwqe(qp, 0);
+ if (ret < 0) {
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ goto bail_unlock;
+ }
+ if (!ret) {
+ if (qp->ibqp.qp_num == 0)
+ ibp->n_vl15_dropped++;
+ goto bail_unlock;
+ }
+ }
+ /* Silently drop packets which are too big. */
+ if (unlikely(wc.byte_len > qp->r_len)) {
+ qp->r_flags |= HFI1_R_REUSE_SGE;
+ ibp->n_pkt_drops++;
+ goto bail_unlock;
+ }
+
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
+ sizeof(struct ib_grh), 1);
+ wc.wc_flags |= IB_WC_GRH;
+ } else
+ hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+ ssge.sg_list = swqe->sg_list + 1;
+ ssge.sge = *swqe->sg_list;
+ ssge.num_sge = swqe->wr.num_sge;
+ sge = &ssge.sge;
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ssge.num_sge)
+ *sge = *ssge.sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= HFI1_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+ hfi1_put_ss(&qp->r_sge);
+ if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+ goto bail_unlock;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = sqp->ibqp.qp_num;
+ if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
+ if (sqp->ibqp.qp_type == IB_QPT_GSI ||
+ sqp->ibqp.qp_type == IB_QPT_SMI)
+ wc.pkey_index = swqe->wr.wr.ud.pkey_index;
+ else
+ wc.pkey_index = sqp->s_pkey_index;
+ } else {
+ wc.pkey_index = 0;
+ }
+ wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+ /* Check for loopback when the port lid is not set */
+ if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
+ wc.slid = HFI1_PERMISSIVE_LID;
+ wc.sl = ah_attr->sl;
+ wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+ wc.port_num = qp->port_num;
+ /* Signal completion event if the solicited bit is set. */
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ swqe->wr.send_flags & IB_SEND_SOLICITED);
+ ibp->n_loop_pkts++;
+bail_unlock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+ rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_ud_req(struct hfi1_qp *qp)
+{
+ struct hfi1_other_headers *ohdr;
+ struct ib_ah_attr *ah_attr;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ struct hfi1_swqe *wqe;
+ unsigned long flags;
+ u32 nwords;
+ u32 extra_bytes;
+ u32 bth0;
+ u16 lrh0;
+ u16 lid;
+ int ret = 0;
+ int next_cur;
+ u8 sc5;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == qp->s_head)
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (atomic_read(&qp->s_iowait.sdma_busy)) {
+ qp->s_flags |= HFI1_S_WAIT_DMA;
+ goto bail;
+ }
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ goto done;
+ }
+
+ if (qp->s_cur == qp->s_head)
+ goto bail;
+
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ next_cur = qp->s_cur + 1;
+ if (next_cur >= qp->s_size)
+ next_cur = 0;
+
+ /* Construct the header. */
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ppd = ppd_from_ibp(ibp);
+ ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+ if (ah_attr->dlid < HFI1_MULTICAST_LID_BASE ||
+ ah_attr->dlid == HFI1_PERMISSIVE_LID) {
+ lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+ if (unlikely(!loopback && (lid == ppd->lid ||
+ (lid == HFI1_PERMISSIVE_LID &&
+ qp->ibqp.qp_type == IB_QPT_GSI)))) {
+ /*
+ * If DMAs are in progress, we can't generate
+ * a completion for the loopback packet since
+ * it would be out of order.
+ * Instead of waiting, we could queue a
+ * zero length descriptor so we get a callback.
+ */
+ if (atomic_read(&qp->s_iowait.sdma_busy)) {
+ qp->s_flags |= HFI1_S_WAIT_DMA;
+ goto bail;
+ }
+ qp->s_cur = next_cur;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ ud_loopback(qp, wqe);
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+ goto done;
+ }
+ }
+
+ qp->s_cur = next_cur;
+ extra_bytes = -wqe->length & 3;
+ nwords = (wqe->length + extra_bytes) >> 2;
+
+ /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+ qp->s_hdrwords = 7;
+ qp->s_cur_size = wqe->length;
+ qp->s_cur_sge = &qp->s_sge;
+ qp->s_srate = ah_attr->static_rate;
+ qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+ qp->s_wqe = wqe;
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ /* Header size in 32-bit words. */
+ qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
+ &ah_attr->grh,
+ qp->s_hdrwords, nwords);
+ lrh0 = HFI1_LRH_GRH;
+ ohdr = &qp->s_hdr->ibh.u.l.oth;
+ /*
+ * Don't worry about sending to locally attached multicast
+ * QPs. It is unspecified by the spec. what happens.
+ */
+ } else {
+ /* Header size in 32-bit words. */
+ lrh0 = HFI1_LRH_BTH;
+ ohdr = &qp->s_hdr->ibh.u.oth;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ qp->s_hdrwords++;
+ ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+ bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+ } else
+ bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+ sc5 = ibp->sl_to_sc[ah_attr->sl];
+ lrh0 |= (ah_attr->sl & 0xf) << 4;
+ if (qp->ibqp.qp_type == IB_QPT_SMI) {
+ lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+ qp->s_sc = 0xf;
+ } else {
+ lrh0 |= (sc5 & 0xf) << 12;
+ qp->s_sc = sc5;
+ }
+ qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
+ qp->s_hdr->ibh.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */
+ qp->s_hdr->ibh.lrh[2] =
+ cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+ if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE))
+ qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
+ else {
+ lid = ppd->lid;
+ if (lid) {
+ lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+ qp->s_hdr->ibh.lrh[3] = cpu_to_be16(lid);
+ } else
+ qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ bth0 |= extra_bytes << 20;
+ if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+ bth0 |= hfi1_get_pkey(ibp, wqe->wr.wr.ud.pkey_index);
+ else
+ bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ ohdr->bth[1] = cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+ ohdr->bth[2] = cpu_to_be32(mask_psn(qp->s_next_psn++));
+ /*
+ * Qkeys with the high order bit set mean use the
+ * qkey from the QP context instead of the WR (see 10.2.5).
+ */
+ ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+ qp->qkey : wqe->wr.wr.ud.remote_qkey);
+ ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+ /* disarm any ahg */
+ qp->s_hdr->ahgcount = 0;
+ qp->s_hdr->ahgidx = 0;
+ qp->s_hdr->tx_flags = 0;
+ qp->s_hdr->sde = NULL;
+
+done:
+ ret = 1;
+ goto unlock;
+
+bail:
+ qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+/*
+ * Hardware can't check this so we do it here.
+ *
+ * This is a slightly different algorithm than the standard pkey check. It
+ * special cases the management keys and allows for 0x7fff and 0xffff to be in
+ * the table at the same time.
+ *
+ * @returns the index found or -1 if not found
+ */
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ unsigned i;
+
+ if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
+ unsigned lim_idx = -1;
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
+ /* here we look for an exact match */
+ if (ppd->pkeys[i] == pkey)
+ return i;
+ if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
+ lim_idx = i;
+ }
+
+ /* did not find 0xffff return 0x7fff idx if found */
+ if (pkey == FULL_MGMT_P_KEY)
+ return lim_idx;
+
+ /* no match... */
+ return -1;
+ }
+
+ pkey &= 0x7fff; /* remove limited/full membership bit */
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+ if ((ppd->pkeys[i] & 0x7fff) == pkey)
+ return i;
+
+ /*
+ * Should not get here, this means hardware failed to validate pkeys.
+ */
+ return -1;
+}
+
+void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
+ u32 pkey, u32 slid, u32 dlid, u8 sc5,
+ const struct ib_grh *old_grh)
+{
+ u64 pbc, pbc_flags = 0;
+ u32 bth0, plen, vl, hwords = 5;
+ u16 lrh0;
+ u8 sl = ibp->sc_to_sl[sc5];
+ struct hfi1_ib_header hdr;
+ struct hfi1_other_headers *ohdr;
+ struct pio_buf *pbuf;
+ struct send_context *ctxt = qp_to_send_context(qp, sc5);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ if (old_grh) {
+ struct ib_grh *grh = &hdr.u.l.grh;
+
+ grh->version_tclass_flow = old_grh->version_tclass_flow;
+ grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
+ grh->hop_limit = 0xff;
+ grh->sgid = old_grh->dgid;
+ grh->dgid = old_grh->sgid;
+ ohdr = &hdr.u.l.oth;
+ lrh0 = HFI1_LRH_GRH;
+ hwords += sizeof(struct ib_grh) / sizeof(u32);
+ } else {
+ ohdr = &hdr.u.oth;
+ lrh0 = HFI1_LRH_BTH;
+ }
+
+ lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
+
+ bth0 = pkey | (IB_OPCODE_CNP << 24);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+
+ ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
+ ohdr->bth[2] = 0; /* PSN 0 */
+
+ hdr.lrh[0] = cpu_to_be16(lrh0);
+ hdr.lrh[1] = cpu_to_be16(dlid);
+ hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+ hdr.lrh[3] = cpu_to_be16(slid);
+
+ plen = 2 /* PBC */ + hwords;
+ pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+ vl = sc_to_vlt(ppd->dd, sc5);
+ pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+ if (ctxt) {
+ pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+ if (pbuf)
+ ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+ &hdr, hwords);
+ }
+}
+
+/*
+ * opa_smp_check() - Do the regular pkey checking, and the additional
+ * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
+ * ("SMA Packet Checks").
+ *
+ * Note that:
+ * - Checks are done using the pkey directly from the packet's BTH,
+ * and specifically _not_ the pkey that we attach to the completion,
+ * which may be different.
+ * - These checks are specifically for "non-local" SMPs (i.e., SMPs
+ * which originated on another node). SMPs which are sent from, and
+ * destined to this node are checked in opa_local_smp_check().
+ *
+ * At the point where opa_smp_check() is called, we know:
+ * - destination QP is QP0
+ *
+ * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
+ */
+static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
+ struct hfi1_qp *qp, u16 slid, struct opa_smp *smp)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ /*
+ * I don't think it's possible for us to get here with sc != 0xf,
+ * but check it to be certain.
+ */
+ if (sc5 != 0xf)
+ return 1;
+
+ if (rcv_pkey_check(ppd, pkey, sc5, slid))
+ return 1;
+
+ /*
+ * At this point we know (and so don't need to check again) that
+ * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
+ * (see ingress_pkey_check).
+ */
+ if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
+ smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+ }
+
+ /*
+ * SMPs fall into one of four (disjoint) categories:
+ * SMA request, SMA response, trap, or trap repress.
+ * Our response depends, in part, on which type of
+ * SMP we're processing.
+ *
+ * If this is not an SMA request, or trap repress:
+ * - accept MAD if the port is running an SM
+ * - pkey == FULL_MGMT_P_KEY =>
+ * reply with unsupported method (i.e., just mark
+ * the smp's status field here, and let it be
+ * processed normally)
+ * - pkey != LIM_MGMT_P_KEY =>
+ * increment port recv constraint errors, drop MAD
+ * If this is an SMA request or trap repress:
+ * - pkey != FULL_MGMT_P_KEY =>
+ * increment port recv constraint errors, drop MAD
+ */
+ switch (smp->method) {
+ case IB_MGMT_METHOD_GET:
+ case IB_MGMT_METHOD_SET:
+ case IB_MGMT_METHOD_REPORT:
+ case IB_MGMT_METHOD_TRAP_REPRESS:
+ if (pkey != FULL_MGMT_P_KEY) {
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+ }
+ break;
+ case IB_MGMT_METHOD_SEND:
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_GET_RESP:
+ case IB_MGMT_METHOD_REPORT_RESP:
+ if (ibp->port_cap_flags & IB_PORT_SM)
+ return 0;
+ if (pkey == FULL_MGMT_P_KEY) {
+ smp->status |= IB_SMP_UNSUP_METHOD;
+ return 0;
+ }
+ if (pkey != LIM_MGMT_P_KEY) {
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+
+/**
+ * hfi1_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_ud_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_other_headers *ohdr = packet->ohdr;
+ int opcode;
+ u32 hdrsize = packet->hlen;
+ u32 pad;
+ struct ib_wc wc;
+ u32 qkey;
+ u32 src_qp;
+ u16 dlid, pkey;
+ int mgmt_pkey_idx = -1;
+ struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 rcv_flags = packet->rcv_flags;
+ void *data = packet->ebuf;
+ u32 tlen = packet->tlen;
+ struct hfi1_qp *qp = packet->qp;
+ bool has_grh = rcv_flags & HFI1_HAS_GRH;
+ bool sc4_bit = has_sc4_bit(packet);
+ u8 sc;
+ u32 bth1;
+ int is_mcast;
+ struct ib_grh *grh = NULL;
+
+ qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+ src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
+ dlid = be16_to_cpu(hdr->lrh[1]);
+ is_mcast = (dlid > HFI1_MULTICAST_LID_BASE) &&
+ (dlid != HFI1_PERMISSIVE_LID);
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ if (unlikely(bth1 & HFI1_BECN_SMASK)) {
+ /*
+ * In pre-B0 h/w the CNP_OPCODE is handled via an
+ * error path (errata 291394).
+ */
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 lqpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+ u8 sl, sc5;
+
+ sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+ sc5 |= sc4_bit;
+ sl = ibp->sc_to_sl[sc5];
+
+ process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
+ }
+
+ /*
+ * The opcode is in the low byte when its in network order
+ * (top byte when in host order).
+ */
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ opcode &= 0xff;
+
+ pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+ if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
+ u16 slid = be16_to_cpu(hdr->lrh[3]);
+ u8 sc5;
+
+ sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+ sc5 |= sc4_bit;
+
+ return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+ }
+ /*
+ * Get the number of bytes the message was padded by
+ * and drop incomplete packets.
+ */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto drop;
+
+ tlen -= hdrsize + pad + 4;
+
+ /*
+ * Check that the permissive LID is only used on QP0
+ * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+ */
+ if (qp->ibqp.qp_num) {
+ if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+ hdr->lrh[3] == IB_LID_PERMISSIVE))
+ goto drop;
+ if (qp->ibqp.qp_num > 1) {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u16 slid;
+ u8 sc5;
+
+ sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+ sc5 |= sc4_bit;
+
+ slid = be16_to_cpu(hdr->lrh[3]);
+ if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
+ /*
+ * Traps will not be sent for packets dropped
+ * by the HW. This is fine, as sending trap
+ * for invalid pkeys is optional according to
+ * IB spec (release 1.3, section 10.9.4)
+ */
+ hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+ pkey,
+ (be16_to_cpu(hdr->lrh[0]) >> 4) &
+ 0xF,
+ src_qp, qp->ibqp.qp_num,
+ hdr->lrh[3], hdr->lrh[1]);
+ return;
+ }
+ } else {
+ /* GSI packet */
+ mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+ if (mgmt_pkey_idx < 0)
+ goto drop;
+
+ }
+ if (unlikely(qkey != qp->qkey)) {
+ hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+ (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+ src_qp, qp->ibqp.qp_num,
+ hdr->lrh[3], hdr->lrh[1]);
+ return;
+ }
+ /* Drop invalid MAD packets (see 13.5.3.1). */
+ if (unlikely(qp->ibqp.qp_num == 1 &&
+ (tlen > 2048 ||
+ (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+ goto drop;
+ } else {
+ /* Received on QP0, and so by definition, this is an SMP */
+ struct opa_smp *smp = (struct opa_smp *)data;
+ u16 slid = be16_to_cpu(hdr->lrh[3]);
+ u8 sc5;
+
+ sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+ sc5 |= sc4_bit;
+
+ if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
+ goto drop;
+
+ if (tlen > 2048)
+ goto drop;
+ if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+ hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+ smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ goto drop;
+
+ /* look up SMI pkey */
+ mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+ if (mgmt_pkey_idx < 0)
+ goto drop;
+
+ }
+
+ if (qp->ibqp.qp_num > 1 &&
+ opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+ wc.ex.imm_data = ohdr->u.ud.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ tlen -= sizeof(u32);
+ } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+ wc.ex.imm_data = 0;
+ wc.wc_flags = 0;
+ } else
+ goto drop;
+
+ /*
+ * A GRH is expected to precede the data even if not
+ * present on the wire.
+ */
+ wc.byte_len = tlen + sizeof(struct ib_grh);
+
+ /*
+ * Get the next work request entry to find where to put the data.
+ */
+ if (qp->r_flags & HFI1_R_REUSE_SGE)
+ qp->r_flags &= ~HFI1_R_REUSE_SGE;
+ else {
+ int ret;
+
+ ret = hfi1_get_rwqe(qp, 0);
+ if (ret < 0) {
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ return;
+ }
+ if (!ret) {
+ if (qp->ibqp.qp_num == 0)
+ ibp->n_vl15_dropped++;
+ return;
+ }
+ }
+ /* Silently drop packets which are too big. */
+ if (unlikely(wc.byte_len > qp->r_len)) {
+ qp->r_flags |= HFI1_R_REUSE_SGE;
+ goto drop;
+ }
+ if (has_grh) {
+ hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+ sizeof(struct ib_grh), 1);
+ wc.wc_flags |= IB_WC_GRH;
+ } else
+ hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+ hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
+ hfi1_put_ss(&qp->r_sge);
+ if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+ return;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.vendor_err = 0;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = src_qp;
+
+ if (qp->ibqp.qp_type == IB_QPT_GSI ||
+ qp->ibqp.qp_type == IB_QPT_SMI) {
+ if (mgmt_pkey_idx < 0) {
+ if (net_ratelimit()) {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_devdata *dd = ppd->dd;
+
+ dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
+ qp->ibqp.qp_type);
+ mgmt_pkey_idx = 0;
+ }
+ }
+ wc.pkey_index = (unsigned)mgmt_pkey_idx;
+ } else
+ wc.pkey_index = 0;
+
+ wc.slid = be16_to_cpu(hdr->lrh[3]);
+ sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+ sc |= sc4_bit;
+ wc.sl = ibp->sc_to_sl[sc];
+
+ /*
+ * Save the LMC lower bits if the destination LID is a unicast LID.
+ */
+ wc.dlid_path_bits = dlid >= HFI1_MULTICAST_LID_BASE ? 0 :
+ dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+ wc.port_num = qp->port_num;
+ /* Signal completion event if the solicited bit is set. */
+ hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ (ohdr->bth[0] &
+ cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+ return;
+
+drop:
+ ibp->n_pkt_drops++;
+}
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
new file mode 100644
index 000000000000..9071afbd7bf4
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_pages.c
@@ -0,0 +1,156 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include "hfi.h"
+
+static void __hfi1_release_user_pages(struct page **p, size_t num_pages,
+ int dirty)
+{
+ size_t i;
+
+ for (i = 0; i < num_pages; i++) {
+ if (dirty)
+ set_page_dirty_lock(p[i]);
+ put_page(p[i]);
+ }
+}
+
+/*
+ * Call with current->mm->mmap_sem held.
+ */
+static int __hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
+ struct page **p)
+{
+ unsigned long lock_limit;
+ size_t got;
+ int ret;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ for (got = 0; got < num_pages; got += ret) {
+ ret = get_user_pages(current, current->mm,
+ start_page + got * PAGE_SIZE,
+ num_pages - got, 1, 1,
+ p + got, NULL);
+ if (ret < 0)
+ goto bail_release;
+ }
+
+ current->mm->pinned_vm += num_pages;
+
+ ret = 0;
+ goto bail;
+
+bail_release:
+ __hfi1_release_user_pages(p, got, 0);
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_map_page - a safety wrapper around pci_map_page()
+ *
+ */
+dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page,
+ unsigned long offset, size_t size, int direction)
+{
+ dma_addr_t phys;
+
+ phys = pci_map_page(hwdev, page, offset, size, direction);
+
+ return phys;
+}
+
+/**
+ * hfi1_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages. For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
+ struct page **p)
+{
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+
+ ret = __hfi1_get_user_pages(start_page, num_pages, p);
+
+ up_write(&current->mm->mmap_sem);
+
+ return ret;
+}
+
+void hfi1_release_user_pages(struct page **p, size_t num_pages)
+{
+ if (current->mm) /* during close after signal, mm can be NULL */
+ down_write(&current->mm->mmap_sem);
+
+ __hfi1_release_user_pages(p, num_pages, 1);
+
+ if (current->mm) {
+ current->mm->pinned_vm -= num_pages;
+ up_write(&current->mm->mmap_sem);
+ }
+}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
new file mode 100644
index 000000000000..55526613a522
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -0,0 +1,1444 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "sdma.h"
+#include "user_sdma.h"
+#include "sdma.h"
+#include "verbs.h" /* for the headers */
+#include "common.h" /* for struct hfi1_tid_info */
+#include "trace.h"
+
+static uint hfi1_sdma_comp_ring_size = 128;
+module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
+
+/* The maximum number of Data io vectors per message/request */
+#define MAX_VECTORS_PER_REQ 8
+/*
+ * Maximum number of packet to send from each message/request
+ * before moving to the next one.
+ */
+#define MAX_PKTS_PER_QUEUE 16
+
+#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
+
+#define req_opcode(x) \
+ (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_version(x) \
+ (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_iovcnt(x) \
+ (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define BTH_SEQ_MASK 0x7ffull
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT 0
+#define KDETH_OFFSET_MASK 0x7fff
+#define KDETH_OM_SHIFT 15
+#define KDETH_OM_MASK 0x1
+#define KDETH_TID_SHIFT 16
+#define KDETH_TID_MASK 0x3ff
+#define KDETH_TIDCTRL_SHIFT 26
+#define KDETH_TIDCTRL_MASK 0x3
+#define KDETH_INTR_SHIFT 28
+#define KDETH_INTR_MASK 0x1
+#define KDETH_SH_SHIFT 29
+#define KDETH_SH_MASK 0x1
+#define KDETH_HCRC_UPPER_SHIFT 16
+#define KDETH_HCRC_UPPER_MASK 0xff
+#define KDETH_HCRC_LOWER_SHIFT 24
+#define KDETH_HCRC_LOWER_MASK 0xff
+
+#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
+#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
+
+#define KDETH_GET(val, field) \
+ (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do { \
+ u32 dwval = le32_to_cpu(dw); \
+ dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+ dwval |= (((val) & KDETH_##field##_MASK) << \
+ KDETH_##field##_SHIFT); \
+ dw = cpu_to_le32(dwval); \
+ } while (0)
+
+#define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \
+ do { \
+ if ((idx) < ARRAY_SIZE((arr))) \
+ (arr)[(idx++)] = sdma_build_ahg_descriptor( \
+ (__force u16)(value), (dw), (bit), \
+ (width)); \
+ else \
+ return -ERANGE; \
+ } while (0)
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL 4
+#define KDETH_OM_LARGE 64
+#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+/* Last packet in the request */
+#define USER_SDMA_TXREQ_FLAGS_LAST_PKT (1 << 0)
+
+#define SDMA_REQ_IN_USE 0
+#define SDMA_REQ_FOR_THREAD 1
+#define SDMA_REQ_SEND_DONE 2
+#define SDMA_REQ_HAVE_AHG 3
+#define SDMA_REQ_HAS_ERROR 4
+#define SDMA_REQ_DONE_ERROR 5
+
+#define SDMA_PKT_Q_INACTIVE (1 << 0)
+#define SDMA_PKT_Q_ACTIVE (1 << 1)
+#define SDMA_PKT_Q_DEFERRED (1 << 2)
+
+/*
+ * Maximum retry attempts to submit a TX request
+ * before putting the process to sleep.
+ */
+#define MAX_DEFER_RETRY_COUNT 1
+
+static unsigned initial_pkt_count = 8;
+
+#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
+
+struct user_sdma_iovec {
+ struct iovec iov;
+ /* number of pages in this vector */
+ unsigned npages;
+ /* array of pinned pages for this vector */
+ struct page **pages;
+ /* offset into the virtual address space of the vector at
+ * which we last left off. */
+ u64 offset;
+};
+
+struct user_sdma_request {
+ struct sdma_req_info info;
+ struct hfi1_user_sdma_pkt_q *pq;
+ struct hfi1_user_sdma_comp_q *cq;
+ /* This is the original header from user space */
+ struct hfi1_pkt_header hdr;
+ /*
+ * Pointer to the SDMA engine for this request.
+ * Since different request could be on different VLs,
+ * each request will need it's own engine pointer.
+ */
+ struct sdma_engine *sde;
+ u8 ahg_idx;
+ u32 ahg[9];
+ /*
+ * KDETH.Offset (Eager) field
+ * We need to remember the initial value so the headers
+ * can be updated properly.
+ */
+ u32 koffset;
+ /*
+ * KDETH.OFFSET (TID) field
+ * The offset can cover multiple packets, depending on the
+ * size of the TID entry.
+ */
+ u32 tidoffset;
+ /*
+ * KDETH.OM
+ * Remember this because the header template always sets it
+ * to 0.
+ */
+ u8 omfactor;
+ /*
+ * pointer to the user's task_struct. We are going to
+ * get a reference to it so we can process io vectors
+ * at a later time.
+ */
+ struct task_struct *user_proc;
+ /*
+ * pointer to the user's mm_struct. We are going to
+ * get a reference to it so it doesn't get freed
+ * since we might not be in process context when we
+ * are processing the iov's.
+ * Using this mm_struct, we can get vma based on the
+ * iov's address (find_vma()).
+ */
+ struct mm_struct *user_mm;
+ /*
+ * We copy the iovs for this request (based on
+ * info.iovcnt). These are only the data vectors
+ */
+ unsigned data_iovs;
+ /* total length of the data in the request */
+ u32 data_len;
+ /* progress index moving along the iovs array */
+ unsigned iov_idx;
+ struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
+ /* number of elements copied to the tids array */
+ u16 n_tids;
+ /* TID array values copied from the tid_iov vector */
+ u32 *tids;
+ u16 tididx;
+ u32 sent;
+ u64 seqnum;
+ spinlock_t list_lock;
+ struct list_head txps;
+ unsigned long flags;
+};
+
+struct user_sdma_txreq {
+ /* Packet header for the txreq */
+ struct hfi1_pkt_header hdr;
+ struct sdma_txreq txreq;
+ struct user_sdma_request *req;
+ struct user_sdma_iovec *iovec1;
+ struct user_sdma_iovec *iovec2;
+ u16 flags;
+ unsigned busycount;
+ u64 seqnum;
+};
+
+#define SDMA_DBG(req, fmt, ...) \
+ hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
+ (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
+ ##__VA_ARGS__)
+#define SDMA_Q_DBG(pq, fmt, ...) \
+ hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
+ (pq)->subctxt, ##__VA_ARGS__)
+
+static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
+static int num_user_pages(const struct iovec *);
+static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
+static void user_sdma_free_request(struct user_sdma_request *);
+static int pin_vector_pages(struct user_sdma_request *,
+ struct user_sdma_iovec *);
+static void unpin_vector_pages(struct user_sdma_iovec *);
+static int check_header_template(struct user_sdma_request *,
+ struct hfi1_pkt_header *, u32, u32);
+static int set_txreq_header(struct user_sdma_request *,
+ struct user_sdma_txreq *, u32);
+static int set_txreq_header_ahg(struct user_sdma_request *,
+ struct user_sdma_txreq *, u32);
+static inline void set_comp_state(struct user_sdma_request *,
+ enum hfi1_sdma_comp_state, int);
+static inline u32 set_pkt_bth_psn(__be32, u8, u32);
+static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
+
+static int defer_packet_queue(
+ struct sdma_engine *,
+ struct iowait *,
+ struct sdma_txreq *,
+ unsigned seq);
+static void activate_packet_queue(struct iowait *, int);
+
+static inline int iovec_may_free(struct user_sdma_iovec *iovec,
+ void (*free)(struct user_sdma_iovec *))
+{
+ if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+ free(iovec);
+ return 1;
+ }
+ return 0;
+}
+
+static inline void iovec_set_complete(struct user_sdma_iovec *iovec)
+{
+ iovec->offset = iovec->iov.iov_len;
+}
+
+static int defer_packet_queue(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *txreq,
+ unsigned seq)
+{
+ struct hfi1_user_sdma_pkt_q *pq =
+ container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+ struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
+ struct user_sdma_txreq *tx =
+ container_of(txreq, struct user_sdma_txreq, txreq);
+
+ if (sdma_progress(sde, seq, txreq)) {
+ if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
+ goto eagain;
+ }
+ /*
+ * We are assuming that if the list is enqueued somewhere, it
+ * is to the dmawait list since that is the only place where
+ * it is supposed to be enqueued.
+ */
+ xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
+ write_seqlock(&dev->iowait_lock);
+ if (list_empty(&pq->busy.list))
+ list_add_tail(&pq->busy.list, &sde->dmawait);
+ write_sequnlock(&dev->iowait_lock);
+ return -EBUSY;
+eagain:
+ return -EAGAIN;
+}
+
+static void activate_packet_queue(struct iowait *wait, int reason)
+{
+ struct hfi1_user_sdma_pkt_q *pq =
+ container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+ xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+ wake_up(&wait->wait_dma);
+};
+
+static void sdma_kmem_cache_ctor(void *obj)
+{
+ struct user_sdma_txreq *tx = (struct user_sdma_txreq *)obj;
+
+ memset(tx, 0, sizeof(*tx));
+}
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
+{
+ int ret = 0;
+ unsigned memsize;
+ char buf[64];
+ struct hfi1_devdata *dd;
+ struct hfi1_user_sdma_comp_q *cq;
+ struct hfi1_user_sdma_pkt_q *pq;
+ unsigned long flags;
+
+ if (!uctxt || !fp) {
+ ret = -EBADF;
+ goto done;
+ }
+
+ if (!hfi1_sdma_comp_ring_size) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ dd = uctxt->dd;
+
+ pq = kzalloc(sizeof(*pq), GFP_KERNEL);
+ if (!pq) {
+ dd_dev_err(dd,
+ "[%u:%u] Failed to allocate SDMA request struct\n",
+ uctxt->ctxt, subctxt_fp(fp));
+ goto pq_nomem;
+ }
+ memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
+ pq->reqs = kmalloc(memsize, GFP_KERNEL);
+ if (!pq->reqs) {
+ dd_dev_err(dd,
+ "[%u:%u] Failed to allocate SDMA request queue (%u)\n",
+ uctxt->ctxt, subctxt_fp(fp), memsize);
+ goto pq_reqs_nomem;
+ }
+ INIT_LIST_HEAD(&pq->list);
+ pq->dd = dd;
+ pq->ctxt = uctxt->ctxt;
+ pq->subctxt = subctxt_fp(fp);
+ pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+ pq->state = SDMA_PKT_Q_INACTIVE;
+ atomic_set(&pq->n_reqs, 0);
+
+ iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+ activate_packet_queue);
+ pq->reqidx = 0;
+ snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
+ subctxt_fp(fp));
+ pq->txreq_cache = kmem_cache_create(buf,
+ sizeof(struct user_sdma_txreq),
+ L1_CACHE_BYTES,
+ SLAB_HWCACHE_ALIGN,
+ sdma_kmem_cache_ctor);
+ if (!pq->txreq_cache) {
+ dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
+ uctxt->ctxt);
+ goto pq_txreq_nomem;
+ }
+ user_sdma_pkt_fp(fp) = pq;
+ cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+ if (!cq) {
+ dd_dev_err(dd,
+ "[%u:%u] Failed to allocate SDMA completion queue\n",
+ uctxt->ctxt, subctxt_fp(fp));
+ goto cq_nomem;
+ }
+
+ memsize = ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size,
+ PAGE_SIZE);
+ cq->comps = vmalloc_user(memsize);
+ if (!cq->comps) {
+ dd_dev_err(dd,
+ "[%u:%u] Failed to allocate SDMA completion queue entries\n",
+ uctxt->ctxt, subctxt_fp(fp));
+ goto cq_comps_nomem;
+ }
+ cq->nentries = hfi1_sdma_comp_ring_size;
+ user_sdma_comp_fp(fp) = cq;
+
+ spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+ list_add(&pq->list, &uctxt->sdma_queues);
+ spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+ goto done;
+
+cq_comps_nomem:
+ kfree(cq);
+cq_nomem:
+ kmem_cache_destroy(pq->txreq_cache);
+pq_txreq_nomem:
+ kfree(pq->reqs);
+pq_reqs_nomem:
+ kfree(pq);
+ user_sdma_pkt_fp(fp) = NULL;
+pq_nomem:
+ ret = -ENOMEM;
+done:
+ return ret;
+}
+
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
+{
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_user_sdma_pkt_q *pq;
+ unsigned long flags;
+
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
+ uctxt->ctxt, fd->subctxt);
+ pq = fd->pq;
+ if (pq) {
+ u16 i, j;
+
+ spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+ if (!list_empty(&pq->list))
+ list_del_init(&pq->list);
+ spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+ iowait_sdma_drain(&pq->busy);
+ if (pq->reqs) {
+ for (i = 0, j = 0; i < atomic_read(&pq->n_reqs) &&
+ j < pq->n_max_reqs; j++) {
+ struct user_sdma_request *req = &pq->reqs[j];
+
+ if (test_bit(SDMA_REQ_IN_USE, &req->flags)) {
+ set_comp_state(req, ERROR, -ECOMM);
+ user_sdma_free_request(req);
+ i++;
+ }
+ }
+ kfree(pq->reqs);
+ }
+ if (pq->txreq_cache)
+ kmem_cache_destroy(pq->txreq_cache);
+ kfree(pq);
+ fd->pq = NULL;
+ }
+ if (fd->cq) {
+ if (fd->cq->comps)
+ vfree(fd->cq->comps);
+ kfree(fd->cq);
+ fd->cq = NULL;
+ }
+ return 0;
+}
+
+int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
+ unsigned long dim, unsigned long *count)
+{
+ int ret = 0, i = 0, sent;
+ struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+ struct hfi1_user_sdma_pkt_q *pq = user_sdma_pkt_fp(fp);
+ struct hfi1_user_sdma_comp_q *cq = user_sdma_comp_fp(fp);
+ struct hfi1_devdata *dd = pq->dd;
+ unsigned long idx = 0;
+ u8 pcount = initial_pkt_count;
+ struct sdma_req_info info;
+ struct user_sdma_request *req;
+ u8 opcode, sc, vl;
+
+ if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
+ hfi1_cdbg(
+ SDMA,
+ "[%u:%u:%u] First vector not big enough for header %lu/%lu",
+ dd->unit, uctxt->ctxt, subctxt_fp(fp),
+ iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
+ ret = -EINVAL;
+ goto done;
+ }
+ ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
+ if (ret) {
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
+ dd->unit, uctxt->ctxt, subctxt_fp(fp), ret);
+ ret = -EFAULT;
+ goto done;
+ }
+ trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, subctxt_fp(fp),
+ (u16 *)&info);
+ if (cq->comps[info.comp_idx].status == QUEUED) {
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
+ dd->unit, uctxt->ctxt, subctxt_fp(fp),
+ info.comp_idx);
+ ret = -EBADSLT;
+ goto done;
+ }
+ if (!info.fragsize) {
+ hfi1_cdbg(SDMA,
+ "[%u:%u:%u:%u] Request does not specify fragsize",
+ dd->unit, uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+ ret = -EINVAL;
+ goto done;
+ }
+ /*
+ * We've done all the safety checks that we can up to this point,
+ * "allocate" the request entry.
+ */
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
+ uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+ req = pq->reqs + info.comp_idx;
+ memset(req, 0, sizeof(*req));
+ /* Mark the request as IN_USE before we start filling it in. */
+ set_bit(SDMA_REQ_IN_USE, &req->flags);
+ req->data_iovs = req_iovcnt(info.ctrl) - 1;
+ req->pq = pq;
+ req->cq = cq;
+ INIT_LIST_HEAD(&req->txps);
+ spin_lock_init(&req->list_lock);
+ memcpy(&req->info, &info, sizeof(info));
+
+ if (req_opcode(info.ctrl) == EXPECTED)
+ req->data_iovs--;
+
+ if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
+ SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
+ MAX_VECTORS_PER_REQ);
+ ret = -EINVAL;
+ goto done;
+ }
+ /* Copy the header from the user buffer */
+ ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
+ sizeof(req->hdr));
+ if (ret) {
+ SDMA_DBG(req, "Failed to copy header template (%d)", ret);
+ ret = -EFAULT;
+ goto free_req;
+ }
+
+ /* If Static rate control is not enabled, sanitize the header. */
+ if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
+ req->hdr.pbc[2] = 0;
+
+ /* Validate the opcode. Do not trust packets from user space blindly. */
+ opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
+ if ((opcode & USER_OPCODE_CHECK_MASK) !=
+ USER_OPCODE_CHECK_VAL) {
+ SDMA_DBG(req, "Invalid opcode (%d)", opcode);
+ ret = -EINVAL;
+ goto free_req;
+ }
+ /*
+ * Validate the vl. Do not trust packets from user space blindly.
+ * VL comes from PBC, SC comes from LRH, and the VL needs to
+ * match the SC look up.
+ */
+ vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
+ sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
+ (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
+ if (vl >= dd->pport->vls_operational ||
+ vl != sc_to_vlt(dd, sc)) {
+ SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
+ ret = -EINVAL;
+ goto free_req;
+ }
+
+ /*
+ * Also should check the BTH.lnh. If it says the next header is GRH then
+ * the RXE parsing will be off and will land in the middle of the KDETH
+ * or miss it entirely.
+ */
+ if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
+ SDMA_DBG(req, "User tried to pass in a GRH");
+ ret = -EINVAL;
+ goto free_req;
+ }
+
+ req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
+ /* Calculate the initial TID offset based on the values of
+ KDETH.OFFSET and KDETH.OM that are passed in. */
+ req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
+ (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+ KDETH_OM_LARGE : KDETH_OM_SMALL);
+ SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
+ idx++;
+
+ /* Save all the IO vector structures */
+ while (i < req->data_iovs) {
+ memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
+ req->iovs[i].offset = 0;
+ req->data_len += req->iovs[i++].iov.iov_len;
+ }
+ SDMA_DBG(req, "total data length %u", req->data_len);
+
+ if (pcount > req->info.npkts)
+ pcount = req->info.npkts;
+ /*
+ * Copy any TID info
+ * User space will provide the TID info only when the
+ * request type is EXPECTED. This is true even if there is
+ * only one packet in the request and the header is already
+ * setup. The reason for the singular TID case is that the
+ * driver needs to perform safety checks.
+ */
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
+
+ if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
+ ret = -EINVAL;
+ goto free_req;
+ }
+ req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
+ if (!req->tids) {
+ ret = -ENOMEM;
+ goto free_req;
+ }
+ /*
+ * We have to copy all of the tids because they may vary
+ * in size and, therefore, the TID count might not be
+ * equal to the pkt count. However, there is no way to
+ * tell at this point.
+ */
+ ret = copy_from_user(req->tids, iovec[idx].iov_base,
+ ntids * sizeof(*req->tids));
+ if (ret) {
+ SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
+ ntids, ret);
+ ret = -EFAULT;
+ goto free_req;
+ }
+ req->n_tids = ntids;
+ idx++;
+ }
+
+ /* Have to select the engine */
+ req->sde = sdma_select_engine_vl(dd,
+ (u32)(uctxt->ctxt + subctxt_fp(fp)),
+ vl);
+ if (!req->sde || !sdma_running(req->sde)) {
+ ret = -ECOMM;
+ goto free_req;
+ }
+
+ /* We don't need an AHG entry if the request contains only one packet */
+ if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
+ int ahg = sdma_ahg_alloc(req->sde);
+
+ if (likely(ahg >= 0)) {
+ req->ahg_idx = (u8)ahg;
+ set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
+ }
+ }
+
+ set_comp_state(req, QUEUED, 0);
+ /* Send the first N packets in the request to buy us some time */
+ sent = user_sdma_send_pkts(req, pcount);
+ if (unlikely(sent < 0)) {
+ if (sent != -EBUSY) {
+ ret = sent;
+ goto send_err;
+ } else
+ sent = 0;
+ }
+ atomic_inc(&pq->n_reqs);
+
+ if (sent < req->info.npkts) {
+ /* Take the references to the user's task and mm_struct */
+ get_task_struct(current);
+ req->user_proc = current;
+
+ /*
+ * This is a somewhat blocking send implementation.
+ * The driver will block the caller until all packets of the
+ * request have been submitted to the SDMA engine. However, it
+ * will not wait for send completions.
+ */
+ while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+ ret = user_sdma_send_pkts(req, pcount);
+ if (ret < 0) {
+ if (ret != -EBUSY)
+ goto send_err;
+ wait_event_interruptible_timeout(
+ pq->busy.wait_dma,
+ (pq->state == SDMA_PKT_Q_ACTIVE),
+ msecs_to_jiffies(
+ SDMA_IOWAIT_TIMEOUT));
+ }
+ }
+
+ }
+ ret = 0;
+ *count += idx;
+ goto done;
+send_err:
+ set_comp_state(req, ERROR, ret);
+free_req:
+ user_sdma_free_request(req);
+done:
+ return ret;
+}
+
+static inline u32 compute_data_length(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx)
+{
+ /*
+ * Determine the proper size of the packet data.
+ * The size of the data of the first packet is in the header
+ * template. However, it includes the header and ICRC, which need
+ * to be subtracted.
+ * The size of the remaining packets is the minimum of the frag
+ * size (MTU) or remaining data in the request.
+ */
+ u32 len;
+
+ if (!req->seqnum) {
+ len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+ (sizeof(tx->hdr) - 4));
+ } else if (req_opcode(req->info.ctrl) == EXPECTED) {
+ u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
+ PAGE_SIZE;
+ /* Get the data length based on the remaining space in the
+ * TID pair. */
+ len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
+ /* If we've filled up the TID pair, move to the next one. */
+ if (unlikely(!len) && ++req->tididx < req->n_tids &&
+ req->tids[req->tididx]) {
+ tidlen = EXP_TID_GET(req->tids[req->tididx],
+ LEN) * PAGE_SIZE;
+ req->tidoffset = 0;
+ len = min_t(u32, tidlen, req->info.fragsize);
+ }
+ /* Since the TID pairs map entire pages, make sure that we
+ * are not going to try to send more data that we have
+ * remaining. */
+ len = min(len, req->data_len - req->sent);
+ } else
+ len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+ SDMA_DBG(req, "Data Length = %u", len);
+ return len;
+}
+
+static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
+{
+ /* (Size of complete header - size of PBC) + 4B ICRC + data length */
+ return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
+}
+
+static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+{
+ int ret = 0;
+ unsigned npkts = 0;
+ struct user_sdma_txreq *tx = NULL;
+ struct hfi1_user_sdma_pkt_q *pq = NULL;
+ struct user_sdma_iovec *iovec = NULL;
+
+ if (!req->pq) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ pq = req->pq;
+
+ /*
+ * Check if we might have sent the entire request already
+ */
+ if (unlikely(req->seqnum == req->info.npkts)) {
+ if (!list_empty(&req->txps))
+ goto dosend;
+ goto done;
+ }
+
+ if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
+ maxpkts = req->info.npkts - req->seqnum;
+
+ while (npkts < maxpkts) {
+ u32 datalen = 0, queued = 0, data_sent = 0;
+ u64 iov_offset = 0;
+
+ /*
+ * Check whether any of the completions have come back
+ * with errors. If so, we are not going to process any
+ * more packets from this request.
+ */
+ if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+ set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+ ret = -EFAULT;
+ goto done;
+ }
+
+ tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
+ if (!tx) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ tx->flags = 0;
+ tx->req = req;
+ tx->busycount = 0;
+ tx->iovec1 = NULL;
+ tx->iovec2 = NULL;
+
+ if (req->seqnum == req->info.npkts - 1)
+ tx->flags |= USER_SDMA_TXREQ_FLAGS_LAST_PKT;
+
+ /*
+ * Calculate the payload size - this is min of the fragment
+ * (MTU) size or the remaining bytes in the request but only
+ * if we have payload data.
+ */
+ if (req->data_len) {
+ iovec = &req->iovs[req->iov_idx];
+ if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+ if (++req->iov_idx == req->data_iovs) {
+ ret = -EFAULT;
+ goto free_txreq;
+ }
+ iovec = &req->iovs[req->iov_idx];
+ WARN_ON(iovec->offset);
+ }
+
+ /*
+ * This request might include only a header and no user
+ * data, so pin pages only if there is data and it the
+ * pages have not been pinned already.
+ */
+ if (unlikely(!iovec->pages && iovec->iov.iov_len)) {
+ ret = pin_vector_pages(req, iovec);
+ if (ret)
+ goto free_tx;
+ }
+
+ tx->iovec1 = iovec;
+ datalen = compute_data_length(req, tx);
+ if (!datalen) {
+ SDMA_DBG(req,
+ "Request has data but pkt len is 0");
+ ret = -EFAULT;
+ goto free_tx;
+ }
+ }
+
+ if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
+ if (!req->seqnum) {
+ u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
+ u32 lrhlen = get_lrh_len(req->hdr, datalen);
+ /*
+ * Copy the request header into the tx header
+ * because the HW needs a cacheline-aligned
+ * address.
+ * This copy can be optimized out if the hdr
+ * member of user_sdma_request were also
+ * cacheline aligned.
+ */
+ memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
+ if (PBC2LRH(pbclen) != lrhlen) {
+ pbclen = (pbclen & 0xf000) |
+ LRH2PBC(lrhlen);
+ tx->hdr.pbc[0] = cpu_to_le16(pbclen);
+ }
+ ret = sdma_txinit_ahg(&tx->txreq,
+ SDMA_TXREQ_F_AHG_COPY,
+ sizeof(tx->hdr) + datalen,
+ req->ahg_idx, 0, NULL, 0,
+ user_sdma_txreq_cb);
+ if (ret)
+ goto free_tx;
+ ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
+ &tx->hdr,
+ sizeof(tx->hdr));
+ if (ret)
+ goto free_txreq;
+ } else {
+ int changes;
+
+ changes = set_txreq_header_ahg(req, tx,
+ datalen);
+ if (changes < 0)
+ goto free_tx;
+ sdma_txinit_ahg(&tx->txreq,
+ SDMA_TXREQ_F_USE_AHG,
+ datalen, req->ahg_idx, changes,
+ req->ahg, sizeof(req->hdr),
+ user_sdma_txreq_cb);
+ }
+ } else {
+ ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
+ datalen, user_sdma_txreq_cb);
+ if (ret)
+ goto free_tx;
+ /*
+ * Modify the header for this packet. This only needs
+ * to be done if we are not going to use AHG. Otherwise,
+ * the HW will do it based on the changes we gave it
+ * during sdma_txinit_ahg().
+ */
+ ret = set_txreq_header(req, tx, datalen);
+ if (ret)
+ goto free_txreq;
+ }
+
+ /*
+ * If the request contains any data vectors, add up to
+ * fragsize bytes to the descriptor.
+ */
+ while (queued < datalen &&
+ (req->sent + data_sent) < req->data_len) {
+ unsigned long base, offset;
+ unsigned pageidx, len;
+
+ base = (unsigned long)iovec->iov.iov_base;
+ offset = ((base + iovec->offset + iov_offset) &
+ ~PAGE_MASK);
+ pageidx = (((iovec->offset + iov_offset +
+ base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
+ len = offset + req->info.fragsize > PAGE_SIZE ?
+ PAGE_SIZE - offset : req->info.fragsize;
+ len = min((datalen - queued), len);
+ ret = sdma_txadd_page(pq->dd, &tx->txreq,
+ iovec->pages[pageidx],
+ offset, len);
+ if (ret) {
+ dd_dev_err(pq->dd,
+ "SDMA txreq add page failed %d\n",
+ ret);
+ iovec_set_complete(iovec);
+ goto free_txreq;
+ }
+ iov_offset += len;
+ queued += len;
+ data_sent += len;
+ if (unlikely(queued < datalen &&
+ pageidx == iovec->npages &&
+ req->iov_idx < req->data_iovs - 1)) {
+ iovec->offset += iov_offset;
+ iovec = &req->iovs[++req->iov_idx];
+ if (!iovec->pages) {
+ ret = pin_vector_pages(req, iovec);
+ if (ret)
+ goto free_txreq;
+ }
+ iov_offset = 0;
+ tx->iovec2 = iovec;
+
+ }
+ }
+ /*
+ * The txreq was submitted successfully so we can update
+ * the counters.
+ */
+ req->koffset += datalen;
+ if (req_opcode(req->info.ctrl) == EXPECTED)
+ req->tidoffset += datalen;
+ req->sent += data_sent;
+ if (req->data_len) {
+ if (tx->iovec1 && !tx->iovec2)
+ tx->iovec1->offset += iov_offset;
+ else if (tx->iovec2)
+ tx->iovec2->offset += iov_offset;
+ }
+ /*
+ * It is important to increment this here as it is used to
+ * generate the BTH.PSN and, therefore, can't be bulk-updated
+ * outside of the loop.
+ */
+ tx->seqnum = req->seqnum++;
+ list_add_tail(&tx->txreq.list, &req->txps);
+ npkts++;
+ }
+dosend:
+ ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
+ if (list_empty(&req->txps))
+ if (req->seqnum == req->info.npkts) {
+ set_bit(SDMA_REQ_SEND_DONE, &req->flags);
+ /*
+ * The txreq has already been submitted to the HW queue
+ * so we can free the AHG entry now. Corruption will not
+ * happen due to the sequential manner in which
+ * descriptors are processed.
+ */
+ if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
+ sdma_ahg_free(req->sde, req->ahg_idx);
+ }
+ goto done;
+free_txreq:
+ sdma_txclean(pq->dd, &tx->txreq);
+free_tx:
+ kmem_cache_free(pq->txreq_cache, tx);
+done:
+ return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static inline int num_user_pages(const struct iovec *iov)
+{
+ const unsigned long addr = (unsigned long) iov->iov_base;
+ const unsigned long len = iov->iov_len;
+ const unsigned long spage = addr & PAGE_MASK;
+ const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+ return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static int pin_vector_pages(struct user_sdma_request *req,
+ struct user_sdma_iovec *iovec) {
+ int ret = 0;
+ unsigned pinned;
+
+ iovec->npages = num_user_pages(&iovec->iov);
+ iovec->pages = kzalloc(sizeof(*iovec->pages) *
+ iovec->npages, GFP_KERNEL);
+ if (!iovec->pages) {
+ SDMA_DBG(req, "Failed page array alloc");
+ ret = -ENOMEM;
+ goto done;
+ }
+ /* If called by the kernel thread, use the user's mm */
+ if (current->flags & PF_KTHREAD)
+ use_mm(req->user_proc->mm);
+ pinned = get_user_pages_fast(
+ (unsigned long)iovec->iov.iov_base,
+ iovec->npages, 0, iovec->pages);
+ /* If called by the kernel thread, unuse the user's mm */
+ if (current->flags & PF_KTHREAD)
+ unuse_mm(req->user_proc->mm);
+ if (pinned != iovec->npages) {
+ SDMA_DBG(req, "Failed to pin pages (%u/%u)", pinned,
+ iovec->npages);
+ ret = -EFAULT;
+ goto pfree;
+ }
+ goto done;
+pfree:
+ unpin_vector_pages(iovec);
+done:
+ return ret;
+}
+
+static void unpin_vector_pages(struct user_sdma_iovec *iovec)
+{
+ unsigned i;
+
+ if (ACCESS_ONCE(iovec->offset) != iovec->iov.iov_len) {
+ hfi1_cdbg(SDMA,
+ "the complete vector has not been sent yet %llu %zu",
+ iovec->offset, iovec->iov.iov_len);
+ return;
+ }
+ for (i = 0; i < iovec->npages; i++)
+ if (iovec->pages[i])
+ put_page(iovec->pages[i]);
+ kfree(iovec->pages);
+ iovec->pages = NULL;
+ iovec->npages = 0;
+ iovec->offset = 0;
+}
+
+static int check_header_template(struct user_sdma_request *req,
+ struct hfi1_pkt_header *hdr, u32 lrhlen,
+ u32 datalen)
+{
+ /*
+ * Perform safety checks for any type of packet:
+ * - transfer size is multiple of 64bytes
+ * - packet length is multiple of 4bytes
+ * - entire request length is multiple of 4bytes
+ * - packet length is not larger than MTU size
+ *
+ * These checks are only done for the first packet of the
+ * transfer since the header is "given" to us by user space.
+ * For the remainder of the packets we compute the values.
+ */
+ if (req->info.fragsize % PIO_BLOCK_SIZE ||
+ lrhlen & 0x3 || req->data_len & 0x3 ||
+ lrhlen > get_lrh_len(*hdr, req->info.fragsize))
+ return -EINVAL;
+
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ /*
+ * The header is checked only on the first packet. Furthermore,
+ * we ensure that at least one TID entry is copied when the
+ * request is submitted. Therefore, we don't have to verify that
+ * tididx points to something sane.
+ */
+ u32 tidval = req->tids[req->tididx],
+ tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
+ tididx = EXP_TID_GET(tidval, IDX),
+ tidctrl = EXP_TID_GET(tidval, CTRL),
+ tidoff;
+ __le32 kval = hdr->kdeth.ver_tid_offset;
+
+ tidoff = KDETH_GET(kval, OFFSET) *
+ (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+ KDETH_OM_LARGE : KDETH_OM_SMALL);
+ /*
+ * Expected receive packets have the following
+ * additional checks:
+ * - offset is not larger than the TID size
+ * - TIDCtrl values match between header and TID array
+ * - TID indexes match between header and TID array
+ */
+ if ((tidoff + datalen > tidlen) ||
+ KDETH_GET(kval, TIDCTRL) != tidctrl ||
+ KDETH_GET(kval, TID) != tididx)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Correctly set the BTH.PSN field based on type of
+ * transfer - eager packets can just increment the PSN but
+ * expected packets encode generation and sequence in the
+ * BTH.PSN field so just incrementing will result in errors.
+ */
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
+{
+ u32 val = be32_to_cpu(bthpsn),
+ mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
+ 0xffffffull),
+ psn = val & mask;
+ if (expct)
+ psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+ else
+ psn = psn + frags;
+ return psn & mask;
+}
+
+static int set_txreq_header(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx, u32 datalen)
+{
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ struct hfi1_pkt_header *hdr = &tx->hdr;
+ u16 pbclen;
+ int ret;
+ u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+
+ /* Copy the header template to the request before modification */
+ memcpy(hdr, &req->hdr, sizeof(*hdr));
+
+ /*
+ * Check if the PBC and LRH length are mismatched. If so
+ * adjust both in the header.
+ */
+ pbclen = le16_to_cpu(hdr->pbc[0]);
+ if (PBC2LRH(pbclen) != lrhlen) {
+ pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+ hdr->pbc[0] = cpu_to_le16(pbclen);
+ hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
+ /*
+ * Third packet
+ * This is the first packet in the sequence that has
+ * a "static" size that can be used for the rest of
+ * the packets (besides the last one).
+ */
+ if (unlikely(req->seqnum == 2)) {
+ /*
+ * From this point on the lengths in both the
+ * PBC and LRH are the same until the last
+ * packet.
+ * Adjust the template so we don't have to update
+ * every packet
+ */
+ req->hdr.pbc[0] = hdr->pbc[0];
+ req->hdr.lrh[2] = hdr->lrh[2];
+ }
+ }
+ /*
+ * We only have to modify the header if this is not the
+ * first packet in the request. Otherwise, we use the
+ * header given to us.
+ */
+ if (unlikely(!req->seqnum)) {
+ ret = check_header_template(req, hdr, lrhlen, datalen);
+ if (ret)
+ return ret;
+ goto done;
+
+ }
+
+ hdr->bth[2] = cpu_to_be32(
+ set_pkt_bth_psn(hdr->bth[2],
+ (req_opcode(req->info.ctrl) == EXPECTED),
+ req->seqnum));
+
+ /* Set ACK request on last packet */
+ if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+ hdr->bth[2] |= cpu_to_be32(1UL<<31);
+
+ /* Set the new offset */
+ hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
+ /* Expected packets have to fill in the new TID information */
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ tidval = req->tids[req->tididx];
+ /*
+ * If the offset puts us at the end of the current TID,
+ * advance everything.
+ */
+ if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+ PAGE_SIZE)) {
+ req->tidoffset = 0;
+ /* Since we don't copy all the TIDs, all at once,
+ * we have to check again. */
+ if (++req->tididx > req->n_tids - 1 ||
+ !req->tids[req->tididx]) {
+ return -EINVAL;
+ }
+ tidval = req->tids[req->tididx];
+ }
+ req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
+ KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
+ /* Set KDETH.TIDCtrl based on value for this TID. */
+ KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
+ EXP_TID_GET(tidval, CTRL));
+ /* Set KDETH.TID based on value for this TID */
+ KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
+ EXP_TID_GET(tidval, IDX));
+ /* Clear KDETH.SH only on the last packet */
+ if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+ KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
+ /*
+ * Set the KDETH.OFFSET and KDETH.OM based on size of
+ * transfer.
+ */
+ SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
+ req->tidoffset, req->tidoffset / req->omfactor,
+ !!(req->omfactor - KDETH_OM_SMALL));
+ KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
+ req->tidoffset / req->omfactor);
+ KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
+ !!(req->omfactor - KDETH_OM_SMALL));
+ }
+done:
+ trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
+ req->info.comp_idx, hdr, tidval);
+ return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
+}
+
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx, u32 len)
+{
+ int diff = 0;
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ struct hfi1_pkt_header *hdr = &req->hdr;
+ u16 pbclen = le16_to_cpu(hdr->pbc[0]);
+ u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+
+ if (PBC2LRH(pbclen) != lrhlen) {
+ /* PBC.PbcLengthDWs */
+ AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
+ cpu_to_le16(LRH2PBC(lrhlen)));
+ /* LRH.PktLen (we need the full 16 bits due to byte swap) */
+ AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
+ cpu_to_be16(lrhlen >> 2));
+ }
+
+ /*
+ * Do the common updates
+ */
+ /* BTH.PSN and BTH.A */
+ val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
+ (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
+ if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+ val32 |= 1UL << 31;
+ AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
+ AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
+ /* KDETH.Offset */
+ AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
+ cpu_to_le16(req->koffset & 0xffff));
+ AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
+ cpu_to_le16(req->koffset >> 16));
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ __le16 val;
+
+ tidval = req->tids[req->tididx];
+
+ /*
+ * If the offset puts us at the end of the current TID,
+ * advance everything.
+ */
+ if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+ PAGE_SIZE)) {
+ req->tidoffset = 0;
+ /* Since we don't copy all the TIDs, all at once,
+ * we have to check again. */
+ if (++req->tididx > req->n_tids - 1 ||
+ !req->tids[req->tididx]) {
+ return -EINVAL;
+ }
+ tidval = req->tids[req->tididx];
+ }
+ req->omfactor = ((EXP_TID_GET(tidval, LEN) *
+ PAGE_SIZE) >=
+ KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
+ KDETH_OM_SMALL;
+ /* KDETH.OM and KDETH.OFFSET (TID) */
+ AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
+ ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
+ ((req->tidoffset / req->omfactor) & 0x7fff)));
+ /* KDETH.TIDCtrl, KDETH.TID */
+ val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
+ (EXP_TID_GET(tidval, IDX) & 0x3ff));
+ /* Clear KDETH.SH on last packet */
+ if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) {
+ val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
+ INTR) >> 16);
+ val &= cpu_to_le16(~(1U << 13));
+ AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+ } else
+ AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+ }
+
+ trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
+ req->info.comp_idx, req->sde->this_idx,
+ req->ahg_idx, req->ahg, diff, tidval);
+ return diff;
+}
+
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
+ int drain)
+{
+ struct user_sdma_txreq *tx =
+ container_of(txreq, struct user_sdma_txreq, txreq);
+ struct user_sdma_request *req = tx->req;
+ struct hfi1_user_sdma_pkt_q *pq = req ? req->pq : NULL;
+ u64 tx_seqnum;
+
+ if (unlikely(!req || !pq))
+ return;
+
+ if (tx->iovec1)
+ iovec_may_free(tx->iovec1, unpin_vector_pages);
+ if (tx->iovec2)
+ iovec_may_free(tx->iovec2, unpin_vector_pages);
+
+ tx_seqnum = tx->seqnum;
+ kmem_cache_free(pq->txreq_cache, tx);
+
+ if (status != SDMA_TXREQ_S_OK) {
+ dd_dev_err(pq->dd, "SDMA completion with error %d", status);
+ set_comp_state(req, ERROR, status);
+ set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+ /* Do not free the request until the sender loop has ack'ed
+ * the error and we've seen all txreqs. */
+ if (tx_seqnum == ACCESS_ONCE(req->seqnum) &&
+ test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
+ atomic_dec(&pq->n_reqs);
+ user_sdma_free_request(req);
+ }
+ } else {
+ if (tx_seqnum == req->info.npkts - 1) {
+ /* We've sent and completed all packets in this
+ * request. Signal completion to the user */
+ atomic_dec(&pq->n_reqs);
+ set_comp_state(req, COMPLETE, 0);
+ user_sdma_free_request(req);
+ }
+ }
+ if (!atomic_read(&pq->n_reqs))
+ xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+}
+
+static void user_sdma_free_request(struct user_sdma_request *req)
+{
+ if (!list_empty(&req->txps)) {
+ struct sdma_txreq *t, *p;
+
+ list_for_each_entry_safe(t, p, &req->txps, list) {
+ struct user_sdma_txreq *tx =
+ container_of(t, struct user_sdma_txreq, txreq);
+ list_del_init(&t->list);
+ sdma_txclean(req->pq->dd, t);
+ kmem_cache_free(req->pq->txreq_cache, tx);
+ }
+ }
+ if (req->data_iovs) {
+ int i;
+
+ for (i = 0; i < req->data_iovs; i++)
+ if (req->iovs[i].npages && req->iovs[i].pages)
+ unpin_vector_pages(&req->iovs[i]);
+ }
+ if (req->user_proc)
+ put_task_struct(req->user_proc);
+ kfree(req->tids);
+ clear_bit(SDMA_REQ_IN_USE, &req->flags);
+}
+
+static inline void set_comp_state(struct user_sdma_request *req,
+ enum hfi1_sdma_comp_state state,
+ int ret)
+{
+ SDMA_DBG(req, "Setting completion status %u %d", state, ret);
+ req->cq->comps[req->info.comp_idx].status = state;
+ if (state == ERROR)
+ req->cq->comps[req->info.comp_idx].errcode = -ret;
+ trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt,
+ req->pq->subctxt, req->info.comp_idx,
+ state, ret);
+}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
new file mode 100644
index 000000000000..fa4422553e23
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_sdma.h
@@ -0,0 +1,89 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+
+#define EXP_TID_TIDLEN_MASK 0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT 0
+#define EXP_TID_TIDCTRL_MASK 0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK 0x7FFULL
+#define EXP_TID_TIDIDX_SHIFT 22
+#define EXP_TID_GET(tid, field) \
+ (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+extern uint extended_psn;
+
+struct hfi1_user_sdma_pkt_q {
+ struct list_head list;
+ unsigned ctxt;
+ unsigned subctxt;
+ u16 n_max_reqs;
+ atomic_t n_reqs;
+ u16 reqidx;
+ struct hfi1_devdata *dd;
+ struct kmem_cache *txreq_cache;
+ struct user_sdma_request *reqs;
+ struct iowait busy;
+ unsigned state;
+};
+
+struct hfi1_user_sdma_comp_q {
+ u16 nentries;
+ struct hfi1_sdma_comp_entry *comps;
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
+int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
+ unsigned long *);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
new file mode 100644
index 000000000000..41bb59eb001c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -0,0 +1,2145 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+unsigned int hfi1_lkey_table_size = 16;
+module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
+ S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+ "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int hfi1_max_pds = 0xFFFF;
+module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+ "Maximum number of protection domains to support");
+
+static unsigned int hfi1_max_ahs = 0xFFFF;
+module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int hfi1_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+ "Maximum number of completion queue entries to support");
+
+unsigned int hfi1_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int hfi1_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int hfi1_max_qps = 16384;
+module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int hfi1_max_sges = 0x60;
+module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int hfi1_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+ "Maximum number of multicast groups to support");
+
+unsigned int hfi1_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
+ uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+ "Maximum number of attached QPs to support");
+
+unsigned int hfi1_max_srqs = 1024;
+module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int hfi1_max_srq_sges = 128;
+module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int hfi1_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static void verbs_sdma_complete(
+ struct sdma_txreq *cookie,
+ int status,
+ int drained);
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; hfi1_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = 0,
+ [IB_QPS_INIT] = HFI1_POST_RECV_OK,
+ [IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK,
+ [IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+ HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK |
+ HFI1_PROCESS_NEXT_SEND_OK,
+ [IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+ HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK,
+ [IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+ HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
+ [IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV |
+ HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
+};
+
+struct hfi1_ucontext {
+ struct ib_ucontext ibucontext;
+};
+
+static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext
+ *ibucontext)
+{
+ return container_of(ibucontext, struct hfi1_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
+ [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+ [IB_WR_SEND] = IB_WC_SEND,
+ [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+ [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * Length of header by opcode, 0 --> not supported
+ */
+const u8 hdr_len_by_opcode[256] = {
+ /* RC */
+ [IB_OPCODE_RC_SEND_FIRST] = 12 + 8,
+ [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8,
+ [IB_OPCODE_RC_SEND_LAST] = 12 + 8,
+ [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_SEND_ONLY] = 12 + 8,
+ [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16,
+ [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+ [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4,
+ [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28,
+ [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28,
+ /* UC */
+ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
+ [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
+ [IB_OPCODE_UC_SEND_LAST] = 12 + 8,
+ [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_UC_SEND_ONLY] = 12 + 8,
+ [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16,
+ [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+ /* UD */
+ [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8,
+ [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12
+};
+
+static const opcode_handler opcode_handler_tbl[256] = {
+ /* RC */
+ [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv,
+ /* UC */
+ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ /* UD */
+ [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv,
+ [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv,
+ /* CNP */
+ [IB_OPCODE_CNP] = &hfi1_cnp_rcv
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_hfi1_sys_image_guid;
+
+/**
+ * hfi1_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void hfi1_copy_sge(
+ struct hfi1_sge_state *ss,
+ void *data, u32 length,
+ int release)
+{
+ struct hfi1_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ memcpy(sge->vaddr, data, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (release)
+ hfi1_put_mr(sge->mr);
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= HFI1_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ data += len;
+ length -= len;
+ }
+}
+
+/**
+ * hfi1_skip_sge - skip over SGE memory
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release)
+{
+ struct hfi1_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (release)
+ hfi1_put_mr(sge->mr);
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= HFI1_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+}
+
+/**
+ * post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int post_one_send(struct hfi1_qp *qp, struct ib_send_wr *wr)
+{
+ struct hfi1_swqe *wqe;
+ u32 next;
+ int i;
+ int j;
+ int acc;
+ struct hfi1_lkey_table *rkt;
+ struct hfi1_pd *pd;
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+
+ /* IB spec says that num_sge == 0 is OK. */
+ if (unlikely(wr->num_sge > qp->s_max_sge))
+ return -EINVAL;
+
+ ppd = &dd->pport[qp->port_num - 1];
+ ibp = &ppd->ibport_data;
+
+ /*
+ * Don't allow RDMA reads or atomic operations on UC or
+ * undefined operations.
+ * Make sure buffer is large enough to hold the result for atomics.
+ */
+ if (wr->opcode == IB_WR_FAST_REG_MR) {
+ return -EINVAL;
+ } else if (qp->ibqp.qp_type == IB_QPT_UC) {
+ if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+ return -EINVAL;
+ } else if (qp->ibqp.qp_type != IB_QPT_RC) {
+ /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
+ if (wr->opcode != IB_WR_SEND &&
+ wr->opcode != IB_WR_SEND_WITH_IMM)
+ return -EINVAL;
+ /* Check UD destination address PD */
+ if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+ return -EINVAL;
+ } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+ return -EINVAL;
+ else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+ (wr->num_sge == 0 ||
+ wr->sg_list[0].length < sizeof(u64) ||
+ wr->sg_list[0].addr & (sizeof(u64) - 1)))
+ return -EINVAL;
+ else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+ return -EINVAL;
+
+ next = qp->s_head + 1;
+ if (next >= qp->s_size)
+ next = 0;
+ if (next == qp->s_last)
+ return -ENOMEM;
+
+ rkt = &to_idev(qp->ibqp.device)->lk_table;
+ pd = to_ipd(qp->ibqp.pd);
+ wqe = get_swqe_ptr(qp, qp->s_head);
+ wqe->wr = *wr;
+ wqe->length = 0;
+ j = 0;
+ if (wr->num_sge) {
+ acc = wr->opcode >= IB_WR_RDMA_READ ?
+ IB_ACCESS_LOCAL_WRITE : 0;
+ for (i = 0; i < wr->num_sge; i++) {
+ u32 length = wr->sg_list[i].length;
+ int ok;
+
+ if (length == 0)
+ continue;
+ ok = hfi1_lkey_ok(rkt, pd, &wqe->sg_list[j],
+ &wr->sg_list[i], acc);
+ if (!ok)
+ goto bail_inval_free;
+ wqe->length += length;
+ j++;
+ }
+ wqe->wr.num_sge = j;
+ }
+ if (qp->ibqp.qp_type == IB_QPT_UC ||
+ qp->ibqp.qp_type == IB_QPT_RC) {
+ if (wqe->length > 0x80000000U)
+ goto bail_inval_free;
+ } else {
+ struct hfi1_ah *ah = to_iah(wr->wr.ud.ah);
+
+ atomic_inc(&ah->refcount);
+ }
+ wqe->ssn = qp->s_ssn++;
+ qp->s_head = next;
+
+ return 0;
+
+bail_inval_free:
+ /* release mr holds */
+ while (j) {
+ struct hfi1_sge *sge = &wqe->sg_list[--j];
+
+ hfi1_put_mr(sge->mr);
+ }
+ return -EINVAL;
+}
+
+/**
+ * post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct hfi1_qp *qp = to_iqp(ibqp);
+ int err = 0;
+ int call_send;
+ unsigned long flags;
+ unsigned nreq = 0;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Check that state is OK to post send. */
+ if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return -EINVAL;
+ }
+
+ /* sq empty and not list -> call send */
+ call_send = qp->s_head == qp->s_last && !wr->next;
+
+ for (; wr; wr = wr->next) {
+ err = post_one_send(qp, wr);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto bail;
+ }
+ nreq++;
+ }
+bail:
+ if (nreq && !call_send)
+ hfi1_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (nreq && call_send)
+ hfi1_do_send(&qp->s_iowait.iowork);
+ return err;
+}
+
+/**
+ * post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct hfi1_qp *qp = to_iqp(ibqp);
+ struct hfi1_rwq *wq = qp->r_rq.wq;
+ unsigned long flags;
+ int ret;
+
+ /* Check that state is OK to post receive. */
+ if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) {
+ *bad_wr = wr;
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ for (; wr; wr = wr->next) {
+ struct hfi1_rwqe *wqe;
+ u32 next;
+ int i;
+
+ if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+ *bad_wr = wr;
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+ next = wq->head + 1;
+ if (next >= qp->r_rq.size)
+ next = 0;
+ if (next == wq->tail) {
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ *bad_wr = wr;
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+ wqe->wr_id = wr->wr_id;
+ wqe->num_sge = wr->num_sge;
+ for (i = 0; i < wr->num_sge; i++)
+ wqe->sg_list[i] = wr->sg_list[i];
+ /* Make sure queue entry is written before the head index. */
+ smp_wmb();
+ wq->head = next;
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ }
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/*
+ * Make sure the QP is ready and able to accept the given opcode.
+ */
+static inline int qp_ok(int opcode, struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp;
+
+ if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK))
+ goto dropit;
+ if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+ (opcode == IB_OPCODE_CNP))
+ return 1;
+dropit:
+ ibp = &packet->rcd->ppd->ibport_data;
+ ibp->n_pkt_drops++;
+ return 0;
+}
+
+
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ *
+ * Tlen is the length of the header + data + CRC in bytes.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+ u16 lid;
+
+ /* Check for GRH */
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh == HFI1_LRH_BTH)
+ packet->ohdr = &hdr->u.oth;
+ else if (lnh == HFI1_LRH_GRH) {
+ u32 vtf;
+
+ packet->ohdr = &hdr->u.l.oth;
+ if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+ goto drop;
+ vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+ if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+ goto drop;
+ packet->rcv_flags |= HFI1_HAS_GRH;
+ } else
+ goto drop;
+
+ trace_input_ibhdr(rcd->dd, hdr);
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK;
+ lid = be16_to_cpu(hdr->lrh[1]);
+ if (unlikely((lid >= HFI1_MULTICAST_LID_BASE) &&
+ (lid != HFI1_PERMISSIVE_LID))) {
+ struct hfi1_mcast *mcast;
+ struct hfi1_mcast_qp *p;
+
+ if (lnh != HFI1_LRH_GRH)
+ goto drop;
+ mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid);
+ if (mcast == NULL)
+ goto drop;
+ list_for_each_entry_rcu(p, &mcast->qp_list, list) {
+ packet->qp = p->qp;
+ spin_lock(&packet->qp->r_lock);
+ if (likely((qp_ok(opcode, packet))))
+ opcode_handler_tbl[opcode](packet);
+ spin_unlock(&packet->qp->r_lock);
+ }
+ /*
+ * Notify hfi1_multicast_detach() if it is waiting for us
+ * to finish.
+ */
+ if (atomic_dec_return(&mcast->refcount) <= 1)
+ wake_up(&mcast->wait);
+ } else {
+ rcu_read_lock();
+ packet->qp = hfi1_lookup_qpn(ibp, qp_num);
+ if (!packet->qp) {
+ rcu_read_unlock();
+ goto drop;
+ }
+ spin_lock(&packet->qp->r_lock);
+ if (likely((qp_ok(opcode, packet))))
+ opcode_handler_tbl[opcode](packet);
+ spin_unlock(&packet->qp->r_lock);
+ rcu_read_unlock();
+ }
+ return;
+
+drop:
+ ibp->n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+ struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
+ struct list_head *list = &dev->memwait;
+ struct hfi1_qp *qp = NULL;
+ struct iowait *wait;
+ unsigned long flags;
+
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ if (!list_empty(list)) {
+ wait = list_first_entry(list, struct iowait, list);
+ qp = container_of(wait, struct hfi1_qp, s_iowait);
+ list_del_init(&qp->s_iowait.list);
+ /* refcount held until actual wake up */
+ if (!list_empty(list))
+ mod_timer(&dev->mem_timer, jiffies + 1);
+ }
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+ if (qp)
+ hfi1_qp_wakeup(qp, HFI1_S_WAIT_KMEM);
+}
+
+void update_sge(struct hfi1_sge_state *ss, u32 length)
+{
+ struct hfi1_sge *sge = &ss->sge;
+
+ sge->vaddr += length;
+ sge->length -= length;
+ sge->sge_length -= length;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= HFI1_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ return;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+}
+
+static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+ struct hfi1_qp *qp)
+{
+ struct verbs_txreq *tx;
+ unsigned long flags;
+
+ tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+ if (!tx) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ write_seqlock(&dev->iowait_lock);
+ if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK &&
+ list_empty(&qp->s_iowait.list)) {
+ dev->n_txwait++;
+ qp->s_flags |= HFI1_S_WAIT_TX;
+ list_add_tail(&qp->s_iowait.list, &dev->txwait);
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TX);
+ atomic_inc(&qp->refcount);
+ }
+ qp->s_flags &= ~HFI1_S_BUSY;
+ write_sequnlock(&dev->iowait_lock);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ tx = ERR_PTR(-EBUSY);
+ }
+ return tx;
+}
+
+static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
+ struct hfi1_qp *qp)
+{
+ struct verbs_txreq *tx;
+
+ tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+ if (!tx) {
+ /* call slow path to get the lock */
+ tx = __get_txreq(dev, qp);
+ if (IS_ERR(tx))
+ return tx;
+ }
+ tx->qp = qp;
+ return tx;
+}
+
+void hfi1_put_txreq(struct verbs_txreq *tx)
+{
+ struct hfi1_ibdev *dev;
+ struct hfi1_qp *qp;
+ unsigned long flags;
+ unsigned int seq;
+
+ qp = tx->qp;
+ dev = to_idev(qp->ibqp.device);
+
+ if (tx->mr) {
+ hfi1_put_mr(tx->mr);
+ tx->mr = NULL;
+ }
+ sdma_txclean(dd_from_dev(dev), &tx->txreq);
+
+ /* Free verbs_txreq and return to slab cache */
+ kmem_cache_free(dev->verbs_txreq_cache, tx);
+
+ do {
+ seq = read_seqbegin(&dev->iowait_lock);
+ if (!list_empty(&dev->txwait)) {
+ struct iowait *wait;
+
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ /* Wake up first QP wanting a free struct */
+ wait = list_first_entry(&dev->txwait, struct iowait,
+ list);
+ qp = container_of(wait, struct hfi1_qp, s_iowait);
+ list_del_init(&qp->s_iowait.list);
+ /* refcount held until actual wake up */
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+ hfi1_qp_wakeup(qp, HFI1_S_WAIT_TX);
+ break;
+ }
+ } while (read_seqretry(&dev->iowait_lock, seq));
+}
+
+/*
+ * This is called with progress side lock held.
+ */
+/* New API */
+static void verbs_sdma_complete(
+ struct sdma_txreq *cookie,
+ int status,
+ int drained)
+{
+ struct verbs_txreq *tx =
+ container_of(cookie, struct verbs_txreq, txreq);
+ struct hfi1_qp *qp = tx->qp;
+
+ spin_lock(&qp->s_lock);
+ if (tx->wqe)
+ hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+ else if (qp->ibqp.qp_type == IB_QPT_RC) {
+ struct hfi1_ib_header *hdr;
+
+ hdr = &tx->phdr.hdr;
+ hfi1_rc_send_complete(qp, hdr);
+ }
+ if (drained) {
+ /*
+ * This happens when the send engine notes
+ * a QP in the error state and cannot
+ * do the flush work until that QP's
+ * sdma work has finished.
+ */
+ if (qp->s_flags & HFI1_S_WAIT_DMA) {
+ qp->s_flags &= ~HFI1_S_WAIT_DMA;
+ hfi1_schedule_send(qp);
+ }
+ }
+ spin_unlock(&qp->s_lock);
+
+ hfi1_put_txreq(tx);
+}
+
+static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+ write_seqlock(&dev->iowait_lock);
+ if (list_empty(&qp->s_iowait.list)) {
+ if (list_empty(&dev->memwait))
+ mod_timer(&dev->mem_timer, jiffies + 1);
+ qp->s_flags |= HFI1_S_WAIT_KMEM;
+ list_add_tail(&qp->s_iowait.list, &dev->memwait);
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_KMEM);
+ atomic_inc(&qp->refcount);
+ }
+ write_sequnlock(&dev->iowait_lock);
+ qp->s_flags &= ~HFI1_S_BUSY;
+ ret = -EBUSY;
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ return ret;
+}
+
+/*
+ * This routine calls txadds for each sg entry.
+ *
+ * Add failures will revert the sge cursor
+ */
+static int build_verbs_ulp_payload(
+ struct sdma_engine *sde,
+ struct hfi1_sge_state *ss,
+ u32 length,
+ struct verbs_txreq *tx)
+{
+ struct hfi1_sge *sg_list = ss->sg_list;
+ struct hfi1_sge sge = ss->sge;
+ u8 num_sge = ss->num_sge;
+ u32 len;
+ int ret = 0;
+
+ while (length) {
+ len = ss->sge.length;
+ if (len > length)
+ len = length;
+ if (len > ss->sge.sge_length)
+ len = ss->sge.sge_length;
+ WARN_ON_ONCE(len == 0);
+ ret = sdma_txadd_kvaddr(
+ sde->dd,
+ &tx->txreq,
+ ss->sge.vaddr,
+ len);
+ if (ret)
+ goto bail_txadd;
+ update_sge(ss, len);
+ length -= len;
+ }
+ return ret;
+bail_txadd:
+ /* unwind cursor */
+ ss->sge = sge;
+ ss->num_sge = num_sge;
+ ss->sg_list = sg_list;
+ return ret;
+}
+
+/*
+ * Build the number of DMA descriptors needed to send length bytes of data.
+ *
+ * NOTE: DMA mapping is held in the tx until completed in the ring or
+ * the tx desc is freed without having been submitted to the ring
+ *
+ * This routine insures the following all the helper routine
+ * calls succeed.
+ */
+/* New API */
+static int build_verbs_tx_desc(
+ struct sdma_engine *sde,
+ struct hfi1_sge_state *ss,
+ u32 length,
+ struct verbs_txreq *tx,
+ struct ahg_ib_header *ahdr,
+ u64 pbc)
+{
+ int ret = 0;
+ struct hfi1_pio_header *phdr;
+ u16 hdrbytes = tx->hdr_dwords << 2;
+
+ phdr = &tx->phdr;
+ if (!ahdr->ahgcount) {
+ ret = sdma_txinit_ahg(
+ &tx->txreq,
+ ahdr->tx_flags,
+ hdrbytes + length,
+ ahdr->ahgidx,
+ 0,
+ NULL,
+ 0,
+ verbs_sdma_complete);
+ if (ret)
+ goto bail_txadd;
+ phdr->pbc = cpu_to_le64(pbc);
+ memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc));
+ /* add the header */
+ ret = sdma_txadd_kvaddr(
+ sde->dd,
+ &tx->txreq,
+ &tx->phdr,
+ tx->hdr_dwords << 2);
+ if (ret)
+ goto bail_txadd;
+ } else {
+ struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth;
+ struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth;
+
+ /* needed in rc_send_complete() */
+ phdr->hdr.lrh[0] = ahdr->ibh.lrh[0];
+ if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) {
+ sohdr = &ahdr->ibh.u.l.oth;
+ dohdr = &phdr->hdr.u.l.oth;
+ }
+ /* opcode */
+ dohdr->bth[0] = sohdr->bth[0];
+ /* PSN/ACK */
+ dohdr->bth[2] = sohdr->bth[2];
+ ret = sdma_txinit_ahg(
+ &tx->txreq,
+ ahdr->tx_flags,
+ length,
+ ahdr->ahgidx,
+ ahdr->ahgcount,
+ ahdr->ahgdesc,
+ hdrbytes,
+ verbs_sdma_complete);
+ if (ret)
+ goto bail_txadd;
+ }
+
+ /* add the ulp payload - if any. ss can be NULL for acks */
+ if (ss)
+ ret = build_verbs_ulp_payload(sde, ss, length, tx);
+bail_txadd:
+ return ret;
+}
+
+int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc)
+{
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct verbs_txreq *tx;
+ struct sdma_txreq *stx;
+ u64 pbc_flags = 0;
+ struct sdma_engine *sde;
+ u8 sc5 = qp->s_sc;
+ int ret;
+
+ if (!list_empty(&qp->s_iowait.tx_head)) {
+ stx = list_first_entry(
+ &qp->s_iowait.tx_head,
+ struct sdma_txreq,
+ list);
+ list_del_init(&stx->list);
+ tx = container_of(stx, struct verbs_txreq, txreq);
+ ret = sdma_send_txreq(tx->sde, &qp->s_iowait, stx);
+ if (unlikely(ret == -ECOMM))
+ goto bail_ecomm;
+ return ret;
+ }
+
+ tx = get_txreq(dev, qp);
+ if (IS_ERR(tx))
+ goto bail_tx;
+
+ if (!qp->s_hdr->sde) {
+ tx->sde = sde = qp_to_sdma_engine(qp, sc5);
+ if (!sde)
+ goto bail_no_sde;
+ } else
+ tx->sde = sde = qp->s_hdr->sde;
+
+ if (likely(pbc == 0)) {
+ u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+ /* No vl15 here */
+ /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+ pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+
+ pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+ }
+ tx->wqe = qp->s_wqe;
+ tx->mr = qp->s_rdma_mr;
+ if (qp->s_rdma_mr)
+ qp->s_rdma_mr = NULL;
+ tx->hdr_dwords = hdrwords + 2;
+ ret = build_verbs_tx_desc(sde, ss, len, tx, ahdr, pbc);
+ if (unlikely(ret))
+ goto bail_build;
+ trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
+ ret = sdma_send_txreq(sde, &qp->s_iowait, &tx->txreq);
+ if (unlikely(ret == -ECOMM))
+ goto bail_ecomm;
+ return ret;
+
+bail_no_sde:
+ hfi1_put_txreq(tx);
+bail_ecomm:
+ /* The current one got "sent" */
+ return 0;
+bail_build:
+ /* kmalloc or mapping fail */
+ hfi1_put_txreq(tx);
+ return wait_kmem(dev, qp);
+bail_tx:
+ return PTR_ERR(tx);
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ unsigned long flags;
+ int ret = 0;
+
+ /*
+ * Note that as soon as want_buffer() is called and
+ * possibly before it returns, sc_piobufavail()
+ * could be called. Therefore, put QP on the I/O wait list before
+ * enabling the PIO avail interrupt.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+ write_seqlock(&dev->iowait_lock);
+ if (list_empty(&qp->s_iowait.list)) {
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ int was_empty;
+
+ dev->n_piowait++;
+ qp->s_flags |= HFI1_S_WAIT_PIO;
+ was_empty = list_empty(&sc->piowait);
+ list_add_tail(&qp->s_iowait.list, &sc->piowait);
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_PIO);
+ atomic_inc(&qp->refcount);
+ /* counting: only call wantpiobuf_intr if first user */
+ if (was_empty)
+ hfi1_sc_wantpiobuf_intr(sc, 1);
+ }
+ write_sequnlock(&dev->iowait_lock);
+ qp->s_flags &= ~HFI1_S_BUSY;
+ ret = -EBUSY;
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+ struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1);
+ u8 vl;
+
+ vl = sc_to_vlt(dd, sc5);
+ if (vl >= ppd->vls_supported && vl != 15)
+ return NULL;
+ return dd->vld[vl].sc;
+}
+
+int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 *hdr = (u32 *)&ahdr->ibh;
+ u64 pbc_flags = 0;
+ u32 sc5;
+ unsigned long flags = 0;
+ struct send_context *sc;
+ struct pio_buf *pbuf;
+ int wc_status = IB_WC_SUCCESS;
+
+ /* vl15 special case taken care of in ud.c */
+ sc5 = qp->s_sc;
+ sc = qp_to_send_context(qp, sc5);
+
+ if (!sc)
+ return -EINVAL;
+ if (likely(pbc == 0)) {
+ u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+ /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+ pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+ pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+ }
+ pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+ if (unlikely(pbuf == NULL)) {
+ if (ppd->host_link_state != HLS_UP_ACTIVE) {
+ /*
+ * If we have filled the PIO buffers to capacity and are
+ * not in an active state this request is not going to
+ * go out to so just complete it with an error or else a
+ * ULP or the core may be stuck waiting.
+ */
+ hfi1_cdbg(
+ PIO,
+ "alloc failed. state not active, completing");
+ wc_status = IB_WC_GENERAL_ERR;
+ goto pio_bail;
+ } else {
+ /*
+ * This is a normal occurrence. The PIO buffs are full
+ * up but we are still happily sending, well we could be
+ * so lets continue to queue the request.
+ */
+ hfi1_cdbg(PIO, "alloc failed. state active, queuing");
+ return no_bufs_available(qp, sc);
+ }
+ }
+
+ if (len == 0) {
+ pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
+ } else {
+ if (ss) {
+ seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4);
+ while (len) {
+ void *addr = ss->sge.vaddr;
+ u32 slen = ss->sge.length;
+
+ if (slen > len)
+ slen = len;
+ update_sge(ss, slen);
+ seg_pio_copy_mid(pbuf, addr, slen);
+ len -= slen;
+ }
+ seg_pio_copy_end(pbuf);
+ }
+ }
+
+ trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
+
+ if (qp->s_rdma_mr) {
+ hfi1_put_mr(qp->s_rdma_mr);
+ qp->s_rdma_mr = NULL;
+ }
+
+pio_bail:
+ if (qp->s_wqe) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_send_complete(qp, qp->s_wqe, wc_status);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_rc_send_complete(qp, &ahdr->ibh);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+ return 0;
+}
+/*
+ * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for egress partition keys
+ * specified in the OPAv1 spec., section 9.1l.7.
+ */
+static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+ u16 mkey = pkey & PKEY_LOW_15_MASK;
+ u16 ment = ent & PKEY_LOW_15_MASK;
+
+ if (mkey == ment) {
+ /*
+ * If pkey[15] is set (full partition member),
+ * is bit 15 in the corresponding table element
+ * clear (limited member)?
+ */
+ if (pkey & PKEY_MEMBER_MASK)
+ return !!(ent & PKEY_MEMBER_MASK);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * egress_pkey_check - return 0 if hdr's pkey matches according to the
+ * criteria in the OPAv1 spec., section 9.11.7.
+ */
+static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
+ struct hfi1_ib_header *hdr,
+ struct hfi1_qp *qp)
+{
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_devdata *dd;
+ int i = 0;
+ u16 pkey;
+ u8 lnh, sc5 = qp->s_sc;
+
+ if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
+ return 0;
+
+ /* locate the pkey within the headers */
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh == HFI1_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else
+ ohdr = &hdr->u.oth;
+
+ pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+ /* If SC15, pkey[0:14] must be 0x7fff */
+ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+ goto bad;
+
+
+ /* Is the pkey = 0x0, or 0x8000? */
+ if ((pkey & PKEY_LOW_15_MASK) == 0)
+ goto bad;
+
+ /* The most likely matching pkey has index qp->s_pkey_index */
+ if (unlikely(!egress_pkey_matches_entry(pkey,
+ ppd->pkeys[qp->s_pkey_index]))) {
+ /* no match - try the entire table */
+ for (; i < MAX_PKEY_VALUES; i++) {
+ if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+ break;
+ }
+ }
+
+ if (i < MAX_PKEY_VALUES)
+ return 0;
+bad:
+ incr_cntr64(&ppd->port_xmit_constraint_errors);
+ dd = ppd->dd;
+ if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
+ u16 slid = be16_to_cpu(hdr->lrh[3]);
+
+ dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
+ dd->err_info_xmit_constraint.slid = slid;
+ dd->err_info_xmit_constraint.pkey = pkey;
+ }
+ return 1;
+}
+
+/**
+ * hfi1_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @ahdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags HFI1_S_BUSY otherwise.
+ */
+int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+ u32 plen;
+ int ret;
+ int pio = 0;
+ unsigned long flags = 0;
+ u32 dwords = (len + 3) >> 2;
+
+ /*
+ * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+ * can defer SDMA restart until link goes ACTIVE without
+ * worrying about just how we got there.
+ */
+ if ((qp->ibqp.qp_type == IB_QPT_SMI) ||
+ !(dd->flags & HFI1_HAS_SEND_DMA))
+ pio = 1;
+
+ ret = egress_pkey_check(dd->pport, &ahdr->ibh, qp);
+ if (unlikely(ret)) {
+ /*
+ * The value we are returning here does not get propagated to
+ * the verbs caller. Thus we need to complete the request with
+ * error otherwise the caller could be sitting waiting on the
+ * completion event. Only do this for PIO. SDMA has its own
+ * mechanism for handling the errors. So for SDMA we can just
+ * return.
+ */
+ if (pio) {
+ hfi1_cdbg(PIO, "%s() Failed. Completing with err",
+ __func__);
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+ return -EINVAL;
+ }
+
+ /*
+ * Calculate the send buffer trigger address.
+ * The +2 counts for the pbc control qword
+ */
+ plen = hdrwords + dwords + 2;
+
+ if (pio) {
+ ret = dd->process_pio_send(
+ qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
+ } else {
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n",
+ slashstrip(__FILE__), __LINE__, __func__);
+ dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", hdrwords, len);
+#endif
+ ret = dd->process_dma_send(
+ qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
+ }
+
+ return ret;
+}
+
+static int query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props,
+ struct ib_udata *uhw)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibdev *dev = to_idev(ibdev);
+
+ if (uhw->inlen || uhw->outlen)
+ return -EINVAL;
+ memset(props, 0, sizeof(*props));
+
+ props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+ IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+ IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+ IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+
+ props->page_size_cap = PAGE_SIZE;
+ props->vendor_id =
+ dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
+ props->vendor_part_id = dd->pcidev->device;
+ props->hw_ver = dd->minrev;
+ props->sys_image_guid = ib_hfi1_sys_image_guid;
+ props->max_mr_size = ~0ULL;
+ props->max_qp = hfi1_max_qps;
+ props->max_qp_wr = hfi1_max_qp_wrs;
+ props->max_sge = hfi1_max_sges;
+ props->max_sge_rd = hfi1_max_sges;
+ props->max_cq = hfi1_max_cqs;
+ props->max_ah = hfi1_max_ahs;
+ props->max_cqe = hfi1_max_cqes;
+ props->max_mr = dev->lk_table.max;
+ props->max_fmr = dev->lk_table.max;
+ props->max_map_per_fmr = 32767;
+ props->max_pd = hfi1_max_pds;
+ props->max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
+ props->max_qp_init_rd_atom = 255;
+ /* props->max_res_rd_atom */
+ props->max_srq = hfi1_max_srqs;
+ props->max_srq_wr = hfi1_max_srq_wrs;
+ props->max_srq_sge = hfi1_max_srq_sges;
+ /* props->local_ca_ack_delay */
+ props->atomic_cap = IB_ATOMIC_GLOB;
+ props->max_pkeys = hfi1_get_npkeys(dd);
+ props->max_mcast_grp = hfi1_max_mcast_grps;
+ props->max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
+ props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+ props->max_mcast_grp;
+
+ return 0;
+}
+
+static inline u16 opa_speed_to_ib(u16 in)
+{
+ u16 out = 0;
+
+ if (in & OPA_LINK_SPEED_25G)
+ out |= IB_SPEED_EDR;
+ if (in & OPA_LINK_SPEED_12_5G)
+ out |= IB_SPEED_FDR;
+
+ return out;
+}
+
+/*
+ * Convert a single OPA link width (no multiple flags) to an IB value.
+ * A zero OPA link width means link down, which means the IB width value
+ * is a don't care.
+ */
+static inline u16 opa_width_to_ib(u16 in)
+{
+ switch (in) {
+ case OPA_LINK_WIDTH_1X:
+ /* map 2x and 3x to 1x as they don't exist in IB */
+ case OPA_LINK_WIDTH_2X:
+ case OPA_LINK_WIDTH_3X:
+ return IB_WIDTH_1X;
+ default: /* link down or unknown, return our largest width */
+ case OPA_LINK_WIDTH_4X:
+ return IB_WIDTH_4X;
+ }
+}
+
+static int query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u16 lid = ppd->lid;
+
+ memset(props, 0, sizeof(*props));
+ props->lid = lid ? lid : 0;
+ props->lmc = ppd->lmc;
+ props->sm_lid = ibp->sm_lid;
+ props->sm_sl = ibp->sm_sl;
+ /* OPA logical states match IB logical states */
+ props->state = driver_lstate(ppd);
+ props->phys_state = hfi1_ibphys_portstate(ppd);
+ props->port_cap_flags = ibp->port_cap_flags;
+ props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
+ props->max_msg_sz = 0x80000000;
+ props->pkey_tbl_len = hfi1_get_npkeys(dd);
+ props->bad_pkey_cntr = ibp->pkey_violations;
+ props->qkey_viol_cntr = ibp->qkey_violations;
+ props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
+ /* see rate_show() in ib core/sysfs.c */
+ props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+ props->max_vl_num = ppd->vls_supported;
+ props->init_type_reply = 0;
+
+ /* Once we are a "first class" citizen and have added the OPA MTUs to
+ * the core we can advertise the larger MTU enum to the ULPs, for now
+ * advertise only 4K.
+ *
+ * Those applications which are either OPA aware or pass the MTU enum
+ * from the Path Records to us will get the new 8k MTU. Those that
+ * attempt to process the MTU enum may fail in various ways.
+ */
+ props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
+ 4096 : hfi1_max_mtu), IB_MTU_4096);
+ props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
+ mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
+ props->subnet_timeout = ibp->subnet_timeout;
+
+ return 0;
+}
+
+static int port_immutable(struct ib_device *ibdev, u8 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct ib_port_attr attr;
+ int err;
+
+ err = query_port(ibdev, port_num, &attr);
+ if (err)
+ return err;
+
+ memset(immutable, 0, sizeof(*immutable));
+
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+ immutable->core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
+ immutable->max_mad_size = OPA_MGMT_MAD_SIZE;
+
+ return 0;
+}
+
+static int modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(device);
+ unsigned i;
+ int ret;
+
+ if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+ IB_DEVICE_MODIFY_NODE_DESC)) {
+ ret = -EOPNOTSUPP;
+ goto bail;
+ }
+
+ if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+ memcpy(device->node_desc, device_modify->node_desc, 64);
+ for (i = 0; i < dd->num_pports; i++) {
+ struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+ hfi1_node_desc_chg(ibp);
+ }
+ }
+
+ if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+ ib_hfi1_sys_image_guid =
+ cpu_to_be64(device_modify->sys_image_guid);
+ for (i = 0; i < dd->num_pports; i++) {
+ struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+ hfi1_sys_guid_chg(ibp);
+ }
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static int modify_port(struct ib_device *ibdev, u8 port,
+ int port_modify_mask, struct ib_port_modify *props)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ int ret = 0;
+
+ ibp->port_cap_flags |= props->set_port_cap_mask;
+ ibp->port_cap_flags &= ~props->clr_port_cap_mask;
+ if (props->set_port_cap_mask || props->clr_port_cap_mask)
+ hfi1_cap_mask_chg(ibp);
+ if (port_modify_mask & IB_PORT_SHUTDOWN) {
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
+ OPA_LINKDOWN_REASON_UNKNOWN);
+ ret = set_link_state(ppd, HLS_DN_DOWNDEF);
+ }
+ if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+ ibp->qkey_violations = 0;
+ return ret;
+}
+
+static int query_gid(struct ib_device *ibdev, u8 port,
+ int index, union ib_gid *gid)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ int ret = 0;
+
+ if (!port || port > dd->num_pports)
+ ret = -EINVAL;
+ else {
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ gid->global.subnet_prefix = ibp->gid_prefix;
+ if (index == 0)
+ gid->global.interface_id = cpu_to_be64(ppd->guid);
+ else if (index < HFI1_GUIDS_PER_PORT)
+ gid->global.interface_id = ibp->guids[index - 1];
+ else
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static struct ib_pd *alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct hfi1_ibdev *dev = to_idev(ibdev);
+ struct hfi1_pd *pd;
+ struct ib_pd *ret;
+
+ /*
+ * This is actually totally arbitrary. Some correctness tests
+ * assume there's a maximum number of PDs that can be allocated.
+ * We don't actually have this limit, but we fail the test if
+ * we allow allocations of more than we report for this value.
+ */
+
+ pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ spin_lock(&dev->n_pds_lock);
+ if (dev->n_pds_allocated == hfi1_max_pds) {
+ spin_unlock(&dev->n_pds_lock);
+ kfree(pd);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ dev->n_pds_allocated++;
+ spin_unlock(&dev->n_pds_lock);
+
+ /* ib_alloc_pd() will initialize pd->ibpd. */
+ pd->user = udata != NULL;
+
+ ret = &pd->ibpd;
+
+bail:
+ return ret;
+}
+
+static int dealloc_pd(struct ib_pd *ibpd)
+{
+ struct hfi1_pd *pd = to_ipd(ibpd);
+ struct hfi1_ibdev *dev = to_idev(ibpd->device);
+
+ spin_lock(&dev->n_pds_lock);
+ dev->n_pds_allocated--;
+ spin_unlock(&dev->n_pds_lock);
+
+ kfree(pd);
+
+ return 0;
+}
+
+/*
+ * convert ah port,sl to sc
+ */
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
+
+ return ibp->sl_to_sc[ah->sl];
+}
+
+int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ u8 sc5;
+
+ /* A multicast address requires a GRH (see ch. 8.4.1). */
+ if (ah_attr->dlid >= HFI1_MULTICAST_LID_BASE &&
+ ah_attr->dlid != HFI1_PERMISSIVE_LID &&
+ !(ah_attr->ah_flags & IB_AH_GRH))
+ goto bail;
+ if ((ah_attr->ah_flags & IB_AH_GRH) &&
+ ah_attr->grh.sgid_index >= HFI1_GUIDS_PER_PORT)
+ goto bail;
+ if (ah_attr->dlid == 0)
+ goto bail;
+ if (ah_attr->port_num < 1 ||
+ ah_attr->port_num > ibdev->phys_port_cnt)
+ goto bail;
+ if (ah_attr->static_rate != IB_RATE_PORT_CURRENT &&
+ ib_rate_to_mbps(ah_attr->static_rate) < 0)
+ goto bail;
+ if (ah_attr->sl >= OPA_MAX_SLS)
+ goto bail;
+ /* test the mapping for validity */
+ ibp = to_iport(ibdev, ah_attr->port_num);
+ ppd = ppd_from_ibp(ibp);
+ sc5 = ibp->sl_to_sc[ah_attr->sl];
+ dd = dd_from_ppd(ppd);
+ if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
+ goto bail;
+ return 0;
+bail:
+ return -EINVAL;
+}
+
+/**
+ * create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *create_ah(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr)
+{
+ struct hfi1_ah *ah;
+ struct ib_ah *ret;
+ struct hfi1_ibdev *dev = to_idev(pd->device);
+ unsigned long flags;
+
+ if (hfi1_check_ah(pd->device, ah_attr)) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
+ if (!ah) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ spin_lock_irqsave(&dev->n_ahs_lock, flags);
+ if (dev->n_ahs_allocated == hfi1_max_ahs) {
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+ kfree(ah);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ dev->n_ahs_allocated++;
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+ /* ib_create_ah() will initialize ah->ibah. */
+ ah->attr = *ah_attr;
+ atomic_set(&ah->refcount, 0);
+
+ ret = &ah->ibah;
+
+bail:
+ return ret;
+}
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
+{
+ struct ib_ah_attr attr;
+ struct ib_ah *ah = ERR_PTR(-EINVAL);
+ struct hfi1_qp *qp0;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.dlid = dlid;
+ attr.port_num = ppd_from_ibp(ibp)->port;
+ rcu_read_lock();
+ qp0 = rcu_dereference(ibp->qp[0]);
+ if (qp0)
+ ah = ib_create_ah(qp0->ibqp.pd, &attr);
+ rcu_read_unlock();
+ return ah;
+}
+
+/**
+ * destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int destroy_ah(struct ib_ah *ibah)
+{
+ struct hfi1_ibdev *dev = to_idev(ibah->device);
+ struct hfi1_ah *ah = to_iah(ibah);
+ unsigned long flags;
+
+ if (atomic_read(&ah->refcount) != 0)
+ return -EBUSY;
+
+ spin_lock_irqsave(&dev->n_ahs_lock, flags);
+ dev->n_ahs_allocated--;
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+ kfree(ah);
+
+ return 0;
+}
+
+static int modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+ struct hfi1_ah *ah = to_iah(ibah);
+
+ if (hfi1_check_ah(ibah->device, ah_attr))
+ return -EINVAL;
+
+ ah->attr = *ah_attr;
+
+ return 0;
+}
+
+static int query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+ struct hfi1_ah *ah = to_iah(ibah);
+
+ *ah_attr = ah->attr;
+
+ return 0;
+}
+
+/**
+ * hfi1_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the hfi1_ib device
+ */
+unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
+{
+ return ARRAY_SIZE(dd->pport[0].pkeys);
+}
+
+static int query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ int ret;
+
+ if (index >= hfi1_get_npkeys(dd)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ *pkey = hfi1_get_pkey(to_iport(ibdev, port), index);
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the driver
+ */
+
+static struct ib_ucontext *alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ struct hfi1_ucontext *context;
+ struct ib_ucontext *ret;
+
+ context = kmalloc(sizeof(*context), GFP_KERNEL);
+ if (!context) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ ret = &context->ibucontext;
+
+bail:
+ return ret;
+}
+
+static int dealloc_ucontext(struct ib_ucontext *context)
+{
+ kfree(to_iucontext(context));
+ return 0;
+}
+
+static void init_ibport(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
+ int i;
+
+ for (i = 0; i < sz; i++) {
+ ibp->sl_to_sc[i] = i;
+ ibp->sc_to_sl[i] = i;
+ }
+
+ spin_lock_init(&ibp->lock);
+ /* Set the prefix to the default value (see ch. 4.1.1) */
+ ibp->gid_prefix = IB_DEFAULT_GID_PREFIX;
+ ibp->sm_lid = 0;
+ /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+ ibp->port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
+ IB_PORT_CAP_MASK_NOTICE_SUP;
+ ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+ ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+ ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+ ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+ ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+ RCU_INIT_POINTER(ibp->qp[0], NULL);
+ RCU_INIT_POINTER(ibp->qp[1], NULL);
+}
+
+static void verbs_txreq_kmem_cache_ctor(void *obj)
+{
+ struct verbs_txreq *tx = (struct verbs_txreq *)obj;
+
+ memset(tx, 0, sizeof(*tx));
+}
+
+/**
+ * hfi1_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return 0 if successful, errno if unsuccessful.
+ */
+int hfi1_register_ib_device(struct hfi1_devdata *dd)
+{
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ struct ib_device *ibdev = &dev->ibdev;
+ struct hfi1_pportdata *ppd = dd->pport;
+ unsigned i, lk_tab_size;
+ int ret;
+ size_t lcpysz = IB_DEVICE_NAME_MAX;
+ u16 descq_cnt;
+
+ ret = hfi1_qp_init(dev);
+ if (ret)
+ goto err_qp_init;
+
+
+ for (i = 0; i < dd->num_pports; i++)
+ init_ibport(ppd + i);
+
+ /* Only need to initialize non-zero fields. */
+ spin_lock_init(&dev->n_pds_lock);
+ spin_lock_init(&dev->n_ahs_lock);
+ spin_lock_init(&dev->n_cqs_lock);
+ spin_lock_init(&dev->n_qps_lock);
+ spin_lock_init(&dev->n_srqs_lock);
+ spin_lock_init(&dev->n_mcast_grps_lock);
+ init_timer(&dev->mem_timer);
+ dev->mem_timer.function = mem_timer;
+ dev->mem_timer.data = (unsigned long) dev;
+
+ /*
+ * The top hfi1_lkey_table_size bits are used to index the
+ * table. The lower 8 bits can be owned by the user (copied from
+ * the LKEY). The remaining bits act as a generation number or tag.
+ */
+ spin_lock_init(&dev->lk_table.lock);
+ dev->lk_table.max = 1 << hfi1_lkey_table_size;
+ /* ensure generation is at least 4 bits (keys.c) */
+ if (hfi1_lkey_table_size > MAX_LKEY_TABLE_BITS) {
+ dd_dev_warn(dd, "lkey bits %u too large, reduced to %u\n",
+ hfi1_lkey_table_size, MAX_LKEY_TABLE_BITS);
+ hfi1_lkey_table_size = MAX_LKEY_TABLE_BITS;
+ }
+ lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
+ dev->lk_table.table = (struct hfi1_mregion __rcu **)
+ vmalloc(lk_tab_size);
+ if (dev->lk_table.table == NULL) {
+ ret = -ENOMEM;
+ goto err_lk;
+ }
+ RCU_INIT_POINTER(dev->dma_mr, NULL);
+ for (i = 0; i < dev->lk_table.max; i++)
+ RCU_INIT_POINTER(dev->lk_table.table[i], NULL);
+ INIT_LIST_HEAD(&dev->pending_mmaps);
+ spin_lock_init(&dev->pending_lock);
+ seqlock_init(&dev->iowait_lock);
+ dev->mmap_offset = PAGE_SIZE;
+ spin_lock_init(&dev->mmap_offset_lock);
+ INIT_LIST_HEAD(&dev->txwait);
+ INIT_LIST_HEAD(&dev->memwait);
+
+ descq_cnt = sdma_get_descq_cnt();
+
+ /* SLAB_HWCACHE_ALIGN for AHG */
+ dev->verbs_txreq_cache = kmem_cache_create("hfi1_vtxreq_cache",
+ sizeof(struct verbs_txreq),
+ 0, SLAB_HWCACHE_ALIGN,
+ verbs_txreq_kmem_cache_ctor);
+ if (!dev->verbs_txreq_cache) {
+ ret = -ENOMEM;
+ goto err_verbs_txreq;
+ }
+
+ /*
+ * The system image GUID is supposed to be the same for all
+ * HFIs in a single system but since there can be other
+ * device types in the system, we can't be sure this is unique.
+ */
+ if (!ib_hfi1_sys_image_guid)
+ ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
+ lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
+ strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
+ ibdev->owner = THIS_MODULE;
+ ibdev->node_guid = cpu_to_be64(ppd->guid);
+ ibdev->uverbs_abi_ver = HFI1_UVERBS_ABI_VERSION;
+ ibdev->uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_AH) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_AH) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+ (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+ ibdev->node_type = RDMA_NODE_IB_CA;
+ ibdev->phys_port_cnt = dd->num_pports;
+ ibdev->num_comp_vectors = 1;
+ ibdev->dma_device = &dd->pcidev->dev;
+ ibdev->query_device = query_device;
+ ibdev->modify_device = modify_device;
+ ibdev->query_port = query_port;
+ ibdev->modify_port = modify_port;
+ ibdev->query_pkey = query_pkey;
+ ibdev->query_gid = query_gid;
+ ibdev->alloc_ucontext = alloc_ucontext;
+ ibdev->dealloc_ucontext = dealloc_ucontext;
+ ibdev->alloc_pd = alloc_pd;
+ ibdev->dealloc_pd = dealloc_pd;
+ ibdev->create_ah = create_ah;
+ ibdev->destroy_ah = destroy_ah;
+ ibdev->modify_ah = modify_ah;
+ ibdev->query_ah = query_ah;
+ ibdev->create_srq = hfi1_create_srq;
+ ibdev->modify_srq = hfi1_modify_srq;
+ ibdev->query_srq = hfi1_query_srq;
+ ibdev->destroy_srq = hfi1_destroy_srq;
+ ibdev->create_qp = hfi1_create_qp;
+ ibdev->modify_qp = hfi1_modify_qp;
+ ibdev->query_qp = hfi1_query_qp;
+ ibdev->destroy_qp = hfi1_destroy_qp;
+ ibdev->post_send = post_send;
+ ibdev->post_recv = post_receive;
+ ibdev->post_srq_recv = hfi1_post_srq_receive;
+ ibdev->create_cq = hfi1_create_cq;
+ ibdev->destroy_cq = hfi1_destroy_cq;
+ ibdev->resize_cq = hfi1_resize_cq;
+ ibdev->poll_cq = hfi1_poll_cq;
+ ibdev->req_notify_cq = hfi1_req_notify_cq;
+ ibdev->get_dma_mr = hfi1_get_dma_mr;
+ ibdev->reg_phys_mr = hfi1_reg_phys_mr;
+ ibdev->reg_user_mr = hfi1_reg_user_mr;
+ ibdev->dereg_mr = hfi1_dereg_mr;
+ ibdev->alloc_mr = hfi1_alloc_mr;
+ ibdev->alloc_fast_reg_page_list = hfi1_alloc_fast_reg_page_list;
+ ibdev->free_fast_reg_page_list = hfi1_free_fast_reg_page_list;
+ ibdev->alloc_fmr = hfi1_alloc_fmr;
+ ibdev->map_phys_fmr = hfi1_map_phys_fmr;
+ ibdev->unmap_fmr = hfi1_unmap_fmr;
+ ibdev->dealloc_fmr = hfi1_dealloc_fmr;
+ ibdev->attach_mcast = hfi1_multicast_attach;
+ ibdev->detach_mcast = hfi1_multicast_detach;
+ ibdev->process_mad = hfi1_process_mad;
+ ibdev->mmap = hfi1_mmap;
+ ibdev->dma_ops = &hfi1_dma_mapping_ops;
+ ibdev->get_port_immutable = port_immutable;
+
+ strncpy(ibdev->node_desc, init_utsname()->nodename,
+ sizeof(ibdev->node_desc));
+
+ ret = ib_register_device(ibdev, hfi1_create_port_files);
+ if (ret)
+ goto err_reg;
+
+ ret = hfi1_create_agents(dev);
+ if (ret)
+ goto err_agents;
+
+ ret = hfi1_verbs_register_sysfs(dd);
+ if (ret)
+ goto err_class;
+
+ goto bail;
+
+err_class:
+ hfi1_free_agents(dev);
+err_agents:
+ ib_unregister_device(ibdev);
+err_reg:
+err_verbs_txreq:
+ kmem_cache_destroy(dev->verbs_txreq_cache);
+ vfree(dev->lk_table.table);
+err_lk:
+ hfi1_qp_exit(dev);
+err_qp_init:
+ dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+bail:
+ return ret;
+}
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
+{
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ struct ib_device *ibdev = &dev->ibdev;
+
+ hfi1_verbs_unregister_sysfs(dd);
+
+ hfi1_free_agents(dev);
+
+ ib_unregister_device(ibdev);
+
+ if (!list_empty(&dev->txwait))
+ dd_dev_err(dd, "txwait list not empty!\n");
+ if (!list_empty(&dev->memwait))
+ dd_dev_err(dd, "memwait list not empty!\n");
+ if (dev->dma_mr)
+ dd_dev_err(dd, "DMA MR not NULL!\n");
+
+ hfi1_qp_exit(dev);
+ del_timer_sync(&dev->mem_timer);
+ kmem_cache_destroy(dev->verbs_txreq_cache);
+ vfree(dev->lk_table.table);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_schedule_send(struct hfi1_qp *qp)
+{
+ if (hfi1_send_ok(qp)) {
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ iowait_schedule(&qp->s_iowait, ppd->hfi1_wq);
+ }
+}
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+
+ if (packet->qp->ibqp.qp_type == IB_QPT_UC)
+ hfi1_uc_rcv(packet);
+ else if (packet->qp->ibqp.qp_type == IB_QPT_UD)
+ hfi1_ud_rcv(packet);
+ else
+ ibp->n_pkt_drops++;
+}
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
new file mode 100644
index 000000000000..ed903a93baf7
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -0,0 +1,1151 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_H
+#define HFI1_VERBS_H
+
+#include <linux/types.h>
+#include <linux/seqlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_mad.h>
+
+struct hfi1_ctxtdata;
+struct hfi1_pportdata;
+struct hfi1_devdata;
+struct hfi1_packet;
+
+#include "iowait.h"
+
+#define HFI1_MAX_RDMA_ATOMIC 16
+#define HFI1_GUIDS_PER_PORT 5
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define HFI1_UVERBS_ABI_VERSION 2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1)
+
+#define IB_SEQ_NAK (3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK 0x20
+#define IB_NAK_PSN_ERROR 0x60
+#define IB_NAK_INVALID_REQUEST 0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR 0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST 0x64
+
+/* Flags for checking QP state (see ib_hfi1_state_ops[]) */
+#define HFI1_POST_SEND_OK 0x01
+#define HFI1_POST_RECV_OK 0x02
+#define HFI1_PROCESS_RECV_OK 0x04
+#define HFI1_PROCESS_SEND_OK 0x08
+#define HFI1_PROCESS_NEXT_SEND_OK 0x10
+#define HFI1_FLUSH_SEND 0x20
+#define HFI1_FLUSH_RECV 0x40
+#define HFI1_PROCESS_OR_FLUSH_SEND \
+ (HFI1_PROCESS_SEND_OK | HFI1_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE 0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED 0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005)
+
+#define HFI1_VENDOR_IPG cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK (1 << 31)
+#define IB_BTH_SOLICITED (1 << 23)
+#define IB_BTH_MIG_REQ (1 << 22)
+
+#define IB_GRH_VERSION 6
+#define IB_GRH_VERSION_MASK 0xF
+#define IB_GRH_VERSION_SHIFT 28
+#define IB_GRH_TCLASS_MASK 0xFF
+#define IB_GRH_TCLASS_SHIFT 20
+#define IB_GRH_FLOW_MASK 0xFFFFF
+#define IB_GRH_FLOW_SHIFT 0
+#define IB_GRH_NEXT_HDR 0x1B
+
+#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL)
+
+/* flags passed by hfi1_ib_rcv() */
+enum {
+ HFI1_HAS_GRH = (1 << 0),
+};
+
+struct ib_reth {
+ __be64 vaddr;
+ __be32 rkey;
+ __be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+ __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */
+ __be32 rkey;
+ __be64 swap_data;
+ __be64 compare_data;
+} __packed;
+
+union ib_ehdrs {
+ struct {
+ __be32 deth[2];
+ __be32 imm_data;
+ } ud;
+ struct {
+ struct ib_reth reth;
+ __be32 imm_data;
+ } rc;
+ struct {
+ __be32 aeth;
+ __be32 atomic_ack_eth[2];
+ } at;
+ __be32 imm_data;
+ __be32 aeth;
+ struct ib_atomic_eth atomic_eth;
+} __packed;
+
+struct hfi1_other_headers {
+ __be32 bth[3];
+ union ib_ehdrs u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data). Only the first 56 bytes of the IB header
+ * will be in the eager header buffer. The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct hfi1_ib_header {
+ __be16 lrh[4];
+ union {
+ struct {
+ struct ib_grh grh;
+ struct hfi1_other_headers oth;
+ } l;
+ struct hfi1_other_headers oth;
+ } u;
+} __packed;
+
+struct ahg_ib_header {
+ struct sdma_engine *sde;
+ u32 ahgdesc[2];
+ u16 tx_flags;
+ u8 ahgcount;
+ u8 ahgidx;
+ struct hfi1_ib_header ibh;
+};
+
+struct hfi1_pio_header {
+ __le64 pbc;
+ struct hfi1_ib_header hdr;
+} __packed;
+
+/*
+ * used for force cacheline alignment for AHG
+ */
+struct tx_pio_header {
+ struct hfi1_pio_header phdr;
+} ____cacheline_aligned;
+
+/*
+ * There is one struct hfi1_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct hfi1_mcast_qp.
+ */
+struct hfi1_mcast_qp {
+ struct list_head list;
+ struct hfi1_qp *qp;
+};
+
+struct hfi1_mcast {
+ struct rb_node rb_node;
+ union ib_gid mgid;
+ struct list_head qp_list;
+ wait_queue_head_t wait;
+ atomic_t refcount;
+ int n_attached;
+};
+
+/* Protection domain */
+struct hfi1_pd {
+ struct ib_pd ibpd;
+ int user; /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct hfi1_ah {
+ struct ib_ah ibah;
+ struct ib_ah_attr attr;
+ atomic_t refcount;
+};
+
+/*
+ * This structure is used by hfi1_mmap() to validate an offset
+ * when an mmap() request is made. The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct hfi1_mmap_info {
+ struct list_head pending_mmaps;
+ struct ib_ucontext *context;
+ void *obj;
+ __u64 offset;
+ struct kref ref;
+ unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct hfi1_cq_wc {
+ u32 head; /* index of next entry to fill */
+ u32 tail; /* index of next ib_poll_cq() entry */
+ union {
+ /* these are actually size ibcq.cqe + 1 */
+ struct ib_uverbs_wc uqueue[0];
+ struct ib_wc kqueue[0];
+ };
+};
+
+/*
+ * The completion queue structure.
+ */
+struct hfi1_cq {
+ struct ib_cq ibcq;
+ struct kthread_work comptask;
+ struct hfi1_devdata *dd;
+ spinlock_t lock; /* protect changes in this struct */
+ u8 notify;
+ u8 triggered;
+ struct hfi1_cq_wc *queue;
+ struct hfi1_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * Used by the verbs layer.
+ */
+struct hfi1_seg {
+ void *vaddr;
+ size_t length;
+};
+
+/* The number of hfi1_segs that fit in a page. */
+#define HFI1_SEGSZ (PAGE_SIZE / sizeof(struct hfi1_seg))
+
+struct hfi1_segarray {
+ struct hfi1_seg segs[HFI1_SEGSZ];
+};
+
+struct hfi1_mregion {
+ struct ib_pd *pd; /* shares refcnt of ibmr.pd */
+ u64 user_base; /* User's address for this region */
+ u64 iova; /* IB start address of this region */
+ size_t length;
+ u32 lkey;
+ u32 offset; /* offset (bytes) to start of region */
+ int access_flags;
+ u32 max_segs; /* number of hfi1_segs in all the arrays */
+ u32 mapsz; /* size of the map array */
+ u8 page_shift; /* 0 - non unform/non powerof2 sizes */
+ u8 lkey_published; /* in global table */
+ struct completion comp; /* complete when refcount goes to zero */
+ atomic_t refcount;
+ struct hfi1_segarray *map[0]; /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct hfi1_sge {
+ struct hfi1_mregion *mr;
+ void *vaddr; /* kernel virtual address of segment */
+ u32 sge_length; /* length of the SGE */
+ u32 length; /* remaining length of the segment */
+ u16 m; /* current index: mr->map[m] */
+ u16 n; /* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct hfi1_mr {
+ struct ib_mr ibmr;
+ struct ib_umem *umem;
+ struct hfi1_mregion mr; /* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct hfi1_swqe {
+ struct ib_send_wr wr; /* don't use wr.sg_list */
+ u32 psn; /* first packet sequence number */
+ u32 lpsn; /* last packet sequence number */
+ u32 ssn; /* send sequence number */
+ u32 length; /* total length of data in sg_list */
+ struct hfi1_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct hfi1_rwqe {
+ u64 wr_id;
+ u8 num_sge;
+ struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct hfi1_rwq {
+ u32 head; /* new work requests posted to the head */
+ u32 tail; /* receives pull requests from here. */
+ struct hfi1_rwqe wq[0];
+};
+
+struct hfi1_rq {
+ struct hfi1_rwq *wq;
+ u32 size; /* size of RWQE array */
+ u8 max_sge;
+ /* protect changes in this struct */
+ spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct hfi1_srq {
+ struct ib_srq ibsrq;
+ struct hfi1_rq rq;
+ struct hfi1_mmap_info *ip;
+ /* send signal when number of RWQEs < limit */
+ u32 limit;
+};
+
+struct hfi1_sge_state {
+ struct hfi1_sge *sg_list; /* next SGE to be used if any */
+ struct hfi1_sge sge; /* progress state for the current SGE */
+ u32 total_len;
+ u8 num_sge;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct hfi1_ack_entry {
+ u8 opcode;
+ u8 sent;
+ u32 psn;
+ u32 lpsn;
+ union {
+ struct hfi1_sge rdma_sge;
+ u64 atomic_data;
+ };
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct hfi1_qp {
+ struct ib_qp ibqp;
+ /* read mostly fields above and below */
+ struct ib_ah_attr remote_ah_attr;
+ struct ib_ah_attr alt_ah_attr;
+ struct hfi1_qp __rcu *next; /* link list for QPN hash table */
+ struct hfi1_swqe *s_wq; /* send work queue */
+ struct hfi1_mmap_info *ip;
+ struct ahg_ib_header *s_hdr; /* next packet header to send */
+ u8 s_sc; /* SC[0..4] for next packet */
+ unsigned long timeout_jiffies; /* computed from timeout */
+
+ enum ib_mtu path_mtu;
+ int srate_mbps; /* s_srate (below) converted to Mbit/s */
+ u32 remote_qpn;
+ u32 pmtu; /* decoded from path_mtu */
+ u32 qkey; /* QKEY for this QP (for UD or RD) */
+ u32 s_size; /* send work queue size */
+ u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */
+ u32 s_ahgpsn; /* set to the psn in the copy of the header */
+
+ u8 state; /* QP state */
+ u8 allowed_ops; /* high order bits of allowed opcodes */
+ u8 qp_access_flags;
+ u8 alt_timeout; /* Alternate path timeout for this QP */
+ u8 timeout; /* Timeout for this QP */
+ u8 s_srate;
+ u8 s_mig_state;
+ u8 port_num;
+ u8 s_pkey_index; /* PKEY index to use */
+ u8 s_alt_pkey_index; /* Alternate path PKEY index to use */
+ u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */
+ u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */
+ u8 s_retry_cnt; /* number of times to retry */
+ u8 s_rnr_retry_cnt;
+ u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */
+ u8 s_max_sge; /* size of s_wq->sg_list */
+ u8 s_draining;
+
+ /* start of read/write fields */
+ atomic_t refcount ____cacheline_aligned_in_smp;
+ wait_queue_head_t wait;
+
+
+ struct hfi1_ack_entry s_ack_queue[HFI1_MAX_RDMA_ATOMIC + 1]
+ ____cacheline_aligned_in_smp;
+ struct hfi1_sge_state s_rdma_read_sge;
+
+ spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */
+ unsigned long r_aflags;
+ u64 r_wr_id; /* ID for current receive WQE */
+ u32 r_ack_psn; /* PSN for next ACK or atomic ACK */
+ u32 r_len; /* total length of r_sge */
+ u32 r_rcv_len; /* receive data len processed */
+ u32 r_psn; /* expected rcv packet sequence number */
+ u32 r_msn; /* message sequence number */
+
+ u8 r_state; /* opcode of last packet received */
+ u8 r_flags;
+ u8 r_head_ack_queue; /* index into s_ack_queue[] */
+
+ struct list_head rspwait; /* link for waiting to respond */
+
+ struct hfi1_sge_state r_sge; /* current receive data */
+ struct hfi1_rq r_rq; /* receive work queue */
+
+ spinlock_t s_lock ____cacheline_aligned_in_smp;
+ struct hfi1_sge_state *s_cur_sge;
+ u32 s_flags;
+ struct hfi1_swqe *s_wqe;
+ struct hfi1_sge_state s_sge; /* current send request data */
+ struct hfi1_mregion *s_rdma_mr;
+ struct sdma_engine *s_sde; /* current sde */
+ u32 s_cur_size; /* size of send packet in bytes */
+ u32 s_len; /* total length of s_sge */
+ u32 s_rdma_read_len; /* total length of s_rdma_read_sge */
+ u32 s_next_psn; /* PSN for next request */
+ u32 s_last_psn; /* last response PSN processed */
+ u32 s_sending_psn; /* lowest PSN that is being sent */
+ u32 s_sending_hpsn; /* highest PSN that is being sent */
+ u32 s_psn; /* current packet sequence number */
+ u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */
+ u32 s_ack_psn; /* PSN for acking sends and RDMA writes */
+ u32 s_head; /* new entries added here */
+ u32 s_tail; /* next entry to process */
+ u32 s_cur; /* current work queue entry */
+ u32 s_acked; /* last un-ACK'ed entry */
+ u32 s_last; /* last completed entry */
+ u32 s_ssn; /* SSN of tail entry */
+ u32 s_lsn; /* limit sequence number (credit) */
+ u16 s_hdrwords; /* size of s_hdr in 32 bit words */
+ u16 s_rdma_ack_cnt;
+ s8 s_ahgidx;
+ u8 s_state; /* opcode of last packet sent */
+ u8 s_ack_state; /* opcode of packet to ACK */
+ u8 s_nak_state; /* non-zero if NAK is pending */
+ u8 r_nak_state; /* non-zero if NAK is pending */
+ u8 s_retry; /* requester retry counter */
+ u8 s_rnr_retry; /* requester RNR retry counter */
+ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */
+ u8 s_tail_ack_queue; /* index into s_ack_queue[] */
+
+ struct hfi1_sge_state s_ack_rdma_sge;
+ struct timer_list s_timer;
+
+ struct iowait s_iowait;
+
+ struct hfi1_sge r_sg_list[0] /* verified SGEs */
+ ____cacheline_aligned_in_smp;
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define HFI1_R_WRID_VALID 0
+#define HFI1_R_REWIND_SGE 1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define HFI1_R_REUSE_SGE 0x01
+#define HFI1_R_RDMAR_SEQ 0x02
+#define HFI1_R_RSP_NAK 0x04
+#define HFI1_R_RSP_SEND 0x08
+#define HFI1_R_COMM_EST 0x10
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * HFI1_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * HFI1_S_BUSY - send tasklet is processing the QP
+ * HFI1_S_TIMER - the RC retry timer is active
+ * HFI1_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * HFI1_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ * before processing the next SWQE
+ * HFI1_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ * before processing the next SWQE
+ * HFI1_S_WAIT_RNR - waiting for RNR timeout
+ * HFI1_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * HFI1_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ * next send completion entry not via send DMA
+ * HFI1_S_WAIT_PIO - waiting for a send buffer to be available
+ * HFI1_S_WAIT_TX - waiting for a struct verbs_txreq to be available
+ * HFI1_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * HFI1_S_WAIT_KMEM - waiting for kernel memory to be available
+ * HFI1_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * HFI1_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * HFI1_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ * HFI1_S_ECN - a BECN was queued to the send engine
+ */
+#define HFI1_S_SIGNAL_REQ_WR 0x0001
+#define HFI1_S_BUSY 0x0002
+#define HFI1_S_TIMER 0x0004
+#define HFI1_S_RESP_PENDING 0x0008
+#define HFI1_S_ACK_PENDING 0x0010
+#define HFI1_S_WAIT_FENCE 0x0020
+#define HFI1_S_WAIT_RDMAR 0x0040
+#define HFI1_S_WAIT_RNR 0x0080
+#define HFI1_S_WAIT_SSN_CREDIT 0x0100
+#define HFI1_S_WAIT_DMA 0x0200
+#define HFI1_S_WAIT_PIO 0x0400
+#define HFI1_S_WAIT_TX 0x0800
+#define HFI1_S_WAIT_DMA_DESC 0x1000
+#define HFI1_S_WAIT_KMEM 0x2000
+#define HFI1_S_WAIT_PSN 0x4000
+#define HFI1_S_WAIT_ACK 0x8000
+#define HFI1_S_SEND_ONE 0x10000
+#define HFI1_S_UNLIMITED_CREDIT 0x20000
+#define HFI1_S_AHG_VALID 0x40000
+#define HFI1_S_AHG_CLEAR 0x80000
+#define HFI1_S_ECN 0x100000
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define HFI1_S_ANY_WAIT_IO (HFI1_S_WAIT_PIO | HFI1_S_WAIT_TX | \
+ HFI1_S_WAIT_DMA_DESC | HFI1_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define HFI1_S_ANY_WAIT_SEND (HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | \
+ HFI1_S_WAIT_RNR | HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_DMA | \
+ HFI1_S_WAIT_PSN | HFI1_S_WAIT_ACK)
+
+#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | HFI1_S_ANY_WAIT_SEND)
+
+#define HFI1_PSN_CREDIT 16
+
+/*
+ * Since struct hfi1_swqe is not a fixed size, we can't simply index into
+ * struct hfi1_qp.s_wq. This function does the array index computation.
+ */
+static inline struct hfi1_swqe *get_swqe_ptr(struct hfi1_qp *qp,
+ unsigned n)
+{
+ return (struct hfi1_swqe *)((char *)qp->s_wq +
+ (sizeof(struct hfi1_swqe) +
+ qp->s_max_sge *
+ sizeof(struct hfi1_sge)) * n);
+}
+
+/*
+ * Since struct hfi1_rwqe is not a fixed size, we can't simply index into
+ * struct hfi1_rwq.wq. This function does the array index computation.
+ */
+static inline struct hfi1_rwqe *get_rwqe_ptr(struct hfi1_rq *rq, unsigned n)
+{
+ return (struct hfi1_rwqe *)
+ ((char *) rq->wq->wq +
+ (sizeof(struct hfi1_rwqe) +
+ rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+#define MAX_LKEY_TABLE_BITS 23
+
+struct hfi1_lkey_table {
+ spinlock_t lock; /* protect changes in this struct */
+ u32 next; /* next unused index (speeds search) */
+ u32 gen; /* generation count */
+ u32 max; /* size of the table */
+ struct hfi1_mregion __rcu **table;
+};
+
+struct hfi1_opcode_stats {
+ u64 n_packets; /* number of packets */
+ u64 n_bytes; /* total number of bytes */
+};
+
+struct hfi1_opcode_stats_perctx {
+ struct hfi1_opcode_stats stats[256];
+};
+
+static inline void inc_opstats(
+ u32 tlen,
+ struct hfi1_opcode_stats *stats)
+{
+#ifdef CONFIG_DEBUG_FS
+ stats->n_bytes += tlen;
+ stats->n_packets++;
+#endif
+}
+
+struct hfi1_ibport {
+ struct hfi1_qp __rcu *qp[2];
+ struct ib_mad_agent *send_agent; /* agent for SMI (traps) */
+ struct hfi1_ah *sm_ah;
+ struct hfi1_ah *smi_ah;
+ struct rb_root mcast_tree;
+ spinlock_t lock; /* protect changes in this struct */
+
+ /* non-zero when timer is set */
+ unsigned long mkey_lease_timeout;
+ unsigned long trap_timeout;
+ __be64 gid_prefix; /* in network order */
+ __be64 mkey;
+ __be64 guids[HFI1_GUIDS_PER_PORT - 1]; /* writable GUIDs */
+ u64 tid; /* TID for traps */
+ u64 n_rc_resends;
+ u64 n_seq_naks;
+ u64 n_rdma_seq;
+ u64 n_rnr_naks;
+ u64 n_other_naks;
+ u64 n_loop_pkts;
+ u64 n_pkt_drops;
+ u64 n_vl15_dropped;
+ u64 n_rc_timeouts;
+ u64 n_dmawait;
+ u64 n_unaligned;
+ u64 n_rc_dupreq;
+ u64 n_rc_seqnak;
+
+ /* Hot-path per CPU counters to avoid cacheline trading to update */
+ u64 z_rc_acks;
+ u64 z_rc_qacks;
+ u64 z_rc_delayed_comp;
+ u64 __percpu *rc_acks;
+ u64 __percpu *rc_qacks;
+ u64 __percpu *rc_delayed_comp;
+
+ u32 port_cap_flags;
+ u32 pma_sample_start;
+ u32 pma_sample_interval;
+ __be16 pma_counter_select[5];
+ u16 pma_tag;
+ u16 pkey_violations;
+ u16 qkey_violations;
+ u16 mkey_violations;
+ u16 mkey_lease_period;
+ u16 sm_lid;
+ u16 repress_traps;
+ u8 sm_sl;
+ u8 mkeyprot;
+ u8 subnet_timeout;
+ u8 vl_high_limit;
+ /* the first 16 entries are sl_to_vl for !OPA */
+ u8 sl_to_sc[32];
+ u8 sc_to_sl[32];
+};
+
+
+struct hfi1_qp_ibdev;
+struct hfi1_ibdev {
+ struct ib_device ibdev;
+ struct list_head pending_mmaps;
+ spinlock_t mmap_offset_lock; /* protect mmap_offset */
+ u32 mmap_offset;
+ struct hfi1_mregion __rcu *dma_mr;
+
+ struct hfi1_qp_ibdev *qp_dev;
+
+ /* QP numbers are shared by all IB ports */
+ struct hfi1_lkey_table lk_table;
+ /* protect wait lists */
+ seqlock_t iowait_lock;
+ struct list_head txwait; /* list for wait verbs_txreq */
+ struct list_head memwait; /* list for wait kernel memory */
+ struct list_head txreq_free;
+ struct kmem_cache *verbs_txreq_cache;
+ struct timer_list mem_timer;
+
+ /* other waiters */
+ spinlock_t pending_lock;
+
+ u64 n_piowait;
+ u64 n_txwait;
+ u64 n_kmem_wait;
+
+ u32 n_pds_allocated; /* number of PDs allocated for device */
+ spinlock_t n_pds_lock;
+ u32 n_ahs_allocated; /* number of AHs allocated for device */
+ spinlock_t n_ahs_lock;
+ u32 n_cqs_allocated; /* number of CQs allocated for device */
+ spinlock_t n_cqs_lock;
+ u32 n_qps_allocated; /* number of QPs allocated for device */
+ spinlock_t n_qps_lock;
+ u32 n_srqs_allocated; /* number of SRQs allocated for device */
+ spinlock_t n_srqs_lock;
+ u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+ spinlock_t n_mcast_grps_lock;
+#ifdef CONFIG_DEBUG_FS
+ /* per HFI debugfs */
+ struct dentry *hfi1_ibdev_dbg;
+ /* per HFI symlinks to above */
+ struct dentry *hfi1_ibdev_link;
+#endif
+};
+
+struct hfi1_verbs_counters {
+ u64 symbol_error_counter;
+ u64 link_error_recovery_counter;
+ u64 link_downed_counter;
+ u64 port_rcv_errors;
+ u64 port_rcv_remphys_errors;
+ u64 port_xmit_discards;
+ u64 port_xmit_data;
+ u64 port_rcv_data;
+ u64 port_xmit_packets;
+ u64 port_rcv_packets;
+ u32 local_link_integrity_errors;
+ u32 excessive_buffer_overrun_errors;
+ u32 vl15_dropped;
+};
+
+static inline struct hfi1_mr *to_imr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct hfi1_mr, ibmr);
+}
+
+static inline struct hfi1_pd *to_ipd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct hfi1_pd, ibpd);
+}
+
+static inline struct hfi1_ah *to_iah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct hfi1_ah, ibah);
+}
+
+static inline struct hfi1_cq *to_icq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct hfi1_cq, ibcq);
+}
+
+static inline struct hfi1_srq *to_isrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct hfi1_srq, ibsrq);
+}
+
+static inline struct hfi1_qp *to_iqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct hfi1_qp, ibqp);
+}
+
+static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct hfi1_ibdev, ibdev);
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct hfi1_qp *qp)
+{
+ return !(qp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT_IO)) &&
+ (qp->s_hdrwords || (qp->s_flags & HFI1_S_RESP_PENDING) ||
+ !(qp->s_flags & HFI1_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_schedule_send(struct hfi1_qp *qp);
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+ u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
+void hfi1_cap_mask_chg(struct hfi1_ibport *ibp);
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+ struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+ u16 *out_mad_pkey_index);
+int hfi1_create_agents(struct hfi1_ibdev *dev);
+void hfi1_free_agents(struct hfi1_ibdev *dev);
+
+/*
+ * The PSN_MASK and PSN_SHIFT allow for
+ * 1) comparing two PSNs
+ * 2) returning the PSN with any upper bits masked
+ * 3) returning the difference between to PSNs
+ *
+ * The number of significant bits in the PSN must
+ * necessarily be at least one bit less than
+ * the container holding the PSN.
+ */
+#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
+#define PSN_MASK 0xFFFFFF
+#define PSN_SHIFT 8
+#else
+#define PSN_MASK 0x7FFFFFFF
+#define PSN_SHIFT 1
+#endif
+#define PSN_MODIFY_MASK 0xFFFFFF
+
+/* Number of bits to pay attention to in the opcode for checking qp type */
+#define OPCODE_QP_MASK 0xE0
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_msn(u32 a, u32 b)
+{
+ return (((int) a) - ((int) b)) << 8;
+}
+
+/*
+ * Compare two PSNs
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_psn(u32 a, u32 b)
+{
+ return (((int) a) - ((int) b)) << PSN_SHIFT;
+}
+
+/*
+ * Return masked PSN
+ */
+static inline u32 mask_psn(u32 a)
+{
+ return a & PSN_MASK;
+}
+
+/*
+ * Return delta between two PSNs
+ */
+static inline u32 delta_psn(u32 a, u32 b)
+{
+ return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
+}
+
+struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid);
+
+int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp);
+
+struct verbs_txreq;
+void hfi1_put_txreq(struct verbs_txreq *tx);
+
+int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len);
+
+void hfi1_copy_sge(struct hfi1_sge_state *ss, void *data, u32 length,
+ int release);
+
+void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release);
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet);
+
+void hfi1_uc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_hdrerr(
+ struct hfi1_ctxtdata *rcd,
+ struct hfi1_ib_header *hdr,
+ u32 rcv_flags,
+ struct hfi1_qp *qp);
+
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
+
+void hfi1_rc_rnr_retry(unsigned long arg);
+
+void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr);
+
+void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err);
+
+void hfi1_ud_rcv(struct hfi1_packet *packet);
+
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
+
+int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region);
+
+void hfi1_free_lkey(struct hfi1_mregion *mr);
+
+int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
+ struct hfi1_sge *isge, struct ib_sge *sge, int acc);
+
+int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
+ u32 len, u64 vaddr, u32 rkey, int acc);
+
+int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+
+struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+
+int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask,
+ struct ib_udata *udata);
+
+int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int hfi1_destroy_srq(struct ib_srq *ibsrq);
+
+int hfi1_cq_init(struct hfi1_devdata *dd);
+
+void hfi1_cq_exit(struct hfi1_devdata *dd);
+
+void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int sig);
+
+int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *hfi1_create_cq(
+ struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+
+int hfi1_destroy_cq(struct ib_cq *ibcq);
+
+int hfi1_req_notify_cq(
+ struct ib_cq *ibcq,
+ enum ib_cq_notify_flags notify_flags);
+
+int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata);
+
+int hfi1_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_entries);
+
+struct ib_fast_reg_page_list *hfi1_alloc_fast_reg_page_list(
+ struct ib_device *ibdev, int page_list_len);
+
+void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl);
+
+int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr);
+
+struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+
+int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova);
+
+int hfi1_unmap_fmr(struct list_head *fmr_list);
+
+int hfi1_dealloc_fmr(struct ib_fmr *ibfmr);
+
+static inline void hfi1_get_mr(struct hfi1_mregion *mr)
+{
+ atomic_inc(&mr->refcount);
+}
+
+static inline void hfi1_put_mr(struct hfi1_mregion *mr)
+{
+ if (unlikely(atomic_dec_and_test(&mr->refcount)))
+ complete(&mr->comp);
+}
+
+static inline void hfi1_put_ss(struct hfi1_sge_state *ss)
+{
+ while (ss->num_sge) {
+ hfi1_put_mr(ss->sge.mr);
+ if (--ss->num_sge)
+ ss->sge = *ss->sg_list++;
+ }
+}
+
+void hfi1_release_mmap_info(struct kref *ref);
+
+struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, u32 size,
+ struct ib_ucontext *context,
+ void *obj);
+
+void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
+ u32 size, void *obj);
+
+int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only);
+
+void hfi1_migrate_qp(struct hfi1_qp *qp);
+
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+ int has_grh, struct hfi1_qp *qp, u32 bth0);
+
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+ struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void clear_ahg(struct hfi1_qp *qp);
+
+void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
+ u32 bth0, u32 bth2, int middle);
+
+void hfi1_do_send(struct work_struct *work);
+
+void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
+ enum ib_wc_status status);
+
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct hfi1_qp *qp, int is_fecn);
+
+int hfi1_make_rc_req(struct hfi1_qp *qp);
+
+int hfi1_make_uc_req(struct hfi1_qp *qp);
+
+int hfi1_make_ud_req(struct hfi1_qp *qp);
+
+int hfi1_register_ib_device(struct hfi1_devdata *);
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *);
+
+void hfi1_ib_rcv(struct hfi1_packet *packet);
+
+unsigned hfi1_get_npkeys(struct hfi1_devdata *);
+
+int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc);
+
+int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
+ u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+ u32 plen, u32 dwords, u64 pbc);
+
+struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5);
+
+extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
+
+extern const u8 hdr_len_by_opcode[];
+
+extern const int ib_hfi1_state_ops[];
+
+extern __be64 ib_hfi1_sys_image_guid; /* in network order */
+
+extern unsigned int hfi1_lkey_table_size;
+
+extern unsigned int hfi1_max_cqes;
+
+extern unsigned int hfi1_max_cqs;
+
+extern unsigned int hfi1_max_qp_wrs;
+
+extern unsigned int hfi1_max_qps;
+
+extern unsigned int hfi1_max_sges;
+
+extern unsigned int hfi1_max_mcast_grps;
+
+extern unsigned int hfi1_max_mcast_qp_attached;
+
+extern unsigned int hfi1_max_srqs;
+
+extern unsigned int hfi1_max_srq_sges;
+
+extern unsigned int hfi1_max_srq_wrs;
+
+extern const u32 ib_hfi1_rnr_table[];
+
+extern struct ib_dma_mapping_ops hfi1_dma_mapping_ops;
+
+#endif /* HFI1_VERBS_H */
diff --git a/drivers/staging/rdma/hfi1/verbs_mcast.c b/drivers/staging/rdma/hfi1/verbs_mcast.c
new file mode 100644
index 000000000000..afc6b4c61a1d
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/verbs_mcast.c
@@ -0,0 +1,385 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/rculist.h>
+
+#include "hfi.h"
+
+/**
+ * mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct hfi1_mcast_qp *mcast_qp_alloc(struct hfi1_qp *qp)
+{
+ struct hfi1_mcast_qp *mqp;
+
+ mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
+ if (!mqp)
+ goto bail;
+
+ mqp->qp = qp;
+ atomic_inc(&qp->refcount);
+
+bail:
+ return mqp;
+}
+
+static void mcast_qp_free(struct hfi1_mcast_qp *mqp)
+{
+ struct hfi1_qp *qp = mqp->qp;
+
+ /* Notify hfi1_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+
+ kfree(mqp);
+}
+
+/**
+ * mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct hfi1_mcast *mcast_alloc(union ib_gid *mgid)
+{
+ struct hfi1_mcast *mcast;
+
+ mcast = kmalloc(sizeof(*mcast), GFP_KERNEL);
+ if (!mcast)
+ goto bail;
+
+ mcast->mgid = *mgid;
+ INIT_LIST_HEAD(&mcast->qp_list);
+ init_waitqueue_head(&mcast->wait);
+ atomic_set(&mcast->refcount, 0);
+ mcast->n_attached = 0;
+
+bail:
+ return mcast;
+}
+
+static void mcast_free(struct hfi1_mcast *mcast)
+{
+ struct hfi1_mcast_qp *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+ mcast_qp_free(p);
+
+ kfree(mcast);
+}
+
+/**
+ * hfi1_mcast_find - search the global table for the given multicast GID
+ * @ibp: the IB port structure
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid)
+{
+ struct rb_node *n;
+ unsigned long flags;
+ struct hfi1_mcast *mcast;
+
+ spin_lock_irqsave(&ibp->lock, flags);
+ n = ibp->mcast_tree.rb_node;
+ while (n) {
+ int ret;
+
+ mcast = rb_entry(n, struct hfi1_mcast, rb_node);
+
+ ret = memcmp(mgid->raw, mcast->mgid.raw,
+ sizeof(union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else {
+ atomic_inc(&mcast->refcount);
+ spin_unlock_irqrestore(&ibp->lock, flags);
+ goto bail;
+ }
+ }
+ spin_unlock_irqrestore(&ibp->lock, flags);
+
+ mcast = NULL;
+
+bail:
+ return mcast;
+}
+
+/**
+ * mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added. Return EEXIST if the GID was already in
+ * the table but the QP was added. Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp,
+ struct hfi1_mcast *mcast, struct hfi1_mcast_qp *mqp)
+{
+ struct rb_node **n = &ibp->mcast_tree.rb_node;
+ struct rb_node *pn = NULL;
+ int ret;
+
+ spin_lock_irq(&ibp->lock);
+
+ while (*n) {
+ struct hfi1_mcast *tmcast;
+ struct hfi1_mcast_qp *p;
+
+ pn = *n;
+ tmcast = rb_entry(pn, struct hfi1_mcast, rb_node);
+
+ ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+ sizeof(union ib_gid));
+ if (ret < 0) {
+ n = &pn->rb_left;
+ continue;
+ }
+ if (ret > 0) {
+ n = &pn->rb_right;
+ continue;
+ }
+
+ /* Search the QP list to see if this is already there. */
+ list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+ if (p->qp == mqp->qp) {
+ ret = ESRCH;
+ goto bail;
+ }
+ }
+ if (tmcast->n_attached == hfi1_max_mcast_qp_attached) {
+ ret = ENOMEM;
+ goto bail;
+ }
+
+ tmcast->n_attached++;
+
+ list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+ ret = EEXIST;
+ goto bail;
+ }
+
+ spin_lock(&dev->n_mcast_grps_lock);
+ if (dev->n_mcast_grps_allocated == hfi1_max_mcast_grps) {
+ spin_unlock(&dev->n_mcast_grps_lock);
+ ret = ENOMEM;
+ goto bail;
+ }
+
+ dev->n_mcast_grps_allocated++;
+ spin_unlock(&dev->n_mcast_grps_lock);
+
+ mcast->n_attached++;
+
+ list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+ atomic_inc(&mcast->refcount);
+ rb_link_node(&mcast->rb_node, pn, n);
+ rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
+
+ ret = 0;
+
+bail:
+ spin_unlock_irq(&ibp->lock);
+
+ return ret;
+}
+
+int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct hfi1_qp *qp = to_iqp(ibqp);
+ struct hfi1_ibdev *dev = to_idev(ibqp->device);
+ struct hfi1_ibport *ibp;
+ struct hfi1_mcast *mcast;
+ struct hfi1_mcast_qp *mqp;
+ int ret;
+
+ if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /*
+ * Allocate data structures since its better to do this outside of
+ * spin locks and it will most likely be needed.
+ */
+ mcast = mcast_alloc(gid);
+ if (mcast == NULL) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ mqp = mcast_qp_alloc(qp);
+ if (mqp == NULL) {
+ mcast_free(mcast);
+ ret = -ENOMEM;
+ goto bail;
+ }
+ ibp = to_iport(ibqp->device, qp->port_num);
+ switch (mcast_add(dev, ibp, mcast, mqp)) {
+ case ESRCH:
+ /* Neither was used: OK to attach the same QP twice. */
+ mcast_qp_free(mqp);
+ mcast_free(mcast);
+ break;
+
+ case EEXIST: /* The mcast wasn't used */
+ mcast_free(mcast);
+ break;
+
+ case ENOMEM:
+ /* Exceeded the maximum number of mcast groups. */
+ mcast_qp_free(mqp);
+ mcast_free(mcast);
+ ret = -ENOMEM;
+ goto bail;
+
+ default:
+ break;
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct hfi1_qp *qp = to_iqp(ibqp);
+ struct hfi1_ibdev *dev = to_idev(ibqp->device);
+ struct hfi1_ibport *ibp = to_iport(ibqp->device, qp->port_num);
+ struct hfi1_mcast *mcast = NULL;
+ struct hfi1_mcast_qp *p, *tmp;
+ struct rb_node *n;
+ int last = 0;
+ int ret;
+
+ if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ spin_lock_irq(&ibp->lock);
+
+ /* Find the GID in the mcast table. */
+ n = ibp->mcast_tree.rb_node;
+ while (1) {
+ if (n == NULL) {
+ spin_unlock_irq(&ibp->lock);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ mcast = rb_entry(n, struct hfi1_mcast, rb_node);
+ ret = memcmp(gid->raw, mcast->mgid.raw,
+ sizeof(union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else
+ break;
+ }
+
+ /* Search the QP list. */
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+ if (p->qp != qp)
+ continue;
+ /*
+ * We found it, so remove it, but don't poison the forward
+ * link until we are sure there are no list walkers.
+ */
+ list_del_rcu(&p->list);
+ mcast->n_attached--;
+
+ /* If this was the last attached QP, remove the GID too. */
+ if (list_empty(&mcast->qp_list)) {
+ rb_erase(&mcast->rb_node, &ibp->mcast_tree);
+ last = 1;
+ }
+ break;
+ }
+
+ spin_unlock_irq(&ibp->lock);
+
+ if (p) {
+ /*
+ * Wait for any list walkers to finish before freeing the
+ * list element.
+ */
+ wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+ mcast_qp_free(p);
+ }
+ if (last) {
+ atomic_dec(&mcast->refcount);
+ wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+ mcast_free(mcast);
+ spin_lock_irq(&dev->n_mcast_grps_lock);
+ dev->n_mcast_grps_allocated--;
+ spin_unlock_irq(&dev->n_mcast_grps_lock);
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp)
+{
+ return ibp->mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/staging/rdma/ipath/Kconfig b/drivers/staging/rdma/ipath/Kconfig
new file mode 100644
index 000000000000..041ce0634968
--- /dev/null
+++ b/drivers/staging/rdma/ipath/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_IPATH
+ tristate "QLogic HTX HCA support"
+ depends on 64BIT && NET && HT_IRQ
+ ---help---
+ This is a driver for the deprecated QLogic Hyper-Transport
+ IB host channel adapter (model QHT7140),
+ including InfiniBand verbs support. This driver allows these
+ devices to be used with both kernel upper level protocols such
+ as IP-over-InfiniBand as well as with userspace applications
+ (in conjunction with InfiniBand userspace access).
+ For QLogic PCIe QLE based cards, use the QIB driver instead.
+
+ If you have this hardware you will need to boot with PAT disabled
+ on your x86-64 systems, use the nopat kernel parameter.
+
+ Note that this driver will soon be removed entirely from the kernel.
diff --git a/drivers/staging/rdma/ipath/Makefile b/drivers/staging/rdma/ipath/Makefile
new file mode 100644
index 000000000000..4496f2820c92
--- /dev/null
+++ b/drivers/staging/rdma/ipath/Makefile
@@ -0,0 +1,37 @@
+ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
+ -DIPATH_KERN_TYPE=0
+
+obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
+
+ib_ipath-y := \
+ ipath_cq.o \
+ ipath_diag.o \
+ ipath_dma.o \
+ ipath_driver.o \
+ ipath_eeprom.o \
+ ipath_file_ops.o \
+ ipath_fs.o \
+ ipath_init_chip.o \
+ ipath_intr.o \
+ ipath_keys.o \
+ ipath_mad.o \
+ ipath_mmap.o \
+ ipath_mr.o \
+ ipath_qp.o \
+ ipath_rc.o \
+ ipath_ruc.o \
+ ipath_sdma.o \
+ ipath_srq.o \
+ ipath_stats.o \
+ ipath_sysfs.o \
+ ipath_uc.o \
+ ipath_ud.o \
+ ipath_user_pages.o \
+ ipath_user_sdma.o \
+ ipath_verbs_mcast.o \
+ ipath_verbs.o
+
+ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
+
+ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
+ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/staging/rdma/ipath/TODO b/drivers/staging/rdma/ipath/TODO
new file mode 100644
index 000000000000..cb00158d64c8
--- /dev/null
+++ b/drivers/staging/rdma/ipath/TODO
@@ -0,0 +1,5 @@
+The ipath driver has been moved to staging in preparation for its removal in a
+few releases. The driver will be deleted during the 4.6 merge window.
+
+Contact Dennis Dalessandro <dennis.dalessandro@intel.com> and
+Cc: linux-rdma@vger.kernel.org
diff --git a/drivers/staging/rdma/ipath/ipath_common.h b/drivers/staging/rdma/ipath/ipath_common.h
new file mode 100644
index 000000000000..28cfe97cf1e9
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_common.h
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_COMMON_H
+#define _IPATH_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+
+/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
+#define IPATH_SRC_OUI_1 0x00
+#define IPATH_SRC_OUI_2 0x11
+#define IPATH_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _IPATH_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
+ */
+
+/*
+ * The value in the BTH QP field that InfiniPath uses to differentiate
+ * an infinipath protocol IB packet vs standard IB transport
+ */
+#define IPATH_KD_QP 0x656b79
+
+/*
+ * valid states passed to ipath_set_linkstate() user call
+ */
+#define IPATH_IB_LINKDOWN 0
+#define IPATH_IB_LINKARM 1
+#define IPATH_IB_LINKACTIVE 2
+#define IPATH_IB_LINKDOWN_ONLY 3
+#define IPATH_IB_LINKDOWN_SLEEP 4
+#define IPATH_IB_LINKDOWN_DISABLE 5
+#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */
+#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */
+#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */
+#define IPATH_IB_LINK_HRTBT 9 /* enable heartbeat, normal, non-loopback */
+
+/*
+ * These 3 values (SDR and DDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs. They
+ * are also the the possible values for ipath_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define IPATH_IB_SDR 1
+#define IPATH_IB_DDR 2
+
+/*
+ * stats maintained by the driver. For now, at least, this is global
+ * to all minor devices.
+ */
+struct infinipath_stats {
+ /* number of interrupts taken */
+ __u64 sps_ints;
+ /* number of interrupts for errors */
+ __u64 sps_errints;
+ /* number of errors from chip (not incl. packet errors or CRC) */
+ __u64 sps_errs;
+ /* number of packet errors from chip other than CRC */
+ __u64 sps_pkterrs;
+ /* number of packets with CRC errors (ICRC and VCRC) */
+ __u64 sps_crcerrs;
+ /* number of hardware errors reported (parity, etc.) */
+ __u64 sps_hwerrs;
+ /* number of times IB link changed state unexpectedly */
+ __u64 sps_iblink;
+ __u64 sps_unused; /* was fastrcvint, no longer implemented */
+ /* number of kernel (port0) packets received */
+ __u64 sps_port0pkts;
+ /* number of "ethernet" packets sent by driver */
+ __u64 sps_ether_spkts;
+ /* number of "ethernet" packets received by driver */
+ __u64 sps_ether_rpkts;
+ /* number of SMA packets sent by driver. Obsolete. */
+ __u64 sps_sma_spkts;
+ /* number of SMA packets received by driver. Obsolete. */
+ __u64 sps_sma_rpkts;
+ /* number of times all ports rcvhdrq was full and packet dropped */
+ __u64 sps_hdrqfull;
+ /* number of times all ports egrtid was full and packet dropped */
+ __u64 sps_etidfull;
+ /*
+ * number of times we tried to send from driver, but no pio buffers
+ * avail
+ */
+ __u64 sps_nopiobufs;
+ /* number of ports currently open */
+ __u64 sps_ports;
+ /* list of pkeys (other than default) accepted (0 means not set) */
+ __u16 sps_pkeys[4];
+ __u16 sps_unused16[4]; /* available; maintaining compatible layout */
+ /* number of user ports per chip (not IB ports) */
+ __u32 sps_nports;
+ /* not our interrupt, or already handled */
+ __u32 sps_nullintr;
+ /* max number of packets handled per receive call */
+ __u32 sps_maxpkts_call;
+ /* avg number of packets handled per receive call */
+ __u32 sps_avgpkts_call;
+ /* total number of pages locked */
+ __u64 sps_pagelocks;
+ /* total number of pages unlocked */
+ __u64 sps_pageunlocks;
+ /*
+ * Number of packets dropped in kernel other than errors (ether
+ * packets if ipath not configured, etc.)
+ */
+ __u64 sps_krdrops;
+ __u64 sps_txeparity; /* PIO buffer parity error, recovered */
+ /* pad for future growth */
+ __u64 __sps_pad[45];
+};
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.
+ */
+#define IPATH_STATUS_INITTED 0x1 /* basic initialization done */
+#define IPATH_STATUS_DISABLED 0x2 /* hardware disabled */
+/* Device has been disabled via admin request */
+#define IPATH_STATUS_ADMIN_DISABLED 0x4
+/* Chip has been found and initted */
+#define IPATH_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define IPATH_STATUS_IB_READY 0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define IPATH_STATUS_IB_CONF 0x80
+/* no link established, probably no cable */
+#define IPATH_STATUS_IB_NOCABLE 0x100
+/* A Fatal hardware error has occurred. */
+#define IPATH_STATUS_HWERROR 0x200
+
+/*
+ * The list of usermode accessible registers. Also see Reg_* later in file.
+ */
+typedef enum _ipath_ureg {
+ /* (RO) DMA RcvHdr to be used next. */
+ ur_rcvhdrtail = 0,
+ /* (RW) RcvHdr entry to be processed next by host. */
+ ur_rcvhdrhead = 1,
+ /* (RO) Index of next Eager index to use. */
+ ur_rcvegrindextail = 2,
+ /* (RW) Eager TID to be processed next */
+ ur_rcvegrindexhead = 3,
+ /* For internal use only; max register number. */
+ _IPATH_UregMax
+} ipath_ureg;
+
+/* bit values for spi_runtime_flags */
+#define IPATH_RUNTIME_HT 0x1
+#define IPATH_RUNTIME_PCIE 0x2
+#define IPATH_RUNTIME_FORCE_WC_ORDER 0x4
+#define IPATH_RUNTIME_RCVHDR_COPY 0x8
+#define IPATH_RUNTIME_MASTER 0x10
+#define IPATH_RUNTIME_NODMA_RTAIL 0x80
+#define IPATH_RUNTIME_SDMA 0x200
+#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
+#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
+
+/*
+ * This structure is returned by ipath_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct ipath_base_info {
+ /* version of hardware, for feature checking. */
+ __u32 spi_hw_version;
+ /* version of software, for feature checking. */
+ __u32 spi_sw_version;
+ /* InfiniPath port assigned, goes into sent packets */
+ __u16 spi_port;
+ __u16 spi_subport;
+ /*
+ * IB MTU, packets IB data must be less than this.
+ * The MTU is in bytes, and will be a multiple of 4 bytes.
+ */
+ __u32 spi_mtu;
+ /*
+ * Size of a PIO buffer. Any given packet's total size must be less
+ * than this (in words). Included is the starting control word, so
+ * if 513 is returned, then total pkt size is 512 words or less.
+ */
+ __u32 spi_piosize;
+ /* size of the TID cache in infinipath, in entries */
+ __u32 spi_tidcnt;
+ /* size of the TID Eager list in infinipath, in entries */
+ __u32 spi_tidegrcnt;
+ /* size of a single receive header queue entry in words. */
+ __u32 spi_rcvhdrent_size;
+ /*
+ * Count of receive header queue entries allocated.
+ * This may be less than the spu_rcvhdrcnt passed in!.
+ */
+ __u32 spi_rcvhdr_cnt;
+
+ /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
+ __u32 spi_runtime_flags;
+
+ /* address where receive buffer queue is mapped into */
+ __u64 spi_rcvhdr_base;
+
+ /* user program. */
+
+ /* base address of eager TID receive buffers. */
+ __u64 spi_rcv_egrbufs;
+
+ /* Allocated by initialization code, not by protocol. */
+
+ /*
+ * Size of each TID buffer in host memory, starting at
+ * spi_rcv_egrbufs. The buffers are virtually contiguous.
+ */
+ __u32 spi_rcv_egrbufsize;
+ /*
+ * The special QP (queue pair) value that identifies an infinipath
+ * protocol packet from standard IB packets. More, probably much
+ * more, to be added.
+ */
+ __u32 spi_qpair;
+
+ /*
+ * User register base for init code, not to be used directly by
+ * protocol or applications.
+ */
+ __u64 __spi_uregbase;
+ /*
+ * Maximum buffer size in bytes that can be used in a single TID
+ * entry (assuming the buffer is aligned to this boundary). This is
+ * the minimum of what the hardware and software support Guaranteed
+ * to be a power of 2.
+ */
+ __u32 spi_tid_maxsize;
+ /*
+ * alignment of each pio send buffer (byte count
+ * to add to spi_piobufbase to get to second buffer)
+ */
+ __u32 spi_pioalign;
+ /*
+ * The index of the first pio buffer available to this process;
+ * needed to do lookup in spi_pioavailaddr; not added to
+ * spi_piobufbase.
+ */
+ __u32 spi_pioindex;
+ /* number of buffers mapped for this process */
+ __u32 spi_piocnt;
+
+ /*
+ * Base address of writeonly pio buffers for this process.
+ * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+ * boundaries. spi_piocnt buffers are mapped from this address
+ */
+ __u64 spi_piobufbase;
+
+ /*
+ * Base address of readonly memory copy of the pioavail registers.
+ * There are 2 bits for each buffer.
+ */
+ __u64 spi_pioavailaddr;
+
+ /*
+ * Address where driver updates a copy of the interface and driver
+ * status (IPATH_STATUS_*) as a 64 bit value. It's followed by a
+ * string indicating hardware error, if there was one.
+ */
+ __u64 spi_status;
+
+ /* number of chip ports available to user processes */
+ __u32 spi_nports;
+ /* unit number of chip we are using */
+ __u32 spi_unit;
+ /* num bufs in each contiguous set */
+ __u32 spi_rcv_egrperchunk;
+ /* size in bytes of each contiguous set */
+ __u32 spi_rcv_egrchunksize;
+ /* total size of mmap to cover full rcvegrbuffers */
+ __u32 spi_rcv_egrbuftotlen;
+ __u32 spi_filler_for_align;
+ /* address of readonly memory copy of the rcvhdrq tail register. */
+ __u64 spi_rcvhdr_tailaddr;
+
+ /* shared memory pages for subports if port is shared */
+ __u64 spi_subport_uregbase;
+ __u64 spi_subport_rcvegrbuf;
+ __u64 spi_subport_rcvhdr_base;
+
+ /* shared memory page for hardware port if it is shared */
+ __u64 spi_port_uregbase;
+ __u64 spi_port_rcvegrbuf;
+ __u64 spi_port_rcvhdr_base;
+ __u64 spi_port_rcvhdr_tailaddr;
+
+} __attribute__ ((aligned(8)));
+
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of ipath_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way. The driver must be the same or higher
+ * for initialization to succeed. In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define IPATH_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define IPATH_USER_SWMINOR 6
+
+#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
+
+#define IPATH_KERN_TYPE 0
+
+/*
+ * Similarly, this is the kernel version going back to the user. It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of ipath_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
+
+/*
+ * This structure is passed to ipath_userinit() to tell the driver where
+ * user code buffers are, sizes, etc. The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility. It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct ipath_user_info {
+ /*
+ * version of user software, to detect compatibility issues.
+ * Should be set to IPATH_USER_SWVERSION.
+ */
+ __u32 spu_userversion;
+
+ /* desired number of receive header queue entries */
+ __u32 spu_rcvhdrcnt;
+
+ /* size of struct base_info to write to */
+ __u32 spu_base_info_size;
+
+ /*
+ * number of words in KD protocol header
+ * This tells InfiniPath how many words to copy to rcvhdrq. If 0,
+ * kernel uses a default. Once set, attempts to set any other value
+ * are an error (EAGAIN) until driver is reloaded.
+ */
+ __u32 spu_rcvhdrsize;
+
+ /*
+ * If two or more processes wish to share a port, each process
+ * must set the spu_subport_cnt and spu_subport_id to the same
+ * values. The only restriction on the spu_subport_id is that
+ * it be unique for a given node.
+ */
+ __u16 spu_subport_cnt;
+ __u16 spu_subport_id;
+
+ __u32 spu_unused; /* kept for compatible layout */
+
+ /*
+ * address of struct base_info to write to
+ */
+ __u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+#define IPATH_CMD_MIN 16
+
+#define __IPATH_CMD_USER_INIT 16 /* old set up userspace (for old user code) */
+#define IPATH_CMD_PORT_INFO 17 /* find out what resources we got */
+#define IPATH_CMD_RECV_CTRL 18 /* control receipt of packets */
+#define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */
+#define IPATH_CMD_TID_FREE 20 /* free expected TID entries */
+#define IPATH_CMD_SET_PART_KEY 21 /* add partition key */
+#define __IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes (for old user code) */
+#define IPATH_CMD_ASSIGN_PORT 23 /* allocate HCA and port */
+#define IPATH_CMD_USER_INIT 24 /* set up userspace */
+#define IPATH_CMD_UNUSED_1 25
+#define IPATH_CMD_UNUSED_2 26
+#define IPATH_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */
+#define IPATH_CMD_POLL_TYPE 28 /* set the kind of polling we want */
+#define IPATH_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */
+/* 30 is unused */
+#define IPATH_CMD_SDMA_INFLIGHT 31 /* sdma inflight counter request */
+#define IPATH_CMD_SDMA_COMPLETE 32 /* sdma completion counter request */
+
+/*
+ * Poll types
+ */
+#define IPATH_POLL_TYPE_URGENT 0x01
+#define IPATH_POLL_TYPE_OVERFLOW 0x02
+
+struct ipath_port_info {
+ __u32 num_active; /* number of active units */
+ __u32 unit; /* unit (chip) assigned to caller */
+ __u16 port; /* port on unit assigned to caller */
+ __u16 subport; /* subport on unit assigned to caller */
+ __u16 num_ports; /* number of ports available on unit */
+ __u16 num_subports; /* number of subports opened on port */
+};
+
+struct ipath_tid_info {
+ __u32 tidcnt;
+ /* make structure same size in 32 and 64 bit */
+ __u32 tid__unused;
+ /* virtual address of first page in transfer */
+ __u64 tidvaddr;
+ /* pointer (same size 32/64 bit) to __u16 tid array */
+ __u64 tidlist;
+
+ /*
+ * pointer (same size 32/64 bit) to bitmap of TIDs used
+ * for this call; checked for being large enough at open
+ */
+ __u64 tidmap;
+};
+
+struct ipath_cmd {
+ __u32 type; /* command type */
+ union {
+ struct ipath_tid_info tid_info;
+ struct ipath_user_info user_info;
+
+ /*
+ * address in userspace where we should put the sdma
+ * inflight counter
+ */
+ __u64 sdma_inflight;
+ /*
+ * address in userspace where we should put the sdma
+ * completion counter
+ */
+ __u64 sdma_complete;
+ /* address in userspace of struct ipath_port_info to
+ write result to */
+ __u64 port_info;
+ /* enable/disable receipt of packets */
+ __u32 recv_ctrl;
+ /* enable/disable armlaunch errors (non-zero to enable) */
+ __u32 armlaunch_ctrl;
+ /* partition key to set */
+ __u16 part_key;
+ /* user address of __u32 bitmask of active slaves */
+ __u64 slave_mask_addr;
+ /* type of polling we want */
+ __u16 poll_type;
+ } cmd;
+};
+
+struct ipath_iovec {
+ /* Pointer to data, but same size 32 and 64 bit */
+ __u64 iov_base;
+
+ /*
+ * Length of data; don't need 64 bits, but want
+ * ipath_sendpkt to remain same size as before 32 bit changes, so...
+ */
+ __u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send. Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the ipath_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __ipath_sendpkt {
+ __u32 sps_flags; /* flags for packet (TBD) */
+ __u32 sps_cnt; /* number of entries to use in sps_iov */
+ /* array of iov's describing packet. TEMPORARY */
+ struct ipath_iovec sps_iov[4];
+};
+
+/*
+ * diagnostics can send a packet by "writing" one of the following
+ * two structs to diag data special file
+ * The first is the legacy version for backward compatibility
+ */
+struct ipath_diag_pkt {
+ __u32 unit;
+ __u64 data;
+ __u32 len;
+};
+
+/* The second diag_pkt struct is the expanded version that allows
+ * more control over the packet, specifically, by allowing a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+struct ipath_diag_xpkt {
+ __u64 data;
+ __u64 pbc_wd;
+ __u32 unit;
+ __u32 len;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define IPATH_FLASH_VERSION 2
+struct ipath_flash {
+ /* flash layout version (IPATH_FLASH_VERSION) */
+ __u8 if_fversion;
+ /* checksum protecting if_length bytes */
+ __u8 if_csum;
+ /*
+ * valid length (in use, protected by if_csum), including
+ * if_fversion and if_csum themselves)
+ */
+ __u8 if_length;
+ /* the GUID, in network order */
+ __u8 if_guid[8];
+ /* number of GUIDs to use, starting from if_guid */
+ __u8 if_numguid;
+ /* the (last 10 characters of) board serial number, in ASCII */
+ char if_serial[12];
+ /* board mfg date (YYYYMMDD ASCII) */
+ char if_mfgdate[8];
+ /* last board rework/test date (YYYYMMDD ASCII) */
+ char if_testdate[8];
+ /* logging of error counts, TBD */
+ __u8 if_errcntp[4];
+ /* powered on hours, updated at driver unload */
+ __u8 if_powerhour[2];
+ /* ASCII free-form comment field */
+ char if_comment[32];
+ /* Backwards compatible prefix for longer QLogic Serial Numbers */
+ char if_sprefix[4];
+ /* 82 bytes used, min flash size is 128 bytes */
+ __u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct infinipath_counters {
+ __u64 LBIntCnt;
+ __u64 LBFlowStallCnt;
+ __u64 TxSDmaDescCnt; /* was Reserved1 */
+ __u64 TxUnsupVLErrCnt;
+ __u64 TxDataPktCnt;
+ __u64 TxFlowPktCnt;
+ __u64 TxDwordCnt;
+ __u64 TxLenErrCnt;
+ __u64 TxMaxMinLenErrCnt;
+ __u64 TxUnderrunCnt;
+ __u64 TxFlowStallCnt;
+ __u64 TxDroppedPktCnt;
+ __u64 RxDroppedPktCnt;
+ __u64 RxDataPktCnt;
+ __u64 RxFlowPktCnt;
+ __u64 RxDwordCnt;
+ __u64 RxLenErrCnt;
+ __u64 RxMaxMinLenErrCnt;
+ __u64 RxICRCErrCnt;
+ __u64 RxVCRCErrCnt;
+ __u64 RxFlowCtrlErrCnt;
+ __u64 RxBadFormatCnt;
+ __u64 RxLinkProblemCnt;
+ __u64 RxEBPCnt;
+ __u64 RxLPCRCErrCnt;
+ __u64 RxBufOvflCnt;
+ __u64 RxTIDFullErrCnt;
+ __u64 RxTIDValidErrCnt;
+ __u64 RxPKeyMismatchCnt;
+ __u64 RxP0HdrEgrOvflCnt;
+ __u64 RxP1HdrEgrOvflCnt;
+ __u64 RxP2HdrEgrOvflCnt;
+ __u64 RxP3HdrEgrOvflCnt;
+ __u64 RxP4HdrEgrOvflCnt;
+ __u64 RxP5HdrEgrOvflCnt;
+ __u64 RxP6HdrEgrOvflCnt;
+ __u64 RxP7HdrEgrOvflCnt;
+ __u64 RxP8HdrEgrOvflCnt;
+ __u64 RxP9HdrEgrOvflCnt; /* was Reserved6 */
+ __u64 RxP10HdrEgrOvflCnt; /* was Reserved7 */
+ __u64 RxP11HdrEgrOvflCnt; /* new for IBA7220 */
+ __u64 RxP12HdrEgrOvflCnt; /* new for IBA7220 */
+ __u64 RxP13HdrEgrOvflCnt; /* new for IBA7220 */
+ __u64 RxP14HdrEgrOvflCnt; /* new for IBA7220 */
+ __u64 RxP15HdrEgrOvflCnt; /* new for IBA7220 */
+ __u64 RxP16HdrEgrOvflCnt; /* new for IBA7220 */
+ __u64 IBStatusChangeCnt;
+ __u64 IBLinkErrRecoveryCnt;
+ __u64 IBLinkDownedCnt;
+ __u64 IBSymbolErrCnt;
+ /* The following are new for IBA7220 */
+ __u64 RxVL15DroppedPktCnt;
+ __u64 RxOtherLocalPhyErrCnt;
+ __u64 PcieRetryBufDiagQwordCnt;
+ __u64 ExcessBufferOvflCnt;
+ __u64 LocalLinkIntegrityErrCnt;
+ __u64 RxVlErrCnt;
+ __u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * ipath_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
+#define INFINIPATH_RHF_LENGTH_SHIFT 0
+#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
+#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
+#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
+#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
+#define INFINIPATH_RHF_SEQ_MASK 0xF
+#define INFINIPATH_RHF_SEQ_SHIFT 0
+#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
+#define INFINIPATH_RHF_H_ICRCERR 0x80000000
+#define INFINIPATH_RHF_H_VCRCERR 0x40000000
+#define INFINIPATH_RHF_H_PARITYERR 0x20000000
+#define INFINIPATH_RHF_H_LENERR 0x10000000
+#define INFINIPATH_RHF_H_MTUERR 0x08000000
+#define INFINIPATH_RHF_H_IHDRERR 0x04000000
+#define INFINIPATH_RHF_H_TIDERR 0x02000000
+#define INFINIPATH_RHF_H_MKERR 0x01000000
+#define INFINIPATH_RHF_H_IBERR 0x00800000
+#define INFINIPATH_RHF_H_ERR_MASK 0xFF800000
+#define INFINIPATH_RHF_L_USE_EGR 0x80000000
+#define INFINIPATH_RHF_L_SWA 0x00008000
+#define INFINIPATH_RHF_L_SWB 0x00004000
+
+/* infinipath header fields */
+#define INFINIPATH_I_VERS_MASK 0xF
+#define INFINIPATH_I_VERS_SHIFT 28
+#define INFINIPATH_I_PORT_MASK 0xF
+#define INFINIPATH_I_PORT_SHIFT 24
+#define INFINIPATH_I_TID_MASK 0x7FF
+#define INFINIPATH_I_TID_SHIFT 13
+#define INFINIPATH_I_OFFSET_MASK 0x1FFF
+#define INFINIPATH_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define INFINIPATH_KPF_INTR 0x1
+#define INFINIPATH_KPF_SUBPORT_MASK 0x3
+#define INFINIPATH_KPF_SUBPORT_SHIFT 1
+
+#define INFINIPATH_MAX_SUBPORT 4
+
+/* SendPIO per-buffer control */
+#define INFINIPATH_SP_TEST 0x40
+#define INFINIPATH_SP_TESTEBP 0x20
+#define INFINIPATH_SP_TRIGGER_SHIFT 15
+
+/* SendPIOAvail bits */
+#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
+#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* infinipath header format */
+struct ipath_header {
+ /*
+ * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
+ * 14 bits before ECO change ~28 Dec 03. After that, Vers 4,
+ * Port 4, TID 11, offset 13.
+ */
+ __le32 ver_port_tid_offset;
+ __le16 chksum;
+ __le16 pkt_flags;
+};
+
+/* infinipath user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ infinipath.
+ */
+struct ipath_message_header {
+ __be16 lrh[4];
+ __be32 bth[3];
+ /* fields below this point are in host byte order */
+ struct ipath_header iph;
+ __u8 sub_opcode;
+};
+
+/* infinipath ethernet header format */
+struct ether_header {
+ __be16 lrh[4];
+ __be32 bth[3];
+ struct ipath_header iph;
+ __u8 sub_opcode;
+ __u8 cmd;
+ __be16 lid;
+ __u16 mac[3];
+ __u8 frag_num;
+ __u8 seq_num;
+ __le32 len;
+ /* MUST be of word size due to PIO write requirements */
+ __le32 csum;
+ __le16 csum_offset;
+ __le16 flags;
+ __u16 first_2_bytes;
+ __u8 unused[2]; /* currently unused */
+};
+
+
+/* IB - LRH header consts */
+#define IPATH_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */
+#define IPATH_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define IPATH_DEFAULT_P_KEY 0xFFFF
+#define IPATH_PERMISSIVE_LID 0xFFFF
+#define IPATH_AETH_CREDIT_SHIFT 24
+#define IPATH_AETH_CREDIT_MASK 0x1F
+#define IPATH_AETH_CREDIT_INVAL 0x1F
+#define IPATH_PSN_MASK 0xFFFFFF
+#define IPATH_MSN_MASK 0xFFFFFF
+#define IPATH_QPN_MASK 0xFFFFFF
+#define IPATH_MULTICAST_LID_BASE 0xC000
+#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
+#define IPATH_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from infinipath) */
+#define RCVHQ_RCV_TYPE_EXPECTED 0
+#define RCVHQ_RCV_TYPE_EAGER 1
+#define RCVHQ_RCV_TYPE_NON_KD 2
+#define RCVHQ_RCV_TYPE_ERROR 3
+
+
+/* sub OpCodes - ith4x */
+#define IPATH_ITH4X_OPCODE_ENCAP 0x81
+#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
+
+#define IPATH_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
+{
+ return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
+}
+
+static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
+{
+ return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
+ & INFINIPATH_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
+{
+ return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
+ & INFINIPATH_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
+{
+ return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
+ & INFINIPATH_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
+ & INFINIPATH_RHF_SEQ_MASK;
+}
+
+static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
+{
+ return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
+ & INFINIPATH_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+ return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
+}
+
+static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
+{
+ return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
+ & INFINIPATH_I_VERS_MASK;
+}
+
+#endif /* _IPATH_COMMON_H */
diff --git a/drivers/staging/rdma/ipath/ipath_cq.c b/drivers/staging/rdma/ipath/ipath_cq.c
new file mode 100644
index 000000000000..e9dd9112e718
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_cq.c
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
+{
+ struct ipath_cq_wc *wc;
+ unsigned long flags;
+ u32 head;
+ u32 next;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ /*
+ * Note that the head pointer might be writable by user processes.
+ * Take care to verify it is a sane value.
+ */
+ wc = cq->queue;
+ head = wc->head;
+ if (head >= (unsigned) cq->ibcq.cqe) {
+ head = cq->ibcq.cqe;
+ next = 0;
+ } else
+ next = head + 1;
+ if (unlikely(next == wc->tail)) {
+ spin_unlock_irqrestore(&cq->lock, flags);
+ if (cq->ibcq.event_handler) {
+ struct ib_event ev;
+
+ ev.device = cq->ibcq.device;
+ ev.element.cq = &cq->ibcq;
+ ev.event = IB_EVENT_CQ_ERR;
+ cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+ }
+ return;
+ }
+ if (cq->ip) {
+ wc->uqueue[head].wr_id = entry->wr_id;
+ wc->uqueue[head].status = entry->status;
+ wc->uqueue[head].opcode = entry->opcode;
+ wc->uqueue[head].vendor_err = entry->vendor_err;
+ wc->uqueue[head].byte_len = entry->byte_len;
+ wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
+ wc->uqueue[head].qp_num = entry->qp->qp_num;
+ wc->uqueue[head].src_qp = entry->src_qp;
+ wc->uqueue[head].wc_flags = entry->wc_flags;
+ wc->uqueue[head].pkey_index = entry->pkey_index;
+ wc->uqueue[head].slid = entry->slid;
+ wc->uqueue[head].sl = entry->sl;
+ wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+ wc->uqueue[head].port_num = entry->port_num;
+ /* Make sure entry is written before the head index. */
+ smp_wmb();
+ } else
+ wc->kqueue[head] = *entry;
+ wc->head = next;
+
+ if (cq->notify == IB_CQ_NEXT_COMP ||
+ (cq->notify == IB_CQ_SOLICITED && solicited)) {
+ cq->notify = IB_CQ_NONE;
+ cq->triggered++;
+ /*
+ * This will cause send_complete() to be called in
+ * another thread.
+ */
+ tasklet_hi_schedule(&cq->comptask);
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (entry->status != IB_WC_SUCCESS)
+ to_idev(cq->ibcq.device)->n_wqe_errs++;
+}
+
+/**
+ * ipath_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context. Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+ struct ipath_cq *cq = to_icq(ibcq);
+ struct ipath_cq_wc *wc;
+ unsigned long flags;
+ int npolled;
+ u32 tail;
+
+ /* The kernel can only poll a kernel completion queue */
+ if (cq->ip) {
+ npolled = -EINVAL;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ wc = cq->queue;
+ tail = wc->tail;
+ if (tail > (u32) cq->ibcq.cqe)
+ tail = (u32) cq->ibcq.cqe;
+ for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+ if (tail == wc->head)
+ break;
+ /* The kernel doesn't need a RMB since it has the lock. */
+ *entry = wc->kqueue[tail];
+ if (tail >= cq->ibcq.cqe)
+ tail = 0;
+ else
+ tail++;
+ }
+ wc->tail = tail;
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+ return npolled;
+}
+
+static void send_complete(unsigned long data)
+{
+ struct ipath_cq *cq = (struct ipath_cq *)data;
+
+ /*
+ * The completion handler will most likely rearm the notification
+ * and poll for all pending entries. If a new completion entry
+ * is added while we are in this routine, tasklet_hi_schedule()
+ * won't call us again until we return so we check triggered to
+ * see if we need to call the handler again.
+ */
+ for (;;) {
+ u8 triggered = cq->triggered;
+
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+ if (cq->triggered == triggered)
+ return;
+ }
+}
+
+/**
+ * ipath_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @attr: creation attributes
+ * @context: unused by the InfiniPath driver
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ int entries = attr->cqe;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_cq *cq;
+ struct ipath_cq_wc *wc;
+ struct ib_cq *ret;
+ u32 sz;
+
+ if (attr->flags)
+ return ERR_PTR(-EINVAL);
+
+ if (entries < 1 || entries > ib_ipath_max_cqes) {
+ ret = ERR_PTR(-EINVAL);
+ goto done;
+ }
+
+ /* Allocate the completion queue structure. */
+ cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+ if (!cq) {
+ ret = ERR_PTR(-ENOMEM);
+ goto done;
+ }
+
+ /*
+ * Allocate the completion queue entries and head/tail pointers.
+ * This is allocated separately so that it can be resized and
+ * also mapped into user space.
+ * We need to use vmalloc() in order to support mmap and large
+ * numbers of entries.
+ */
+ sz = sizeof(*wc);
+ if (udata && udata->outlen >= sizeof(__u64))
+ sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+ else
+ sz += sizeof(struct ib_wc) * (entries + 1);
+ wc = vmalloc_user(sz);
+ if (!wc) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_cq;
+ }
+
+ /*
+ * Return the address of the WC as the offset to mmap.
+ * See ipath_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ int err;
+
+ cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
+ if (!cq->ip) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_wc;
+ }
+
+ err = ib_copy_to_udata(udata, &cq->ip->offset,
+ sizeof(cq->ip->offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ } else
+ cq->ip = NULL;
+
+ spin_lock(&dev->n_cqs_lock);
+ if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
+ spin_unlock(&dev->n_cqs_lock);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ dev->n_cqs_allocated++;
+ spin_unlock(&dev->n_cqs_lock);
+
+ if (cq->ip) {
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ /*
+ * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+ * The number of entries should be >= the number requested or return
+ * an error.
+ */
+ cq->ibcq.cqe = entries;
+ cq->notify = IB_CQ_NONE;
+ cq->triggered = 0;
+ spin_lock_init(&cq->lock);
+ tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
+ wc->head = 0;
+ wc->tail = 0;
+ cq->queue = wc;
+
+ ret = &cq->ibcq;
+
+ goto done;
+
+bail_ip:
+ kfree(cq->ip);
+bail_wc:
+ vfree(wc);
+bail_cq:
+ kfree(cq);
+done:
+ return ret;
+}
+
+/**
+ * ipath_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int ipath_destroy_cq(struct ib_cq *ibcq)
+{
+ struct ipath_ibdev *dev = to_idev(ibcq->device);
+ struct ipath_cq *cq = to_icq(ibcq);
+
+ tasklet_kill(&cq->comptask);
+ spin_lock(&dev->n_cqs_lock);
+ dev->n_cqs_allocated--;
+ spin_unlock(&dev->n_cqs_lock);
+ if (cq->ip)
+ kref_put(&cq->ip->ref, ipath_release_mmap_info);
+ else
+ vfree(cq->queue);
+ kfree(cq);
+
+ return 0;
+}
+
+/**
+ * ipath_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context. Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+ struct ipath_cq *cq = to_icq(ibcq);
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&cq->lock, flags);
+ /*
+ * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+ * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+ */
+ if (cq->notify != IB_CQ_NEXT_COMP)
+ cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+ if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+ cq->queue->head != cq->queue->tail)
+ ret = 1;
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return ret;
+}
+
+/**
+ * ipath_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+ struct ipath_cq *cq = to_icq(ibcq);
+ struct ipath_cq_wc *old_wc;
+ struct ipath_cq_wc *wc;
+ u32 head, tail, n;
+ int ret;
+ u32 sz;
+
+ if (cqe < 1 || cqe > ib_ipath_max_cqes) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /*
+ * Need to use vmalloc() if we want to support large #s of entries.
+ */
+ sz = sizeof(*wc);
+ if (udata && udata->outlen >= sizeof(__u64))
+ sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+ else
+ sz += sizeof(struct ib_wc) * (cqe + 1);
+ wc = vmalloc_user(sz);
+ if (!wc) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ /* Check that we can write the offset to mmap. */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ __u64 offset = 0;
+
+ ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+ if (ret)
+ goto bail_free;
+ }
+
+ spin_lock_irq(&cq->lock);
+ /*
+ * Make sure head and tail are sane since they
+ * might be user writable.
+ */
+ old_wc = cq->queue;
+ head = old_wc->head;
+ if (head > (u32) cq->ibcq.cqe)
+ head = (u32) cq->ibcq.cqe;
+ tail = old_wc->tail;
+ if (tail > (u32) cq->ibcq.cqe)
+ tail = (u32) cq->ibcq.cqe;
+ if (head < tail)
+ n = cq->ibcq.cqe + 1 + head - tail;
+ else
+ n = head - tail;
+ if (unlikely((u32)cqe < n)) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ for (n = 0; tail != head; n++) {
+ if (cq->ip)
+ wc->uqueue[n] = old_wc->uqueue[tail];
+ else
+ wc->kqueue[n] = old_wc->kqueue[tail];
+ if (tail == (u32) cq->ibcq.cqe)
+ tail = 0;
+ else
+ tail++;
+ }
+ cq->ibcq.cqe = cqe;
+ wc->head = n;
+ wc->tail = 0;
+ cq->queue = wc;
+ spin_unlock_irq(&cq->lock);
+
+ vfree(old_wc);
+
+ if (cq->ip) {
+ struct ipath_ibdev *dev = to_idev(ibcq->device);
+ struct ipath_mmap_info *ip = cq->ip;
+
+ ipath_update_mmap_info(dev, ip, sz, wc);
+
+ /*
+ * Return the offset to mmap.
+ * See ipath_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ ret = ib_copy_to_udata(udata, &ip->offset,
+ sizeof(ip->offset));
+ if (ret)
+ goto bail;
+ }
+
+ spin_lock_irq(&dev->pending_lock);
+ if (list_empty(&ip->pending_mmaps))
+ list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ ret = 0;
+ goto bail;
+
+bail_unlock:
+ spin_unlock_irq(&cq->lock);
+bail_free:
+ vfree(wc);
+bail:
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_debug.h b/drivers/staging/rdma/ipath/ipath_debug.h
new file mode 100644
index 000000000000..65926cd35759
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_debug.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_DEBUG_H
+#define _IPATH_DEBUG_H
+
+#ifndef _IPATH_DEBUGGING /* debugging enabled or not */
+#define _IPATH_DEBUGGING 1
+#endif
+
+#if _IPATH_DEBUGGING
+
+/*
+ * Mask values for debugging. The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically. This can be set at modprobe time also:
+ * modprobe infinipath.ko infinipath_debug=7
+ */
+
+#define __IPATH_INFO 0x1 /* generic low verbosity stuff */
+#define __IPATH_DBG 0x2 /* generic debug */
+#define __IPATH_TRSAMPLE 0x8 /* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __IPATH_VERBDBG 0x40 /* very verbose debug */
+#define __IPATH_PKTDBG 0x80 /* print packet data */
+/* print process startup (init)/exit messages */
+#define __IPATH_PROCDBG 0x100
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG 0x200
+#define __IPATH_ERRPKTDBG 0x400
+#define __IPATH_USER_SEND 0x1000 /* use user mode send */
+#define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */
+#define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */
+#define __IPATH_IPATHDBG 0x10000 /* Ethernet (IPATH) gen debug */
+#define __IPATH_IPATHWARN 0x20000 /* Ethernet (IPATH) warnings */
+#define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors */
+#define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump */
+#define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump */
+#define __IPATH_LINKVERBDBG 0x200000 /* very verbose linkchange debug */
+
+#else /* _IPATH_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __IPATH_INFO 0x0 /* generic low verbosity stuff */
+#define __IPATH_DBG 0x0 /* generic debug */
+#define __IPATH_TRSAMPLE 0x0 /* generate trace buffer sample entries */
+#define __IPATH_VERBDBG 0x0 /* very verbose debug */
+#define __IPATH_PKTDBG 0x0 /* print packet data */
+#define __IPATH_PROCDBG 0x0 /* process startup (init)/exit messages */
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG 0x0
+#define __IPATH_EPKTDBG 0x0 /* print ethernet packet data */
+#define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */
+#define __IPATH_IPATHWARN 0x0 /* Ethernet (IPATH) warnings on */
+#define __IPATH_IPATHERR 0x0 /* Ethernet (IPATH) errors on */
+#define __IPATH_IPATHPD 0x0 /* Ethernet (IPATH) packet dump on */
+#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on */
+#define __IPATH_LINKVERBDBG 0x0 /* very verbose linkchange debug */
+
+#endif /* _IPATH_DEBUGGING */
+
+#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
+
+#endif /* _IPATH_DEBUG_H */
diff --git a/drivers/staging/rdma/ipath/ipath_diag.c b/drivers/staging/rdma/ipath/ipath_diag.c
new file mode 100644
index 000000000000..45802e97332e
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_diag.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions. It is accessed by
+ * opening the ipath_diag device, normally minor number 129. Diagnostic use
+ * of the InfiniPath chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/export.h>
+#include <asm/uaccess.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+int ipath_diag_inuse;
+static int diag_set_link;
+
+static int ipath_diag_open(struct inode *in, struct file *fp);
+static int ipath_diag_release(struct inode *in, struct file *fp);
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+ size_t count, loff_t *off);
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+ .owner = THIS_MODULE,
+ .write = ipath_diag_write,
+ .read = ipath_diag_read,
+ .open = ipath_diag_open,
+ .release = ipath_diag_release,
+ .llseek = default_llseek,
+};
+
+static ssize_t ipath_diagpkt_write(struct file *fp,
+ const char __user *data,
+ size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+ .owner = THIS_MODULE,
+ .write = ipath_diagpkt_write,
+ .llseek = noop_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_dev;
+
+int ipath_diag_add(struct ipath_devdata *dd)
+{
+ char name[16];
+ int ret = 0;
+
+ if (atomic_inc_return(&diagpkt_count) == 1) {
+ ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
+ "ipath_diagpkt", &diagpkt_file_ops,
+ &diagpkt_cdev, &diagpkt_dev);
+
+ if (ret) {
+ ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
+ "device: %d", ret);
+ goto done;
+ }
+ }
+
+ snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
+
+ ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
+ &diag_file_ops, &dd->diag_cdev,
+ &dd->diag_dev);
+ if (ret)
+ ipath_dev_err(dd, "Couldn't create %s device: %d",
+ name, ret);
+
+done:
+ return ret;
+}
+
+void ipath_diag_remove(struct ipath_devdata *dd)
+{
+ if (atomic_dec_and_test(&diagpkt_count))
+ ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
+
+ ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
+}
+
+/**
+ * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip. This is usually used for a single qword
+ *
+ * NOTE: This assumes the chip address is 64-bit aligned.
+ */
+static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
+ const void __iomem *caddr, size_t count)
+{
+ const u64 __iomem *reg_addr = caddr;
+ const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+ int ret;
+
+ /* not very efficient, but it works for now */
+ if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ while (reg_addr < reg_end) {
+ u64 data = readq(reg_addr);
+ if (copy_to_user(uaddr, &data, sizeof(u64))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+ reg_addr++;
+ uaddr += sizeof(u64);
+ }
+ ret = 0;
+bail:
+ return ret;
+}
+
+/**
+ * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE: This assumes the chip address is 64-bit aligned.
+ */
+
+static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
+ const void __user *uaddr, size_t count)
+{
+ u64 __iomem *reg_addr = caddr;
+ const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+ int ret;
+
+ /* not very efficient, but it works for now */
+ if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ while (reg_addr < reg_end) {
+ u64 data;
+ if (copy_from_user(&data, uaddr, sizeof(data))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+ writeq(data, reg_addr);
+
+ reg_addr++;
+ uaddr += sizeof(u64);
+ }
+ ret = 0;
+bail:
+ return ret;
+}
+
+/**
+ * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
+ const void __iomem *caddr, size_t count)
+{
+ const u32 __iomem *reg_addr = caddr;
+ const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+ int ret;
+
+ if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+ reg_end > (u32 __iomem *) dd->ipath_kregend) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ /* not very efficient, but it works for now */
+ while (reg_addr < reg_end) {
+ u32 data = readl(reg_addr);
+ if (copy_to_user(uaddr, &data, sizeof(data))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ reg_addr++;
+ uaddr += sizeof(u32);
+
+ }
+ ret = 0;
+bail:
+ return ret;
+}
+
+/**
+ * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
+ const void __user *uaddr, size_t count)
+{
+ u32 __iomem *reg_addr = caddr;
+ const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+ int ret;
+
+ if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+ reg_end > (u32 __iomem *) dd->ipath_kregend) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ while (reg_addr < reg_end) {
+ u32 data;
+ if (copy_from_user(&data, uaddr, sizeof(data))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+ writel(data, reg_addr);
+
+ reg_addr++;
+ uaddr += sizeof(u32);
+ }
+ ret = 0;
+bail:
+ return ret;
+}
+
+static int ipath_diag_open(struct inode *in, struct file *fp)
+{
+ int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
+ struct ipath_devdata *dd;
+ int ret;
+
+ mutex_lock(&ipath_mutex);
+
+ if (ipath_diag_inuse) {
+ ret = -EBUSY;
+ goto bail;
+ }
+
+ dd = ipath_lookup(unit);
+
+ if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
+ !dd->ipath_kregbase) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ fp->private_data = dd;
+ ipath_diag_inuse = -2;
+ diag_set_link = 0;
+ ret = 0;
+
+ /* Only expose a way to reset the device if we
+ make it into diag mode. */
+ ipath_expose_reset(&dd->pcidev->dev);
+
+bail:
+ mutex_unlock(&ipath_mutex);
+
+ return ret;
+}
+
+/**
+ * ipath_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: ipath_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t ipath_diagpkt_write(struct file *fp,
+ const char __user *data,
+ size_t count, loff_t *off)
+{
+ u32 __iomem *piobuf;
+ u32 plen, pbufn, maxlen_reserve;
+ struct ipath_diag_pkt odp;
+ struct ipath_diag_xpkt dp;
+ u32 *tmpbuf = NULL;
+ struct ipath_devdata *dd;
+ ssize_t ret = 0;
+ u64 val;
+ u32 l_state, lt_state; /* LinkState, LinkTrainingState */
+
+
+ if (count == sizeof(dp)) {
+ if (copy_from_user(&dp, data, sizeof(dp))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+ } else if (count == sizeof(odp)) {
+ if (copy_from_user(&odp, data, sizeof(odp))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+ dp.len = odp.len;
+ dp.unit = odp.unit;
+ dp.data = odp.data;
+ dp.pbc_wd = 0;
+ } else {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* send count must be an exact number of dwords */
+ if (dp.len & 3) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ plen = dp.len >> 2;
+
+ dd = ipath_lookup(dp.unit);
+ if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
+ !dd->ipath_kregbase) {
+ ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
+ dp.unit);
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ if (ipath_diag_inuse && !diag_set_link &&
+ !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+ diag_set_link = 1;
+ ipath_cdbg(VERBOSE, "Trying to set to set link active for "
+ "diag pkt\n");
+ ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+ ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+ }
+
+ if (!(dd->ipath_flags & IPATH_INITTED)) {
+ /* no hardware, freeze, etc. */
+ ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
+ ret = -ENODEV;
+ goto bail;
+ }
+ /*
+ * Want to skip check for l_state if using custom PBC,
+ * because we might be trying to force an SM packet out.
+ * first-cut, skip _all_ state checking in that case.
+ */
+ val = ipath_ib_state(dd, dd->ipath_lastibcstat);
+ lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+ l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+ if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
+ (val != dd->ib_init && val != dd->ib_arm &&
+ val != dd->ib_active))) {
+ ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
+ dd->ipath_unit, (unsigned long long) val);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /*
+ * need total length before first word written, plus 2 Dwords. One Dword
+ * is for padding so we get the full user data when not aligned on
+ * a word boundary. The other Dword is to make sure we have room for the
+ * ICRC which gets tacked on later.
+ */
+ maxlen_reserve = 2 * sizeof(u32);
+ if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
+ ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
+ dp.len, dd->ipath_ibmaxlen);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ plen = sizeof(u32) + dp.len;
+
+ tmpbuf = vmalloc(plen);
+ if (!tmpbuf) {
+ dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
+ "failing\n");
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ if (copy_from_user(tmpbuf,
+ (const void __user *) (unsigned long) dp.data,
+ dp.len)) {
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ plen >>= 2; /* in dwords */
+
+ piobuf = ipath_getpiobuf(dd, plen, &pbufn);
+ if (!piobuf) {
+ ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
+ dd->ipath_unit);
+ ret = -EBUSY;
+ goto bail;
+ }
+ /* disarm it just to be extra sure */
+ ipath_disarm_piobufs(dd, pbufn, 1);
+
+ if (ipath_debug & __IPATH_PKTDBG)
+ ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
+ dd->ipath_unit, plen - 1, pbufn);
+
+ if (dp.pbc_wd == 0)
+ dp.pbc_wd = plen;
+ writeq(dp.pbc_wd, piobuf);
+ /*
+ * Copy all by the trigger word, then flush, so it's written
+ * to chip before trigger word, then write trigger word, then
+ * flush again, so packet is sent.
+ */
+ if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+ ipath_flush_wc();
+ __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
+ ipath_flush_wc();
+ __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+ } else
+ __iowrite32_copy(piobuf + 2, tmpbuf, plen);
+
+ ipath_flush_wc();
+
+ ret = sizeof(dp);
+
+bail:
+ vfree(tmpbuf);
+ return ret;
+}
+
+static int ipath_diag_release(struct inode *in, struct file *fp)
+{
+ mutex_lock(&ipath_mutex);
+ ipath_diag_inuse = 0;
+ fp->private_data = NULL;
+ mutex_unlock(&ipath_mutex);
+ return 0;
+}
+
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+ size_t count, loff_t *off)
+{
+ struct ipath_devdata *dd = fp->private_data;
+ void __iomem *kreg_base;
+ ssize_t ret;
+
+ kreg_base = dd->ipath_kregbase;
+
+ if (count == 0)
+ ret = 0;
+ else if ((count % 4) || (*off % 4))
+ /* address or length is not 32-bit aligned, hence invalid */
+ ret = -EINVAL;
+ else if (ipath_diag_inuse < 1 && (*off || count != 8))
+ ret = -EINVAL; /* prevent cat /dev/ipath_diag* */
+ else if ((count % 8) || (*off % 8))
+ /* address or length not 64-bit aligned; do 32-bit reads */
+ ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
+ else
+ ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
+
+ if (ret >= 0) {
+ *off += count;
+ ret = count;
+ if (ipath_diag_inuse == -2)
+ ipath_diag_inuse++;
+ }
+
+ return ret;
+}
+
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *off)
+{
+ struct ipath_devdata *dd = fp->private_data;
+ void __iomem *kreg_base;
+ ssize_t ret;
+
+ kreg_base = dd->ipath_kregbase;
+
+ if (count == 0)
+ ret = 0;
+ else if ((count % 4) || (*off % 4))
+ /* address or length is not 32-bit aligned, hence invalid */
+ ret = -EINVAL;
+ else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
+ ipath_diag_inuse == -2) /* read qw off 0, write qw off 0 */
+ ret = -EINVAL; /* before any other write allowed */
+ else if ((count % 8) || (*off % 8))
+ /* address or length not 64-bit aligned; do 32-bit writes */
+ ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
+ else
+ ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
+
+ if (ret >= 0) {
+ *off += count;
+ ret = count;
+ if (ipath_diag_inuse == -1)
+ ipath_diag_inuse = 1; /* all read/write OK now */
+ }
+
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_dma.c b/drivers/staging/rdma/ipath/ipath_dma.c
new file mode 100644
index 000000000000..123a8c053539
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_dma.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 ipath_dma_map_single(struct ib_device *dev,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ BUG_ON(!valid_dma_direction(direction));
+ return (u64) cpu_addr;
+}
+
+static void ipath_dma_unmap_single(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ BUG_ON(!valid_dma_direction(direction));
+}
+
+static u64 ipath_dma_map_page(struct ib_device *dev,
+ struct page *page,
+ unsigned long offset,
+ size_t size,
+ enum dma_data_direction direction)
+{
+ u64 addr;
+
+ BUG_ON(!valid_dma_direction(direction));
+
+ if (offset + size > PAGE_SIZE) {
+ addr = BAD_DMA_ADDRESS;
+ goto done;
+ }
+
+ addr = (u64) page_address(page);
+ if (addr)
+ addr += offset;
+ /* TODO: handle highmem pages */
+
+done:
+ return addr;
+}
+
+static void ipath_dma_unmap_page(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ BUG_ON(!valid_dma_direction(direction));
+}
+
+static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction direction)
+{
+ struct scatterlist *sg;
+ u64 addr;
+ int i;
+ int ret = nents;
+
+ BUG_ON(!valid_dma_direction(direction));
+
+ for_each_sg(sgl, sg, nents, i) {
+ addr = (u64) page_address(sg_page(sg));
+ /* TODO: handle highmem pages */
+ if (!addr) {
+ ret = 0;
+ break;
+ }
+ sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+ sg->dma_length = sg->length;
+#endif
+ }
+ return ret;
+}
+
+static void ipath_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ BUG_ON(!valid_dma_direction(direction));
+}
+
+static void ipath_sync_single_for_cpu(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+}
+
+static void ipath_sync_single_for_device(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+}
+
+static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
+ u64 *dma_handle, gfp_t flag)
+{
+ struct page *p;
+ void *addr = NULL;
+
+ p = alloc_pages(flag, get_order(size));
+ if (p)
+ addr = page_address(p);
+ if (dma_handle)
+ *dma_handle = (u64) addr;
+ return addr;
+}
+
+static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
+ void *cpu_addr, u64 dma_handle)
+{
+ free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
+ .mapping_error = ipath_mapping_error,
+ .map_single = ipath_dma_map_single,
+ .unmap_single = ipath_dma_unmap_single,
+ .map_page = ipath_dma_map_page,
+ .unmap_page = ipath_dma_unmap_page,
+ .map_sg = ipath_map_sg,
+ .unmap_sg = ipath_unmap_sg,
+ .sync_single_for_cpu = ipath_sync_single_for_cpu,
+ .sync_single_for_device = ipath_sync_single_for_device,
+ .alloc_coherent = ipath_dma_alloc_coherent,
+ .free_coherent = ipath_dma_free_coherent
+};
diff --git a/drivers/staging/rdma/ipath/ipath_driver.c b/drivers/staging/rdma/ipath/ipath_driver.c
new file mode 100644
index 000000000000..871dbe56216a
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_driver.c
@@ -0,0 +1,2789 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#ifdef CONFIG_X86_64
+#include <asm/pat.h>
+#endif
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+
+static void ipath_update_pio_bufs(struct ipath_devdata *);
+
+const char *ipath_get_unit_name(int unit)
+{
+ static char iname[16];
+ snprintf(iname, sizeof iname, "infinipath%u", unit);
+ return iname;
+}
+
+#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
+#define PFX IPATH_DRV_NAME ": "
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_ipath_version[] = IPATH_IDSTR "\n";
+
+static struct idr unit_table;
+DEFINE_SPINLOCK(ipath_devs_lock);
+LIST_HEAD(ipath_dev_list);
+
+wait_queue_head_t ipath_state_wait;
+
+unsigned ipath_debug = __IPATH_INFO;
+
+module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "mask for debug prints");
+EXPORT_SYMBOL_GPL(ipath_debug);
+
+unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
+module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
+MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
+
+static unsigned ipath_hol_timeout_ms = 13000;
+module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+ "duration of user app suspension after link failure");
+
+unsigned ipath_linkrecovery = 1;
+module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("QLogic <support@qlogic.com>");
+MODULE_DESCRIPTION("QLogic InfiniPath driver");
+
+/*
+ * Table to translate the LINKTRAININGSTATE portion of
+ * IBCStatus to a human-readable form.
+ */
+const char *ipath_ibcstatus_str[] = {
+ "Disabled",
+ "LinkUp",
+ "PollActive",
+ "PollQuiet",
+ "SleepDelay",
+ "SleepQuiet",
+ "LState6", /* unused */
+ "LState7", /* unused */
+ "CfgDebounce",
+ "CfgRcvfCfg",
+ "CfgWaitRmt",
+ "CfgIdle",
+ "RecovRetrain",
+ "CfgTxRevLane", /* unused before IBA7220 */
+ "RecovWaitRmt",
+ "RecovIdle",
+ /* below were added for IBA7220 */
+ "CfgEnhanced",
+ "CfgTest",
+ "CfgWaitRmtTest",
+ "CfgWaitCfgEnhanced",
+ "SendTS_T",
+ "SendTstIdles",
+ "RcvTS_T",
+ "SendTst_TS1s",
+ "LTState18", "LTState19", "LTState1A", "LTState1B",
+ "LTState1C", "LTState1D", "LTState1E", "LTState1F"
+};
+
+static void ipath_remove_one(struct pci_dev *);
+static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
+
+/* Only needed for registration, nothing else needs this info */
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
+
+/* Number of seconds before our card status check... */
+#define STATUS_TIMEOUT 60
+
+static const struct pci_device_id ipath_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
+ { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
+
+static struct pci_driver ipath_driver = {
+ .name = IPATH_DRV_NAME,
+ .probe = ipath_init_one,
+ .remove = ipath_remove_one,
+ .id_table = ipath_pci_tbl,
+ .driver = {
+ .groups = ipath_driver_attr_groups,
+ },
+};
+
+static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
+ u32 *bar0, u32 *bar1)
+{
+ int ret;
+
+ ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
+ if (ret)
+ ipath_dev_err(dd, "failed to read bar0 before enable: "
+ "error %d\n", -ret);
+
+ ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
+ if (ret)
+ ipath_dev_err(dd, "failed to read bar1 before enable: "
+ "error %d\n", -ret);
+
+ ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
+}
+
+static void ipath_free_devdata(struct pci_dev *pdev,
+ struct ipath_devdata *dd)
+{
+ unsigned long flags;
+
+ pci_set_drvdata(pdev, NULL);
+
+ if (dd->ipath_unit != -1) {
+ spin_lock_irqsave(&ipath_devs_lock, flags);
+ idr_remove(&unit_table, dd->ipath_unit);
+ list_del(&dd->ipath_list);
+ spin_unlock_irqrestore(&ipath_devs_lock, flags);
+ }
+ vfree(dd);
+}
+
+static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
+{
+ unsigned long flags;
+ struct ipath_devdata *dd;
+ int ret;
+
+ dd = vzalloc(sizeof(*dd));
+ if (!dd) {
+ dd = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+ dd->ipath_unit = -1;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irqsave(&ipath_devs_lock, flags);
+
+ ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
+ if (ret < 0) {
+ printk(KERN_ERR IPATH_DRV_NAME
+ ": Could not allocate unit ID: error %d\n", -ret);
+ ipath_free_devdata(pdev, dd);
+ dd = ERR_PTR(ret);
+ goto bail_unlock;
+ }
+ dd->ipath_unit = ret;
+
+ dd->pcidev = pdev;
+ pci_set_drvdata(pdev, dd);
+
+ list_add(&dd->ipath_list, &ipath_dev_list);
+
+bail_unlock:
+ spin_unlock_irqrestore(&ipath_devs_lock, flags);
+ idr_preload_end();
+bail:
+ return dd;
+}
+
+static inline struct ipath_devdata *__ipath_lookup(int unit)
+{
+ return idr_find(&unit_table, unit);
+}
+
+struct ipath_devdata *ipath_lookup(int unit)
+{
+ struct ipath_devdata *dd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ipath_devs_lock, flags);
+ dd = __ipath_lookup(unit);
+ spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+ return dd;
+}
+
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
+{
+ int nunits, npresent, nup;
+ struct ipath_devdata *dd;
+ unsigned long flags;
+ int maxports;
+
+ nunits = npresent = nup = maxports = 0;
+
+ spin_lock_irqsave(&ipath_devs_lock, flags);
+
+ list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+ nunits++;
+ if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
+ npresent++;
+ if (dd->ipath_lid &&
+ !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
+ | IPATH_LINKUNK)))
+ nup++;
+ if (dd->ipath_cfgports > maxports)
+ maxports = dd->ipath_cfgports;
+ }
+
+ spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+ if (npresentp)
+ *npresentp = npresent;
+ if (nupp)
+ *nupp = nup;
+ if (maxportsp)
+ *maxportsp = maxports;
+
+ return nunits;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining. If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
+{
+ return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
+{
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration. Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire). On chips that use an address-based
+ * trigger to send packets to the wire, this is easy. On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void ipath_verify_pioperf(struct ipath_devdata *dd)
+{
+ u32 pbnum, cnt, lcnt;
+ u32 __iomem *piobuf;
+ u32 *addr;
+ u64 msecs, emsecs;
+
+ piobuf = ipath_getpiobuf(dd, 0, &pbnum);
+ if (!piobuf) {
+ dev_info(&dd->pcidev->dev,
+ "No PIObufs for checking perf, skipping\n");
+ return;
+ }
+
+ /*
+ * Enough to give us a reasonable test, less than piobuf size, and
+ * likely multiple of store buffer length.
+ */
+ cnt = 1024;
+
+ addr = vmalloc(cnt);
+ if (!addr) {
+ dev_info(&dd->pcidev->dev,
+ "Couldn't get memory for checking PIO perf,"
+ " skipping\n");
+ goto done;
+ }
+
+ preempt_disable(); /* we want reasonably accurate elapsed time */
+ msecs = 1 + jiffies_to_msecs(jiffies);
+ for (lcnt = 0; lcnt < 10000U; lcnt++) {
+ /* wait until we cross msec boundary */
+ if (jiffies_to_msecs(jiffies) >= msecs)
+ break;
+ udelay(1);
+ }
+
+ ipath_disable_armlaunch(dd);
+
+ /*
+ * length 0, no dwords actually sent, and mark as VL15
+ * on chips where that may matter (due to IB flowcontrol)
+ */
+ if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
+ writeq(1UL << 63, piobuf);
+ else
+ writeq(0, piobuf);
+ ipath_flush_wc();
+
+ /*
+ * this is only roughly accurate, since even with preempt we
+ * still take interrupts that could take a while. Running for
+ * >= 5 msec seems to get us "close enough" to accurate values
+ */
+ msecs = jiffies_to_msecs(jiffies);
+ for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+ __iowrite32_copy(piobuf + 64, addr, cnt >> 2);
+ emsecs = jiffies_to_msecs(jiffies) - msecs;
+ }
+
+ /* 1 GiB/sec, slightly over IB SDR line rate */
+ if (lcnt < (emsecs * 1024U))
+ ipath_dev_err(dd,
+ "Performance problem: bandwidth to PIO buffers is "
+ "only %u MiB/sec\n",
+ lcnt / (u32) emsecs);
+ else
+ ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
+ lcnt / (u32) emsecs);
+
+ preempt_enable();
+
+ vfree(addr);
+
+done:
+ /* disarm piobuf, so it's available again */
+ ipath_disarm_piobufs(dd, pbnum, 1);
+ ipath_enable_armlaunch(dd);
+}
+
+static void cleanup_device(struct ipath_devdata *dd);
+
+static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int ret, len, j;
+ struct ipath_devdata *dd;
+ unsigned long long addr;
+ u32 bar0 = 0, bar1 = 0;
+
+#ifdef CONFIG_X86_64
+ if (pat_enabled()) {
+ pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n");
+ ret = -ENODEV;
+ goto bail;
+ }
+#endif
+
+ dd = ipath_alloc_devdata(pdev);
+ if (IS_ERR(dd)) {
+ ret = PTR_ERR(dd);
+ printk(KERN_ERR IPATH_DRV_NAME
+ ": Could not allocate devdata: error %d\n", -ret);
+ goto bail;
+ }
+
+ ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
+
+ ret = pci_enable_device(pdev);
+ if (ret) {
+ /* This can happen iff:
+ *
+ * We did a chip reset, and then failed to reprogram the
+ * BAR, or the chip reset due to an internal error. We then
+ * unloaded the driver and reloaded it.
+ *
+ * Both reset cases set the BAR back to initial state. For
+ * the latter case, the AER sticky error bit at offset 0x718
+ * should be set, but the Linux kernel doesn't yet know
+ * about that, it appears. If the original BAR was retained
+ * in the kernel data structures, this may be OK.
+ */
+ ipath_dev_err(dd, "enable unit %d failed: error %d\n",
+ dd->ipath_unit, -ret);
+ goto bail_devdata;
+ }
+ addr = pci_resource_start(pdev, 0);
+ len = pci_resource_len(pdev, 0);
+ ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
+ "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
+ ent->device, ent->driver_data);
+
+ read_bars(dd, pdev, &bar0, &bar1);
+
+ if (!bar1 && !(bar0 & ~0xf)) {
+ if (addr) {
+ dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
+ "rewriting as %llx\n", addr);
+ ret = pci_write_config_dword(
+ pdev, PCI_BASE_ADDRESS_0, addr);
+ if (ret) {
+ ipath_dev_err(dd, "rewrite of BAR0 "
+ "failed: err %d\n", -ret);
+ goto bail_disable;
+ }
+ ret = pci_write_config_dword(
+ pdev, PCI_BASE_ADDRESS_1, addr >> 32);
+ if (ret) {
+ ipath_dev_err(dd, "rewrite of BAR1 "
+ "failed: err %d\n", -ret);
+ goto bail_disable;
+ }
+ } else {
+ ipath_dev_err(dd, "BAR is 0 (probable RESET), "
+ "not usable until reboot\n");
+ ret = -ENODEV;
+ goto bail_disable;
+ }
+ }
+
+ ret = pci_request_regions(pdev, IPATH_DRV_NAME);
+ if (ret) {
+ dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
+ "err %d\n", dd->ipath_unit, -ret);
+ goto bail_disable;
+ }
+
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (ret) {
+ /*
+ * if the 64 bit setup fails, try 32 bit. Some systems
+ * do not setup 64 bit maps on systems with 2GB or less
+ * memory installed.
+ */
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (ret) {
+ dev_info(&pdev->dev,
+ "Unable to set DMA mask for unit %u: %d\n",
+ dd->ipath_unit, ret);
+ goto bail_regions;
+ }
+ else {
+ ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
+ ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (ret)
+ dev_info(&pdev->dev,
+ "Unable to set DMA consistent mask "
+ "for unit %u: %d\n",
+ dd->ipath_unit, ret);
+
+ }
+ }
+ else {
+ ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (ret)
+ dev_info(&pdev->dev,
+ "Unable to set DMA consistent mask "
+ "for unit %u: %d\n",
+ dd->ipath_unit, ret);
+ }
+
+ pci_set_master(pdev);
+
+ /*
+ * Save BARs to rewrite after device reset. Save all 64 bits of
+ * BAR, just in case.
+ */
+ dd->ipath_pcibar0 = addr;
+ dd->ipath_pcibar1 = addr >> 32;
+ dd->ipath_deviceid = ent->device; /* save for later use */
+ dd->ipath_vendorid = ent->vendor;
+
+ /* setup the chip-specific functions, as early as possible. */
+ switch (ent->device) {
+ case PCI_DEVICE_ID_INFINIPATH_HT:
+ ipath_init_iba6110_funcs(dd);
+ break;
+
+ default:
+ ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
+ "failing\n", ent->device);
+ return -ENODEV;
+ }
+
+ for (j = 0; j < 6; j++) {
+ if (!pdev->resource[j].start)
+ continue;
+ ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
+ j, &pdev->resource[j],
+ (unsigned long long)pci_resource_len(pdev, j));
+ }
+
+ if (!addr) {
+ ipath_dev_err(dd, "No valid address in BAR 0!\n");
+ ret = -ENODEV;
+ goto bail_regions;
+ }
+
+ dd->ipath_pcirev = pdev->revision;
+
+#if defined(__powerpc__)
+ /* There isn't a generic way to specify writethrough mappings */
+ dd->ipath_kregbase = __ioremap(addr, len,
+ (_PAGE_NO_CACHE|_PAGE_WRITETHRU));
+#else
+ /* XXX: split this properly to enable on PAT */
+ dd->ipath_kregbase = ioremap_nocache(addr, len);
+#endif
+
+ if (!dd->ipath_kregbase) {
+ ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
+ addr);
+ ret = -ENOMEM;
+ goto bail_iounmap;
+ }
+ dd->ipath_kregend = (u64 __iomem *)
+ ((void __iomem *)dd->ipath_kregbase + len);
+ dd->ipath_physaddr = addr; /* used for io_remap, etc. */
+ /* for user mmap */
+ ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
+ addr, dd->ipath_kregbase);
+
+ if (dd->ipath_f_bus(dd, pdev))
+ ipath_dev_err(dd, "Failed to setup config space; "
+ "continuing anyway\n");
+
+ /*
+ * set up our interrupt handler; IRQF_SHARED probably not needed,
+ * since MSI interrupts shouldn't be shared but won't hurt for now.
+ * check 0 irq after we return from chip-specific bus setup, since
+ * that can affect this due to setup
+ */
+ if (!dd->ipath_irq)
+ ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't "
+ "work\n");
+ else {
+ ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
+ IPATH_DRV_NAME, dd);
+ if (ret) {
+ ipath_dev_err(dd, "Couldn't setup irq handler, "
+ "irq=%d: %d\n", dd->ipath_irq, ret);
+ goto bail_iounmap;
+ }
+ }
+
+ ret = ipath_init_chip(dd, 0); /* do the chip-specific init */
+ if (ret)
+ goto bail_irqsetup;
+
+ ret = ipath_enable_wc(dd);
+
+ if (ret)
+ ret = 0;
+
+ ipath_verify_pioperf(dd);
+
+ ipath_device_create_group(&pdev->dev, dd);
+ ipathfs_add_device(dd);
+ ipath_user_add(dd);
+ ipath_diag_add(dd);
+ ipath_register_ib_device(dd);
+
+ goto bail;
+
+bail_irqsetup:
+ cleanup_device(dd);
+
+ if (dd->ipath_irq)
+ dd->ipath_f_free_irq(dd);
+
+ if (dd->ipath_f_cleanup)
+ dd->ipath_f_cleanup(dd);
+
+bail_iounmap:
+ iounmap((volatile void __iomem *) dd->ipath_kregbase);
+
+bail_regions:
+ pci_release_regions(pdev);
+
+bail_disable:
+ pci_disable_device(pdev);
+
+bail_devdata:
+ ipath_free_devdata(pdev, dd);
+
+bail:
+ return ret;
+}
+
+static void cleanup_device(struct ipath_devdata *dd)
+{
+ int port;
+ struct ipath_portdata **tmp;
+ unsigned long flags;
+
+ if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+ /* can't do anything more with chip; needs re-init */
+ *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+ if (dd->ipath_kregbase) {
+ /*
+ * if we haven't already cleaned up before these are
+ * to ensure any register reads/writes "fail" until
+ * re-init
+ */
+ dd->ipath_kregbase = NULL;
+ dd->ipath_uregbase = 0;
+ dd->ipath_sregbase = 0;
+ dd->ipath_cregbase = 0;
+ dd->ipath_kregsize = 0;
+ }
+ ipath_disable_wc(dd);
+ }
+
+ if (dd->ipath_spectriggerhit)
+ dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
+ dd->ipath_spectriggerhit);
+
+ if (dd->ipath_pioavailregs_dma) {
+ dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+ (void *) dd->ipath_pioavailregs_dma,
+ dd->ipath_pioavailregs_phys);
+ dd->ipath_pioavailregs_dma = NULL;
+ }
+ if (dd->ipath_dummy_hdrq) {
+ dma_free_coherent(&dd->pcidev->dev,
+ dd->ipath_pd[0]->port_rcvhdrq_size,
+ dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
+ dd->ipath_dummy_hdrq = NULL;
+ }
+
+ if (dd->ipath_pageshadow) {
+ struct page **tmpp = dd->ipath_pageshadow;
+ dma_addr_t *tmpd = dd->ipath_physshadow;
+ int i, cnt = 0;
+
+ ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
+ "locked\n");
+ for (port = 0; port < dd->ipath_cfgports; port++) {
+ int port_tidbase = port * dd->ipath_rcvtidcnt;
+ int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+ for (i = port_tidbase; i < maxtid; i++) {
+ if (!tmpp[i])
+ continue;
+ pci_unmap_page(dd->pcidev, tmpd[i],
+ PAGE_SIZE, PCI_DMA_FROMDEVICE);
+ ipath_release_user_pages(&tmpp[i], 1);
+ tmpp[i] = NULL;
+ cnt++;
+ }
+ }
+ if (cnt) {
+ ipath_stats.sps_pageunlocks += cnt;
+ ipath_cdbg(VERBOSE, "There were still %u expTID "
+ "entries locked\n", cnt);
+ }
+ if (ipath_stats.sps_pagelocks ||
+ ipath_stats.sps_pageunlocks)
+ ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
+ "unlocked via ipath_m{un}lock\n",
+ (unsigned long long)
+ ipath_stats.sps_pagelocks,
+ (unsigned long long)
+ ipath_stats.sps_pageunlocks);
+
+ ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
+ dd->ipath_pageshadow);
+ tmpp = dd->ipath_pageshadow;
+ dd->ipath_pageshadow = NULL;
+ vfree(tmpp);
+
+ dd->ipath_egrtidbase = NULL;
+ }
+
+ /*
+ * free any resources still in use (usually just kernel ports)
+ * at unload; we do for portcnt, because that's what we allocate.
+ * We acquire lock to be really paranoid that ipath_pd isn't being
+ * accessed from some interrupt-related code (that should not happen,
+ * but best to be sure).
+ */
+ spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+ tmp = dd->ipath_pd;
+ dd->ipath_pd = NULL;
+ spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+ for (port = 0; port < dd->ipath_portcnt; port++) {
+ struct ipath_portdata *pd = tmp[port];
+ tmp[port] = NULL; /* debugging paranoia */
+ ipath_free_pddata(dd, pd);
+ }
+ kfree(tmp);
+}
+
+static void ipath_remove_one(struct pci_dev *pdev)
+{
+ struct ipath_devdata *dd = pci_get_drvdata(pdev);
+
+ ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
+
+ /*
+ * disable the IB link early, to be sure no new packets arrive, which
+ * complicates the shutdown process
+ */
+ ipath_shutdown_device(dd);
+
+ flush_workqueue(ib_wq);
+
+ if (dd->verbs_dev)
+ ipath_unregister_ib_device(dd->verbs_dev);
+
+ ipath_diag_remove(dd);
+ ipath_user_remove(dd);
+ ipathfs_remove_device(dd);
+ ipath_device_remove_group(&pdev->dev, dd);
+
+ ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
+ "unit %u\n", dd, (u32) dd->ipath_unit);
+
+ cleanup_device(dd);
+
+ /*
+ * turn off rcv, send, and interrupts for all ports, all drivers
+ * should also hard reset the chip here?
+ * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+ * for all versions of the driver, if they were allocated
+ */
+ if (dd->ipath_irq) {
+ ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
+ dd->ipath_unit, dd->ipath_irq);
+ dd->ipath_f_free_irq(dd);
+ } else
+ ipath_dbg("irq is 0, not doing free_irq "
+ "for unit %u\n", dd->ipath_unit);
+ /*
+ * we check for NULL here, because it's outside
+ * the kregbase check, and we need to call it
+ * after the free_irq. Thus it's possible that
+ * the function pointers were never initialized.
+ */
+ if (dd->ipath_f_cleanup)
+ /* clean up chip-specific stuff */
+ dd->ipath_f_cleanup(dd);
+
+ ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
+ iounmap((volatile void __iomem *) dd->ipath_kregbase);
+ pci_release_regions(pdev);
+ ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
+ pci_disable_device(pdev);
+
+ ipath_free_devdata(pdev, dd);
+}
+
+/* general driver use */
+DEFINE_MUTEX(ipath_mutex);
+
+static DEFINE_SPINLOCK(ipath_pioavail_lock);
+
+/**
+ * ipath_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the infinipath device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * cancel a range of PIO buffers, used when they might be armed, but
+ * not triggered. Used at init to ensure buffer state, and also user
+ * process close, in case it died while writing to a PIO buffer
+ * Also after errors.
+ */
+void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
+ unsigned cnt)
+{
+ unsigned i, last = first + cnt;
+ unsigned long flags;
+
+ ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
+ for (i = first; i < last; i++) {
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ /*
+ * The disarm-related bits are write-only, so it
+ * is ok to OR them in with our copy of sendctrl
+ * while we hold the lock.
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl | INFINIPATH_S_DISARM |
+ (i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
+ /* can't disarm bufs back-to-back per iba7220 spec */
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+ }
+ /* on some older chips, update may not happen after cancel */
+ ipath_force_pio_avail_update(dd);
+}
+
+/**
+ * ipath_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the infinipath device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route. Currently used only by
+ * ipath_set_linkstate. Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
+{
+ dd->ipath_state_wanted = state;
+ wait_event_interruptible_timeout(ipath_state_wait,
+ (dd->ipath_flags & state),
+ msecs_to_jiffies(msecs));
+ dd->ipath_state_wanted = 0;
+
+ if (!(dd->ipath_flags & state)) {
+ u64 val;
+ ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
+ " ms\n",
+ /* test INIT ahead of DOWN, both can be set */
+ (state & IPATH_LINKINIT) ? "INIT" :
+ ((state & IPATH_LINKDOWN) ? "DOWN" :
+ ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
+ msecs);
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+ ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
+ (unsigned long long) ipath_read_kreg64(
+ dd, dd->ipath_kregs->kr_ibcctrl),
+ (unsigned long long) val,
+ ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
+ }
+ return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
+}
+
+static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
+ char *buf, size_t blen)
+{
+ static const struct {
+ ipath_err_t err;
+ const char *msg;
+ } errs[] = {
+ { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
+ { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
+ { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
+ { INFINIPATH_E_SDMABASE, "SDmaBase" },
+ { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
+ { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
+ { INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
+ { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
+ { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
+ { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
+ { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
+ { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
+ };
+ int i;
+ int expected;
+ size_t bidx = 0;
+
+ for (i = 0; i < ARRAY_SIZE(errs); i++) {
+ expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
+ test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+ if ((err & errs[i].err) && !expected)
+ bidx += snprintf(buf + bidx, blen - bidx,
+ "%s ", errs[i].msg);
+ }
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else. Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+ ipath_err_t err)
+{
+ int iserr = 1;
+ *buf = '\0';
+ if (err & INFINIPATH_E_PKTERRS) {
+ if (!(err & ~INFINIPATH_E_PKTERRS))
+ iserr = 0; // if only packet errors.
+ if (ipath_debug & __IPATH_ERRPKTDBG) {
+ if (err & INFINIPATH_E_REBP)
+ strlcat(buf, "EBP ", blen);
+ if (err & INFINIPATH_E_RVCRC)
+ strlcat(buf, "VCRC ", blen);
+ if (err & INFINIPATH_E_RICRC) {
+ strlcat(buf, "CRC ", blen);
+ // clear for check below, so only once
+ err &= INFINIPATH_E_RICRC;
+ }
+ if (err & INFINIPATH_E_RSHORTPKTLEN)
+ strlcat(buf, "rshortpktlen ", blen);
+ if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+ strlcat(buf, "sdroppeddatapkt ", blen);
+ if (err & INFINIPATH_E_SPKTLEN)
+ strlcat(buf, "spktlen ", blen);
+ }
+ if ((err & INFINIPATH_E_RICRC) &&
+ !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
+ strlcat(buf, "CRC ", blen);
+ if (!iserr)
+ goto done;
+ }
+ if (err & INFINIPATH_E_RHDRLEN)
+ strlcat(buf, "rhdrlen ", blen);
+ if (err & INFINIPATH_E_RBADTID)
+ strlcat(buf, "rbadtid ", blen);
+ if (err & INFINIPATH_E_RBADVERSION)
+ strlcat(buf, "rbadversion ", blen);
+ if (err & INFINIPATH_E_RHDR)
+ strlcat(buf, "rhdr ", blen);
+ if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
+ strlcat(buf, "sendspecialtrigger ", blen);
+ if (err & INFINIPATH_E_RLONGPKTLEN)
+ strlcat(buf, "rlongpktlen ", blen);
+ if (err & INFINIPATH_E_RMAXPKTLEN)
+ strlcat(buf, "rmaxpktlen ", blen);
+ if (err & INFINIPATH_E_RMINPKTLEN)
+ strlcat(buf, "rminpktlen ", blen);
+ if (err & INFINIPATH_E_SMINPKTLEN)
+ strlcat(buf, "sminpktlen ", blen);
+ if (err & INFINIPATH_E_RFORMATERR)
+ strlcat(buf, "rformaterr ", blen);
+ if (err & INFINIPATH_E_RUNSUPVL)
+ strlcat(buf, "runsupvl ", blen);
+ if (err & INFINIPATH_E_RUNEXPCHAR)
+ strlcat(buf, "runexpchar ", blen);
+ if (err & INFINIPATH_E_RIBFLOW)
+ strlcat(buf, "ribflow ", blen);
+ if (err & INFINIPATH_E_SUNDERRUN)
+ strlcat(buf, "sunderrun ", blen);
+ if (err & INFINIPATH_E_SPIOARMLAUNCH)
+ strlcat(buf, "spioarmlaunch ", blen);
+ if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+ strlcat(buf, "sunexperrpktnum ", blen);
+ if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+ strlcat(buf, "sdroppedsmppkt ", blen);
+ if (err & INFINIPATH_E_SMAXPKTLEN)
+ strlcat(buf, "smaxpktlen ", blen);
+ if (err & INFINIPATH_E_SUNSUPVL)
+ strlcat(buf, "sunsupVL ", blen);
+ if (err & INFINIPATH_E_INVALIDADDR)
+ strlcat(buf, "invalidaddr ", blen);
+ if (err & INFINIPATH_E_RRCVEGRFULL)
+ strlcat(buf, "rcvegrfull ", blen);
+ if (err & INFINIPATH_E_RRCVHDRFULL)
+ strlcat(buf, "rcvhdrfull ", blen);
+ if (err & INFINIPATH_E_IBSTATUSCHANGED)
+ strlcat(buf, "ibcstatuschg ", blen);
+ if (err & INFINIPATH_E_RIBLOSTLINK)
+ strlcat(buf, "riblostlink ", blen);
+ if (err & INFINIPATH_E_HARDWARE)
+ strlcat(buf, "hardware ", blen);
+ if (err & INFINIPATH_E_RESET)
+ strlcat(buf, "reset ", blen);
+ if (err & INFINIPATH_E_SDMAERRS)
+ decode_sdma_errs(dd, err, buf, blen);
+ if (err & INFINIPATH_E_INVALIDEEPCMD)
+ strlcat(buf, "invalideepromcmd ", blen);
+done:
+ return iserr;
+}
+
+/**
+ * get_rhf_errstring - decode RHF errors
+ * @err: the err number
+ * @msg: the output buffer
+ * @len: the length of the output buffer
+ *
+ * only used one place now, may want more later
+ */
+static void get_rhf_errstring(u32 err, char *msg, size_t len)
+{
+ /* if no errors, and so don't need to check what's first */
+ *msg = '\0';
+
+ if (err & INFINIPATH_RHF_H_ICRCERR)
+ strlcat(msg, "icrcerr ", len);
+ if (err & INFINIPATH_RHF_H_VCRCERR)
+ strlcat(msg, "vcrcerr ", len);
+ if (err & INFINIPATH_RHF_H_PARITYERR)
+ strlcat(msg, "parityerr ", len);
+ if (err & INFINIPATH_RHF_H_LENERR)
+ strlcat(msg, "lenerr ", len);
+ if (err & INFINIPATH_RHF_H_MTUERR)
+ strlcat(msg, "mtuerr ", len);
+ if (err & INFINIPATH_RHF_H_IHDRERR)
+ /* infinipath hdr checksum error */
+ strlcat(msg, "ipathhdrerr ", len);
+ if (err & INFINIPATH_RHF_H_TIDERR)
+ strlcat(msg, "tiderr ", len);
+ if (err & INFINIPATH_RHF_H_MKERR)
+ /* bad port, offset, etc. */
+ strlcat(msg, "invalid ipathhdr ", len);
+ if (err & INFINIPATH_RHF_H_IBERR)
+ strlcat(msg, "iberr ", len);
+ if (err & INFINIPATH_RHF_L_SWA)
+ strlcat(msg, "swA ", len);
+ if (err & INFINIPATH_RHF_L_SWB)
+ strlcat(msg, "swB ", len);
+}
+
+/**
+ * ipath_get_egrbuf - get an eager buffer
+ * @dd: the infinipath device
+ * @bufnum: the eager buffer to get
+ *
+ * must only be called if ipath_pd[port] is known to be allocated
+ */
+static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
+{
+ return dd->ipath_port0_skbinfo ?
+ (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
+}
+
+/**
+ * ipath_alloc_skb - allocate an skb and buffer with possible constraints
+ * @dd: the infinipath device
+ * @gfp_mask: the sk_buff SFP mask
+ */
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
+ gfp_t gfp_mask)
+{
+ struct sk_buff *skb;
+ u32 len;
+
+ /*
+ * Only fully supported way to handle this is to allocate lots
+ * extra, align as needed, and then do skb_reserve(). That wastes
+ * a lot of memory... I'll have to hack this into infinipath_copy
+ * also.
+ */
+
+ /*
+ * We need 2 extra bytes for ipath_ether data sent in the
+ * key header. In order to keep everything dword aligned,
+ * we'll reserve 4 bytes.
+ */
+ len = dd->ipath_ibmaxlen + 4;
+
+ if (dd->ipath_flags & IPATH_4BYTE_TID) {
+ /* We need a 2KB multiple alignment, and there is no way
+ * to do it except to allocate extra and then skb_reserve
+ * enough to bring it up to the right alignment.
+ */
+ len += 2047;
+ }
+
+ skb = __dev_alloc_skb(len, gfp_mask);
+ if (!skb) {
+ ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
+ len);
+ goto bail;
+ }
+
+ skb_reserve(skb, 4);
+
+ if (dd->ipath_flags & IPATH_4BYTE_TID) {
+ u32 una = (unsigned long)skb->data & 2047;
+ if (una)
+ skb_reserve(skb, 2048 - una);
+ }
+
+bail:
+ return skb;
+}
+
+static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
+ u32 eflags,
+ u32 l,
+ u32 etail,
+ __le32 *rhf_addr,
+ struct ipath_message_header *hdr)
+{
+ char emsg[128];
+
+ get_rhf_errstring(eflags, emsg, sizeof emsg);
+ ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
+ "tlen=%x opcode=%x egridx=%x: %s\n",
+ eflags, l,
+ ipath_hdrget_rcv_type(rhf_addr),
+ ipath_hdrget_length_in_bytes(rhf_addr),
+ be32_to_cpu(hdr->bth[0]) >> 24,
+ etail, emsg);
+
+ /* Count local link integrity errors. */
+ if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
+ u8 n = (dd->ipath_ibcctrl >>
+ INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+ INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+
+ if (++dd->ipath_lli_counter > n) {
+ dd->ipath_lli_counter = 0;
+ dd->ipath_lli_errors++;
+ }
+ }
+}
+
+/*
+ * ipath_kreceive - receive a packet
+ * @pd: the infinipath port
+ *
+ * called from interrupt handler for errors or receive interrupt
+ */
+void ipath_kreceive(struct ipath_portdata *pd)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+ __le32 *rhf_addr;
+ void *ebuf;
+ const u32 rsize = dd->ipath_rcvhdrentsize; /* words */
+ const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */
+ u32 etail = -1, l, hdrqtail;
+ struct ipath_message_header *hdr;
+ u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
+ static u64 totcalls; /* stats, may eventually remove */
+ int last;
+
+ l = pd->port_head;
+ rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
+ if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+ u32 seq = ipath_hdrget_seq(rhf_addr);
+
+ if (seq != pd->port_seq_cnt)
+ goto bail;
+ hdrqtail = 0;
+ } else {
+ hdrqtail = ipath_get_rcvhdrtail(pd);
+ if (l == hdrqtail)
+ goto bail;
+ smp_rmb();
+ }
+
+reloop:
+ for (last = 0, i = 1; !last; i += !last) {
+ hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
+ eflags = ipath_hdrget_err_flags(rhf_addr);
+ etype = ipath_hdrget_rcv_type(rhf_addr);
+ /* total length */
+ tlen = ipath_hdrget_length_in_bytes(rhf_addr);
+ ebuf = NULL;
+ if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
+ ipath_hdrget_use_egr_buf(rhf_addr) :
+ (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+ /*
+ * It turns out that the chip uses an eager buffer
+ * for all non-expected packets, whether it "needs"
+ * one or not. So always get the index, but don't
+ * set ebuf (so we try to copy data) unless the
+ * length requires it.
+ */
+ etail = ipath_hdrget_index(rhf_addr);
+ updegr = 1;
+ if (tlen > sizeof(*hdr) ||
+ etype == RCVHQ_RCV_TYPE_NON_KD)
+ ebuf = ipath_get_egrbuf(dd, etail);
+ }
+
+ /*
+ * both tiderr and ipathhdrerr are set for all plain IB
+ * packets; only ipathhdrerr should be set.
+ */
+
+ if (etype != RCVHQ_RCV_TYPE_NON_KD &&
+ etype != RCVHQ_RCV_TYPE_ERROR &&
+ ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
+ IPS_PROTO_VERSION)
+ ipath_cdbg(PKT, "Bad InfiniPath protocol version "
+ "%x\n", etype);
+
+ if (unlikely(eflags))
+ ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
+ else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+ ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
+ if (dd->ipath_lli_counter)
+ dd->ipath_lli_counter--;
+ } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+ u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
+ u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
+ ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
+ "qp=%x), len %x; ignored\n",
+ etype, opcode, qp, tlen);
+ }
+ else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
+ ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
+ be32_to_cpu(hdr->bth[0]) >> 24);
+ else {
+ /*
+ * error packet, type of error unknown.
+ * Probably type 3, but we don't know, so don't
+ * even try to print the opcode, etc.
+ * Usually caused by a "bad packet", that has no
+ * BTH, when the LRH says it should.
+ */
+ ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
+ " %x, len %x hdrq+%x rhf: %Lx\n",
+ etail, tlen, l, (unsigned long long)
+ le64_to_cpu(*(__le64 *) rhf_addr));
+ if (ipath_debug & __IPATH_ERRPKTDBG) {
+ u32 j, *d, dw = rsize-2;
+ if (rsize > (tlen>>2))
+ dw = tlen>>2;
+ d = (u32 *)hdr;
+ printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
+ dw);
+ for (j = 0; j < dw; j++)
+ printk(KERN_DEBUG "%8x%s", d[j],
+ (j%8) == 7 ? "\n" : " ");
+ printk(KERN_DEBUG ".\n");
+ }
+ }
+ l += rsize;
+ if (l >= maxcnt)
+ l = 0;
+ rhf_addr = (__le32 *) pd->port_rcvhdrq +
+ l + dd->ipath_rhf_offset;
+ if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+ u32 seq = ipath_hdrget_seq(rhf_addr);
+
+ if (++pd->port_seq_cnt > 13)
+ pd->port_seq_cnt = 1;
+ if (seq != pd->port_seq_cnt)
+ last = 1;
+ } else if (l == hdrqtail)
+ last = 1;
+ /*
+ * update head regs on last packet, and every 16 packets.
+ * Reduce bus traffic, while still trying to prevent
+ * rcvhdrq overflows, for when the queue is nearly full
+ */
+ if (last || !(i & 0xf)) {
+ u64 lval = l;
+
+ /* request IBA6120 and 7220 interrupt only on last */
+ if (last)
+ lval |= dd->ipath_rhdrhead_intr_off;
+ ipath_write_ureg(dd, ur_rcvhdrhead, lval,
+ pd->port_port);
+ if (updegr) {
+ ipath_write_ureg(dd, ur_rcvegrindexhead,
+ etail, pd->port_port);
+ updegr = 0;
+ }
+ }
+ }
+
+ if (!dd->ipath_rhdrhead_intr_off && !reloop &&
+ !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+ /* IBA6110 workaround; we can have a race clearing chip
+ * interrupt with another interrupt about to be delivered,
+ * and can clear it before it is delivered on the GPIO
+ * workaround. By doing the extra check here for the
+ * in-memory tail register updating while we were doing
+ * earlier packets, we "almost" guarantee we have covered
+ * that case.
+ */
+ u32 hqtail = ipath_get_rcvhdrtail(pd);
+ if (hqtail != hdrqtail) {
+ hdrqtail = hqtail;
+ reloop = 1; /* loop 1 extra time at most */
+ goto reloop;
+ }
+ }
+
+ pkttot += i;
+
+ pd->port_head = l;
+
+ if (pkttot > ipath_stats.sps_maxpkts_call)
+ ipath_stats.sps_maxpkts_call = pkttot;
+ ipath_stats.sps_port0pkts += pkttot;
+ ipath_stats.sps_avgpkts_call =
+ ipath_stats.sps_port0pkts / ++totcalls;
+
+bail:;
+}
+
+/**
+ * ipath_update_pio_bufs - update shadow copy of the PIO availability map
+ * @dd: the infinipath device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by some code
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(struct ipath_devdata *dd)
+{
+ unsigned long flags;
+ int i;
+ const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
+
+ /* If the generation (check) bits have changed, then we update the
+ * busy bit for the corresponding PIO buffer. This algorithm will
+ * modify positions to the value they already have in some cases
+ * (i.e., no change), but it's faster than changing only the bits
+ * that have changed.
+ *
+ * We would like to do this atomicly, to avoid spinlocks in the
+ * critical send path, but that's not really possible, given the
+ * type of changes, and that this routine could be called on
+ * multiple cpu's simultaneously, so we lock in this routine only,
+ * to avoid conflicting updates; all we change is the shadow, and
+ * it's a single 64 bit memory location, so by definition the update
+ * is atomic in terms of what other cpu's can see in testing the
+ * bits. The spin_lock overhead isn't too bad, since it only
+ * happens when all buffers are in use, so only cpu overhead, not
+ * latency or bandwidth is affected.
+ */
+ if (!dd->ipath_pioavailregs_dma) {
+ ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
+ return;
+ }
+ if (ipath_debug & __IPATH_VERBDBG) {
+ /* only if packet debug and verbose */
+ volatile __le64 *dma = dd->ipath_pioavailregs_dma;
+ unsigned long *shadow = dd->ipath_pioavailshadow;
+
+ ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
+ "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
+ "s3=%lx\n",
+ (unsigned long long) le64_to_cpu(dma[0]),
+ shadow[0],
+ (unsigned long long) le64_to_cpu(dma[1]),
+ shadow[1],
+ (unsigned long long) le64_to_cpu(dma[2]),
+ shadow[2],
+ (unsigned long long) le64_to_cpu(dma[3]),
+ shadow[3]);
+ if (piobregs > 4)
+ ipath_cdbg(
+ PKT, "2nd group, dma4=%llx shad4=%lx, "
+ "d5=%llx s5=%lx, d6=%llx s6=%lx, "
+ "d7=%llx s7=%lx\n",
+ (unsigned long long) le64_to_cpu(dma[4]),
+ shadow[4],
+ (unsigned long long) le64_to_cpu(dma[5]),
+ shadow[5],
+ (unsigned long long) le64_to_cpu(dma[6]),
+ shadow[6],
+ (unsigned long long) le64_to_cpu(dma[7]),
+ shadow[7]);
+ }
+ spin_lock_irqsave(&ipath_pioavail_lock, flags);
+ for (i = 0; i < piobregs; i++) {
+ u64 pchbusy, pchg, piov, pnew;
+ /*
+ * Chip Errata: bug 6641; even and odd qwords>3 are swapped
+ */
+ if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+ piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
+ else
+ piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
+ pchg = dd->ipath_pioavailkernel[i] &
+ ~(dd->ipath_pioavailshadow[i] ^ piov);
+ pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+ if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
+ pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
+ pnew |= piov & pchbusy;
+ dd->ipath_pioavailshadow[i] = pnew;
+ }
+ }
+ spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/*
+ * used to force update of pioavailshadow if we can't get a pio buffer.
+ * Needed primarily due to exitting freeze mode after recovering
+ * from errors. Done lazily, because it's safer (known to not
+ * be writing pio buffers).
+ */
+static void ipath_reset_availshadow(struct ipath_devdata *dd)
+{
+ int i, im;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ipath_pioavail_lock, flags);
+ for (i = 0; i < dd->ipath_pioavregs; i++) {
+ u64 val, oldval;
+ /* deal with 6110 chip bug on high register #s */
+ im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+ i ^ 1 : i;
+ val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
+ /*
+ * busy out the buffers not in the kernel avail list,
+ * without changing the generation bits.
+ */
+ oldval = dd->ipath_pioavailshadow[i];
+ dd->ipath_pioavailshadow[i] = val |
+ ((~dd->ipath_pioavailkernel[i] <<
+ INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
+ 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
+ if (oldval != dd->ipath_pioavailshadow[i])
+ ipath_dbg("shadow[%d] was %Lx, now %lx\n",
+ i, (unsigned long long) oldval,
+ dd->ipath_pioavailshadow[i]);
+ }
+ spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/**
+ * ipath_setrcvhdrsize - set the receive header size
+ * @dd: the infinipath device
+ * @rhdrsize: the receive header size
+ *
+ * called from user init code, and also layered driver init
+ */
+int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
+{
+ int ret = 0;
+
+ if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
+ if (dd->ipath_rcvhdrsize != rhdrsize) {
+ dev_info(&dd->pcidev->dev,
+ "Error: can't set protocol header "
+ "size %u, already %u\n",
+ rhdrsize, dd->ipath_rcvhdrsize);
+ ret = -EAGAIN;
+ } else
+ ipath_cdbg(VERBOSE, "Reuse same protocol header "
+ "size %u\n", dd->ipath_rcvhdrsize);
+ } else if (rhdrsize > (dd->ipath_rcvhdrentsize -
+ (sizeof(u64) / sizeof(u32)))) {
+ ipath_dbg("Error: can't set protocol header size %u "
+ "(> max %u)\n", rhdrsize,
+ dd->ipath_rcvhdrentsize -
+ (u32) (sizeof(u64) / sizeof(u32)));
+ ret = -EOVERFLOW;
+ } else {
+ dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
+ dd->ipath_rcvhdrsize = rhdrsize;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+ dd->ipath_rcvhdrsize);
+ ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
+ dd->ipath_rcvhdrsize);
+ }
+ return ret;
+}
+
+/*
+ * debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_pio_bufs(struct ipath_devdata *dd)
+{
+ unsigned long *shadow = dd->ipath_pioavailshadow;
+ __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
+
+ dd->ipath_upd_pio_shadow = 1;
+
+ /*
+ * not atomic, but if we lose a stat count in a while, that's OK
+ */
+ ipath_stats.sps_nopiobufs++;
+ if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+ ipath_force_pio_avail_update(dd); /* at start */
+ ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
+ "%llx %llx %llx %llx\n"
+ "ipath shadow: %lx %lx %lx %lx\n",
+ dd->ipath_consec_nopiobuf,
+ (unsigned long)get_cycles(),
+ (unsigned long long) le64_to_cpu(dma[0]),
+ (unsigned long long) le64_to_cpu(dma[1]),
+ (unsigned long long) le64_to_cpu(dma[2]),
+ (unsigned long long) le64_to_cpu(dma[3]),
+ shadow[0], shadow[1], shadow[2], shadow[3]);
+ /*
+ * 4 buffers per byte, 4 registers above, cover rest
+ * below
+ */
+ if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
+ (sizeof(shadow[0]) * 4 * 4))
+ ipath_dbg("2nd group: dmacopy: "
+ "%llx %llx %llx %llx\n"
+ "ipath shadow: %lx %lx %lx %lx\n",
+ (unsigned long long)le64_to_cpu(dma[4]),
+ (unsigned long long)le64_to_cpu(dma[5]),
+ (unsigned long long)le64_to_cpu(dma[6]),
+ (unsigned long long)le64_to_cpu(dma[7]),
+ shadow[4], shadow[5], shadow[6], shadow[7]);
+
+ /* at end, so update likely happened */
+ ipath_reset_availshadow(dd);
+ }
+}
+
+/*
+ * common code for normal driver pio buffer allocation, and reserved
+ * allocation.
+ *
+ * do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ */
+static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
+ u32 *pbufnum, u32 first, u32 last, u32 firsti)
+{
+ int i, j, updated = 0;
+ unsigned piobcnt;
+ unsigned long flags;
+ unsigned long *shadow = dd->ipath_pioavailshadow;
+ u32 __iomem *buf;
+
+ piobcnt = last - first;
+ if (dd->ipath_upd_pio_shadow) {
+ /*
+ * Minor optimization. If we had no buffers on last call,
+ * start out by doing the update; continue and do scan even
+ * if no buffers were updated, to be paranoid
+ */
+ ipath_update_pio_bufs(dd);
+ updated++;
+ i = first;
+ } else
+ i = firsti;
+rescan:
+ /*
+ * while test_and_set_bit() is atomic, we do that and then the
+ * change_bit(), and the pair is not. See if this is the cause
+ * of the remaining armlaunch errors.
+ */
+ spin_lock_irqsave(&ipath_pioavail_lock, flags);
+ for (j = 0; j < piobcnt; j++, i++) {
+ if (i >= last)
+ i = first;
+ if (__test_and_set_bit((2 * i) + 1, shadow))
+ continue;
+ /* flip generation bit */
+ __change_bit(2 * i, shadow);
+ break;
+ }
+ spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+ if (j == piobcnt) {
+ if (!updated) {
+ /*
+ * first time through; shadow exhausted, but may be
+ * buffers available, try an update and then rescan.
+ */
+ ipath_update_pio_bufs(dd);
+ updated++;
+ i = first;
+ goto rescan;
+ } else if (updated == 1 && piobcnt <=
+ ((dd->ipath_sendctrl
+ >> INFINIPATH_S_UPDTHRESH_SHIFT) &
+ INFINIPATH_S_UPDTHRESH_MASK)) {
+ /*
+ * for chips supporting and using the update
+ * threshold we need to force an update of the
+ * in-memory copy if the count is less than the
+ * thershold, then check one more time.
+ */
+ ipath_force_pio_avail_update(dd);
+ ipath_update_pio_bufs(dd);
+ updated++;
+ i = first;
+ goto rescan;
+ }
+
+ no_pio_bufs(dd);
+ buf = NULL;
+ } else {
+ if (i < dd->ipath_piobcnt2k)
+ buf = (u32 __iomem *) (dd->ipath_pio2kbase +
+ i * dd->ipath_palign);
+ else
+ buf = (u32 __iomem *)
+ (dd->ipath_pio4kbase +
+ (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
+ if (pbufnum)
+ *pbufnum = i;
+ }
+
+ return buf;
+}
+
+/**
+ * ipath_getpiobuf - find an available pio buffer
+ * @dd: the infinipath device
+ * @plen: the size of the PIO buffer needed in 32-bit words
+ * @pbufnum: the buffer number is placed here
+ */
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
+{
+ u32 __iomem *buf;
+ u32 pnum, nbufs;
+ u32 first, lasti;
+
+ if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
+ first = dd->ipath_piobcnt2k;
+ lasti = dd->ipath_lastpioindexl;
+ } else {
+ first = 0;
+ lasti = dd->ipath_lastpioindex;
+ }
+ nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+ buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
+
+ if (buf) {
+ /*
+ * Set next starting place. It's just an optimization,
+ * it doesn't matter who wins on this, so no locking
+ */
+ if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+ dd->ipath_lastpioindexl = pnum + 1;
+ else
+ dd->ipath_lastpioindex = pnum + 1;
+ if (dd->ipath_upd_pio_shadow)
+ dd->ipath_upd_pio_shadow = 0;
+ if (dd->ipath_consec_nopiobuf)
+ dd->ipath_consec_nopiobuf = 0;
+ ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
+ pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
+ if (pbufnum)
+ *pbufnum = pnum;
+
+ }
+ return buf;
+}
+
+/**
+ * ipath_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the infinipath device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+ unsigned len, int avail)
+{
+ unsigned long flags;
+ unsigned end, cnt = 0;
+
+ /* There are two bits per send buffer (busy and generation) */
+ start *= 2;
+ end = start + len * 2;
+
+ spin_lock_irqsave(&ipath_pioavail_lock, flags);
+ /* Set or clear the busy bit in the shadow. */
+ while (start < end) {
+ if (avail) {
+ unsigned long dma;
+ int i, im;
+ /*
+ * the BUSY bit will never be set, because we disarm
+ * the user buffers before we hand them back to the
+ * kernel. We do have to make sure the generation
+ * bit is set correctly in shadow, since it could
+ * have changed many times while allocated to user.
+ * We can't use the bitmap functions on the full
+ * dma array because it is always little-endian, so
+ * we have to flip to host-order first.
+ * BITS_PER_LONG is slightly wrong, since it's
+ * always 64 bits per register in chip...
+ * We only work on 64 bit kernels, so that's OK.
+ */
+ /* deal with 6110 chip bug on high register #s */
+ i = start / BITS_PER_LONG;
+ im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+ i ^ 1 : i;
+ __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
+ + start, dd->ipath_pioavailshadow);
+ dma = (unsigned long) le64_to_cpu(
+ dd->ipath_pioavailregs_dma[im]);
+ if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+ + start) % BITS_PER_LONG, &dma))
+ __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+ + start, dd->ipath_pioavailshadow);
+ else
+ __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+ + start, dd->ipath_pioavailshadow);
+ __set_bit(start, dd->ipath_pioavailkernel);
+ } else {
+ __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
+ dd->ipath_pioavailshadow);
+ __clear_bit(start, dd->ipath_pioavailkernel);
+ }
+ start += 2;
+ }
+
+ if (dd->ipath_pioupd_thresh) {
+ end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+ cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
+ }
+ spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+ /*
+ * When moving buffers from kernel to user, if number assigned to
+ * the user is less than the pio update threshold, and threshold
+ * is supported (cnt was computed > 0), drop the update threshold
+ * so we update at least once per allocated number of buffers.
+ * In any case, if the kernel buffers are less than the threshold,
+ * drop the threshold. We don't bother increasing it, having once
+ * decreased it, since it would typically just cycle back and forth.
+ * If we don't decrease below buffers in use, we can wait a long
+ * time for an update, until some other context uses PIO buffers.
+ */
+ if (!avail && len < cnt)
+ cnt = len;
+ if (cnt < dd->ipath_pioupd_thresh) {
+ dd->ipath_pioupd_thresh = cnt;
+ ipath_dbg("Decreased pio update threshold to %u\n",
+ dd->ipath_pioupd_thresh);
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
+ << INFINIPATH_S_UPDTHRESH_SHIFT);
+ dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+ << INFINIPATH_S_UPDTHRESH_SHIFT;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+ }
+}
+
+/**
+ * ipath_create_rcvhdrq - create a receive header queue
+ * @dd: the infinipath device
+ * @pd: the port data
+ *
+ * this must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int ipath_create_rcvhdrq(struct ipath_devdata *dd,
+ struct ipath_portdata *pd)
+{
+ int ret = 0;
+
+ if (!pd->port_rcvhdrq) {
+ dma_addr_t phys_hdrqtail;
+ gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+ int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+ sizeof(u32), PAGE_SIZE);
+
+ pd->port_rcvhdrq = dma_alloc_coherent(
+ &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
+ gfp_flags);
+
+ if (!pd->port_rcvhdrq) {
+ ipath_dev_err(dd, "attempt to allocate %d bytes "
+ "for port %u rcvhdrq failed\n",
+ amt, pd->port_port);
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+ pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
+ &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+ GFP_KERNEL);
+ if (!pd->port_rcvhdrtail_kvaddr) {
+ ipath_dev_err(dd, "attempt to allocate 1 page "
+ "for port %u rcvhdrqtailaddr "
+ "failed\n", pd->port_port);
+ ret = -ENOMEM;
+ dma_free_coherent(&dd->pcidev->dev, amt,
+ pd->port_rcvhdrq,
+ pd->port_rcvhdrq_phys);
+ pd->port_rcvhdrq = NULL;
+ goto bail;
+ }
+ pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
+ ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
+ "physical\n", pd->port_port,
+ (unsigned long long) phys_hdrqtail);
+ }
+
+ pd->port_rcvhdrq_size = amt;
+
+ ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
+ "for port %u rcvhdr Q\n",
+ amt >> PAGE_SHIFT, pd->port_rcvhdrq,
+ (unsigned long) pd->port_rcvhdrq_phys,
+ (unsigned long) pd->port_rcvhdrq_size,
+ pd->port_port);
+ }
+ else
+ ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
+ "hdrtailaddr@%p %llx physical\n",
+ pd->port_port, pd->port_rcvhdrq,
+ (unsigned long long) pd->port_rcvhdrq_phys,
+ pd->port_rcvhdrtail_kvaddr, (unsigned long long)
+ pd->port_rcvhdrqtailaddr_phys);
+
+ /* clear for security and sanity on each use */
+ memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
+ if (pd->port_rcvhdrtail_kvaddr)
+ memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+
+ /*
+ * tell chip each time we init it, even if we are re-using previous
+ * memory (we zero the register at process close)
+ */
+ ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+ pd->port_port, pd->port_rcvhdrqtailaddr_phys);
+ ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+ pd->port_port, pd->port_rcvhdrq_phys);
+
+bail:
+ return ret;
+}
+
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent. Used whenever we need to be
+ * sure the send side is idle. Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo. The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if normal send had happened.
+ */
+void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
+{
+ unsigned long flags;
+
+ if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
+ ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
+ goto bail;
+ }
+ /*
+ * If we have SDMA, and it's not disabled, we have to kick off the
+ * abort state machine, provided we aren't already aborting.
+ * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
+ * we skip the rest of this routine. It is already "in progress"
+ */
+ if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
+ int skip_cancel;
+ unsigned long *statp = &dd->ipath_sdma_status;
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ skip_cancel =
+ test_and_set_bit(IPATH_SDMA_ABORTING, statp)
+ && !test_bit(IPATH_SDMA_DISABLED, statp);
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+ if (skip_cancel)
+ goto bail;
+ }
+
+ ipath_dbg("Cancelling all in-progress send buffers\n");
+
+ /* skip armlaunch errs for a while */
+ dd->ipath_lastcancel = jiffies + HZ / 2;
+
+ /*
+ * The abort bit is auto-clearing. We also don't want pioavail
+ * update happening during this, and we don't want any other
+ * sends going out, so turn those off for the duration. We read
+ * the scratch register to be sure that cancels and the abort
+ * have taken effect in the chip. Otherwise two parts are same
+ * as ipath_force_pio_avail_update()
+ */
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
+ | INFINIPATH_S_PIOENABLE);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl | INFINIPATH_S_ABORT);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ /* disarm all send buffers */
+ ipath_disarm_piobufs(dd, 0,
+ dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+
+ if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+ set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+
+ if (restore_sendctrl) {
+ /* else done by caller later if needed */
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
+ INFINIPATH_S_PIOENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ /* and again, be sure all have hit the chip */
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+ }
+
+ if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
+ !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
+ test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ /* only wait so long for intr */
+ dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
+ dd->ipath_sdma_reset_wait = 200;
+ if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+ tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+ }
+bail:;
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons. We read the scratch register
+ * to make it highly likely that the update will have happened by the
+ * time we return. If already off (as in cancel_sends above), this
+ * routine is a nop, on the assumption that the caller will "do the
+ * right thing".
+ */
+void ipath_force_pio_avail_update(struct ipath_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ }
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
+ int linitcmd)
+{
+ u64 mod_wd;
+ static const char *what[4] = {
+ [0] = "NOP",
+ [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
+ [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
+ [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
+ };
+
+ if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
+ /*
+ * If we are told to disable, note that so link-recovery
+ * code does not attempt to bring us back up.
+ */
+ preempt_disable();
+ dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+ preempt_enable();
+ } else if (linitcmd) {
+ /*
+ * Any other linkinitcmd will lead to LINKDOWN and then
+ * to INIT (if all is well), so clear flag to let
+ * link-recovery code attempt to bring us back up.
+ */
+ preempt_disable();
+ dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+ preempt_enable();
+ }
+
+ mod_wd = (linkcmd << dd->ibcc_lc_shift) |
+ (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+ ipath_cdbg(VERBOSE,
+ "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
+ dd->ipath_unit, what[linkcmd], linitcmd,
+ ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+ dd->ipath_ibcctrl | mod_wd);
+ /* read from chip so write is flushed */
+ (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+}
+
+int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
+{
+ u32 lstate;
+ int ret;
+
+ switch (newstate) {
+ case IPATH_IB_LINKDOWN_ONLY:
+ ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
+ /* don't wait */
+ ret = 0;
+ goto bail;
+
+ case IPATH_IB_LINKDOWN:
+ ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+ INFINIPATH_IBCC_LINKINITCMD_POLL);
+ /* don't wait */
+ ret = 0;
+ goto bail;
+
+ case IPATH_IB_LINKDOWN_SLEEP:
+ ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+ INFINIPATH_IBCC_LINKINITCMD_SLEEP);
+ /* don't wait */
+ ret = 0;
+ goto bail;
+
+ case IPATH_IB_LINKDOWN_DISABLE:
+ ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+ INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+ /* don't wait */
+ ret = 0;
+ goto bail;
+
+ case IPATH_IB_LINKARM:
+ if (dd->ipath_flags & IPATH_LINKARMED) {
+ ret = 0;
+ goto bail;
+ }
+ if (!(dd->ipath_flags &
+ (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
+
+ /*
+ * Since the port can transition to ACTIVE by receiving
+ * a non VL 15 packet, wait for either state.
+ */
+ lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
+ break;
+
+ case IPATH_IB_LINKACTIVE:
+ if (dd->ipath_flags & IPATH_LINKACTIVE) {
+ ret = 0;
+ goto bail;
+ }
+ if (!(dd->ipath_flags & IPATH_LINKARMED)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
+ lstate = IPATH_LINKACTIVE;
+ break;
+
+ case IPATH_IB_LINK_LOOPBACK:
+ dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
+ dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+ dd->ipath_ibcctrl);
+
+ /* turn heartbeat off, as it causes loopback to fail */
+ dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+ IPATH_IB_HRTBT_OFF);
+ /* don't wait */
+ ret = 0;
+ goto bail;
+
+ case IPATH_IB_LINK_EXTERNAL:
+ dev_info(&dd->pcidev->dev,
+ "Disabling IB local loopback (normal)\n");
+ dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+ IPATH_IB_HRTBT_ON);
+ dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+ dd->ipath_ibcctrl);
+ /* don't wait */
+ ret = 0;
+ goto bail;
+
+ /*
+ * Heartbeat can be explicitly enabled by the user via
+ * "hrtbt_enable" "file", and if disabled, trying to enable here
+ * will have no effect. Implicit changes (heartbeat off when
+ * loopback on, and vice versa) are included to ease testing.
+ */
+ case IPATH_IB_LINK_HRTBT:
+ ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+ IPATH_IB_HRTBT_ON);
+ goto bail;
+
+ case IPATH_IB_LINK_NO_HRTBT:
+ ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+ IPATH_IB_HRTBT_OFF);
+ goto bail;
+
+ default:
+ ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
+ ret = -EINVAL;
+ goto bail;
+ }
+ ret = ipath_wait_linkstate(dd, lstate, 2000);
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_set_mtu - set the MTU
+ * @dd: the infinipath device
+ * @arg: the new MTU
+ *
+ * we can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size. For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
+{
+ u32 piosize;
+ int changed = 0;
+ int ret;
+
+ /*
+ * mtu is IB data payload max. It's the largest power of 2 less
+ * than piosize (or even larger, since it only really controls the
+ * largest we can receive; we can send the max of the mtu and
+ * piosize). We check that it's one of the valid IB sizes.
+ */
+ if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+ (arg != 4096 || !ipath_mtu4096)) {
+ ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
+ ret = -EINVAL;
+ goto bail;
+ }
+ if (dd->ipath_ibmtu == arg) {
+ ret = 0; /* same as current */
+ goto bail;
+ }
+
+ piosize = dd->ipath_ibmaxlen;
+ dd->ipath_ibmtu = arg;
+
+ if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
+ /* Only if it's not the initial value (or reset to it) */
+ if (piosize != dd->ipath_init_ibmaxlen) {
+ if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
+ piosize = dd->ipath_init_ibmaxlen;
+ dd->ipath_ibmaxlen = piosize;
+ changed = 1;
+ }
+ } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
+ piosize = arg + IPATH_PIO_MAXIBHDR;
+ ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
+ "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
+ arg);
+ dd->ipath_ibmaxlen = piosize;
+ changed = 1;
+ }
+
+ if (changed) {
+ u64 ibc = dd->ipath_ibcctrl, ibdw;
+ /*
+ * update our housekeeping variables, and set IBC max
+ * size, same as init code; max IBC is max we allow in
+ * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+ */
+ dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
+ ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
+ ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
+ dd->ibcc_mpl_shift);
+ ibc |= ibdw << dd->ibcc_mpl_shift;
+ dd->ipath_ibcctrl = ibc;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+ dd->ipath_ibcctrl);
+ dd->ipath_f_tidtemplate(dd);
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
+{
+ dd->ipath_lid = lid;
+ dd->ipath_lmc = lmc;
+
+ dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
+ (~((1U << lmc) - 1)) << 16);
+
+ dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
+
+ return 0;
+}
+
+
+/**
+ * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
+ * @dd: the infinipath device
+ * @regno: the register number to write
+ * @port: the port containing the register
+ * @value: the value to write
+ *
+ * Registers that vary with the chip implementation constants (port)
+ * use this routine.
+ */
+void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
+ unsigned port, u64 value)
+{
+ u16 where;
+
+ if (port < dd->ipath_portcnt &&
+ (regno == dd->ipath_kregs->kr_rcvhdraddr ||
+ regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
+ where = regno + port;
+ else
+ where = -1;
+
+ ipath_write_kreg(dd, where, value);
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void ipath_run_led_override(unsigned long opaque)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+ int timeoff;
+ int pidx;
+ u64 lstate, ltstate, val;
+
+ if (!(dd->ipath_flags & IPATH_INITTED))
+ return;
+
+ pidx = dd->ipath_led_override_phase++ & 1;
+ dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
+ timeoff = dd->ipath_led_override_timeoff;
+
+ /*
+ * below potentially restores the LED values per current status,
+ * should also possibly setup the traffic-blink register,
+ * but leave that to per-chip functions.
+ */
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+ ltstate = ipath_ib_linktrstate(dd, val);
+ lstate = ipath_ib_linkstate(dd, val);
+
+ dd->ipath_f_setextled(dd, lstate, ltstate);
+ mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
+}
+
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
+{
+ int timeoff, freq;
+
+ if (!(dd->ipath_flags & IPATH_INITTED))
+ return;
+
+ /* First check if we are blinking. If not, use 1HZ polling */
+ timeoff = HZ;
+ freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+ if (freq) {
+ /* For blink, set each phase from one nybble of val */
+ dd->ipath_led_override_vals[0] = val & 0xF;
+ dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
+ timeoff = (HZ << 4)/freq;
+ } else {
+ /* Non-blink set both phases the same. */
+ dd->ipath_led_override_vals[0] = val & 0xF;
+ dd->ipath_led_override_vals[1] = val & 0xF;
+ }
+ dd->ipath_led_override_timeoff = timeoff;
+
+ /*
+ * If the timer has not already been started, do so. Use a "quick"
+ * timeout so the function will be called soon, to look at our request.
+ */
+ if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
+ /* Need to start timer */
+ init_timer(&dd->ipath_led_override_timer);
+ dd->ipath_led_override_timer.function =
+ ipath_run_led_override;
+ dd->ipath_led_override_timer.data = (unsigned long) dd;
+ dd->ipath_led_override_timer.expires = jiffies + 1;
+ add_timer(&dd->ipath_led_override_timer);
+ } else
+ atomic_dec(&dd->ipath_led_override_timer_active);
+}
+
+/**
+ * ipath_shutdown_device - shut down a device
+ * @dd: the infinipath device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled. It does not free any data structures.
+ * Everything it does has to be setup again by ipath_init_chip(dd,1)
+ */
+void ipath_shutdown_device(struct ipath_devdata *dd)
+{
+ unsigned long flags;
+
+ ipath_dbg("Shutting down the device\n");
+
+ ipath_hol_up(dd); /* make sure user processes aren't suspended */
+
+ dd->ipath_flags |= IPATH_LINKUNK;
+ dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
+ IPATH_LINKINIT | IPATH_LINKARMED |
+ IPATH_LINKACTIVE);
+ *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
+ IPATH_STATUS_IB_READY);
+
+ /* mask interrupts, but not errors */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+ dd->ipath_rcvctrl = 0;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+
+ if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+ teardown_sdma(dd);
+
+ /*
+ * gracefully stop all sends allowing any in progress to trickle out
+ * first.
+ */
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl = 0;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+ /* flush it */
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ /*
+ * enough for anything that's going to trickle out to have actually
+ * done so.
+ */
+ udelay(5);
+
+ dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
+
+ ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+ ipath_cancel_sends(dd, 0);
+
+ /*
+ * we are shutting down, so tell components that care. We don't do
+ * this on just a link state change, much like ethernet, a cable
+ * unplug, etc. doesn't change driver state
+ */
+ signal_ib_event(dd, IB_EVENT_PORT_ERR);
+
+ /* disable IBC */
+ dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+ dd->ipath_control | INFINIPATH_C_FREEZEMODE);
+
+ /*
+ * clear SerdesEnable and turn the leds off; do this here because
+ * we are unloading, so don't count on interrupts to move along
+ * Turn the LEDs off explicitly for the same reason.
+ */
+ dd->ipath_f_quiet_serdes(dd);
+
+ /* stop all the timers that might still be running */
+ del_timer_sync(&dd->ipath_hol_timer);
+ if (dd->ipath_stats_timer_active) {
+ del_timer_sync(&dd->ipath_stats_timer);
+ dd->ipath_stats_timer_active = 0;
+ }
+ if (dd->ipath_intrchk_timer.data) {
+ del_timer_sync(&dd->ipath_intrchk_timer);
+ dd->ipath_intrchk_timer.data = 0;
+ }
+ if (atomic_read(&dd->ipath_led_override_timer_active)) {
+ del_timer_sync(&dd->ipath_led_override_timer);
+ atomic_set(&dd->ipath_led_override_timer_active, 0);
+ }
+
+ /*
+ * clear all interrupts and errors, so that the next time the driver
+ * is loaded or device is enabled, we know that whatever is set
+ * happened while we were unloaded
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+ ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+ ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
+ ipath_update_eeprom_log(dd);
+}
+
+/**
+ * ipath_free_pddata - free a port's allocated data
+ * @dd: the infinipath device
+ * @pd: the portdata structure
+ *
+ * free up any allocated data for a port
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of port data, because it is called after ipath_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ * (The only exception to global state is freeing the port0 port0_skbs.)
+ */
+void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
+{
+ if (!pd)
+ return;
+
+ if (pd->port_rcvhdrq) {
+ ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
+ "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
+ (unsigned long) pd->port_rcvhdrq_size);
+ dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
+ pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
+ pd->port_rcvhdrq = NULL;
+ if (pd->port_rcvhdrtail_kvaddr) {
+ dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+ pd->port_rcvhdrtail_kvaddr,
+ pd->port_rcvhdrqtailaddr_phys);
+ pd->port_rcvhdrtail_kvaddr = NULL;
+ }
+ }
+ if (pd->port_port && pd->port_rcvegrbuf) {
+ unsigned e;
+
+ for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+ void *base = pd->port_rcvegrbuf[e];
+ size_t size = pd->port_rcvegrbuf_size;
+
+ ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
+ "chunk %u/%u\n", base,
+ (unsigned long) size,
+ e, pd->port_rcvegrbuf_chunks);
+ dma_free_coherent(&dd->pcidev->dev, size,
+ base, pd->port_rcvegrbuf_phys[e]);
+ }
+ kfree(pd->port_rcvegrbuf);
+ pd->port_rcvegrbuf = NULL;
+ kfree(pd->port_rcvegrbuf_phys);
+ pd->port_rcvegrbuf_phys = NULL;
+ pd->port_rcvegrbuf_chunks = 0;
+ } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
+ unsigned e;
+ struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
+
+ dd->ipath_port0_skbinfo = NULL;
+ ipath_cdbg(VERBOSE, "free closed port %d "
+ "ipath_port0_skbinfo @ %p\n", pd->port_port,
+ skbinfo);
+ for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
+ if (skbinfo[e].skb) {
+ pci_unmap_single(dd->pcidev, skbinfo[e].phys,
+ dd->ipath_ibmaxlen,
+ PCI_DMA_FROMDEVICE);
+ dev_kfree_skb(skbinfo[e].skb);
+ }
+ vfree(skbinfo);
+ }
+ kfree(pd->port_tid_pg_list);
+ vfree(pd->subport_uregbase);
+ vfree(pd->subport_rcvegrbuf);
+ vfree(pd->subport_rcvhdr_base);
+ kfree(pd);
+}
+
+static int __init infinipath_init(void)
+{
+ int ret;
+
+ if (ipath_debug & __IPATH_DBG)
+ printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
+
+ /*
+ * These must be called before the driver is registered with
+ * the PCI subsystem.
+ */
+ idr_init(&unit_table);
+
+ ret = pci_register_driver(&ipath_driver);
+ if (ret < 0) {
+ printk(KERN_ERR IPATH_DRV_NAME
+ ": Unable to register driver: error %d\n", -ret);
+ goto bail_unit;
+ }
+
+ ret = ipath_init_ipathfs();
+ if (ret < 0) {
+ printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
+ "ipathfs: error %d\n", -ret);
+ goto bail_pci;
+ }
+
+ goto bail;
+
+bail_pci:
+ pci_unregister_driver(&ipath_driver);
+
+bail_unit:
+ idr_destroy(&unit_table);
+
+bail:
+ return ret;
+}
+
+static void __exit infinipath_cleanup(void)
+{
+ ipath_exit_ipathfs();
+
+ ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
+ pci_unregister_driver(&ipath_driver);
+
+ idr_destroy(&unit_table);
+}
+
+/**
+ * ipath_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload). We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize. For
+ * now, we only allow this if no user ports are open that use chip resources
+ */
+int ipath_reset_device(int unit)
+{
+ int ret, i;
+ struct ipath_devdata *dd = ipath_lookup(unit);
+ unsigned long flags;
+
+ if (!dd) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ if (atomic_read(&dd->ipath_led_override_timer_active)) {
+ /* Need to stop LED timer, _then_ shut off LEDs */
+ del_timer_sync(&dd->ipath_led_override_timer);
+ atomic_set(&dd->ipath_led_override_timer_active, 0);
+ }
+
+ /* Shut off LEDs after we are sure timer is not running */
+ dd->ipath_led_override = LED_OVER_BOTH_OFF;
+ dd->ipath_f_setextled(dd, 0, 0);
+
+ dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
+
+ if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
+ dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
+ "not initialized or not present\n", unit);
+ ret = -ENXIO;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+ if (dd->ipath_pd)
+ for (i = 1; i < dd->ipath_cfgports; i++) {
+ if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+ continue;
+ spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+ ipath_dbg("unit %u port %d is in use "
+ "(PID %u cmd %s), can't reset\n",
+ unit, i,
+ pid_nr(dd->ipath_pd[i]->port_pid),
+ dd->ipath_pd[i]->port_comm);
+ ret = -EBUSY;
+ goto bail;
+ }
+ spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+ if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+ teardown_sdma(dd);
+
+ dd->ipath_flags &= ~IPATH_INITTED;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+ ret = dd->ipath_f_reset(dd);
+ if (ret == 1) {
+ ipath_dbg("Reinitializing unit %u after reset attempt\n",
+ unit);
+ ret = ipath_init_chip(dd, 1);
+ } else
+ ret = -EAGAIN;
+ if (ret)
+ ipath_dev_err(dd, "Reinitialize unit %u after "
+ "reset failed with %d\n", unit, ret);
+ else
+ dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
+ "resetting\n", unit);
+
+bail:
+ return ret;
+}
+
+/*
+ * send a signal to all the processes that have the driver open
+ * through the normal interfaces (i.e., everything other than diags
+ * interface). Returns number of signalled processes.
+ */
+static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
+{
+ int i, sub, any = 0;
+ struct pid *pid;
+ unsigned long flags;
+
+ if (!dd->ipath_pd)
+ return 0;
+
+ spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+ for (i = 1; i < dd->ipath_cfgports; i++) {
+ if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+ continue;
+ pid = dd->ipath_pd[i]->port_pid;
+ if (!pid)
+ continue;
+
+ dev_info(&dd->pcidev->dev, "context %d in use "
+ "(PID %u), sending signal %d\n",
+ i, pid_nr(pid), sig);
+ kill_pid(pid, sig, 1);
+ any++;
+ for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
+ pid = dd->ipath_pd[i]->port_subpid[sub];
+ if (!pid)
+ continue;
+ dev_info(&dd->pcidev->dev, "sub-context "
+ "%d:%d in use (PID %u), sending "
+ "signal %d\n", i, sub, pid_nr(pid), sig);
+ kill_pid(pid, sig, 1);
+ any++;
+ }
+ }
+ spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+ return any;
+}
+
+static void ipath_hol_signal_down(struct ipath_devdata *dd)
+{
+ if (ipath_signal_procs(dd, SIGSTOP))
+ ipath_dbg("Stopped some processes\n");
+ ipath_cancel_sends(dd, 1);
+}
+
+
+static void ipath_hol_signal_up(struct ipath_devdata *dd)
+{
+ if (ipath_signal_procs(dd, SIGCONT))
+ ipath_dbg("Continued some processes\n");
+}
+
+/*
+ * link is down, stop any users processes, and flush pending sends
+ * to prevent HoL blocking, then start the HoL timer that
+ * periodically continues, then stop procs, so they can detect
+ * link down if they want, and do something about it.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void ipath_hol_down(struct ipath_devdata *dd)
+{
+ dd->ipath_hol_state = IPATH_HOL_DOWN;
+ ipath_hol_signal_down(dd);
+ dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+ dd->ipath_hol_timer.expires = jiffies +
+ msecs_to_jiffies(ipath_hol_timeout_ms);
+ mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+}
+
+/*
+ * link is up, continue any user processes, and ensure timer
+ * is a nop, if running. Let timer keep running, if set; it
+ * will nop when it sees the link is up
+ */
+void ipath_hol_up(struct ipath_devdata *dd)
+{
+ ipath_hol_signal_up(dd);
+ dd->ipath_hol_state = IPATH_HOL_UP;
+}
+
+/*
+ * toggle the running/not running state of user proceses
+ * to prevent HoL blocking on chip resources, but still allow
+ * user processes to do link down special case handling.
+ * Should only be called via the timer
+ */
+void ipath_hol_event(unsigned long opaque)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+ if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
+ && dd->ipath_hol_state != IPATH_HOL_UP) {
+ dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+ ipath_dbg("Stopping processes\n");
+ ipath_hol_signal_down(dd);
+ } else { /* may do "extra" if also in ipath_hol_up() */
+ dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
+ ipath_dbg("Continuing processes\n");
+ ipath_hol_signal_up(dd);
+ }
+ if (dd->ipath_hol_state == IPATH_HOL_UP)
+ ipath_dbg("link's up, don't resched timer\n");
+ else {
+ dd->ipath_hol_timer.expires = jiffies +
+ msecs_to_jiffies(ipath_hol_timeout_ms);
+ mod_timer(&dd->ipath_hol_timer,
+ dd->ipath_hol_timer.expires);
+ }
+}
+
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
+{
+ u64 val;
+
+ if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
+ return -1;
+ if (dd->ipath_rx_pol_inv != new_pol_inv) {
+ dd->ipath_rx_pol_inv = new_pol_inv;
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+ val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+ INFINIPATH_XGXS_RX_POL_SHIFT);
+ val |= ((u64)dd->ipath_rx_pol_inv) <<
+ INFINIPATH_XGXS_RX_POL_SHIFT;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+ }
+ return 0;
+}
+
+/*
+ * Disable and enable the armlaunch error. Used for PIO bandwidth testing on
+ * the 7220, which is count-based, rather than trigger-based. Safe for the
+ * driver check, since it's at init. Not completely safe when used for
+ * user-mode checking, since some error checking can be lost, but not
+ * particularly risky, and only has problematic side-effects in the face of
+ * very buggy user code. There is no reference counting, but that's also
+ * fine, given the intended use.
+ */
+void ipath_enable_armlaunch(struct ipath_devdata *dd)
+{
+ dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+ INFINIPATH_E_SPIOARMLAUNCH);
+ dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ dd->ipath_errormask);
+}
+
+void ipath_disable_armlaunch(struct ipath_devdata *dd)
+{
+ /* so don't re-enable if already set */
+ dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
+ dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ dd->ipath_errormask);
+}
+
+module_init(infinipath_init);
+module_exit(infinipath_cleanup);
diff --git a/drivers/staging/rdma/ipath/ipath_eeprom.c b/drivers/staging/rdma/ipath/ipath_eeprom.c
new file mode 100644
index 000000000000..fc7181985e8e
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_eeprom.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+
+/*
+ * InfiniPath I2C driver for a serial eeprom. This is not a generic
+ * I2C interface. For a start, the device we're using (Atmel AT24C11)
+ * doesn't work like a regular I2C device. It looks like one
+ * electrically, but not logically. Normal I2C devices have a single
+ * 7-bit or 10-bit I2C address that they respond to. Valid 7-bit
+ * addresses range from 0x03 to 0x77. Addresses 0x00 to 0x02 and 0x78
+ * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
+ * call" address.) The Atmel device, on the other hand, responds to ALL
+ * 7-bit addresses. It's designed to be the only device on a given I2C
+ * bus. A 7-bit address corresponds to the memory address within the
+ * Atmel device itself.
+ *
+ * Also, the timing requirements mean more than simple software
+ * bitbanging, with readbacks from chip to ensure timing (simple udelay
+ * is not enough).
+ *
+ * This all means that accessing the device is specialized enough
+ * that using the standard kernel I2C bitbanging interface would be
+ * impossible. For example, the core I2C eeprom driver expects to find
+ * a device at one or more of a limited set of addresses only. It doesn't
+ * allow writing to an eeprom. It also doesn't provide any means of
+ * accessing eeprom contents from within the kernel, only via sysfs.
+ */
+
+/* Added functionality for IBA7220-based cards */
+#define IPATH_EEPROM_DEV_V1 0xA0
+#define IPATH_EEPROM_DEV_V2 0xA2
+#define IPATH_TEMP_DEV 0x98
+#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
+#define IPATH_NO_DEV (0xFF)
+
+/*
+ * The number of I2C chains is proliferating. Table below brings
+ * some order to the madness. The basic principle is that the
+ * table is scanned from the top, and a "probe" is made to the
+ * device probe_dev. If that succeeds, the chain is considered
+ * to be of that type, and dd->i2c_chain_type is set to the index+1
+ * of the entry.
+ * The +1 is so static initialization can mean "unknown, do probe."
+ */
+static struct i2c_chain_desc {
+ u8 probe_dev; /* If seen at probe, chain is this type */
+ u8 eeprom_dev; /* Dev addr (if any) for EEPROM */
+ u8 temp_dev; /* Dev Addr (if any) for Temp-sense */
+} i2c_chains[] = {
+ { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
+ { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
+ { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
+ { IPATH_NO_DEV }
+};
+
+enum i2c_type {
+ i2c_line_scl = 0,
+ i2c_line_sda
+};
+
+enum i2c_state {
+ i2c_line_low = 0,
+ i2c_line_high
+};
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_gpio_set - set a GPIO line
+ * @dd: the infinipath device
+ * @line: the line to set
+ * @new_line_state: the state to set
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.
+ */
+static int i2c_gpio_set(struct ipath_devdata *dd,
+ enum i2c_type line,
+ enum i2c_state new_line_state)
+{
+ u64 out_mask, dir_mask, *gpioval;
+ unsigned long flags = 0;
+
+ gpioval = &dd->ipath_gpio_out;
+
+ if (line == i2c_line_scl) {
+ dir_mask = dd->ipath_gpio_scl;
+ out_mask = (1UL << dd->ipath_gpio_scl_num);
+ } else {
+ dir_mask = dd->ipath_gpio_sda;
+ out_mask = (1UL << dd->ipath_gpio_sda_num);
+ }
+
+ spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+ if (new_line_state == i2c_line_high) {
+ /* tri-state the output rather than force high */
+ dd->ipath_extctrl &= ~dir_mask;
+ } else {
+ /* config line to be an output */
+ dd->ipath_extctrl |= dir_mask;
+ }
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+
+ /* set output as well (no real verify) */
+ if (new_line_state == i2c_line_high)
+ *gpioval |= out_mask;
+ else
+ *gpioval &= ~out_mask;
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
+ spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+ return 0;
+}
+
+/**
+ * i2c_gpio_get - get a GPIO line state
+ * @dd: the infinipath device
+ * @line: the line to get
+ * @curr_statep: where to put the line state
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error. curr_state is not set on error.
+ */
+static int i2c_gpio_get(struct ipath_devdata *dd,
+ enum i2c_type line,
+ enum i2c_state *curr_statep)
+{
+ u64 read_val, mask;
+ int ret;
+ unsigned long flags = 0;
+
+ /* check args */
+ if (curr_statep == NULL) {
+ ret = 1;
+ goto bail;
+ }
+
+ /* config line to be an input */
+ if (line == i2c_line_scl)
+ mask = dd->ipath_gpio_scl;
+ else
+ mask = dd->ipath_gpio_sda;
+
+ spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+ dd->ipath_extctrl &= ~mask;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+ /*
+ * Below is very unlikely to reflect true input state if Output
+ * Enable actually changed.
+ */
+ read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+ spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+ if (read_val & mask)
+ *curr_statep = i2c_line_high;
+ else
+ *curr_statep = i2c_line_low;
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the infinipath device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip. Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct ipath_devdata *dd)
+{
+ (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+ rmb();
+}
+
+static void scl_out(struct ipath_devdata *dd, u8 bit)
+{
+ udelay(1);
+ i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
+
+ i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct ipath_devdata *dd, u8 bit)
+{
+ i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
+
+ i2c_wait_for_writes(dd);
+}
+
+static u8 sda_in(struct ipath_devdata *dd, int wait)
+{
+ enum i2c_state bit;
+
+ if (i2c_gpio_get(dd, i2c_line_sda, &bit))
+ ipath_dbg("get bit failed!\n");
+
+ if (wait)
+ i2c_wait_for_writes(dd);
+
+ return bit == i2c_line_high ? 1U : 0;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the infinipath device
+ */
+static int i2c_ackrcv(struct ipath_devdata *dd)
+{
+ u8 ack_received;
+
+ /* AT ENTRY SCL = LOW */
+ /* change direction, ignore data */
+ ack_received = sda_in(dd, 1);
+ scl_out(dd, i2c_line_high);
+ ack_received = sda_in(dd, 1) == 0;
+ scl_out(dd, i2c_line_low);
+ return ack_received;
+}
+
+/**
+ * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
+ * @dd: the infinipath device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct ipath_devdata *dd)
+{
+ int bit_cntr, data;
+
+ data = 0;
+
+ for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+ data <<= 1;
+ scl_out(dd, i2c_line_high);
+ data |= sda_in(dd, 0);
+ scl_out(dd, i2c_line_low);
+ }
+ return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the infinipath device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct ipath_devdata *dd, u8 data)
+{
+ int bit_cntr;
+ u8 bit;
+
+ for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+ bit = (data >> bit_cntr) & 1;
+ sda_out(dd, bit);
+ scl_out(dd, i2c_line_high);
+ scl_out(dd, i2c_line_low);
+ }
+ return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+static void send_ack(struct ipath_devdata *dd)
+{
+ sda_out(dd, i2c_line_low);
+ scl_out(dd, i2c_line_high);
+ scl_out(dd, i2c_line_low);
+ sda_out(dd, i2c_line_high);
+}
+
+/**
+ * i2c_startcmd - transmit the start condition, followed by address/cmd
+ * @dd: the infinipath device
+ * @offset_dir: direction byte
+ *
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
+{
+ int res;
+
+ /* issue start sequence */
+ sda_out(dd, i2c_line_high);
+ scl_out(dd, i2c_line_high);
+ sda_out(dd, i2c_line_low);
+ scl_out(dd, i2c_line_low);
+
+ /* issue length and direction byte */
+ res = wr_byte(dd, offset_dir);
+
+ if (res)
+ ipath_cdbg(VERBOSE, "No ack to complete start\n");
+
+ return res;
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the infinipath device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct ipath_devdata *dd)
+{
+ scl_out(dd, i2c_line_low);
+ sda_out(dd, i2c_line_low);
+ scl_out(dd, i2c_line_high);
+ sda_out(dd, i2c_line_high);
+ udelay(2);
+}
+
+/**
+ * eeprom_reset - reset I2C communication
+ * @dd: the infinipath device
+ */
+
+static int eeprom_reset(struct ipath_devdata *dd)
+{
+ int clock_cycles_left = 9;
+ u64 *gpioval = &dd->ipath_gpio_out;
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+ /* Make sure shadows are consistent */
+ dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
+ *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
+ spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+ ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
+ "is %llx\n", (unsigned long long) *gpioval);
+
+ /*
+ * This is to get the i2c into a known state, by first going low,
+ * then tristate sda (and then tristate scl as first thing
+ * in loop)
+ */
+ scl_out(dd, i2c_line_low);
+ sda_out(dd, i2c_line_high);
+
+ /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
+ while (clock_cycles_left--) {
+ scl_out(dd, i2c_line_high);
+
+ /* SDA seen high, issue START by dropping it while SCL high */
+ if (sda_in(dd, 0)) {
+ sda_out(dd, i2c_line_low);
+ scl_out(dd, i2c_line_low);
+ /* ATMEL spec says must be followed by STOP. */
+ scl_out(dd, i2c_line_high);
+ sda_out(dd, i2c_line_high);
+ ret = 0;
+ goto bail;
+ }
+
+ scl_out(dd, i2c_line_low);
+ }
+
+ ret = 1;
+
+bail:
+ return ret;
+}
+
+/*
+ * Probe for I2C device at specified address. Returns 0 for "success"
+ * to match rest of this file.
+ * Leave bus in "reasonable" state for further commands.
+ */
+static int i2c_probe(struct ipath_devdata *dd, int devaddr)
+{
+ int ret = 0;
+
+ ret = eeprom_reset(dd);
+ if (ret) {
+ ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
+ devaddr);
+ return ret;
+ }
+ /*
+ * Reset no longer leaves bus in start condition, so normal
+ * i2c_startcmd() will do.
+ */
+ ret = i2c_startcmd(dd, devaddr | READ_CMD);
+ if (ret)
+ ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
+ devaddr);
+ else {
+ /*
+ * Device did respond. Complete a single-byte read, because some
+ * devices apparently cannot handle STOP immediately after they
+ * ACK the start-cmd.
+ */
+ int data;
+ data = rd_byte(dd);
+ stop_cmd(dd);
+ ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
+ }
+ return ret;
+}
+
+/*
+ * Returns the "i2c type". This is a pointer to a struct that describes
+ * the I2C chain on this board. To minimize impact on struct ipath_devdata,
+ * the (small integer) index into the table is actually memoized, rather
+ * then the pointer.
+ * Memoization is because the type is determined on the first call per chip.
+ * An alternative would be to move type determination to early
+ * init code.
+ */
+static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
+{
+ int idx;
+
+ /* Get memoized index, from previous successful probes */
+ idx = dd->ipath_i2c_chain_type - 1;
+ if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
+ goto done;
+
+ idx = 0;
+ while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
+ /* if probe succeeds, this is type */
+ if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
+ break;
+ ++idx;
+ }
+
+ /*
+ * Old EEPROM (first entry) may require a reset after probe,
+ * rather than being able to "start" after "stop"
+ */
+ if (idx == 0)
+ eeprom_reset(dd);
+
+ if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
+ idx = -1;
+ else
+ dd->ipath_i2c_chain_type = idx + 1;
+done:
+ return (idx >= 0) ? i2c_chains + idx : NULL;
+}
+
+static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
+ u8 eeprom_offset, void *buffer, int len)
+{
+ int ret;
+ struct i2c_chain_desc *icd;
+ u8 *bp = buffer;
+
+ ret = 1;
+ icd = ipath_i2c_type(dd);
+ if (!icd)
+ goto bail;
+
+ if (icd->eeprom_dev == IPATH_NO_DEV) {
+ /* legacy not-really-I2C */
+ ipath_cdbg(VERBOSE, "Start command only address\n");
+ eeprom_offset = (eeprom_offset << 1) | READ_CMD;
+ ret = i2c_startcmd(dd, eeprom_offset);
+ } else {
+ /* Actual I2C */
+ ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
+ if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+ ipath_dbg("Failed EEPROM startcmd\n");
+ stop_cmd(dd);
+ ret = 1;
+ goto bail;
+ }
+ ret = wr_byte(dd, eeprom_offset);
+ stop_cmd(dd);
+ if (ret) {
+ ipath_dev_err(dd, "Failed to write EEPROM address\n");
+ ret = 1;
+ goto bail;
+ }
+ ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
+ }
+ if (ret) {
+ ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
+ stop_cmd(dd);
+ ret = 1;
+ goto bail;
+ }
+
+ /*
+ * eeprom keeps clocking data out as long as we ack, automatically
+ * incrementing the address.
+ */
+ while (len-- > 0) {
+ /* get and store data */
+ *bp++ = rd_byte(dd);
+ /* send ack if not the last byte */
+ if (len)
+ send_ack(dd);
+ }
+
+ stop_cmd(dd);
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
+ const void *buffer, int len)
+{
+ int sub_len;
+ const u8 *bp = buffer;
+ int max_wait_time, i;
+ int ret;
+ struct i2c_chain_desc *icd;
+
+ ret = 1;
+ icd = ipath_i2c_type(dd);
+ if (!icd)
+ goto bail;
+
+ while (len > 0) {
+ if (icd->eeprom_dev == IPATH_NO_DEV) {
+ if (i2c_startcmd(dd,
+ (eeprom_offset << 1) | WRITE_CMD)) {
+ ipath_dbg("Failed to start cmd offset %u\n",
+ eeprom_offset);
+ goto failed_write;
+ }
+ } else {
+ /* Real I2C */
+ if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+ ipath_dbg("Failed EEPROM startcmd\n");
+ goto failed_write;
+ }
+ ret = wr_byte(dd, eeprom_offset);
+ if (ret) {
+ ipath_dev_err(dd, "Failed to write EEPROM "
+ "address\n");
+ goto failed_write;
+ }
+ }
+
+ sub_len = min(len, 4);
+ eeprom_offset += sub_len;
+ len -= sub_len;
+
+ for (i = 0; i < sub_len; i++) {
+ if (wr_byte(dd, *bp++)) {
+ ipath_dbg("no ack after byte %u/%u (%u "
+ "total remain)\n", i, sub_len,
+ len + sub_len - i);
+ goto failed_write;
+ }
+ }
+
+ stop_cmd(dd);
+
+ /*
+ * wait for write complete by waiting for a successful
+ * read (the chip replies with a zero after the write
+ * cmd completes, and before it writes to the eeprom.
+ * The startcmd for the read will fail the ack until
+ * the writes have completed. We do this inline to avoid
+ * the debug prints that are in the real read routine
+ * if the startcmd fails.
+ * We also use the proper device address, so it doesn't matter
+ * whether we have real eeprom_dev. legacy likes any address.
+ */
+ max_wait_time = 100;
+ while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
+ stop_cmd(dd);
+ if (!--max_wait_time) {
+ ipath_dbg("Did not get successful read to "
+ "complete write\n");
+ goto failed_write;
+ }
+ }
+ /* now read (and ignore) the resulting byte */
+ rd_byte(dd);
+ stop_cmd(dd);
+ }
+
+ ret = 0;
+ goto bail;
+
+failed_write:
+ stop_cmd(dd);
+ ret = 1;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
+ void *buff, int len)
+{
+ int ret;
+
+ ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+ if (!ret) {
+ ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
+ mutex_unlock(&dd->ipath_eep_lock);
+ }
+
+ return ret;
+}
+
+/**
+ * ipath_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
+ const void *buff, int len)
+{
+ int ret;
+
+ ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+ if (!ret) {
+ ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
+ mutex_unlock(&dd->ipath_eep_lock);
+ }
+
+ return ret;
+}
+
+static u8 flash_csum(struct ipath_flash *ifp, int adjust)
+{
+ u8 *ip = (u8 *) ifp;
+ u8 csum = 0, len;
+
+ /*
+ * Limit length checksummed to max length of actual data.
+ * Checksum of erased eeprom will still be bad, but we avoid
+ * reading past the end of the buffer we were passed.
+ */
+ len = ifp->if_length;
+ if (len > sizeof(struct ipath_flash))
+ len = sizeof(struct ipath_flash);
+ while (len--)
+ csum += *ip++;
+ csum -= ifp->if_csum;
+ csum = ~csum;
+ if (adjust)
+ ifp->if_csum = csum;
+
+ return csum;
+}
+
+/**
+ * ipath_get_guid - get the GUID from the i2c device
+ * @dd: the infinipath device
+ *
+ * We have the capability to use the ipath_nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void ipath_get_eeprom_info(struct ipath_devdata *dd)
+{
+ void *buf;
+ struct ipath_flash *ifp;
+ __be64 guid;
+ int len, eep_stat;
+ u8 csum, *bguid;
+ int t = dd->ipath_unit;
+ struct ipath_devdata *dd0 = ipath_lookup(0);
+
+ if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
+ u8 oguid;
+ dd->ipath_guid = dd0->ipath_guid;
+ bguid = (u8 *) & dd->ipath_guid;
+
+ oguid = bguid[7];
+ bguid[7] += t;
+ if (oguid > bguid[7]) {
+ if (bguid[6] == 0xff) {
+ if (bguid[5] == 0xff) {
+ ipath_dev_err(
+ dd,
+ "Can't set %s GUID from "
+ "base, wraps to OUI!\n",
+ ipath_get_unit_name(t));
+ dd->ipath_guid = 0;
+ goto bail;
+ }
+ bguid[5]++;
+ }
+ bguid[6]++;
+ }
+ dd->ipath_nguid = 1;
+
+ ipath_dbg("nguid %u, so adding %u to device 0 guid, "
+ "for %llx\n",
+ dd0->ipath_nguid, t,
+ (unsigned long long) be64_to_cpu(dd->ipath_guid));
+ goto bail;
+ }
+
+ /*
+ * read full flash, not just currently used part, since it may have
+ * been written with a newer definition
+ * */
+ len = sizeof(struct ipath_flash);
+ buf = vmalloc(len);
+ if (!buf) {
+ ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+ "bytes from eeprom for GUID\n", len);
+ goto bail;
+ }
+
+ mutex_lock(&dd->ipath_eep_lock);
+ eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
+ mutex_unlock(&dd->ipath_eep_lock);
+
+ if (eep_stat) {
+ ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
+ goto done;
+ }
+ ifp = (struct ipath_flash *)buf;
+
+ csum = flash_csum(ifp, 0);
+ if (csum != ifp->if_csum) {
+ dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
+ "0x%x, not 0x%x\n", csum, ifp->if_csum);
+ goto done;
+ }
+ if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+ *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+ ipath_dev_err(dd, "Invalid GUID %llx from flash; "
+ "ignoring\n",
+ *(unsigned long long *) ifp->if_guid);
+ /* don't allow GUID if all 0 or all 1's */
+ goto done;
+ }
+
+ /* complain, but allow it */
+ if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+ dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
+ "default, probably not correct!\n",
+ *(unsigned long long *) ifp->if_guid);
+
+ bguid = ifp->if_guid;
+ if (!bguid[0] && !bguid[1] && !bguid[2]) {
+ /* original incorrect GUID format in flash; fix in
+ * core copy, by shifting up 2 octets; don't need to
+ * change top octet, since both it and shifted are
+ * 0.. */
+ bguid[1] = bguid[3];
+ bguid[2] = bguid[4];
+ bguid[3] = bguid[4] = 0;
+ guid = *(__be64 *) ifp->if_guid;
+ ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
+ "shifting 2 octets\n");
+ } else
+ guid = *(__be64 *) ifp->if_guid;
+ dd->ipath_guid = guid;
+ dd->ipath_nguid = ifp->if_numguid;
+ /*
+ * Things are slightly complicated by the desire to transparently
+ * support both the Pathscale 10-digit serial number and the QLogic
+ * 13-character version.
+ */
+ if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
+ && ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
+ /* This board has a Serial-prefix, which is stored
+ * elsewhere for backward-compatibility.
+ */
+ char *snp = dd->ipath_serial;
+ memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
+ snp[sizeof ifp->if_sprefix] = '\0';
+ len = strlen(snp);
+ snp += len;
+ len = (sizeof dd->ipath_serial) - len;
+ if (len > sizeof ifp->if_serial) {
+ len = sizeof ifp->if_serial;
+ }
+ memcpy(snp, ifp->if_serial, len);
+ } else
+ memcpy(dd->ipath_serial, ifp->if_serial,
+ sizeof ifp->if_serial);
+ if (!strstr(ifp->if_comment, "Tested successfully"))
+ ipath_dev_err(dd, "Board SN %s did not pass functional "
+ "test: %s\n", dd->ipath_serial,
+ ifp->if_comment);
+
+ ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
+ (unsigned long long) be64_to_cpu(dd->ipath_guid));
+
+ memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
+ /*
+ * Power-on (actually "active") hours are kept as little-endian value
+ * in EEPROM, but as seconds in a (possibly as small as 24-bit)
+ * atomic_t while running.
+ */
+ atomic_set(&dd->ipath_active_time, 0);
+ dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
+
+done:
+ vfree(buf);
+
+bail:;
+}
+
+/**
+ * ipath_update_eeprom_log - copy active-time and error counters to eeprom
+ * @dd: the infinipath device
+ *
+ * Although the time is kept as seconds in the ipath_devdata struct, it is
+ * rounded to hours for re-write, as we have only 16 bits in EEPROM.
+ * First-cut code reads whole (expected) struct ipath_flash, modifies,
+ * re-writes. Future direction: read/write only what we need, assuming
+ * that the EEPROM had to have been "good enough" for driver init, and
+ * if not, we aren't making it worse.
+ *
+ */
+
+int ipath_update_eeprom_log(struct ipath_devdata *dd)
+{
+ void *buf;
+ struct ipath_flash *ifp;
+ int len, hi_water;
+ uint32_t new_time, new_hrs;
+ u8 csum;
+ int ret, idx;
+ unsigned long flags;
+
+ /* first, check if we actually need to do anything. */
+ ret = 0;
+ for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+ if (dd->ipath_eep_st_new_errs[idx]) {
+ ret = 1;
+ break;
+ }
+ }
+ new_time = atomic_read(&dd->ipath_active_time);
+
+ if (ret == 0 && new_time < 3600)
+ return 0;
+
+ /*
+ * The quick-check above determined that there is something worthy
+ * of logging, so get current contents and do a more detailed idea.
+ * read full flash, not just currently used part, since it may have
+ * been written with a newer definition
+ */
+ len = sizeof(struct ipath_flash);
+ buf = vmalloc(len);
+ ret = 1;
+ if (!buf) {
+ ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+ "bytes from eeprom for logging\n", len);
+ goto bail;
+ }
+
+ /* Grab semaphore and read current EEPROM. If we get an
+ * error, let go, but if not, keep it until we finish write.
+ */
+ ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+ if (ret) {
+ ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
+ goto free_bail;
+ }
+ ret = ipath_eeprom_internal_read(dd, 0, buf, len);
+ if (ret) {
+ mutex_unlock(&dd->ipath_eep_lock);
+ ipath_dev_err(dd, "Unable read EEPROM for logging\n");
+ goto free_bail;
+ }
+ ifp = (struct ipath_flash *)buf;
+
+ csum = flash_csum(ifp, 0);
+ if (csum != ifp->if_csum) {
+ mutex_unlock(&dd->ipath_eep_lock);
+ ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
+ csum, ifp->if_csum);
+ ret = 1;
+ goto free_bail;
+ }
+ hi_water = 0;
+ spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+ for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+ int new_val = dd->ipath_eep_st_new_errs[idx];
+ if (new_val) {
+ /*
+ * If we have seen any errors, add to EEPROM values
+ * We need to saturate at 0xFF (255) and we also
+ * would need to adjust the checksum if we were
+ * trying to minimize EEPROM traffic
+ * Note that we add to actual current count in EEPROM,
+ * in case it was altered while we were running.
+ */
+ new_val += ifp->if_errcntp[idx];
+ if (new_val > 0xFF)
+ new_val = 0xFF;
+ if (ifp->if_errcntp[idx] != new_val) {
+ ifp->if_errcntp[idx] = new_val;
+ hi_water = offsetof(struct ipath_flash,
+ if_errcntp) + idx;
+ }
+ /*
+ * update our shadow (used to minimize EEPROM
+ * traffic), to match what we are about to write.
+ */
+ dd->ipath_eep_st_errs[idx] = new_val;
+ dd->ipath_eep_st_new_errs[idx] = 0;
+ }
+ }
+ /*
+ * now update active-time. We would like to round to the nearest hour
+ * but unless atomic_t are sure to be proper signed ints we cannot,
+ * because we need to account for what we "transfer" to EEPROM and
+ * if we log an hour at 31 minutes, then we would need to set
+ * active_time to -29 to accurately count the _next_ hour.
+ */
+ if (new_time >= 3600) {
+ new_hrs = new_time / 3600;
+ atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
+ new_hrs += dd->ipath_eep_hrs;
+ if (new_hrs > 0xFFFF)
+ new_hrs = 0xFFFF;
+ dd->ipath_eep_hrs = new_hrs;
+ if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
+ ifp->if_powerhour[0] = new_hrs & 0xFF;
+ hi_water = offsetof(struct ipath_flash, if_powerhour);
+ }
+ if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
+ ifp->if_powerhour[1] = new_hrs >> 8;
+ hi_water = offsetof(struct ipath_flash, if_powerhour)
+ + 1;
+ }
+ }
+ /*
+ * There is a tiny possibility that we could somehow fail to write
+ * the EEPROM after updating our shadows, but problems from holding
+ * the spinlock too long are a much bigger issue.
+ */
+ spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+ if (hi_water) {
+ /* we made some change to the data, uopdate cksum and write */
+ csum = flash_csum(ifp, 1);
+ ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
+ }
+ mutex_unlock(&dd->ipath_eep_lock);
+ if (ret)
+ ipath_dev_err(dd, "Failed updating EEPROM\n");
+
+free_bail:
+ vfree(buf);
+bail:
+ return ret;
+
+}
+
+/**
+ * ipath_inc_eeprom_err - increment one of the four error counters
+ * that are logged to EEPROM.
+ * @dd: the infinipath device
+ * @eidx: 0..3, the counter to increment
+ * @incr: how much to add
+ *
+ * Each counter is 8-bits, and saturates at 255 (0xFF). They
+ * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
+ * is called, but it can only be called in a context that allows sleep.
+ * This function can be called even at interrupt level.
+ */
+
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
+{
+ uint new_val;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+ new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
+ if (new_val > 255)
+ new_val = 255;
+ dd->ipath_eep_st_new_errs[eidx] = new_val;
+ spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+ return;
+}
+
+static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
+{
+ int ret;
+ struct i2c_chain_desc *icd;
+
+ ret = -ENOENT;
+
+ icd = ipath_i2c_type(dd);
+ if (!icd)
+ goto bail;
+
+ if (icd->temp_dev == IPATH_NO_DEV) {
+ /* tempsense only exists on new, real-I2C boards */
+ ret = -ENXIO;
+ goto bail;
+ }
+
+ if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+ ipath_dbg("Failed tempsense startcmd\n");
+ stop_cmd(dd);
+ ret = -ENXIO;
+ goto bail;
+ }
+ ret = wr_byte(dd, regnum);
+ stop_cmd(dd);
+ if (ret) {
+ ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
+ regnum);
+ ret = -ENXIO;
+ goto bail;
+ }
+ if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
+ ipath_dbg("Failed tempsense RD startcmd\n");
+ stop_cmd(dd);
+ ret = -ENXIO;
+ goto bail;
+ }
+ /*
+ * We can only clock out one byte per command, sensibly
+ */
+ ret = rd_byte(dd);
+ stop_cmd(dd);
+
+bail:
+ return ret;
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+
+/**
+ * ipath_tempsense_read - read register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
+{
+ int ret;
+
+ if (regnum > 7)
+ return -EINVAL;
+
+ /* return a bogus value for (the one) register we do not have */
+ if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
+ return 0;
+
+ ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+ if (!ret) {
+ ret = ipath_tempsense_internal_read(dd, regnum);
+ mutex_unlock(&dd->ipath_eep_lock);
+ }
+
+ /*
+ * There are three possibilities here:
+ * ret is actual value (0..255)
+ * ret is -ENXIO or -EINVAL from code in this file
+ * ret is -EINTR from mutex_lock_interruptible.
+ */
+ return ret;
+}
+
+static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
+ u8 regnum, u8 data)
+{
+ int ret = -ENOENT;
+ struct i2c_chain_desc *icd;
+
+ icd = ipath_i2c_type(dd);
+ if (!icd)
+ goto bail;
+
+ if (icd->temp_dev == IPATH_NO_DEV) {
+ /* tempsense only exists on new, real-I2C boards */
+ ret = -ENXIO;
+ goto bail;
+ }
+ if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+ ipath_dbg("Failed tempsense startcmd\n");
+ stop_cmd(dd);
+ ret = -ENXIO;
+ goto bail;
+ }
+ ret = wr_byte(dd, regnum);
+ if (ret) {
+ stop_cmd(dd);
+ ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
+ regnum);
+ ret = -ENXIO;
+ goto bail;
+ }
+ ret = wr_byte(dd, data);
+ stop_cmd(dd);
+ ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
+ if (ret) {
+ ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
+ regnum);
+ ret = -ENXIO;
+ }
+
+bail:
+ return ret;
+}
+
+#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
+
+/**
+ * ipath_tempsense_write - write register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to write
+ * @data: data to write
+ *
+ * returns 0 for success or < 0 for error
+ */
+int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
+{
+ int ret;
+
+ if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+ if (!ret) {
+ ret = ipath_tempsense_internal_write(dd, regnum, data);
+ mutex_unlock(&dd->ipath_eep_lock);
+ }
+
+ /*
+ * There are three possibilities here:
+ * ret is 0 for success
+ * ret is -ENXIO or -EINVAL from code in this file
+ * ret is -EINTR from mutex_lock_interruptible.
+ */
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c
new file mode 100644
index 000000000000..450d15965005
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_file_ops.c
@@ -0,0 +1,2620 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <asm/pgtable.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+#include "ipath_user_sdma.h"
+
+static int ipath_open(struct inode *, struct file *);
+static int ipath_close(struct inode *, struct file *);
+static ssize_t ipath_write(struct file *, const char __user *, size_t,
+ loff_t *);
+static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from);
+static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
+static int ipath_mmap(struct file *, struct vm_area_struct *);
+
+/*
+ * This is really, really weird shit - write() and writev() here
+ * have completely unrelated semantics. Sucky userland ABI,
+ * film at 11.
+ */
+static const struct file_operations ipath_file_ops = {
+ .owner = THIS_MODULE,
+ .write = ipath_write,
+ .write_iter = ipath_write_iter,
+ .open = ipath_open,
+ .release = ipath_close,
+ .poll = ipath_poll,
+ .mmap = ipath_mmap,
+ .llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+ struct page *page;
+ u64 paddr = 0;
+
+ page = vmalloc_to_page(p);
+ if (page)
+ paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+static int ipath_get_base_info(struct file *fp,
+ void __user *ubase, size_t ubase_size)
+{
+ struct ipath_portdata *pd = port_fp(fp);
+ int ret = 0;
+ struct ipath_base_info *kinfo = NULL;
+ struct ipath_devdata *dd = pd->port_dd;
+ unsigned subport_cnt;
+ int shared, master;
+ size_t sz;
+
+ subport_cnt = pd->port_subport_cnt;
+ if (!subport_cnt) {
+ shared = 0;
+ master = 0;
+ subport_cnt = 1;
+ } else {
+ shared = 1;
+ master = !subport_fp(fp);
+ }
+
+ sz = sizeof(*kinfo);
+ /* If port sharing is not requested, allow the old size structure */
+ if (!shared)
+ sz -= 7 * sizeof(u64);
+ if (ubase_size < sz) {
+ ipath_cdbg(PROC,
+ "Base size %zu, need %zu (version mismatch?)\n",
+ ubase_size, sz);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+ if (kinfo == NULL) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ ret = dd->ipath_f_get_base_info(pd, kinfo);
+ if (ret < 0)
+ goto bail;
+
+ kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+ kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+ kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
+ kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+ /*
+ * have to mmap whole thing
+ */
+ kinfo->spi_rcv_egrbuftotlen =
+ pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+ kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+ kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+ pd->port_rcvegrbuf_chunks;
+ kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
+ if (master)
+ kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
+ /*
+ * for this use, may be ipath_cfgports summed over all chips that
+ * are are configured and present
+ */
+ kinfo->spi_nports = dd->ipath_cfgports;
+ /* unit (chip/board) our port is on */
+ kinfo->spi_unit = dd->ipath_unit;
+ /* for now, only a single page */
+ kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+ /*
+ * Doing this per port, and based on the skip value, etc. This has
+ * to be the actual buffer size, since the protocol code treats it
+ * as an array.
+ *
+ * These have to be set to user addresses in the user code via mmap.
+ * These values are used on return to user code for the mmap target
+ * addresses only. For 32 bit, same 44 bit address problem, so use
+ * the physical address, not virtual. Before 2.6.11, using the
+ * page_address() macro worked, but in 2.6.11, even that returns the
+ * full 64 bit address (upper bits all 1's). So far, using the
+ * physical addresses (or chip offsets, for chip mapping) works, but
+ * no doubt some future kernel release will change that, and we'll be
+ * on to yet another method of dealing with this.
+ */
+ kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
+ kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
+ kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
+ kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
+ kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+ (void *) dd->ipath_statusp -
+ (void *) dd->ipath_pioavailregs_dma;
+ if (!shared) {
+ kinfo->spi_piocnt = pd->port_piocnt;
+ kinfo->spi_piobufbase = (u64) pd->port_piobufs;
+ kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
+ dd->ipath_ureg_align * pd->port_port;
+ } else if (master) {
+ kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
+ (pd->port_piocnt % subport_cnt);
+ /* Master's PIO buffers are after all the slave's */
+ kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+ dd->ipath_palign *
+ (pd->port_piocnt - kinfo->spi_piocnt);
+ } else {
+ unsigned slave = subport_fp(fp) - 1;
+
+ kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
+ kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+ dd->ipath_palign * kinfo->spi_piocnt * slave;
+ }
+
+ if (shared) {
+ kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
+ dd->ipath_ureg_align * pd->port_port;
+ kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
+ kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
+ kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
+
+ kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
+ PAGE_SIZE * subport_fp(fp));
+
+ kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
+ pd->port_rcvhdrq_size * subport_fp(fp));
+ kinfo->spi_rcvhdr_tailaddr = 0;
+ kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
+ pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
+ subport_fp(fp));
+
+ kinfo->spi_subport_uregbase =
+ cvt_kvaddr(pd->subport_uregbase);
+ kinfo->spi_subport_rcvegrbuf =
+ cvt_kvaddr(pd->subport_rcvegrbuf);
+ kinfo->spi_subport_rcvhdr_base =
+ cvt_kvaddr(pd->subport_rcvhdr_base);
+ ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
+ kinfo->spi_port, kinfo->spi_runtime_flags,
+ (unsigned long long) kinfo->spi_subport_uregbase,
+ (unsigned long long) kinfo->spi_subport_rcvegrbuf,
+ (unsigned long long) kinfo->spi_subport_rcvhdr_base);
+ }
+
+ /*
+ * All user buffers are 2KB buffers. If we ever support
+ * giving 4KB buffers to user processes, this will need some
+ * work.
+ */
+ kinfo->spi_pioindex = (kinfo->spi_piobufbase -
+ (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
+ kinfo->spi_pioalign = dd->ipath_palign;
+
+ kinfo->spi_qpair = IPATH_KD_QP;
+ /*
+ * user mode PIO buffers are always 2KB, even when 4KB can
+ * be received, and sent via the kernel; this is ibmaxlen
+ * for 2K MTU.
+ */
+ kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
+ kinfo->spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */
+ kinfo->spi_port = pd->port_port;
+ kinfo->spi_subport = subport_fp(fp);
+ kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
+ kinfo->spi_hw_version = dd->ipath_revision;
+
+ if (master) {
+ kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
+ }
+
+ sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+ if (copy_to_user(ubase, kinfo, sz))
+ ret = -EFAULT;
+
+bail:
+ kfree(kinfo);
+ return ret;
+}
+
+/**
+ * ipath_tid_update - update a port TID
+ * @pd: the port
+ * @fp: the ipath device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller. To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
+ const struct ipath_tid_info *ti)
+{
+ int ret = 0, ntids;
+ u32 tid, porttid, cnt, i, tidcnt, tidoff;
+ u16 *tidlist;
+ struct ipath_devdata *dd = pd->port_dd;
+ u64 physaddr;
+ unsigned long vaddr;
+ u64 __iomem *tidbase;
+ unsigned long tidmap[8];
+ struct page **pagep = NULL;
+ unsigned subport = subport_fp(fp);
+
+ if (!dd->ipath_pageshadow) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ cnt = ti->tidcnt;
+ if (!cnt) {
+ ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
+ (unsigned long long) ti->tidlist);
+ /*
+ * Should we treat as success? likely a bug
+ */
+ ret = -EFAULT;
+ goto done;
+ }
+ porttid = pd->port_port * dd->ipath_rcvtidcnt;
+ if (!pd->port_subport_cnt) {
+ tidcnt = dd->ipath_rcvtidcnt;
+ tid = pd->port_tidcursor;
+ tidoff = 0;
+ } else if (!subport) {
+ tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+ (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+ tidoff = dd->ipath_rcvtidcnt - tidcnt;
+ porttid += tidoff;
+ tid = tidcursor_fp(fp);
+ } else {
+ tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+ tidoff = tidcnt * (subport - 1);
+ porttid += tidoff;
+ tid = tidcursor_fp(fp);
+ }
+ if (cnt > tidcnt) {
+ /* make sure it all fits in port_tid_pg_list */
+ dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
+ "TIDs, only trying max (%u)\n", cnt, tidcnt);
+ cnt = tidcnt;
+ }
+ pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
+ tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
+
+ memset(tidmap, 0, sizeof(tidmap));
+ /* before decrement; chip actual # */
+ ntids = tidcnt;
+ tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+ dd->ipath_rcvtidbase +
+ porttid * sizeof(*tidbase));
+
+ ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
+ pd->port_port, cnt, tid, tidbase);
+
+ /* virtual address of first page in transfer */
+ vaddr = ti->tidvaddr;
+ if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+ cnt * PAGE_SIZE)) {
+ ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
+ (void *)vaddr, cnt);
+ ret = -EFAULT;
+ goto done;
+ }
+ ret = ipath_get_user_pages(vaddr, cnt, pagep);
+ if (ret) {
+ if (ret == -EBUSY) {
+ ipath_dbg("Failed to lock addr %p, %u pages "
+ "(already locked)\n",
+ (void *) vaddr, cnt);
+ /*
+ * for now, continue, and see what happens but with
+ * the new implementation, this should never happen,
+ * unless perhaps the user has mpin'ed the pages
+ * themselves (something we need to test)
+ */
+ ret = 0;
+ } else {
+ dev_info(&dd->pcidev->dev,
+ "Failed to lock addr %p, %u pages: "
+ "errno %d\n", (void *) vaddr, cnt, -ret);
+ goto done;
+ }
+ }
+ for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+ for (; ntids--; tid++) {
+ if (tid == tidcnt)
+ tid = 0;
+ if (!dd->ipath_pageshadow[porttid + tid])
+ break;
+ }
+ if (ntids < 0) {
+ /*
+ * oops, wrapped all the way through their TIDs,
+ * and didn't have enough free; see comments at
+ * start of routine
+ */
+ ipath_dbg("Not enough free TIDs for %u pages "
+ "(index %d), failing\n", cnt, i);
+ i--; /* last tidlist[i] not filled in */
+ ret = -ENOMEM;
+ break;
+ }
+ tidlist[i] = tid + tidoff;
+ ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
+ "vaddr %lx\n", i, tid + tidoff, vaddr);
+ /* we "know" system pages and TID pages are same size */
+ dd->ipath_pageshadow[porttid + tid] = pagep[i];
+ dd->ipath_physshadow[porttid + tid] = ipath_map_page(
+ dd->pcidev, pagep[i], 0, PAGE_SIZE,
+ PCI_DMA_FROMDEVICE);
+ /*
+ * don't need atomic or it's overhead
+ */
+ __set_bit(tid, tidmap);
+ physaddr = dd->ipath_physshadow[porttid + tid];
+ ipath_stats.sps_pagelocks++;
+ ipath_cdbg(VERBOSE,
+ "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
+ tid, vaddr, (unsigned long long) physaddr,
+ pagep[i]);
+ dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
+ physaddr);
+ /*
+ * don't check this tid in ipath_portshadow, since we
+ * just filled it in; start with the next one.
+ */
+ tid++;
+ }
+
+ if (ret) {
+ u32 limit;
+ cleanup:
+ /* jump here if copy out of updated info failed... */
+ ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
+ -ret, i, cnt);
+ /* same code that's in ipath_free_tid() */
+ limit = sizeof(tidmap) * BITS_PER_BYTE;
+ if (limit > tidcnt)
+ /* just in case size changes in future */
+ limit = tidcnt;
+ tid = find_first_bit((const unsigned long *)tidmap, limit);
+ for (; tid < limit; tid++) {
+ if (!test_bit(tid, tidmap))
+ continue;
+ if (dd->ipath_pageshadow[porttid + tid]) {
+ ipath_cdbg(VERBOSE, "Freeing TID %u\n",
+ tid);
+ dd->ipath_f_put_tid(dd, &tidbase[tid],
+ RCVHQ_RCV_TYPE_EXPECTED,
+ dd->ipath_tidinvalid);
+ pci_unmap_page(dd->pcidev,
+ dd->ipath_physshadow[porttid + tid],
+ PAGE_SIZE, PCI_DMA_FROMDEVICE);
+ dd->ipath_pageshadow[porttid + tid] = NULL;
+ ipath_stats.sps_pageunlocks++;
+ }
+ }
+ ipath_release_user_pages(pagep, cnt);
+ } else {
+ /*
+ * Copy the updated array, with ipath_tid's filled in, back
+ * to user. Since we did the copy in already, this "should
+ * never fail" If it does, we have to clean up...
+ */
+ if (copy_to_user((void __user *)
+ (unsigned long) ti->tidlist,
+ tidlist, cnt * sizeof(*tidlist))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+ if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+ tidmap, sizeof tidmap)) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+ if (tid == tidcnt)
+ tid = 0;
+ if (!pd->port_subport_cnt)
+ pd->port_tidcursor = tid;
+ else
+ tidcursor_fp(fp) = tid;
+ }
+
+done:
+ if (ret)
+ ipath_dbg("Failed to map %u TID pages, failing with %d\n",
+ ti->tidcnt, -ret);
+ return ret;
+}
+
+/**
+ * ipath_tid_free - free a port TID
+ * @pd: the port
+ * @subport: the subport
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance. We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted. We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
+ const struct ipath_tid_info *ti)
+{
+ int ret = 0;
+ u32 tid, porttid, cnt, limit, tidcnt;
+ struct ipath_devdata *dd = pd->port_dd;
+ u64 __iomem *tidbase;
+ unsigned long tidmap[8];
+
+ if (!dd->ipath_pageshadow) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+ sizeof tidmap)) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ porttid = pd->port_port * dd->ipath_rcvtidcnt;
+ if (!pd->port_subport_cnt)
+ tidcnt = dd->ipath_rcvtidcnt;
+ else if (!subport) {
+ tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+ (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+ porttid += dd->ipath_rcvtidcnt - tidcnt;
+ } else {
+ tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+ porttid += tidcnt * (subport - 1);
+ }
+ tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+ dd->ipath_rcvtidbase +
+ porttid * sizeof(*tidbase));
+
+ limit = sizeof(tidmap) * BITS_PER_BYTE;
+ if (limit > tidcnt)
+ /* just in case size changes in future */
+ limit = tidcnt;
+ tid = find_first_bit(tidmap, limit);
+ ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
+ "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
+ limit, tid, porttid);
+ for (cnt = 0; tid < limit; tid++) {
+ /*
+ * small optimization; if we detect a run of 3 or so without
+ * any set, use find_first_bit again. That's mainly to
+ * accelerate the case where we wrapped, so we have some at
+ * the beginning, and some at the end, and a big gap
+ * in the middle.
+ */
+ if (!test_bit(tid, tidmap))
+ continue;
+ cnt++;
+ if (dd->ipath_pageshadow[porttid + tid]) {
+ struct page *p;
+ p = dd->ipath_pageshadow[porttid + tid];
+ dd->ipath_pageshadow[porttid + tid] = NULL;
+ ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
+ pid_nr(pd->port_pid), tid);
+ dd->ipath_f_put_tid(dd, &tidbase[tid],
+ RCVHQ_RCV_TYPE_EXPECTED,
+ dd->ipath_tidinvalid);
+ pci_unmap_page(dd->pcidev,
+ dd->ipath_physshadow[porttid + tid],
+ PAGE_SIZE, PCI_DMA_FROMDEVICE);
+ ipath_release_user_pages(&p, 1);
+ ipath_stats.sps_pageunlocks++;
+ } else
+ ipath_dbg("Unused tid %u, ignoring\n", tid);
+ }
+ if (cnt != ti->tidcnt)
+ ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
+ ti->tidcnt, cnt);
+done:
+ if (ret)
+ ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
+ ti->tidcnt, -ret);
+ return ret;
+}
+
+/**
+ * ipath_set_part_key - set a partition key
+ * @pd: the port
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed). This is somewhat tricky, since multiple ports may set
+ * the same key, so we reference count them, and clean up at exit. All 4
+ * partition keys are packed into a single infinipath register. It's an
+ * error for a process to set the same pkey multiple times. We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that. I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available. This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+ int i, any = 0, pidx = -1;
+ u16 lkey = key & 0x7FFF;
+ int ret;
+
+ if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
+ /* nothing to do; this key always valid */
+ ret = 0;
+ goto bail;
+ }
+
+ ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
+ "%hx:%x %hx:%x %hx:%x %hx:%x\n",
+ pd->port_port, key, dd->ipath_pkeys[0],
+ atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
+ atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
+ atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
+ atomic_read(&dd->ipath_pkeyrefs[3]));
+
+ if (!lkey) {
+ ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
+ pd->port_port);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /*
+ * Set the full membership bit, because it has to be
+ * set in the register or the packet, and it seems
+ * cleaner to set in the register than to force all
+ * callers to set it. (see bug 4331)
+ */
+ key |= 0x8000;
+
+ for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+ if (!pd->port_pkeys[i] && pidx == -1)
+ pidx = i;
+ if (pd->port_pkeys[i] == key) {
+ ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
+ "(%x) more than once\n",
+ pd->port_port, key);
+ ret = -EEXIST;
+ goto bail;
+ }
+ }
+ if (pidx == -1) {
+ ipath_dbg("All pkeys for port %u already in use, "
+ "can't set %x\n", pd->port_port, key);
+ ret = -EBUSY;
+ goto bail;
+ }
+ for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+ if (!dd->ipath_pkeys[i]) {
+ any++;
+ continue;
+ }
+ if (dd->ipath_pkeys[i] == key) {
+ atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
+
+ if (atomic_inc_return(pkrefs) > 1) {
+ pd->port_pkeys[pidx] = key;
+ ipath_cdbg(VERBOSE, "p%u set key %x "
+ "matches #%d, count now %d\n",
+ pd->port_port, key, i,
+ atomic_read(pkrefs));
+ ret = 0;
+ goto bail;
+ } else {
+ /*
+ * lost race, decrement count, catch below
+ */
+ atomic_dec(pkrefs);
+ ipath_cdbg(VERBOSE, "Lost race, count was "
+ "0, after dec, it's %d\n",
+ atomic_read(pkrefs));
+ any++;
+ }
+ }
+ if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+ /*
+ * It makes no sense to have both the limited and
+ * full membership PKEY set at the same time since
+ * the unlimited one will disable the limited one.
+ */
+ ret = -EEXIST;
+ goto bail;
+ }
+ }
+ if (!any) {
+ ipath_dbg("port %u, all pkeys already in use, "
+ "can't set %x\n", pd->port_port, key);
+ ret = -EBUSY;
+ goto bail;
+ }
+ for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+ if (!dd->ipath_pkeys[i] &&
+ atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+ u64 pkey;
+
+ /* for ipathstats, etc. */
+ ipath_stats.sps_pkeys[i] = lkey;
+ pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
+ pkey =
+ (u64) dd->ipath_pkeys[0] |
+ ((u64) dd->ipath_pkeys[1] << 16) |
+ ((u64) dd->ipath_pkeys[2] << 32) |
+ ((u64) dd->ipath_pkeys[3] << 48);
+ ipath_cdbg(PROC, "p%u set key %x in #%d, "
+ "portidx %d, new pkey reg %llx\n",
+ pd->port_port, key, i, pidx,
+ (unsigned long long) pkey);
+ ipath_write_kreg(
+ dd, dd->ipath_kregs->kr_partitionkey, pkey);
+
+ ret = 0;
+ goto bail;
+ }
+ }
+ ipath_dbg("port %u, all pkeys already in use 2nd pass, "
+ "can't set %x\n", pd->port_port, key);
+ ret = -EBUSY;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_manage_rcvq - manage a port's receive queue
+ * @pd: the port
+ * @subport: the subport
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the port, for use in queue
+ * overflow conditions. start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
+ int start_stop)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+
+ ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
+ start_stop ? "en" : "dis", dd->ipath_unit,
+ pd->port_port, subport);
+ if (subport)
+ goto bail;
+ /* atomically clear receive enable port. */
+ if (start_stop) {
+ /*
+ * On enable, force in-memory copy of the tail register to
+ * 0, so that protocol code doesn't have to worry about
+ * whether or not the chip has yet updated the in-memory
+ * copy or not on return from the system call. The chip
+ * always resets it's tail register back to 0 on a
+ * transition from disabled to enabled. This could cause a
+ * problem if software was broken, and did the enable w/o
+ * the disable, but eventually the in-memory copy will be
+ * updated and correct itself, even in the face of software
+ * bugs.
+ */
+ if (pd->port_rcvhdrtail_kvaddr)
+ ipath_clear_rcvhdrtail(pd);
+ set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+ &dd->ipath_rcvctrl);
+ } else
+ clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
+ &dd->ipath_rcvctrl);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ /* now be sure chip saw it before we return */
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ if (start_stop) {
+ /*
+ * And try to be sure that tail reg update has happened too.
+ * This should in theory interlock with the RXE changes to
+ * the tail register. Don't assign it to the tail register
+ * in memory copy, since we could overwrite an update by the
+ * chip if we did.
+ */
+ ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+ }
+ /* always; new head should be equal to new tail; see above */
+bail:
+ return 0;
+}
+
+static void ipath_clean_part_key(struct ipath_portdata *pd,
+ struct ipath_devdata *dd)
+{
+ int i, j, pchanged = 0;
+ u64 oldpkey;
+
+ /* for debugging only */
+ oldpkey = (u64) dd->ipath_pkeys[0] |
+ ((u64) dd->ipath_pkeys[1] << 16) |
+ ((u64) dd->ipath_pkeys[2] << 32) |
+ ((u64) dd->ipath_pkeys[3] << 48);
+
+ for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+ if (!pd->port_pkeys[i])
+ continue;
+ ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
+ pd->port_pkeys[i]);
+ for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
+ /* check for match independent of the global bit */
+ if ((dd->ipath_pkeys[j] & 0x7fff) !=
+ (pd->port_pkeys[i] & 0x7fff))
+ continue;
+ if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
+ ipath_cdbg(VERBOSE, "p%u clear key "
+ "%x matches #%d\n",
+ pd->port_port,
+ pd->port_pkeys[i], j);
+ ipath_stats.sps_pkeys[j] =
+ dd->ipath_pkeys[j] = 0;
+ pchanged++;
+ }
+ else ipath_cdbg(
+ VERBOSE, "p%u key %x matches #%d, "
+ "but ref still %d\n", pd->port_port,
+ pd->port_pkeys[i], j,
+ atomic_read(&dd->ipath_pkeyrefs[j]));
+ break;
+ }
+ pd->port_pkeys[i] = 0;
+ }
+ if (pchanged) {
+ u64 pkey = (u64) dd->ipath_pkeys[0] |
+ ((u64) dd->ipath_pkeys[1] << 16) |
+ ((u64) dd->ipath_pkeys[2] << 32) |
+ ((u64) dd->ipath_pkeys[3] << 48);
+ ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
+ "new pkey reg %llx\n", pd->port_port,
+ (unsigned long long) oldpkey,
+ (unsigned long long) pkey);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+ pkey);
+ }
+}
+
+/*
+ * Initialize the port data with the receive buffer sizes
+ * so this can be done while the master port is locked.
+ * Otherwise, there is a race with a slave opening the port
+ * and seeing these fields uninitialized.
+ */
+static void init_user_egr_sizes(struct ipath_portdata *pd)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+ unsigned egrperchunk, egrcnt, size;
+
+ /*
+ * to avoid wasting a lot of memory, we allocate 32KB chunks of
+ * physically contiguous memory, advance through it until used up
+ * and then allocate more. Of course, we need memory to store those
+ * extra pointers, now. Started out with 256KB, but under heavy
+ * memory pressure (creating large files and then copying them over
+ * NFS while doing lots of MPI jobs), we hit some allocation
+ * failures, even though we can sleep... (2.6.10) Still get
+ * failures at 64K. 32K is the lowest we can go without wasting
+ * additional memory.
+ */
+ size = 0x8000;
+ egrperchunk = size / dd->ipath_rcvegrbufsize;
+ egrcnt = dd->ipath_rcvegrcnt;
+ pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
+ pd->port_rcvegrbufs_perchunk = egrperchunk;
+ pd->port_rcvegrbuf_size = size;
+}
+
+/**
+ * ipath_create_user_egr - allocate eager TID buffers
+ * @pd: the port to allocate TID buffers for
+ *
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the user port version
+ *
+ * Allocate the eager TID buffers and program them into infinipath
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.
+ */
+static int ipath_create_user_egr(struct ipath_portdata *pd)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+ unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+ size_t size;
+ int ret;
+ gfp_t gfp_flags;
+
+ /*
+ * GFP_USER, but without GFP_FS, so buffer cache can be
+ * coalesced (we hope); otherwise, even at order 4,
+ * heavy filesystem activity makes these fail, and we can
+ * use compound pages.
+ */
+ gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+ egrcnt = dd->ipath_rcvegrcnt;
+ /* TID number offset for this port */
+ egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
+ egrsize = dd->ipath_rcvegrbufsize;
+ ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
+ "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
+
+ chunk = pd->port_rcvegrbuf_chunks;
+ egrperchunk = pd->port_rcvegrbufs_perchunk;
+ size = pd->port_rcvegrbuf_size;
+ pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
+ GFP_KERNEL);
+ if (!pd->port_rcvegrbuf) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ pd->port_rcvegrbuf_phys =
+ kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
+ GFP_KERNEL);
+ if (!pd->port_rcvegrbuf_phys) {
+ ret = -ENOMEM;
+ goto bail_rcvegrbuf;
+ }
+ for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+
+ pd->port_rcvegrbuf[e] = dma_alloc_coherent(
+ &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
+ gfp_flags);
+
+ if (!pd->port_rcvegrbuf[e]) {
+ ret = -ENOMEM;
+ goto bail_rcvegrbuf_phys;
+ }
+ }
+
+ pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
+
+ for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
+ dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
+ unsigned i;
+
+ for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+ dd->ipath_f_put_tid(dd, e + egroff +
+ (u64 __iomem *)
+ ((char __iomem *)
+ dd->ipath_kregbase +
+ dd->ipath_rcvegrbase),
+ RCVHQ_RCV_TYPE_EAGER, pa);
+ pa += egrsize;
+ }
+ cond_resched(); /* don't hog the cpu */
+ }
+
+ ret = 0;
+ goto bail;
+
+bail_rcvegrbuf_phys:
+ for (e = 0; e < pd->port_rcvegrbuf_chunks &&
+ pd->port_rcvegrbuf[e]; e++) {
+ dma_free_coherent(&dd->pcidev->dev, size,
+ pd->port_rcvegrbuf[e],
+ pd->port_rcvegrbuf_phys[e]);
+
+ }
+ kfree(pd->port_rcvegrbuf_phys);
+ pd->port_rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+ kfree(pd->port_rcvegrbuf);
+ pd->port_rcvegrbuf = NULL;
+bail:
+ return ret;
+}
+
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int ipath_mmap_mem(struct vm_area_struct *vma,
+ struct ipath_portdata *pd, unsigned len, int write_ok,
+ void *kvaddr, char *what)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+ unsigned long pfn;
+ int ret;
+
+ if ((vma->vm_end - vma->vm_start) > len) {
+ dev_info(&dd->pcidev->dev,
+ "FAIL on %s: len %lx > %x\n", what,
+ vma->vm_end - vma->vm_start, len);
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ if (!write_ok) {
+ if (vma->vm_flags & VM_WRITE) {
+ dev_info(&dd->pcidev->dev,
+ "%s must be mapped readonly\n", what);
+ ret = -EPERM;
+ goto bail;
+ }
+
+ /* don't allow them to later change with mprotect */
+ vma->vm_flags &= ~VM_MAYWRITE;
+ }
+
+ pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+ ret = remap_pfn_range(vma, vma->vm_start, pfn,
+ len, vma->vm_page_prot);
+ if (ret)
+ dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
+ "bytes r%c failed: %d\n", what, pd->port_port,
+ pfn, len, write_ok?'w':'o', ret);
+ else
+ ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
+ "r%c\n", what, pd->port_port, pfn, len,
+ write_ok?'w':'o');
+bail:
+ return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
+ u64 ureg)
+{
+ unsigned long phys;
+ int ret;
+
+ /*
+ * This is real hardware, so use io_remap. This is the mechanism
+ * for the user process to update the head registers for their port
+ * in the chip.
+ */
+ if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+ dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
+ "%lx > PAGE\n", vma->vm_end - vma->vm_start);
+ ret = -EFAULT;
+ } else {
+ phys = dd->ipath_physaddr + ureg;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+ ret = io_remap_pfn_range(vma, vma->vm_start,
+ phys >> PAGE_SHIFT,
+ vma->vm_end - vma->vm_start,
+ vma->vm_page_prot);
+ }
+ return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+ struct ipath_devdata *dd,
+ struct ipath_portdata *pd,
+ unsigned piobufs, unsigned piocnt)
+{
+ unsigned long phys;
+ int ret;
+
+ /*
+ * When we map the PIO buffers in the chip, we want to map them as
+ * writeonly, no read possible. This prevents access to previous
+ * process data, and catches users who might try to read the i/o
+ * space due to a bug.
+ */
+ if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
+ dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
+ "reqlen %lx > PAGE\n",
+ vma->vm_end - vma->vm_start);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ phys = dd->ipath_physaddr + piobufs;
+
+#if defined(__powerpc__)
+ /* There isn't a generic way to specify writethrough mappings */
+ pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
+ pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+ pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+#endif
+
+ /*
+ * don't allow them to later change to readable with mprotect (for when
+ * not initially mapped readable, as is normally the case)
+ */
+ vma->vm_flags &= ~VM_MAYREAD;
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+ ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+ vma->vm_end - vma->vm_start,
+ vma->vm_page_prot);
+bail:
+ return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+ struct ipath_portdata *pd)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+ unsigned long start, size;
+ size_t total_size, i;
+ unsigned long pfn;
+ int ret;
+
+ size = pd->port_rcvegrbuf_size;
+ total_size = pd->port_rcvegrbuf_chunks * size;
+ if ((vma->vm_end - vma->vm_start) > total_size) {
+ dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
+ "reqlen %lx > actual %lx\n",
+ vma->vm_end - vma->vm_start,
+ (unsigned long) total_size);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ if (vma->vm_flags & VM_WRITE) {
+ dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
+ "writable (flags=%lx)\n", vma->vm_flags);
+ ret = -EPERM;
+ goto bail;
+ }
+ /* don't allow them to later change to writeable with mprotect */
+ vma->vm_flags &= ~VM_MAYWRITE;
+
+ start = vma->vm_start;
+
+ for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
+ pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
+ ret = remap_pfn_range(vma, start, pfn, size,
+ vma->vm_page_prot);
+ if (ret < 0)
+ goto bail;
+ }
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/*
+ * ipath_file_vma_fault - handle a VMA page fault.
+ */
+static int ipath_file_vma_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct page *page;
+
+ page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+ if (!page)
+ return VM_FAULT_SIGBUS;
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+static const struct vm_operations_struct ipath_file_vm_ops = {
+ .fault = ipath_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+ struct ipath_portdata *pd, unsigned subport)
+{
+ unsigned long len;
+ struct ipath_devdata *dd;
+ void *addr;
+ size_t size;
+ int ret = 0;
+
+ /* If the port is not shared, all addresses should be physical */
+ if (!pd->port_subport_cnt)
+ goto bail;
+
+ dd = pd->port_dd;
+ size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+
+ /*
+ * Each process has all the subport uregbase, rcvhdrq, and
+ * rcvegrbufs mmapped - as an array for all the processes,
+ * and also separately for this process.
+ */
+ if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
+ addr = pd->subport_uregbase;
+ size = PAGE_SIZE * pd->port_subport_cnt;
+ } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
+ addr = pd->subport_rcvhdr_base;
+ size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
+ } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
+ addr = pd->subport_rcvegrbuf;
+ size *= pd->port_subport_cnt;
+ } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
+ PAGE_SIZE * subport)) {
+ addr = pd->subport_uregbase + PAGE_SIZE * subport;
+ size = PAGE_SIZE;
+ } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
+ pd->port_rcvhdrq_size * subport)) {
+ addr = pd->subport_rcvhdr_base +
+ pd->port_rcvhdrq_size * subport;
+ size = pd->port_rcvhdrq_size;
+ } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
+ size * subport)) {
+ addr = pd->subport_rcvegrbuf + size * subport;
+ /* rcvegrbufs are read-only on the slave */
+ if (vma->vm_flags & VM_WRITE) {
+ dev_info(&dd->pcidev->dev,
+ "Can't map eager buffers as "
+ "writable (flags=%lx)\n", vma->vm_flags);
+ ret = -EPERM;
+ goto bail;
+ }
+ /*
+ * Don't allow permission to later change to writeable
+ * with mprotect.
+ */
+ vma->vm_flags &= ~VM_MAYWRITE;
+ } else {
+ goto bail;
+ }
+ len = vma->vm_end - vma->vm_start;
+ if (len > size) {
+ ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+ vma->vm_ops = &ipath_file_vm_ops;
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ ret = 1;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_mmap - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip. We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct ipath_portdata *pd;
+ struct ipath_devdata *dd;
+ u64 pgaddr, ureg;
+ unsigned piobufs, piocnt;
+ int ret;
+
+ pd = port_fp(fp);
+ if (!pd) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ dd = pd->port_dd;
+
+ /*
+ * This is the ipath_do_user_init() code, mapping the shared buffers
+ * into the user process. The address referred to by vm_pgoff is the
+ * file offset passed via mmap(). For shared ports, this is the
+ * kernel vmalloc() address of the pages to share with the master.
+ * For non-shared or master ports, this is a physical address.
+ * We only do one mmap for each space mapped.
+ */
+ pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+ /*
+ * Check for 0 in case one of the allocations failed, but user
+ * called mmap anyway.
+ */
+ if (!pgaddr) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
+ (unsigned long long) pgaddr, vma->vm_start,
+ vma->vm_end - vma->vm_start, dd->ipath_unit,
+ pd->port_port, subport_fp(fp));
+
+ /*
+ * Physical addresses must fit in 40 bits for our hardware.
+ * Check for kernel virtual addresses first, anything else must
+ * match a HW or memory address.
+ */
+ ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ goto bail;
+ }
+
+ ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
+ if (!pd->port_subport_cnt) {
+ /* port is not shared */
+ piocnt = pd->port_piocnt;
+ piobufs = pd->port_piobufs;
+ } else if (!subport_fp(fp)) {
+ /* caller is the master */
+ piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
+ (pd->port_piocnt % pd->port_subport_cnt);
+ piobufs = pd->port_piobufs +
+ dd->ipath_palign * (pd->port_piocnt - piocnt);
+ } else {
+ unsigned slave = subport_fp(fp) - 1;
+
+ /* caller is a slave */
+ piocnt = pd->port_piocnt / pd->port_subport_cnt;
+ piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
+ }
+
+ if (pgaddr == ureg)
+ ret = mmap_ureg(vma, dd, ureg);
+ else if (pgaddr == piobufs)
+ ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
+ else if (pgaddr == dd->ipath_pioavailregs_phys)
+ /* in-memory copy of pioavail registers */
+ ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+ (void *) dd->ipath_pioavailregs_dma,
+ "pioavail registers");
+ else if (pgaddr == pd->port_rcvegr_phys)
+ ret = mmap_rcvegrbufs(vma, pd);
+ else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
+ /*
+ * The rcvhdrq itself; readonly except on HT (so have
+ * to allow writable mapping), multiple pages, contiguous
+ * from an i/o perspective.
+ */
+ ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
+ pd->port_rcvhdrq,
+ "rcvhdrq");
+ else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
+ /* in-memory copy of rcvhdrq tail register */
+ ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+ pd->port_rcvhdrtail_kvaddr,
+ "rcvhdrq tail");
+ else
+ ret = -EINVAL;
+
+ vma->vm_private_data = NULL;
+
+ if (ret < 0)
+ dev_info(&dd->pcidev->dev,
+ "Failure %d on off %llx len %lx\n",
+ -ret, (unsigned long long)pgaddr,
+ vma->vm_end - vma->vm_start);
+bail:
+ return ret;
+}
+
+static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
+{
+ unsigned pollflag = 0;
+
+ if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
+ pd->port_hdrqfull != pd->port_hdrqfull_poll) {
+ pollflag |= POLLIN | POLLRDNORM;
+ pd->port_hdrqfull_poll = pd->port_hdrqfull;
+ }
+
+ return pollflag;
+}
+
+static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
+ struct file *fp,
+ struct poll_table_struct *pt)
+{
+ unsigned pollflag = 0;
+ struct ipath_devdata *dd;
+
+ dd = pd->port_dd;
+
+ /* variable access in ipath_poll_hdrqfull() needs this */
+ rmb();
+ pollflag = ipath_poll_hdrqfull(pd);
+
+ if (pd->port_urgent != pd->port_urgent_poll) {
+ pollflag |= POLLIN | POLLRDNORM;
+ pd->port_urgent_poll = pd->port_urgent;
+ }
+
+ if (!pollflag) {
+ /* this saves a spin_lock/unlock in interrupt handler... */
+ set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
+ /* flush waiting flag so don't miss an event... */
+ wmb();
+ poll_wait(fp, &pd->port_wait, pt);
+ }
+
+ return pollflag;
+}
+
+static unsigned int ipath_poll_next(struct ipath_portdata *pd,
+ struct file *fp,
+ struct poll_table_struct *pt)
+{
+ u32 head;
+ u32 tail;
+ unsigned pollflag = 0;
+ struct ipath_devdata *dd;
+
+ dd = pd->port_dd;
+
+ /* variable access in ipath_poll_hdrqfull() needs this */
+ rmb();
+ pollflag = ipath_poll_hdrqfull(pd);
+
+ head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
+ if (pd->port_rcvhdrtail_kvaddr)
+ tail = ipath_get_rcvhdrtail(pd);
+ else
+ tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+
+ if (head != tail)
+ pollflag |= POLLIN | POLLRDNORM;
+ else {
+ /* this saves a spin_lock/unlock in interrupt handler */
+ set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
+ /* flush waiting flag so we don't miss an event */
+ wmb();
+
+ set_bit(pd->port_port + dd->ipath_r_intravail_shift,
+ &dd->ipath_rcvctrl);
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+
+ if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
+ ipath_write_ureg(dd, ur_rcvhdrhead,
+ dd->ipath_rhdrhead_intr_off | head,
+ pd->port_port);
+
+ poll_wait(fp, &pd->port_wait, pt);
+ }
+
+ return pollflag;
+}
+
+static unsigned int ipath_poll(struct file *fp,
+ struct poll_table_struct *pt)
+{
+ struct ipath_portdata *pd;
+ unsigned pollflag;
+
+ pd = port_fp(fp);
+ if (!pd)
+ pollflag = 0;
+ else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
+ pollflag = ipath_poll_urgent(pd, fp, pt);
+ else
+ pollflag = ipath_poll_next(pd, fp, pt);
+
+ return pollflag;
+}
+
+static int ipath_supports_subports(int user_swmajor, int user_swminor)
+{
+ /* no subport implementation prior to software version 1.3 */
+ return (user_swmajor > 1) || (user_swminor >= 3);
+}
+
+static int ipath_compatible_subports(int user_swmajor, int user_swminor)
+{
+ /* this code is written long-hand for clarity */
+ if (IPATH_USER_SWMAJOR != user_swmajor) {
+ /* no promise of compatibility if major mismatch */
+ return 0;
+ }
+ if (IPATH_USER_SWMAJOR == 1) {
+ switch (IPATH_USER_SWMINOR) {
+ case 0:
+ case 1:
+ case 2:
+ /* no subport implementation so cannot be compatible */
+ return 0;
+ case 3:
+ /* 3 is only compatible with itself */
+ return user_swminor == 3;
+ default:
+ /* >= 4 are compatible (or are expected to be) */
+ return user_swminor >= 4;
+ }
+ }
+ /* make no promises yet for future major versions */
+ return 0;
+}
+
+static int init_subports(struct ipath_devdata *dd,
+ struct ipath_portdata *pd,
+ const struct ipath_user_info *uinfo)
+{
+ int ret = 0;
+ unsigned num_subports;
+ size_t size;
+
+ /*
+ * If the user is requesting zero subports,
+ * skip the subport allocation.
+ */
+ if (uinfo->spu_subport_cnt <= 0)
+ goto bail;
+
+ /* Self-consistency check for ipath_compatible_subports() */
+ if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
+ !ipath_compatible_subports(IPATH_USER_SWMAJOR,
+ IPATH_USER_SWMINOR)) {
+ dev_info(&dd->pcidev->dev,
+ "Inconsistent ipath_compatible_subports()\n");
+ goto bail;
+ }
+
+ /* Check for subport compatibility */
+ if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
+ uinfo->spu_userversion & 0xffff)) {
+ dev_info(&dd->pcidev->dev,
+ "Mismatched user version (%d.%d) and driver "
+ "version (%d.%d) while port sharing. Ensure "
+ "that driver and library are from the same "
+ "release.\n",
+ (int) (uinfo->spu_userversion >> 16),
+ (int) (uinfo->spu_userversion & 0xffff),
+ IPATH_USER_SWMAJOR,
+ IPATH_USER_SWMINOR);
+ goto bail;
+ }
+ if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ num_subports = uinfo->spu_subport_cnt;
+ pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
+ if (!pd->subport_uregbase) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ /* Note: pd->port_rcvhdrq_size isn't initialized yet. */
+ size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+ sizeof(u32), PAGE_SIZE) * num_subports;
+ pd->subport_rcvhdr_base = vzalloc(size);
+ if (!pd->subport_rcvhdr_base) {
+ ret = -ENOMEM;
+ goto bail_ureg;
+ }
+
+ pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
+ pd->port_rcvegrbuf_size *
+ num_subports);
+ if (!pd->subport_rcvegrbuf) {
+ ret = -ENOMEM;
+ goto bail_rhdr;
+ }
+
+ pd->port_subport_cnt = uinfo->spu_subport_cnt;
+ pd->port_subport_id = uinfo->spu_subport_id;
+ pd->active_slaves = 1;
+ set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+ goto bail;
+
+bail_rhdr:
+ vfree(pd->subport_rcvhdr_base);
+bail_ureg:
+ vfree(pd->subport_uregbase);
+ pd->subport_uregbase = NULL;
+bail:
+ return ret;
+}
+
+static int try_alloc_port(struct ipath_devdata *dd, int port,
+ struct file *fp,
+ const struct ipath_user_info *uinfo)
+{
+ struct ipath_portdata *pd;
+ int ret;
+
+ if (!(pd = dd->ipath_pd[port])) {
+ void *ptmp;
+
+ pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
+
+ /*
+ * Allocate memory for use in ipath_tid_update() just once
+ * at open, not per call. Reduces cost of expected send
+ * setup.
+ */
+ ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
+ dd->ipath_rcvtidcnt * sizeof(struct page **),
+ GFP_KERNEL);
+ if (!pd || !ptmp) {
+ ipath_dev_err(dd, "Unable to allocate portdata "
+ "memory, failing open\n");
+ ret = -ENOMEM;
+ kfree(pd);
+ kfree(ptmp);
+ goto bail;
+ }
+ dd->ipath_pd[port] = pd;
+ dd->ipath_pd[port]->port_port = port;
+ dd->ipath_pd[port]->port_dd = dd;
+ dd->ipath_pd[port]->port_tid_pg_list = ptmp;
+ init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
+ }
+ if (!pd->port_cnt) {
+ pd->userversion = uinfo->spu_userversion;
+ init_user_egr_sizes(pd);
+ if ((ret = init_subports(dd, pd, uinfo)) != 0)
+ goto bail;
+ ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
+ current->comm, current->pid, dd->ipath_unit,
+ port);
+ pd->port_cnt = 1;
+ port_fp(fp) = pd;
+ pd->port_pid = get_pid(task_pid(current));
+ strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
+ ipath_stats.sps_ports++;
+ ret = 0;
+ } else
+ ret = -EBUSY;
+
+bail:
+ return ret;
+}
+
+static inline int usable(struct ipath_devdata *dd)
+{
+ return dd &&
+ (dd->ipath_flags & IPATH_PRESENT) &&
+ dd->ipath_kregbase &&
+ dd->ipath_lid &&
+ !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
+ | IPATH_LINKUNK));
+}
+
+static int find_free_port(int unit, struct file *fp,
+ const struct ipath_user_info *uinfo)
+{
+ struct ipath_devdata *dd = ipath_lookup(unit);
+ int ret, i;
+
+ if (!dd) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ if (!usable(dd)) {
+ ret = -ENETDOWN;
+ goto bail;
+ }
+
+ for (i = 1; i < dd->ipath_cfgports; i++) {
+ ret = try_alloc_port(dd, i, fp, uinfo);
+ if (ret != -EBUSY)
+ goto bail;
+ }
+ ret = -EBUSY;
+
+bail:
+ return ret;
+}
+
+static int find_best_unit(struct file *fp,
+ const struct ipath_user_info *uinfo)
+{
+ int ret = 0, i, prefunit = -1, devmax;
+ int maxofallports, npresent, nup;
+ int ndev;
+
+ devmax = ipath_count_units(&npresent, &nup, &maxofallports);
+
+ /*
+ * This code is present to allow a knowledgeable person to
+ * specify the layout of processes to processors before opening
+ * this driver, and then we'll assign the process to the "closest"
+ * InfiniPath chip to that processor (we assume reasonable connectivity,
+ * for now). This code assumes that if affinity has been set
+ * before this point, that at most one cpu is set; for now this
+ * is reasonable. I check for both cpumask_empty() and cpumask_full(),
+ * in case some kernel variant sets none of the bits when no
+ * affinity is set. 2.6.11 and 12 kernels have all present
+ * cpus set. Some day we'll have to fix it up further to handle
+ * a cpu subset. This algorithm fails for two HT chips connected
+ * in tunnel fashion. Eventually this needs real topology
+ * information. There may be some issues with dual core numbering
+ * as well. This needs more work prior to release.
+ */
+ if (!cpumask_empty(tsk_cpus_allowed(current)) &&
+ !cpumask_full(tsk_cpus_allowed(current))) {
+ int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
+ get_online_cpus();
+ for_each_online_cpu(i)
+ if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
+ ipath_cdbg(PROC, "%s[%u] affinity set for "
+ "cpu %d/%d\n", current->comm,
+ current->pid, i, ncpus);
+ curcpu = i;
+ nset++;
+ }
+ put_online_cpus();
+ if (curcpu != -1 && nset != ncpus) {
+ if (npresent) {
+ prefunit = curcpu / (ncpus / npresent);
+ ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
+ "%d cpus/chip, select unit %d\n",
+ current->comm, current->pid,
+ npresent, ncpus, ncpus / npresent,
+ prefunit);
+ }
+ }
+ }
+
+ /*
+ * user ports start at 1, kernel port is 0
+ * For now, we do round-robin access across all chips
+ */
+
+ if (prefunit != -1)
+ devmax = prefunit + 1;
+recheck:
+ for (i = 1; i < maxofallports; i++) {
+ for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
+ ndev++) {
+ struct ipath_devdata *dd = ipath_lookup(ndev);
+
+ if (!usable(dd))
+ continue; /* can't use this unit */
+ if (i >= dd->ipath_cfgports)
+ /*
+ * Maxed out on users of this unit. Try
+ * next.
+ */
+ continue;
+ ret = try_alloc_port(dd, i, fp, uinfo);
+ if (!ret)
+ goto done;
+ }
+ }
+
+ if (npresent) {
+ if (nup == 0) {
+ ret = -ENETDOWN;
+ ipath_dbg("No ports available (none initialized "
+ "and ready)\n");
+ } else {
+ if (prefunit > 0) {
+ /* if started above 0, retry from 0 */
+ ipath_cdbg(PROC,
+ "%s[%u] no ports on prefunit "
+ "%d, clear and re-check\n",
+ current->comm, current->pid,
+ prefunit);
+ devmax = ipath_count_units(NULL, NULL,
+ NULL);
+ prefunit = -1;
+ goto recheck;
+ }
+ ret = -EBUSY;
+ ipath_dbg("No ports available\n");
+ }
+ } else {
+ ret = -ENXIO;
+ ipath_dbg("No boards found\n");
+ }
+
+done:
+ return ret;
+}
+
+static int find_shared_port(struct file *fp,
+ const struct ipath_user_info *uinfo)
+{
+ int devmax, ndev, i;
+ int ret = 0;
+
+ devmax = ipath_count_units(NULL, NULL, NULL);
+
+ for (ndev = 0; ndev < devmax; ndev++) {
+ struct ipath_devdata *dd = ipath_lookup(ndev);
+
+ if (!usable(dd))
+ continue;
+ for (i = 1; i < dd->ipath_cfgports; i++) {
+ struct ipath_portdata *pd = dd->ipath_pd[i];
+
+ /* Skip ports which are not yet open */
+ if (!pd || !pd->port_cnt)
+ continue;
+ /* Skip port if it doesn't match the requested one */
+ if (pd->port_subport_id != uinfo->spu_subport_id)
+ continue;
+ /* Verify the sharing process matches the master */
+ if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
+ pd->userversion != uinfo->spu_userversion ||
+ pd->port_cnt >= pd->port_subport_cnt) {
+ ret = -EINVAL;
+ goto done;
+ }
+ port_fp(fp) = pd;
+ subport_fp(fp) = pd->port_cnt++;
+ pd->port_subpid[subport_fp(fp)] =
+ get_pid(task_pid(current));
+ tidcursor_fp(fp) = 0;
+ pd->active_slaves |= 1 << subport_fp(fp);
+ ipath_cdbg(PROC,
+ "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
+ current->comm, current->pid,
+ subport_fp(fp),
+ pd->port_comm, pid_nr(pd->port_pid),
+ dd->ipath_unit, pd->port_port);
+ ret = 1;
+ goto done;
+ }
+ }
+
+done:
+ return ret;
+}
+
+static int ipath_open(struct inode *in, struct file *fp)
+{
+ /* The real work is performed later in ipath_assign_port() */
+ fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
+ return fp->private_data ? 0 : -ENOMEM;
+}
+
+/* Get port early, so can set affinity prior to memory allocation */
+static int ipath_assign_port(struct file *fp,
+ const struct ipath_user_info *uinfo)
+{
+ int ret;
+ int i_minor;
+ unsigned swmajor, swminor;
+
+ /* Check to be sure we haven't already initialized this file */
+ if (port_fp(fp)) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ /* for now, if major version is different, bail */
+ swmajor = uinfo->spu_userversion >> 16;
+ if (swmajor != IPATH_USER_SWMAJOR) {
+ ipath_dbg("User major version %d not same as driver "
+ "major %d\n", uinfo->spu_userversion >> 16,
+ IPATH_USER_SWMAJOR);
+ ret = -ENODEV;
+ goto done;
+ }
+
+ swminor = uinfo->spu_userversion & 0xffff;
+ if (swminor != IPATH_USER_SWMINOR)
+ ipath_dbg("User minor version %d not same as driver "
+ "minor %d\n", swminor, IPATH_USER_SWMINOR);
+
+ mutex_lock(&ipath_mutex);
+
+ if (ipath_compatible_subports(swmajor, swminor) &&
+ uinfo->spu_subport_cnt &&
+ (ret = find_shared_port(fp, uinfo))) {
+ if (ret > 0)
+ ret = 0;
+ goto done_chk_sdma;
+ }
+
+ i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
+ ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
+ (long)file_inode(fp)->i_rdev, i_minor);
+
+ if (i_minor)
+ ret = find_free_port(i_minor - 1, fp, uinfo);
+ else
+ ret = find_best_unit(fp, uinfo);
+
+done_chk_sdma:
+ if (!ret) {
+ struct ipath_filedata *fd = fp->private_data;
+ const struct ipath_portdata *pd = fd->pd;
+ const struct ipath_devdata *dd = pd->port_dd;
+
+ fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
+ dd->ipath_unit,
+ pd->port_port,
+ fd->subport);
+
+ if (!fd->pq)
+ ret = -ENOMEM;
+ }
+
+ mutex_unlock(&ipath_mutex);
+
+done:
+ return ret;
+}
+
+
+static int ipath_do_user_init(struct file *fp,
+ const struct ipath_user_info *uinfo)
+{
+ int ret;
+ struct ipath_portdata *pd = port_fp(fp);
+ struct ipath_devdata *dd;
+ u32 head32;
+
+ /* Subports don't need to initialize anything since master did it. */
+ if (subport_fp(fp)) {
+ ret = wait_event_interruptible(pd->port_wait,
+ !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
+ goto done;
+ }
+
+ dd = pd->port_dd;
+
+ if (uinfo->spu_rcvhdrsize) {
+ ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
+ if (ret)
+ goto done;
+ }
+
+ /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
+
+ /* some ports may get extra buffers, calculate that here */
+ if (pd->port_port <= dd->ipath_ports_extrabuf)
+ pd->port_piocnt = dd->ipath_pbufsport + 1;
+ else
+ pd->port_piocnt = dd->ipath_pbufsport;
+
+ /* for right now, kernel piobufs are at end, so port 1 is at 0 */
+ if (pd->port_port <= dd->ipath_ports_extrabuf)
+ pd->port_pio_base = (dd->ipath_pbufsport + 1)
+ * (pd->port_port - 1);
+ else
+ pd->port_pio_base = dd->ipath_ports_extrabuf +
+ dd->ipath_pbufsport * (pd->port_port - 1);
+ pd->port_piobufs = dd->ipath_piobufbase +
+ pd->port_pio_base * dd->ipath_palign;
+ ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
+ " first pio %u\n", pd->port_port, pd->port_piobufs,
+ pd->port_piocnt, pd->port_pio_base);
+ ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
+
+ /*
+ * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+ * array for time being. If pd->port_port > chip-supported,
+ * we need to do extra stuff here to handle by handling overflow
+ * through port 0, someday
+ */
+ ret = ipath_create_rcvhdrq(dd, pd);
+ if (!ret)
+ ret = ipath_create_user_egr(pd);
+ if (ret)
+ goto done;
+
+ /*
+ * set the eager head register for this port to the current values
+ * of the tail pointers, since we don't know if they were
+ * updated on last use of the port.
+ */
+ head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
+ ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
+ pd->port_lastrcvhdrqtail = -1;
+ ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
+ pd->port_port, head32);
+ pd->port_tidcursor = 0; /* start at beginning after open */
+
+ /* initialize poll variables... */
+ pd->port_urgent = 0;
+ pd->port_urgent_poll = 0;
+ pd->port_hdrqfull_poll = pd->port_hdrqfull;
+
+ /*
+ * Now enable the port for receive.
+ * For chips that are set to DMA the tail register to memory
+ * when they change (and when the update bit transitions from
+ * 0 to 1. So for those chips, we turn it off and then back on.
+ * This will (very briefly) affect any other open ports, but the
+ * duration is very short, and therefore isn't an issue. We
+ * explicitly set the in-memory tail copy to 0 beforehand, so we
+ * don't have to wait to be sure the DMA update has happened
+ * (chip resets head/tail to 0 on transition to enable).
+ */
+ set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+ &dd->ipath_rcvctrl);
+ if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+ if (pd->port_rcvhdrtail_kvaddr)
+ ipath_clear_rcvhdrtail(pd);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl &
+ ~(1ULL << dd->ipath_r_tailupd_shift));
+ }
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ /* Notify any waiting slaves */
+ if (pd->port_subport_cnt) {
+ clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+ wake_up(&pd->port_wait);
+ }
+done:
+ return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries port still had in use
+ * @pd: port
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using ipath_f_clear_tids.
+ */
+static void unlock_expected_tids(struct ipath_portdata *pd)
+{
+ struct ipath_devdata *dd = pd->port_dd;
+ int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
+ int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+
+ ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
+ pd->port_port);
+ for (i = port_tidbase; i < maxtid; i++) {
+ struct page *ps = dd->ipath_pageshadow[i];
+
+ if (!ps)
+ continue;
+
+ dd->ipath_pageshadow[i] = NULL;
+ pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
+ PAGE_SIZE, PCI_DMA_FROMDEVICE);
+ ipath_release_user_pages_on_close(&ps, 1);
+ cnt++;
+ ipath_stats.sps_pageunlocks++;
+ }
+ if (cnt)
+ ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
+ pd->port_port, cnt);
+
+ if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
+ ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
+ (unsigned long long) ipath_stats.sps_pagelocks,
+ (unsigned long long)
+ ipath_stats.sps_pageunlocks);
+}
+
+static int ipath_close(struct inode *in, struct file *fp)
+{
+ int ret = 0;
+ struct ipath_filedata *fd;
+ struct ipath_portdata *pd;
+ struct ipath_devdata *dd;
+ unsigned long flags;
+ unsigned port;
+ struct pid *pid;
+
+ ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
+ (long)in->i_rdev, fp->private_data);
+
+ mutex_lock(&ipath_mutex);
+
+ fd = fp->private_data;
+ fp->private_data = NULL;
+ pd = fd->pd;
+ if (!pd) {
+ mutex_unlock(&ipath_mutex);
+ goto bail;
+ }
+
+ dd = pd->port_dd;
+
+ /* drain user sdma queue */
+ ipath_user_sdma_queue_drain(dd, fd->pq);
+ ipath_user_sdma_queue_destroy(fd->pq);
+
+ if (--pd->port_cnt) {
+ /*
+ * XXX If the master closes the port before the slave(s),
+ * revoke the mmap for the eager receive queue so
+ * the slave(s) don't wait for receive data forever.
+ */
+ pd->active_slaves &= ~(1 << fd->subport);
+ put_pid(pd->port_subpid[fd->subport]);
+ pd->port_subpid[fd->subport] = NULL;
+ mutex_unlock(&ipath_mutex);
+ goto bail;
+ }
+ /* early; no interrupt users after this */
+ spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+ port = pd->port_port;
+ dd->ipath_pd[port] = NULL;
+ pid = pd->port_pid;
+ pd->port_pid = NULL;
+ spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+ if (pd->port_rcvwait_to || pd->port_piowait_to
+ || pd->port_rcvnowait || pd->port_pionowait) {
+ ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
+ "%u rcv %u, pio already\n",
+ pd->port_port, pd->port_rcvwait_to,
+ pd->port_piowait_to, pd->port_rcvnowait,
+ pd->port_pionowait);
+ pd->port_rcvwait_to = pd->port_piowait_to =
+ pd->port_rcvnowait = pd->port_pionowait = 0;
+ }
+ if (pd->port_flag) {
+ ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
+ pd->port_port, pd->port_flag);
+ pd->port_flag = 0;
+ }
+
+ if (dd->ipath_kregbase) {
+ /* atomically clear receive enable port and intr avail. */
+ clear_bit(dd->ipath_r_portenable_shift + port,
+ &dd->ipath_rcvctrl);
+ clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
+ &dd->ipath_rcvctrl);
+ ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ /* and read back from chip to be sure that nothing
+ * else is in flight when we do the rest */
+ (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+ /* clean up the pkeys for this port user */
+ ipath_clean_part_key(pd, dd);
+ /*
+ * be paranoid, and never write 0's to these, just use an
+ * unused part of the port 0 tail page. Of course,
+ * rcvhdraddr points to a large chunk of memory, so this
+ * could still trash things, but at least it won't trash
+ * page 0, and by disabling the port, it should stop "soon",
+ * even if a packet or two is in already in flight after we
+ * disabled the port.
+ */
+ ipath_write_kreg_port(dd,
+ dd->ipath_kregs->kr_rcvhdrtailaddr, port,
+ dd->ipath_dummy_hdrq_phys);
+ ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+ pd->port_port, dd->ipath_dummy_hdrq_phys);
+
+ ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
+ ipath_chg_pioavailkernel(dd, pd->port_pio_base,
+ pd->port_piocnt, 1);
+
+ dd->ipath_f_clear_tids(dd, pd->port_port);
+
+ if (dd->ipath_pageshadow)
+ unlock_expected_tids(pd);
+ ipath_stats.sps_ports--;
+ ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
+ pd->port_comm, pid_nr(pid),
+ dd->ipath_unit, port);
+ }
+
+ put_pid(pid);
+ mutex_unlock(&ipath_mutex);
+ ipath_free_pddata(dd, pd); /* after releasing the mutex */
+
+bail:
+ kfree(fd);
+ return ret;
+}
+
+static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
+ struct ipath_port_info __user *uinfo)
+{
+ struct ipath_port_info info;
+ int nup;
+ int ret;
+ size_t sz;
+
+ (void) ipath_count_units(NULL, &nup, NULL);
+ info.num_active = nup;
+ info.unit = pd->port_dd->ipath_unit;
+ info.port = pd->port_port;
+ info.subport = subport;
+ /* Don't return new fields if old library opened the port. */
+ if (ipath_supports_subports(pd->userversion >> 16,
+ pd->userversion & 0xffff)) {
+ /* Number of user ports available for this device. */
+ info.num_ports = pd->port_dd->ipath_cfgports - 1;
+ info.num_subports = pd->port_subport_cnt;
+ sz = sizeof(info);
+ } else
+ sz = sizeof(info) - 2 * sizeof(u16);
+
+ if (copy_to_user(uinfo, &info, sz)) {
+ ret = -EFAULT;
+ goto bail;
+ }
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static int ipath_get_slave_info(struct ipath_portdata *pd,
+ void __user *slave_mask_addr)
+{
+ int ret = 0;
+
+ if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
+ ret = -EFAULT;
+ return ret;
+}
+
+static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
+ u32 __user *inflightp)
+{
+ const u32 val = ipath_user_sdma_inflight_counter(pq);
+
+ if (put_user(val, inflightp))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ipath_sdma_get_complete(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq,
+ u32 __user *completep)
+{
+ u32 val;
+ int err;
+
+ err = ipath_user_sdma_make_progress(dd, pq);
+ if (err < 0)
+ return err;
+
+ val = ipath_user_sdma_complete_counter(pq);
+ if (put_user(val, completep))
+ return -EFAULT;
+
+ return 0;
+}
+
+static ssize_t ipath_write(struct file *fp, const char __user *data,
+ size_t count, loff_t *off)
+{
+ const struct ipath_cmd __user *ucmd;
+ struct ipath_portdata *pd;
+ const void __user *src;
+ size_t consumed, copy;
+ struct ipath_cmd cmd;
+ ssize_t ret = 0;
+ void *dest;
+
+ if (count < sizeof(cmd.type)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ ucmd = (const struct ipath_cmd __user *) data;
+
+ if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ consumed = sizeof(cmd.type);
+
+ switch (cmd.type) {
+ case IPATH_CMD_ASSIGN_PORT:
+ case __IPATH_CMD_USER_INIT:
+ case IPATH_CMD_USER_INIT:
+ copy = sizeof(cmd.cmd.user_info);
+ dest = &cmd.cmd.user_info;
+ src = &ucmd->cmd.user_info;
+ break;
+ case IPATH_CMD_RECV_CTRL:
+ copy = sizeof(cmd.cmd.recv_ctrl);
+ dest = &cmd.cmd.recv_ctrl;
+ src = &ucmd->cmd.recv_ctrl;
+ break;
+ case IPATH_CMD_PORT_INFO:
+ copy = sizeof(cmd.cmd.port_info);
+ dest = &cmd.cmd.port_info;
+ src = &ucmd->cmd.port_info;
+ break;
+ case IPATH_CMD_TID_UPDATE:
+ case IPATH_CMD_TID_FREE:
+ copy = sizeof(cmd.cmd.tid_info);
+ dest = &cmd.cmd.tid_info;
+ src = &ucmd->cmd.tid_info;
+ break;
+ case IPATH_CMD_SET_PART_KEY:
+ copy = sizeof(cmd.cmd.part_key);
+ dest = &cmd.cmd.part_key;
+ src = &ucmd->cmd.part_key;
+ break;
+ case __IPATH_CMD_SLAVE_INFO:
+ copy = sizeof(cmd.cmd.slave_mask_addr);
+ dest = &cmd.cmd.slave_mask_addr;
+ src = &ucmd->cmd.slave_mask_addr;
+ break;
+ case IPATH_CMD_PIOAVAILUPD: // force an update of PIOAvail reg
+ copy = 0;
+ src = NULL;
+ dest = NULL;
+ break;
+ case IPATH_CMD_POLL_TYPE:
+ copy = sizeof(cmd.cmd.poll_type);
+ dest = &cmd.cmd.poll_type;
+ src = &ucmd->cmd.poll_type;
+ break;
+ case IPATH_CMD_ARMLAUNCH_CTRL:
+ copy = sizeof(cmd.cmd.armlaunch_ctrl);
+ dest = &cmd.cmd.armlaunch_ctrl;
+ src = &ucmd->cmd.armlaunch_ctrl;
+ break;
+ case IPATH_CMD_SDMA_INFLIGHT:
+ copy = sizeof(cmd.cmd.sdma_inflight);
+ dest = &cmd.cmd.sdma_inflight;
+ src = &ucmd->cmd.sdma_inflight;
+ break;
+ case IPATH_CMD_SDMA_COMPLETE:
+ copy = sizeof(cmd.cmd.sdma_complete);
+ dest = &cmd.cmd.sdma_complete;
+ src = &ucmd->cmd.sdma_complete;
+ break;
+ default:
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ if (copy) {
+ if ((count - consumed) < copy) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ if (copy_from_user(dest, src, copy)) {
+ ret = -EFAULT;
+ goto bail;
+ }
+
+ consumed += copy;
+ }
+
+ pd = port_fp(fp);
+ if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
+ cmd.type != IPATH_CMD_ASSIGN_PORT) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ switch (cmd.type) {
+ case IPATH_CMD_ASSIGN_PORT:
+ ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+ if (ret)
+ goto bail;
+ break;
+ case __IPATH_CMD_USER_INIT:
+ /* backwards compatibility, get port first */
+ ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+ if (ret)
+ goto bail;
+ /* and fall through to current version. */
+ case IPATH_CMD_USER_INIT:
+ ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
+ if (ret)
+ goto bail;
+ ret = ipath_get_base_info(
+ fp, (void __user *) (unsigned long)
+ cmd.cmd.user_info.spu_base_info,
+ cmd.cmd.user_info.spu_base_info_size);
+ break;
+ case IPATH_CMD_RECV_CTRL:
+ ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
+ break;
+ case IPATH_CMD_PORT_INFO:
+ ret = ipath_port_info(pd, subport_fp(fp),
+ (struct ipath_port_info __user *)
+ (unsigned long) cmd.cmd.port_info);
+ break;
+ case IPATH_CMD_TID_UPDATE:
+ ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
+ break;
+ case IPATH_CMD_TID_FREE:
+ ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
+ break;
+ case IPATH_CMD_SET_PART_KEY:
+ ret = ipath_set_part_key(pd, cmd.cmd.part_key);
+ break;
+ case __IPATH_CMD_SLAVE_INFO:
+ ret = ipath_get_slave_info(pd,
+ (void __user *) (unsigned long)
+ cmd.cmd.slave_mask_addr);
+ break;
+ case IPATH_CMD_PIOAVAILUPD:
+ ipath_force_pio_avail_update(pd->port_dd);
+ break;
+ case IPATH_CMD_POLL_TYPE:
+ pd->poll_type = cmd.cmd.poll_type;
+ break;
+ case IPATH_CMD_ARMLAUNCH_CTRL:
+ if (cmd.cmd.armlaunch_ctrl)
+ ipath_enable_armlaunch(pd->port_dd);
+ else
+ ipath_disable_armlaunch(pd->port_dd);
+ break;
+ case IPATH_CMD_SDMA_INFLIGHT:
+ ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
+ (u32 __user *) (unsigned long)
+ cmd.cmd.sdma_inflight);
+ break;
+ case IPATH_CMD_SDMA_COMPLETE:
+ ret = ipath_sdma_get_complete(pd->port_dd,
+ user_sdma_queue_fp(fp),
+ (u32 __user *) (unsigned long)
+ cmd.cmd.sdma_complete);
+ break;
+ }
+
+ if (ret >= 0)
+ ret = consumed;
+
+bail:
+ return ret;
+}
+
+static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *filp = iocb->ki_filp;
+ struct ipath_filedata *fp = filp->private_data;
+ struct ipath_portdata *pd = port_fp(filp);
+ struct ipath_user_sdma_queue *pq = fp->pq;
+
+ if (!iter_is_iovec(from) || !from->nr_segs)
+ return -EINVAL;
+
+ return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs);
+}
+
+static struct class *ipath_class;
+
+static int init_cdev(int minor, char *name, const struct file_operations *fops,
+ struct cdev **cdevp, struct device **devp)
+{
+ const dev_t dev = MKDEV(IPATH_MAJOR, minor);
+ struct cdev *cdev = NULL;
+ struct device *device = NULL;
+ int ret;
+
+ cdev = cdev_alloc();
+ if (!cdev) {
+ printk(KERN_ERR IPATH_DRV_NAME
+ ": Could not allocate cdev for minor %d, %s\n",
+ minor, name);
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ cdev->owner = THIS_MODULE;
+ cdev->ops = fops;
+ kobject_set_name(&cdev->kobj, name);
+
+ ret = cdev_add(cdev, dev, 1);
+ if (ret < 0) {
+ printk(KERN_ERR IPATH_DRV_NAME
+ ": Could not add cdev for minor %d, %s (err %d)\n",
+ minor, name, -ret);
+ goto err_cdev;
+ }
+
+ device = device_create(ipath_class, NULL, dev, NULL, name);
+
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+ "device for minor %d, %s (err %d)\n",
+ minor, name, -ret);
+ goto err_cdev;
+ }
+
+ goto done;
+
+err_cdev:
+ cdev_del(cdev);
+ cdev = NULL;
+
+done:
+ if (ret >= 0) {
+ *cdevp = cdev;
+ *devp = device;
+ } else {
+ *cdevp = NULL;
+ *devp = NULL;
+ }
+
+ return ret;
+}
+
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+ struct cdev **cdevp, struct device **devp)
+{
+ return init_cdev(minor, name, fops, cdevp, devp);
+}
+
+static void cleanup_cdev(struct cdev **cdevp,
+ struct device **devp)
+{
+ struct device *dev = *devp;
+
+ if (dev) {
+ device_unregister(dev);
+ *devp = NULL;
+ }
+
+ if (*cdevp) {
+ cdev_del(*cdevp);
+ *cdevp = NULL;
+ }
+}
+
+void ipath_cdev_cleanup(struct cdev **cdevp,
+ struct device **devp)
+{
+ cleanup_cdev(cdevp, devp);
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_dev;
+
+static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
+
+static int user_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
+ if (ret < 0) {
+ printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
+ "chrdev region (err %d)\n", -ret);
+ goto done;
+ }
+
+ ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
+
+ if (IS_ERR(ipath_class)) {
+ ret = PTR_ERR(ipath_class);
+ printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+ "device class (err %d)\n", -ret);
+ goto bail;
+ }
+
+ goto done;
+bail:
+ unregister_chrdev_region(dev, IPATH_NMINORS);
+done:
+ return ret;
+}
+
+static void user_cleanup(void)
+{
+ if (ipath_class) {
+ class_destroy(ipath_class);
+ ipath_class = NULL;
+ }
+
+ unregister_chrdev_region(dev, IPATH_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+static atomic_t user_setup = ATOMIC_INIT(0);
+
+int ipath_user_add(struct ipath_devdata *dd)
+{
+ char name[10];
+ int ret;
+
+ if (atomic_inc_return(&user_count) == 1) {
+ ret = user_init();
+ if (ret < 0) {
+ ipath_dev_err(dd, "Unable to set up user support: "
+ "error %d\n", -ret);
+ goto bail;
+ }
+ ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
+ &wildcard_dev);
+ if (ret < 0) {
+ ipath_dev_err(dd, "Could not create wildcard "
+ "minor: error %d\n", -ret);
+ goto bail_user;
+ }
+
+ atomic_set(&user_setup, 1);
+ }
+
+ snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
+
+ ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
+ &dd->user_cdev, &dd->user_dev);
+ if (ret < 0)
+ ipath_dev_err(dd, "Could not create user minor %d, %s\n",
+ dd->ipath_unit + 1, name);
+
+ goto bail;
+
+bail_user:
+ user_cleanup();
+bail:
+ return ret;
+}
+
+void ipath_user_remove(struct ipath_devdata *dd)
+{
+ cleanup_cdev(&dd->user_cdev, &dd->user_dev);
+
+ if (atomic_dec_return(&user_count) == 0) {
+ if (atomic_read(&user_setup) == 0)
+ goto bail;
+
+ cleanup_cdev(&wildcard_cdev, &wildcard_dev);
+ user_cleanup();
+
+ atomic_set(&user_setup, 0);
+ }
+bail:
+ return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c
new file mode 100644
index 000000000000..25422a3a7238
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_fs.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#include "ipath_kernel.h"
+
+#define IPATHFS_MAGIC 0x726a77
+
+static struct super_block *ipath_super;
+
+static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, const struct file_operations *fops,
+ void *data)
+{
+ int error;
+ struct inode *inode = new_inode(dir->i_sb);
+
+ if (!inode) {
+ error = -EPERM;
+ goto bail;
+ }
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_private = data;
+ if (S_ISDIR(mode)) {
+ inode->i_op = &simple_dir_inode_operations;
+ inc_nlink(inode);
+ inc_nlink(dir);
+ }
+
+ inode->i_fop = fops;
+
+ d_instantiate(dentry, inode);
+ error = 0;
+
+bail:
+ return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+ struct dentry *parent, struct dentry **dentry,
+ const struct file_operations *fops, void *data)
+{
+ int error;
+
+ mutex_lock(&d_inode(parent)->i_mutex);
+ *dentry = lookup_one_len(name, parent, strlen(name));
+ if (!IS_ERR(*dentry))
+ error = ipathfs_mknod(d_inode(parent), *dentry,
+ mode, fops, data);
+ else
+ error = PTR_ERR(*dentry);
+ mutex_unlock(&d_inode(parent)->i_mutex);
+
+ return error;
+}
+
+static ssize_t atomic_stats_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
+ sizeof ipath_stats);
+}
+
+static const struct file_operations atomic_stats_ops = {
+ .read = atomic_stats_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t atomic_counters_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct infinipath_counters counters;
+ struct ipath_devdata *dd;
+
+ dd = file_inode(file)->i_private;
+ dd->ipath_f_read_counters(dd, &counters);
+
+ return simple_read_from_buffer(buf, count, ppos, &counters,
+ sizeof counters);
+}
+
+static const struct file_operations atomic_counters_ops = {
+ .read = atomic_counters_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct ipath_devdata *dd;
+ ssize_t ret;
+ loff_t pos;
+ char *tmp;
+
+ pos = *ppos;
+
+ if ( pos < 0) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ if (pos >= sizeof(struct ipath_flash)) {
+ ret = 0;
+ goto bail;
+ }
+
+ if (count > sizeof(struct ipath_flash) - pos)
+ count = sizeof(struct ipath_flash) - pos;
+
+ tmp = kmalloc(count, GFP_KERNEL);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ dd = file_inode(file)->i_private;
+ if (ipath_eeprom_read(dd, pos, tmp, count)) {
+ ipath_dev_err(dd, "failed to read from flash\n");
+ ret = -ENXIO;
+ goto bail_tmp;
+ }
+
+ if (copy_to_user(buf, tmp, count)) {
+ ret = -EFAULT;
+ goto bail_tmp;
+ }
+
+ *ppos = pos + count;
+ ret = count;
+
+bail_tmp:
+ kfree(tmp);
+
+bail:
+ return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct ipath_devdata *dd;
+ ssize_t ret;
+ loff_t pos;
+ char *tmp;
+
+ pos = *ppos;
+
+ if (pos != 0) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ if (count != sizeof(struct ipath_flash)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ tmp = kmalloc(count, GFP_KERNEL);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ if (copy_from_user(tmp, buf, count)) {
+ ret = -EFAULT;
+ goto bail_tmp;
+ }
+
+ dd = file_inode(file)->i_private;
+ if (ipath_eeprom_write(dd, pos, tmp, count)) {
+ ret = -ENXIO;
+ ipath_dev_err(dd, "failed to write to flash\n");
+ goto bail_tmp;
+ }
+
+ *ppos = pos + count;
+ ret = count;
+
+bail_tmp:
+ kfree(tmp);
+
+bail:
+ return ret;
+}
+
+static const struct file_operations flash_ops = {
+ .read = flash_read,
+ .write = flash_write,
+ .llseek = default_llseek,
+};
+
+static int create_device_files(struct super_block *sb,
+ struct ipath_devdata *dd)
+{
+ struct dentry *dir, *tmp;
+ char unit[10];
+ int ret;
+
+ snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+ ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+ &simple_dir_operations, dd);
+ if (ret) {
+ printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
+ goto bail;
+ }
+
+ ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
+ &atomic_counters_ops, dd);
+ if (ret) {
+ printk(KERN_ERR "create_file(%s/atomic_counters) "
+ "failed: %d\n", unit, ret);
+ goto bail;
+ }
+
+ ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+ &flash_ops, dd);
+ if (ret) {
+ printk(KERN_ERR "create_file(%s/flash) "
+ "failed: %d\n", unit, ret);
+ goto bail;
+ }
+
+bail:
+ return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+ struct dentry *tmp;
+ int ret;
+
+ tmp = lookup_one_len(name, parent, strlen(name));
+
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ goto bail;
+ }
+
+ spin_lock(&tmp->d_lock);
+ if (simple_positive(tmp)) {
+ dget_dlock(tmp);
+ __d_drop(tmp);
+ spin_unlock(&tmp->d_lock);
+ simple_unlink(d_inode(parent), tmp);
+ } else
+ spin_unlock(&tmp->d_lock);
+
+ ret = 0;
+bail:
+ /*
+ * We don't expect clients to care about the return value, but
+ * it's there if they need it.
+ */
+ return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+ struct ipath_devdata *dd)
+{
+ struct dentry *dir, *root;
+ char unit[10];
+ int ret;
+
+ root = dget(sb->s_root);
+ mutex_lock(&d_inode(root)->i_mutex);
+ snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+ dir = lookup_one_len(unit, root, strlen(unit));
+
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ printk(KERN_ERR "Lookup of %s failed\n", unit);
+ goto bail;
+ }
+
+ remove_file(dir, "flash");
+ remove_file(dir, "atomic_counters");
+ d_delete(dir);
+ ret = simple_rmdir(d_inode(root), dir);
+
+bail:
+ mutex_unlock(&d_inode(root)->i_mutex);
+ dput(root);
+ return ret;
+}
+
+static int ipathfs_fill_super(struct super_block *sb, void *data,
+ int silent)
+{
+ struct ipath_devdata *dd, *tmp;
+ unsigned long flags;
+ int ret;
+
+ static struct tree_descr files[] = {
+ [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
+ {""},
+ };
+
+ ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
+ if (ret) {
+ printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
+ goto bail;
+ }
+
+ spin_lock_irqsave(&ipath_devs_lock, flags);
+
+ list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+ spin_unlock_irqrestore(&ipath_devs_lock, flags);
+ ret = create_device_files(sb, dd);
+ if (ret)
+ goto bail;
+ spin_lock_irqsave(&ipath_devs_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+bail:
+ return ret;
+}
+
+static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ struct dentry *ret;
+ ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
+ if (!IS_ERR(ret))
+ ipath_super = ret->d_sb;
+ return ret;
+}
+
+static void ipathfs_kill_super(struct super_block *s)
+{
+ kill_litter_super(s);
+ ipath_super = NULL;
+}
+
+int ipathfs_add_device(struct ipath_devdata *dd)
+{
+ int ret;
+
+ if (ipath_super == NULL) {
+ ret = 0;
+ goto bail;
+ }
+
+ ret = create_device_files(ipath_super, dd);
+
+bail:
+ return ret;
+}
+
+int ipathfs_remove_device(struct ipath_devdata *dd)
+{
+ int ret;
+
+ if (ipath_super == NULL) {
+ ret = 0;
+ goto bail;
+ }
+
+ ret = remove_device_files(ipath_super, dd);
+
+bail:
+ return ret;
+}
+
+static struct file_system_type ipathfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ipathfs",
+ .mount = ipathfs_mount,
+ .kill_sb = ipathfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init ipath_init_ipathfs(void)
+{
+ return register_filesystem(&ipathfs_fs_type);
+}
+
+void __exit ipath_exit_ipathfs(void)
+{
+ unregister_filesystem(&ipathfs_fs_type);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_iba6110.c b/drivers/staging/rdma/ipath/ipath_iba6110.c
new file mode 100644
index 000000000000..7cc305488a3d
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_iba6110.c
@@ -0,0 +1,1940 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the InfiniPath
+ * HT chip.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/htirq.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_kernel.h"
+#include "ipath_registers.h"
+
+static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
+
+
+/*
+ * This lists the InfiniPath registers, in the actual chip layout.
+ * This structure should never be directly accessed.
+ *
+ * The names are in InterCap form because they're taken straight from
+ * the chip specification. Since they're only used in this file, they
+ * don't pollute the rest of the source.
+*/
+
+struct _infinipath_do_not_use_kernel_regs {
+ unsigned long long Revision;
+ unsigned long long Control;
+ unsigned long long PageAlign;
+ unsigned long long PortCnt;
+ unsigned long long DebugPortSelect;
+ unsigned long long DebugPort;
+ unsigned long long SendRegBase;
+ unsigned long long UserRegBase;
+ unsigned long long CounterRegBase;
+ unsigned long long Scratch;
+ unsigned long long ReservedMisc1;
+ unsigned long long InterruptConfig;
+ unsigned long long IntBlocked;
+ unsigned long long IntMask;
+ unsigned long long IntStatus;
+ unsigned long long IntClear;
+ unsigned long long ErrorMask;
+ unsigned long long ErrorStatus;
+ unsigned long long ErrorClear;
+ unsigned long long HwErrMask;
+ unsigned long long HwErrStatus;
+ unsigned long long HwErrClear;
+ unsigned long long HwDiagCtrl;
+ unsigned long long MDIO;
+ unsigned long long IBCStatus;
+ unsigned long long IBCCtrl;
+ unsigned long long ExtStatus;
+ unsigned long long ExtCtrl;
+ unsigned long long GPIOOut;
+ unsigned long long GPIOMask;
+ unsigned long long GPIOStatus;
+ unsigned long long GPIOClear;
+ unsigned long long RcvCtrl;
+ unsigned long long RcvBTHQP;
+ unsigned long long RcvHdrSize;
+ unsigned long long RcvHdrCnt;
+ unsigned long long RcvHdrEntSize;
+ unsigned long long RcvTIDBase;
+ unsigned long long RcvTIDCnt;
+ unsigned long long RcvEgrBase;
+ unsigned long long RcvEgrCnt;
+ unsigned long long RcvBufBase;
+ unsigned long long RcvBufSize;
+ unsigned long long RxIntMemBase;
+ unsigned long long RxIntMemSize;
+ unsigned long long RcvPartitionKey;
+ unsigned long long ReservedRcv[10];
+ unsigned long long SendCtrl;
+ unsigned long long SendPIOBufBase;
+ unsigned long long SendPIOSize;
+ unsigned long long SendPIOBufCnt;
+ unsigned long long SendPIOAvailAddr;
+ unsigned long long TxIntMemBase;
+ unsigned long long TxIntMemSize;
+ unsigned long long ReservedSend[9];
+ unsigned long long SendBufferError;
+ unsigned long long SendBufferErrorCONT1;
+ unsigned long long SendBufferErrorCONT2;
+ unsigned long long SendBufferErrorCONT3;
+ unsigned long long ReservedSBE[4];
+ unsigned long long RcvHdrAddr0;
+ unsigned long long RcvHdrAddr1;
+ unsigned long long RcvHdrAddr2;
+ unsigned long long RcvHdrAddr3;
+ unsigned long long RcvHdrAddr4;
+ unsigned long long RcvHdrAddr5;
+ unsigned long long RcvHdrAddr6;
+ unsigned long long RcvHdrAddr7;
+ unsigned long long RcvHdrAddr8;
+ unsigned long long ReservedRHA[7];
+ unsigned long long RcvHdrTailAddr0;
+ unsigned long long RcvHdrTailAddr1;
+ unsigned long long RcvHdrTailAddr2;
+ unsigned long long RcvHdrTailAddr3;
+ unsigned long long RcvHdrTailAddr4;
+ unsigned long long RcvHdrTailAddr5;
+ unsigned long long RcvHdrTailAddr6;
+ unsigned long long RcvHdrTailAddr7;
+ unsigned long long RcvHdrTailAddr8;
+ unsigned long long ReservedRHTA[7];
+ unsigned long long Sync; /* Software only */
+ unsigned long long Dump; /* Software only */
+ unsigned long long SimVer; /* Software only */
+ unsigned long long ReservedSW[5];
+ unsigned long long SerdesConfig0;
+ unsigned long long SerdesConfig1;
+ unsigned long long SerdesStatus;
+ unsigned long long XGXSConfig;
+ unsigned long long ReservedSW2[4];
+};
+
+struct _infinipath_do_not_use_counters {
+ __u64 LBIntCnt;
+ __u64 LBFlowStallCnt;
+ __u64 Reserved1;
+ __u64 TxUnsupVLErrCnt;
+ __u64 TxDataPktCnt;
+ __u64 TxFlowPktCnt;
+ __u64 TxDwordCnt;
+ __u64 TxLenErrCnt;
+ __u64 TxMaxMinLenErrCnt;
+ __u64 TxUnderrunCnt;
+ __u64 TxFlowStallCnt;
+ __u64 TxDroppedPktCnt;
+ __u64 RxDroppedPktCnt;
+ __u64 RxDataPktCnt;
+ __u64 RxFlowPktCnt;
+ __u64 RxDwordCnt;
+ __u64 RxLenErrCnt;
+ __u64 RxMaxMinLenErrCnt;
+ __u64 RxICRCErrCnt;
+ __u64 RxVCRCErrCnt;
+ __u64 RxFlowCtrlErrCnt;
+ __u64 RxBadFormatCnt;
+ __u64 RxLinkProblemCnt;
+ __u64 RxEBPCnt;
+ __u64 RxLPCRCErrCnt;
+ __u64 RxBufOvflCnt;
+ __u64 RxTIDFullErrCnt;
+ __u64 RxTIDValidErrCnt;
+ __u64 RxPKeyMismatchCnt;
+ __u64 RxP0HdrEgrOvflCnt;
+ __u64 RxP1HdrEgrOvflCnt;
+ __u64 RxP2HdrEgrOvflCnt;
+ __u64 RxP3HdrEgrOvflCnt;
+ __u64 RxP4HdrEgrOvflCnt;
+ __u64 RxP5HdrEgrOvflCnt;
+ __u64 RxP6HdrEgrOvflCnt;
+ __u64 RxP7HdrEgrOvflCnt;
+ __u64 RxP8HdrEgrOvflCnt;
+ __u64 Reserved6;
+ __u64 Reserved7;
+ __u64 IBStatusChangeCnt;
+ __u64 IBLinkErrRecoveryCnt;
+ __u64 IBLinkDownedCnt;
+ __u64 IBSymbolErrCnt;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof( \
+ struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+#define IPATH_CREG_OFFSET(field) (offsetof( \
+ struct _infinipath_do_not_use_counters, field) / sizeof(u64))
+
+static const struct ipath_kregs ipath_ht_kregs = {
+ .kr_control = IPATH_KREG_OFFSET(Control),
+ .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
+ .kr_debugport = IPATH_KREG_OFFSET(DebugPort),
+ .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
+ .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
+ .kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
+ .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
+ .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
+ .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
+ .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
+ .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
+ .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
+ .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
+ .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
+ .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
+ .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
+ .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
+ .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
+ .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
+ .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
+ .kr_intclear = IPATH_KREG_OFFSET(IntClear),
+ .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
+ .kr_intmask = IPATH_KREG_OFFSET(IntMask),
+ .kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
+ .kr_mdio = IPATH_KREG_OFFSET(MDIO),
+ .kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
+ .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
+ .kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
+ .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
+ .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
+ .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
+ .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
+ .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
+ .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
+ .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
+ .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
+ .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
+ .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
+ .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
+ .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
+ .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
+ .kr_revision = IPATH_KREG_OFFSET(Revision),
+ .kr_scratch = IPATH_KREG_OFFSET(Scratch),
+ .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
+ .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
+ .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
+ .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
+ .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
+ .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
+ .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
+ .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
+ .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
+ .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
+ .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
+ .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
+ .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
+ .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
+ /*
+ * These should not be used directly via ipath_write_kreg64(),
+ * use them with ipath_write_kreg64_port(),
+ */
+ .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
+ .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
+};
+
+static const struct ipath_cregs ipath_ht_cregs = {
+ .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
+ .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
+ .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
+ .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
+ .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
+ .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
+ .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
+ .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
+ .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
+ .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
+ .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
+ .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
+ /* calc from Reg_CounterRegBase + offset */
+ .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
+ .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
+ .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
+ .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
+ .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
+ .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
+ .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
+ .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
+ .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
+ .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
+ .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
+ .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
+ .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
+ .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
+ .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
+ .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
+ .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
+ .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
+ .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
+ .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
+ .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
+};
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVURG_SHIFT 0
+#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVAVAIL_SHIFT 12
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
+#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
+#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR 0x0000000000800000ULL
+#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR 0x0000000001000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR 0x0000000002000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR 0x0000000004000000ULL
+#define INFINIPATH_HWE_HTCMISCERR4 0x0000000008000000ULL
+#define INFINIPATH_HWE_HTCMISCERR5 0x0000000010000000ULL
+#define INFINIPATH_HWE_HTCMISCERR6 0x0000000020000000ULL
+#define INFINIPATH_HWE_HTCMISCERR7 0x0000000040000000ULL
+#define INFINIPATH_HWE_HTCBUSTREQPARITYERR 0x0000000080000000ULL
+#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
+#define INFINIPATH_HWE_HTCBUSIREQPARITYERR 0x0000000200000000ULL
+#define INFINIPATH_HWE_COREPLL_FBSLIP 0x0080000000000000ULL
+#define INFINIPATH_HWE_COREPLL_RFSLIP 0x0100000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_FBSLIP 0x0200000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_RFSLIP 0x0400000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_FBSLIP 0x0800000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_RFSLIP 0x1000000000000000ULL
+#define INFINIPATH_HWE_SERDESPLLFAILED 0x2000000000000000ULL
+
+#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
+#define IBA6110_IBCS_LINKSTATE_SHIFT 4
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_FREQSEL 0x2
+#define INFINIPATH_EXTS_SERDESSEL 0x4
+#define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000
+#define INFINIPATH_EXTS_MEMBIST_CORRECT 0x0000000000008000
+
+
+/* TID entries (memory), HT-only */
+#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */
+#define INFINIPATH_RT_VALID 0x8000000000000000ULL
+#define INFINIPATH_RT_ADDR_SHIFT 0
+#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
+#define INFINIPATH_RT_BUFSIZE_SHIFT 48
+
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD_SHIFT 31
+
+/* kr_xgxsconfig bits */
+#define INFINIPATH_XGXS_RESET 0x7ULL
+
+/*
+ * masks and bits that are different in different chips, or present only
+ * in one
+ */
+static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
+ INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
+static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
+ INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
+
+static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
+ INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
+ INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
+ INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
+ INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
+
+#define _IPATH_GPIO_SDA_NUM 1
+#define _IPATH_GPIO_SCL_NUM 0
+
+#define IPATH_GPIO_SDA \
+ (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+#define IPATH_GPIO_SCL \
+ (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+
+/* keep the code below somewhat more readable; not used elsewhere */
+#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \
+ infinipath_hwe_htclnkabyte1crcerr)
+#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr | \
+ infinipath_hwe_htclnkbbyte1crcerr)
+#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \
+ infinipath_hwe_htclnkbbyte0crcerr)
+#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr | \
+ infinipath_hwe_htclnkbbyte1crcerr)
+
+static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
+ char *msg, size_t msgl)
+{
+ char bitsmsg[64];
+ ipath_err_t crcbits = hwerrs &
+ (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
+ /* don't check if 8bit HT */
+ if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+ crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
+ /* don't check if 8bit HT */
+ if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+ crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
+ /*
+ * we'll want to ignore link errors on link that is
+ * not in use, if any. For now, complain about both
+ */
+ if (crcbits) {
+ u16 ctrl0, ctrl1;
+ snprintf(bitsmsg, sizeof bitsmsg,
+ "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
+ !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
+ "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
+ ? "1 (B)" : "0+1 (A+B)"),
+ !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
+ : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
+ "0+1"), (unsigned long long) crcbits);
+ strlcat(msg, bitsmsg, msgl);
+
+ /*
+ * print extra info for debugging. slave/primary
+ * config word 4, 8 (link control 0, 1)
+ */
+
+ if (pci_read_config_word(dd->pcidev,
+ dd->ipath_ht_slave_off + 0x4,
+ &ctrl0))
+ dev_info(&dd->pcidev->dev, "Couldn't read "
+ "linkctrl0 of slave/primary "
+ "config block\n");
+ else if (!(ctrl0 & 1 << 6))
+ /* not if EOC bit set */
+ ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
+ ((ctrl0 >> 8) & 7) ? " CRC" : "",
+ ((ctrl0 >> 4) & 1) ? "linkfail" :
+ "");
+ if (pci_read_config_word(dd->pcidev,
+ dd->ipath_ht_slave_off + 0x8,
+ &ctrl1))
+ dev_info(&dd->pcidev->dev, "Couldn't read "
+ "linkctrl1 of slave/primary "
+ "config block\n");
+ else if (!(ctrl1 & 1 << 6))
+ /* not if EOC bit set */
+ ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
+ ((ctrl1 >> 8) & 7) ? " CRC" : "",
+ ((ctrl1 >> 4) & 1) ? "linkfail" :
+ "");
+
+ /* disable until driver reloaded */
+ dd->ipath_hwerrmask &= ~crcbits;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+ dd->ipath_hwerrmask);
+ ipath_dbg("HT crc errs: %s\n", msg);
+ } else
+ ipath_dbg("ignoring HT crc errors 0x%llx, "
+ "not in use\n", (unsigned long long)
+ (hwerrs & (_IPATH_HTLINK0_CRCBITS |
+ _IPATH_HTLINK1_CRCBITS)));
+}
+
+/* 6110 specific hardware errors... */
+static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
+ INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
+ INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
+ INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
+ INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
+ INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
+ INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
+ INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
+ INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
+ INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
+ << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
+#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
+ << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
+
+static void ipath_ht_txe_recover(struct ipath_devdata *dd)
+{
+ ++ipath_stats.sps_txeparity;
+ dev_info(&dd->pcidev->dev,
+ "Recovering from TXE PIO parity error\n");
+}
+
+
+/**
+ * ipath_ht_handle_hwerrors - display hardware errors.
+ * @dd: the infinipath device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use. Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue. We reuse the same message buffer as
+ * ipath_handle_errors() to avoid excessive stack usage.
+ */
+static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
+ size_t msgl)
+{
+ ipath_err_t hwerrs;
+ u32 bits, ctrl;
+ int isfatal = 0;
+ char bitsmsg[64];
+ int log_idx;
+
+ hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+
+ if (!hwerrs) {
+ ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
+ /*
+ * better than printing cofusing messages
+ * This seems to be related to clearing the crc error, or
+ * the pll error during init.
+ */
+ goto bail;
+ } else if (hwerrs == -1LL) {
+ ipath_dev_err(dd, "Read of hardware error status failed "
+ "(all bits set); ignoring\n");
+ goto bail;
+ }
+ ipath_stats.sps_hwerrs++;
+
+ /* Always clear the error status register, except MEMBISTFAIL,
+ * regardless of whether we continue or stop using the chip.
+ * We want that set so we know it failed, even across driver reload.
+ * We'll still ignore it in the hwerrmask. We do this partly for
+ * diagnostics, but also for support */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+ hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
+
+ hwerrs &= dd->ipath_hwerrmask;
+
+ /* We log some errors to EEPROM, check if we have any of those. */
+ for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
+ if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
+ ipath_inc_eeprom_err(dd, log_idx, 1);
+
+ /*
+ * make sure we get this much out, unless told to be quiet,
+ * it's a parity error we may recover from,
+ * or it's occurred within the last 5 seconds
+ */
+ if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
+ RXE_EAGER_PARITY)) ||
+ (ipath_debug & __IPATH_VERBDBG))
+ dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
+ "(cleared)\n", (unsigned long long) hwerrs);
+ dd->ipath_lasthwerror |= hwerrs;
+
+ if (hwerrs & ~dd->ipath_hwe_bitsextant)
+ ipath_dev_err(dd, "hwerror interrupt with unknown errors "
+ "%llx set\n", (unsigned long long)
+ (hwerrs & ~dd->ipath_hwe_bitsextant));
+
+ ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+ if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
+ /*
+ * parity errors in send memory are recoverable,
+ * just cancel the send (if indicated in * sendbuffererror),
+ * count the occurrence, unfreeze (if no other handled
+ * hardware error bits are set), and continue. They can
+ * occur if a processor speculative read is done to the PIO
+ * buffer while we are sending a packet, for example.
+ */
+ if (hwerrs & TXE_PIO_PARITY) {
+ ipath_ht_txe_recover(dd);
+ hwerrs &= ~TXE_PIO_PARITY;
+ }
+
+ if (!hwerrs) {
+ ipath_dbg("Clearing freezemode on ignored or "
+ "recovered hardware error\n");
+ ipath_clear_freeze(dd);
+ }
+ }
+
+ *msg = '\0';
+
+ /*
+ * may someday want to decode into which bits are which
+ * functional area for parity errors, etc.
+ */
+ if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
+ << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
+ bits = (u32) ((hwerrs >>
+ INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
+ INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
+ snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
+ bits);
+ strlcat(msg, bitsmsg, msgl);
+ }
+
+ ipath_format_hwerrors(hwerrs,
+ ipath_6110_hwerror_msgs,
+ ARRAY_SIZE(ipath_6110_hwerror_msgs),
+ msg, msgl);
+
+ if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
+ hwerr_crcbits(dd, hwerrs, msg, msgl);
+
+ if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
+ strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
+ msgl);
+ /* ignore from now on, so disable until driver reloaded */
+ dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+ dd->ipath_hwerrmask);
+ }
+#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP | \
+ INFINIPATH_HWE_COREPLL_RFSLIP | \
+ INFINIPATH_HWE_HTBPLL_FBSLIP | \
+ INFINIPATH_HWE_HTBPLL_RFSLIP | \
+ INFINIPATH_HWE_HTAPLL_FBSLIP | \
+ INFINIPATH_HWE_HTAPLL_RFSLIP)
+
+ if (hwerrs & _IPATH_PLL_FAIL) {
+ snprintf(bitsmsg, sizeof bitsmsg,
+ "[PLL failed (%llx), InfiniPath hardware unusable]",
+ (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
+ strlcat(msg, bitsmsg, msgl);
+ /* ignore from now on, so disable until driver reloaded */
+ dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+ dd->ipath_hwerrmask);
+ }
+
+ if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
+ /*
+ * If it occurs, it is left masked since the eternal
+ * interface is unused
+ */
+ dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+ dd->ipath_hwerrmask);
+ }
+
+ if (hwerrs) {
+ /*
+ * if any set that we aren't ignoring; only
+ * make the complaint once, in case it's stuck
+ * or recurring, and we get here multiple
+ * times.
+ * force link down, so switch knows, and
+ * LEDs are turned off
+ */
+ if (dd->ipath_flags & IPATH_INITTED) {
+ ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+ ipath_setup_ht_setextled(dd,
+ INFINIPATH_IBCS_L_STATE_DOWN,
+ INFINIPATH_IBCS_LT_STATE_DISABLED);
+ ipath_dev_err(dd, "Fatal Hardware Error (freeze "
+ "mode), no longer usable, SN %.16s\n",
+ dd->ipath_serial);
+ isfatal = 1;
+ }
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+ /* mark as having had error */
+ *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+ /*
+ * mark as not usable, at a minimum until driver
+ * is reloaded, probably until reboot, since no
+ * other reset is possible.
+ */
+ dd->ipath_flags &= ~IPATH_INITTED;
+ }
+ else
+ *msg = 0; /* recovered from all of them */
+ if (*msg)
+ ipath_dev_err(dd, "%s hardware error\n", msg);
+ if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
+ /*
+ * for status file; if no trailing brace is copied,
+ * we'll know it was truncated.
+ */
+ snprintf(dd->ipath_freezemsg,
+ dd->ipath_freezelen, "{%s}", msg);
+
+bail:;
+}
+
+/**
+ * ipath_ht_boardname - fill in the board name
+ * @dd: the infinipath device
+ * @name: the output buffer
+ * @namelen: the size of the output buffer
+ *
+ * fill in the board name, based on the board revision register
+ */
+static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
+ size_t namelen)
+{
+ char *n = NULL;
+ u8 boardrev = dd->ipath_boardrev;
+ int ret = 0;
+
+ switch (boardrev) {
+ case 5:
+ /*
+ * original production board; two production levels, with
+ * different serial number ranges. See ipath_ht_early_init() for
+ * case where we enable IPATH_GPIO_INTR for later serial # range.
+ * Original 112* serial number is no longer supported.
+ */
+ n = "InfiniPath_QHT7040";
+ break;
+ case 7:
+ /* small form factor production board */
+ n = "InfiniPath_QHT7140";
+ break;
+ default: /* don't know, just print the number */
+ ipath_dev_err(dd, "Don't yet know about board "
+ "with ID %u\n", boardrev);
+ snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
+ boardrev);
+ break;
+ }
+ if (n)
+ snprintf(name, namelen, "%s", n);
+
+ if (ret) {
+ ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
+ goto bail;
+ }
+ if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
+ dd->ipath_minrev > 4)) {
+ /*
+ * This version of the driver only supports Rev 3.2 - 3.4
+ */
+ ipath_dev_err(dd,
+ "Unsupported InfiniPath hardware revision %u.%u!\n",
+ dd->ipath_majrev, dd->ipath_minrev);
+ ret = 1;
+ goto bail;
+ }
+ /*
+ * pkt/word counters are 32 bit, and therefore wrap fast enough
+ * that we snapshot them from a timer, and maintain 64 bit shadow
+ * copies
+ */
+ dd->ipath_flags |= IPATH_32BITCOUNTERS;
+ dd->ipath_flags |= IPATH_GPIO_INTR;
+ if (dd->ipath_lbus_speed != 800)
+ ipath_dev_err(dd,
+ "Incorrectly configured for HT @ %uMHz\n",
+ dd->ipath_lbus_speed);
+
+ /*
+ * set here, not in ipath_init_*_funcs because we have to do
+ * it after we can read chip registers.
+ */
+ dd->ipath_ureg_align =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+
+bail:
+ return ret;
+}
+
+static void ipath_check_htlink(struct ipath_devdata *dd)
+{
+ u8 linkerr, link_off, i;
+
+ for (i = 0; i < 2; i++) {
+ link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
+ if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
+ dev_info(&dd->pcidev->dev, "Couldn't read "
+ "linkerror%d of HT slave/primary block\n",
+ i);
+ else if (linkerr & 0xf0) {
+ ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+ "clearing\n", linkerr >> 4, i);
+ /*
+ * writing the linkerr bits that are set should
+ * clear them
+ */
+ if (pci_write_config_byte(dd->pcidev, link_off,
+ linkerr))
+ ipath_dbg("Failed write to clear HT "
+ "linkerror%d\n", i);
+ if (pci_read_config_byte(dd->pcidev, link_off,
+ &linkerr))
+ dev_info(&dd->pcidev->dev,
+ "Couldn't reread linkerror%d of "
+ "HT slave/primary block\n", i);
+ else if (linkerr & 0xf0)
+ dev_info(&dd->pcidev->dev,
+ "HT linkerror%d bits 0x%x "
+ "couldn't be cleared\n",
+ i, linkerr >> 4);
+ }
+ }
+}
+
+static int ipath_setup_ht_reset(struct ipath_devdata *dd)
+{
+ ipath_dbg("No reset possible for this InfiniPath hardware\n");
+ return 0;
+}
+
+#define HT_INTR_DISC_CONFIG 0x80 /* HT interrupt and discovery cap */
+#define HT_INTR_REG_INDEX 2 /* intconfig requires indirect accesses */
+
+/*
+ * Bits 13-15 of command==0 is slave/primary block. Clear any HT CRC
+ * errors. We only bother to do this at load time, because it's OK if
+ * it happened before we were loaded (first time after boot/reset),
+ * but any time after that, it's fatal anyway. Also need to not check
+ * for upper byte errors if we are in 8 bit mode, so figure out
+ * our width. For now, at least, also complain if it's 8 bit.
+ */
+static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
+ int pos, u8 cap_type)
+{
+ u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
+ u16 linkctrl = 0;
+ int i;
+
+ dd->ipath_ht_slave_off = pos;
+ /* command word, master_host bit */
+ /* master host || slave */
+ if ((cap_type >> 2) & 1)
+ link_a_b_off = 4;
+ else
+ link_a_b_off = 0;
+ ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
+ link_a_b_off ? 1 : 0,
+ link_a_b_off ? 'B' : 'A');
+
+ link_a_b_off += pos;
+
+ /*
+ * check both link control registers; clear both HT CRC sets if
+ * necessary.
+ */
+ for (i = 0; i < 2; i++) {
+ link_off = pos + i * 4 + 0x4;
+ if (pci_read_config_word(pdev, link_off, &linkctrl))
+ ipath_dev_err(dd, "Couldn't read HT link control%d "
+ "register\n", i);
+ else if (linkctrl & (0xf << 8)) {
+ ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
+ "bits %x\n", i, linkctrl & (0xf << 8));
+ /*
+ * now write them back to clear the error.
+ */
+ pci_write_config_word(pdev, link_off,
+ linkctrl & (0xf << 8));
+ }
+ }
+
+ /*
+ * As with HT CRC bits, same for protocol errors that might occur
+ * during boot.
+ */
+ for (i = 0; i < 2; i++) {
+ link_off = pos + i * 4 + 0xd;
+ if (pci_read_config_byte(pdev, link_off, &linkerr))
+ dev_info(&pdev->dev, "Couldn't read linkerror%d "
+ "of HT slave/primary block\n", i);
+ else if (linkerr & 0xf0) {
+ ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+ "clearing\n", linkerr >> 4, i);
+ /*
+ * writing the linkerr bits that are set will clear
+ * them
+ */
+ if (pci_write_config_byte
+ (pdev, link_off, linkerr))
+ ipath_dbg("Failed write to clear HT "
+ "linkerror%d\n", i);
+ if (pci_read_config_byte(pdev, link_off, &linkerr))
+ dev_info(&pdev->dev, "Couldn't reread "
+ "linkerror%d of HT slave/primary "
+ "block\n", i);
+ else if (linkerr & 0xf0)
+ dev_info(&pdev->dev, "HT linkerror%d bits "
+ "0x%x couldn't be cleared\n",
+ i, linkerr >> 4);
+ }
+ }
+
+ /*
+ * this is just for our link to the host, not devices connected
+ * through tunnel.
+ */
+
+ if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
+ ipath_dev_err(dd, "Couldn't read HT link width "
+ "config register\n");
+ else {
+ u32 width;
+ switch (linkwidth & 7) {
+ case 5:
+ width = 4;
+ break;
+ case 4:
+ width = 2;
+ break;
+ case 3:
+ width = 32;
+ break;
+ case 1:
+ width = 16;
+ break;
+ case 0:
+ default: /* if wrong, assume 8 bit */
+ width = 8;
+ break;
+ }
+
+ dd->ipath_lbus_width = width;
+
+ if (linkwidth != 0x11) {
+ ipath_dev_err(dd, "Not configured for 16 bit HT "
+ "(%x)\n", linkwidth);
+ if (!(linkwidth & 0xf)) {
+ ipath_dbg("Will ignore HT lane1 errors\n");
+ dd->ipath_flags |= IPATH_8BIT_IN_HT0;
+ }
+ }
+ }
+
+ /*
+ * this is just for our link to the host, not devices connected
+ * through tunnel.
+ */
+ if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
+ ipath_dev_err(dd, "Couldn't read HT link frequency "
+ "config register\n");
+ else {
+ u32 speed;
+ switch (linkwidth & 0xf) {
+ case 6:
+ speed = 1000;
+ break;
+ case 5:
+ speed = 800;
+ break;
+ case 4:
+ speed = 600;
+ break;
+ case 3:
+ speed = 500;
+ break;
+ case 2:
+ speed = 400;
+ break;
+ case 1:
+ speed = 300;
+ break;
+ default:
+ /*
+ * assume reserved and vendor-specific are 200...
+ */
+ case 0:
+ speed = 200;
+ break;
+ }
+ dd->ipath_lbus_speed = speed;
+ }
+
+ snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
+ "HyperTransport,%uMHz,x%u\n",
+ dd->ipath_lbus_speed,
+ dd->ipath_lbus_width);
+}
+
+static int ipath_ht_intconfig(struct ipath_devdata *dd)
+{
+ int ret;
+
+ if (dd->ipath_intconfig) {
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
+ dd->ipath_intconfig); /* interrupt address */
+ ret = 0;
+ } else {
+ ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
+ "interrupt address\n");
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
+ struct ht_irq_msg *msg)
+{
+ struct ipath_devdata *dd = pci_get_drvdata(dev);
+ u64 prev_intconfig = dd->ipath_intconfig;
+
+ dd->ipath_intconfig = msg->address_lo;
+ dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
+
+ /*
+ * If the previous value of dd->ipath_intconfig is zero, we're
+ * getting configured for the first time, and must not program the
+ * intconfig register here (it will be programmed later, when the
+ * hardware is ready). Otherwise, we should.
+ */
+ if (prev_intconfig)
+ ipath_ht_intconfig(dd);
+}
+
+/**
+ * ipath_setup_ht_config - setup the interruptconfig register
+ * @dd: the infinipath device
+ * @pdev: the PCI device
+ *
+ * setup the interruptconfig register from the HT config info.
+ * Also clear CRC errors in HT linkcontrol, if necessary.
+ * This is done only for the real hardware. It is done before
+ * chip address space is initted, so can't touch infinipath registers
+ */
+static int ipath_setup_ht_config(struct ipath_devdata *dd,
+ struct pci_dev *pdev)
+{
+ int pos, ret;
+
+ ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
+ if (ret < 0) {
+ ipath_dev_err(dd, "Couldn't create interrupt handler: "
+ "err %d\n", ret);
+ goto bail;
+ }
+ dd->ipath_irq = ret;
+ ret = 0;
+
+ /*
+ * Handle clearing CRC errors in linkctrl register if necessary. We
+ * do this early, before we ever enable errors or hardware errors,
+ * mostly to avoid causing the chip to enter freeze mode.
+ */
+ pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
+ if (!pos) {
+ ipath_dev_err(dd, "Couldn't find HyperTransport "
+ "capability; no interrupts\n");
+ ret = -ENODEV;
+ goto bail;
+ }
+ do {
+ u8 cap_type;
+
+ /*
+ * The HT capability type byte is 3 bytes after the
+ * capability byte.
+ */
+ if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
+ dev_info(&pdev->dev, "Couldn't read config "
+ "command @ %d\n", pos);
+ continue;
+ }
+ if (!(cap_type & 0xE0))
+ slave_or_pri_blk(dd, pdev, pos, cap_type);
+ } while ((pos = pci_find_next_capability(pdev, pos,
+ PCI_CAP_ID_HT)));
+
+ dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the infinipath device
+ *
+ * Called during driver unload.
+ * This is currently a nop for the HT chip, not for all chips
+ */
+static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
+{
+}
+
+/**
+ * ipath_setup_ht_setextled - set the state of the two external LEDs
+ * @dd: the infinipath device
+ * @lst: the L state
+ * @ltst: the LT state
+ *
+ * Set the state of the two external LEDs, to indicate physical and
+ * logical state of IB link. For this chip (at least with recommended
+ * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
+ * (logical state)
+ *
+ * Note: We try to match the Mellanox HCA LED behavior as best
+ * we can. Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate. That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
+ u64 lst, u64 ltst)
+{
+ u64 extctl;
+ unsigned long flags = 0;
+
+ /* the diags use the LED to indicate diag info, so we leave
+ * the external LED alone when the diags are running */
+ if (ipath_diag_inuse)
+ return;
+
+ /* Allow override of LED display for, e.g. Locating system in rack */
+ if (dd->ipath_led_override) {
+ ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
+ ? INFINIPATH_IBCS_LT_STATE_LINKUP
+ : INFINIPATH_IBCS_LT_STATE_DISABLED;
+ lst = (dd->ipath_led_override & IPATH_LED_LOG)
+ ? INFINIPATH_IBCS_L_STATE_ACTIVE
+ : INFINIPATH_IBCS_L_STATE_DOWN;
+ }
+
+ spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+ /*
+ * start by setting both LED control bits to off, then turn
+ * on the appropriate bit(s).
+ */
+ if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
+ /*
+ * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
+ * is inverted, because it is normally used to indicate
+ * a hardware fault at reset, if there were errors
+ */
+ extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
+ | INFINIPATH_EXTC_LEDGBLERR_OFF;
+ if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+ extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
+ if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+ extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
+ }
+ else {
+ extctl = dd->ipath_extctrl &
+ ~(INFINIPATH_EXTC_LED1PRIPORT_ON |
+ INFINIPATH_EXTC_LED2PRIPORT_ON);
+ if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+ extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
+ if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+ extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
+ }
+ dd->ipath_extctrl = extctl;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
+ spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+}
+
+static void ipath_init_ht_variables(struct ipath_devdata *dd)
+{
+ /*
+ * setup the register offsets, since they are different for each
+ * chip
+ */
+ dd->ipath_kregs = &ipath_ht_kregs;
+ dd->ipath_cregs = &ipath_ht_cregs;
+
+ dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+ dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+ dd->ipath_gpio_sda = IPATH_GPIO_SDA;
+ dd->ipath_gpio_scl = IPATH_GPIO_SCL;
+
+ /*
+ * Fill in data for field-values that change in newer chips.
+ * We dynamically specify only the mask for LINKTRAININGSTATE
+ * and only the shift for LINKSTATE, as they are the only ones
+ * that change. Also precalculate the 3 link states of interest
+ * and the combined mask.
+ */
+ dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
+ dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
+ dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
+ dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
+ dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+ INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+ (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
+ dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+ INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+ (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
+ dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+ INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+ (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
+
+ /*
+ * Fill in data for ibcc field-values that change in newer chips.
+ * We dynamically specify only the mask for LINKINITCMD
+ * and only the shift for LINKCMD and MAXPKTLEN, as they are
+ * the only ones that change.
+ */
+ dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
+ dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
+ dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+
+ /* Fill in shifts for RcvCtrl. */
+ dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
+ dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
+ dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
+ dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
+
+ dd->ipath_i_bitsextant =
+ (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
+ (INFINIPATH_I_RCVAVAIL_MASK <<
+ INFINIPATH_I_RCVAVAIL_SHIFT) |
+ INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
+ INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
+
+ dd->ipath_e_bitsextant =
+ INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
+ INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
+ INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
+ INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
+ INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
+ INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
+ INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+ INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
+ INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
+ INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
+ INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
+ INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
+ INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
+ INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
+ INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
+ INFINIPATH_E_HARDWARE;
+
+ dd->ipath_hwe_bitsextant =
+ (INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
+ INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
+ (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+ INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
+ (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+ INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+ INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
+ INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
+ INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
+ INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
+ INFINIPATH_HWE_HTCMISCERR4 |
+ INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
+ INFINIPATH_HWE_HTCMISCERR7 |
+ INFINIPATH_HWE_HTCBUSTREQPARITYERR |
+ INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
+ INFINIPATH_HWE_HTCBUSIREQPARITYERR |
+ INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
+ INFINIPATH_HWE_MEMBISTFAILED |
+ INFINIPATH_HWE_COREPLL_FBSLIP |
+ INFINIPATH_HWE_COREPLL_RFSLIP |
+ INFINIPATH_HWE_HTBPLL_FBSLIP |
+ INFINIPATH_HWE_HTBPLL_RFSLIP |
+ INFINIPATH_HWE_HTAPLL_FBSLIP |
+ INFINIPATH_HWE_HTAPLL_RFSLIP |
+ INFINIPATH_HWE_SERDESPLLFAILED |
+ INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
+ INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
+
+ dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+ dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+ dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
+ dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
+
+ /*
+ * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+ * 2 is Some Misc, 3 is reserved for future.
+ */
+ dd->ipath_eep_st_masks[0].hwerrs_to_log =
+ INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+ INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
+
+ dd->ipath_eep_st_masks[1].hwerrs_to_log =
+ INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+ INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
+
+ dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
+
+ dd->delay_mult = 2; /* SDR, 4X, can't change */
+
+ dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+ dd->ipath_link_speed_supported = IPATH_IB_SDR;
+ dd->ipath_link_width_enabled = IB_WIDTH_4X;
+ dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
+ /* these can't change for this chip, so set once */
+ dd->ipath_link_width_active = dd->ipath_link_width_enabled;
+ dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
+}
+
+/**
+ * ipath_ht_init_hwerrors - enable hardware errors
+ * @dd: the infinipath device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
+{
+ ipath_err_t val;
+ u64 extsval;
+
+ extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+ if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
+ ipath_dev_err(dd, "MemBIST did not complete!\n");
+ if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
+ ipath_dbg("MemBIST corrected\n");
+
+ ipath_check_htlink(dd);
+
+ /* barring bugs, all hwerrors become interrupts, which can */
+ val = -1LL;
+ /* don't look at crc lane1 if 8 bit */
+ if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+ val &= ~infinipath_hwe_htclnkabyte1crcerr;
+ /* don't look at crc lane1 if 8 bit */
+ if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+ val &= ~infinipath_hwe_htclnkbbyte1crcerr;
+
+ /*
+ * disable RXDSYNCMEMPARITY because external serdes is unused,
+ * and therefore the logic will never be used or initialized,
+ * and uninitialized state will normally result in this error
+ * being asserted. Similarly for the external serdess pll
+ * lock signal.
+ */
+ val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
+ INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
+
+ /*
+ * Disable MISCERR4 because of an inversion in the HT core
+ * logic checking for errors that cause this bit to be set.
+ * The errata can also cause the protocol error bit to be set
+ * in the HT config space linkerror register(s).
+ */
+ val &= ~INFINIPATH_HWE_HTCMISCERR4;
+
+ /*
+ * PLL ignored because unused MDIO interface has a logic problem
+ */
+ if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
+ val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+ dd->ipath_hwerrmask = val;
+}
+
+
+
+
+/**
+ * ipath_ht_bringup_serdes - bring up the serdes
+ * @dd: the infinipath device
+ */
+static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
+{
+ u64 val, config1;
+ int ret = 0, change = 0;
+
+ ipath_dbg("Trying to bringup serdes\n");
+
+ if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
+ INFINIPATH_HWE_SERDESPLLFAILED)
+ {
+ ipath_dbg("At start, serdes PLL failed bit set in "
+ "hwerrstatus, clearing and continuing\n");
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+ INFINIPATH_HWE_SERDESPLLFAILED);
+ }
+
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+ config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
+
+ ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
+ "config1=%llx, sstatus=%llx xgxs %llx\n",
+ (unsigned long long) val, (unsigned long long) config1,
+ (unsigned long long)
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+ (unsigned long long)
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+ /* force reset on */
+ val |= INFINIPATH_SERDC0_RESET_PLL
+ /* | INFINIPATH_SERDC0_RESET_MASK */
+ ;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+ udelay(15); /* need pll reset set at least for a bit */
+
+ if (val & INFINIPATH_SERDC0_RESET_PLL) {
+ u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
+ /* set lane resets, and tx idle, during pll reset */
+ val2 |= INFINIPATH_SERDC0_RESET_MASK |
+ INFINIPATH_SERDC0_TXIDLE;
+ ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
+ "%llx)\n", (unsigned long long) val2);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+ val2);
+ /*
+ * be sure chip saw it
+ */
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ /*
+ * need pll reset clear at least 11 usec before lane
+ * resets cleared; give it a few more
+ */
+ udelay(15);
+ val = val2; /* for check below */
+ }
+
+ if (val & (INFINIPATH_SERDC0_RESET_PLL |
+ INFINIPATH_SERDC0_RESET_MASK |
+ INFINIPATH_SERDC0_TXIDLE)) {
+ val &= ~(INFINIPATH_SERDC0_RESET_PLL |
+ INFINIPATH_SERDC0_RESET_MASK |
+ INFINIPATH_SERDC0_TXIDLE);
+ /* clear them */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+ val);
+ }
+
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+ if (val & INFINIPATH_XGXS_RESET) {
+ /* normally true after boot */
+ val &= ~INFINIPATH_XGXS_RESET;
+ change = 1;
+ }
+ if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
+ INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
+ /* need to compensate for Tx inversion in partner */
+ val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+ INFINIPATH_XGXS_RX_POL_SHIFT);
+ val |= dd->ipath_rx_pol_inv <<
+ INFINIPATH_XGXS_RX_POL_SHIFT;
+ change = 1;
+ }
+ if (change)
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+ /* clear current and de-emphasis bits */
+ config1 &= ~0x0ffffffff00ULL;
+ /* set current to 20ma */
+ config1 |= 0x00000000000ULL;
+ /* set de-emphasis to -5.68dB */
+ config1 |= 0x0cccc000000ULL;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
+
+ ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
+ "config1=%llx, sstatus=%llx xgxs %llx\n",
+ (unsigned long long) val, (unsigned long long) config1,
+ (unsigned long long)
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+ (unsigned long long)
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+ return ret; /* for now, say we always succeeded */
+}
+
+/**
+ * ipath_ht_quiet_serdes - set serdes to txidle
+ * @dd: the infinipath device
+ * driver is being unloaded
+ */
+static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
+{
+ u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+ val |= INFINIPATH_SERDC0_TXIDLE;
+ ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
+ (unsigned long long) val);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+}
+
+/**
+ * ipath_pe_put_tid - write a TID in chip
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void ipath_ht_put_tid(struct ipath_devdata *dd,
+ u64 __iomem *tidptr, u32 type,
+ unsigned long pa)
+{
+ if (!dd->ipath_kregbase)
+ return;
+
+ if (pa != dd->ipath_tidinvalid) {
+ if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
+ dev_info(&dd->pcidev->dev,
+ "physaddr %lx has more than "
+ "40 bits, using only 40!!!\n", pa);
+ pa &= INFINIPATH_RT_ADDR_MASK;
+ }
+ if (type == RCVHQ_RCV_TYPE_EAGER)
+ pa |= dd->ipath_tidtemplate;
+ else {
+ /* in words (fixed, full page). */
+ u64 lenvalid = PAGE_SIZE >> 2;
+ lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+ pa |= lenvalid | INFINIPATH_RT_VALID;
+ }
+ }
+
+ writeq(pa, tidptr);
+}
+
+
+/**
+ * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
+ * @dd: the infinipath device
+ * @port: the port
+ *
+ * Used from ipath_close(), and at chip initialization.
+ */
+static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
+{
+ u64 __iomem *tidbase;
+ int i;
+
+ if (!dd->ipath_kregbase)
+ return;
+
+ ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
+
+ /*
+ * need to invalidate all of the expected TID entries for this
+ * port, so we don't have valid entries that might somehow get
+ * used (early in next use of this port, or through some bug)
+ */
+ tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+ dd->ipath_rcvtidbase +
+ port * dd->ipath_rcvtidcnt *
+ sizeof(*tidbase));
+ for (i = 0; i < dd->ipath_rcvtidcnt; i++)
+ ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+ dd->ipath_tidinvalid);
+
+ tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+ dd->ipath_rcvegrbase +
+ port * dd->ipath_rcvegrcnt *
+ sizeof(*tidbase));
+
+ for (i = 0; i < dd->ipath_rcvegrcnt; i++)
+ ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+ dd->ipath_tidinvalid);
+}
+
+/**
+ * ipath_ht_tidtemplate - setup constants for TID updates
+ * @dd: the infinipath device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
+{
+ dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
+ dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+ dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
+
+ /*
+ * work around chip errata bug 7358, by marking invalid tids
+ * as having max length
+ */
+ dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+ INFINIPATH_RT_BUFSIZE_SHIFT;
+}
+
+static int ipath_ht_early_init(struct ipath_devdata *dd)
+{
+ u32 __iomem *piobuf;
+ u32 pioincr, val32;
+ int i;
+
+ /*
+ * one cache line; long IB headers will spill over into received
+ * buffer
+ */
+ dd->ipath_rcvhdrentsize = 16;
+ dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+
+ /*
+ * For HT, we allocate a somewhat overly large eager buffer,
+ * such that we can guarantee that we can receive the largest
+ * packet that we can send out. To truly support a 4KB MTU,
+ * we need to bump this to a large value. To date, other than
+ * testing, we have never encountered an HCA that can really
+ * send 4KB MTU packets, so we do not handle that (we'll get
+ * errors interrupts if we ever see one).
+ */
+ dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
+
+ /*
+ * the min() check here is currently a nop, but it may not
+ * always be, depending on just how we do ipath_rcvegrbufsize
+ */
+ dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
+ dd->ipath_rcvegrbufsize);
+ dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+ ipath_ht_tidtemplate(dd);
+
+ /*
+ * zero all the TID entries at startup. We do this for sanity,
+ * in case of a previous driver crash of some kind, and also
+ * because the chip powers up with these memories in an unknown
+ * state. Use portcnt, not cfgports, since this is for the
+ * full chip, not for current (possibly different) configuration
+ * value.
+ * Chip Errata bug 6447
+ */
+ for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
+ ipath_ht_clear_tids(dd, val32);
+
+ /*
+ * write the pbc of each buffer, to be sure it's initialized, then
+ * cancel all the buffers, and also abort any packets that might
+ * have been in flight for some reason (the latter is for driver
+ * unload/reload, but isn't a bad idea at first init). PIO send
+ * isn't enabled at this point, so there is no danger of sending
+ * these out on the wire.
+ * Chip Errata bug 6610
+ */
+ piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
+ dd->ipath_piobufbase);
+ pioincr = dd->ipath_palign / sizeof(*piobuf);
+ for (i = 0; i < dd->ipath_piobcnt2k; i++) {
+ /*
+ * reasonable word count, just to init pbc
+ */
+ writel(16, piobuf);
+ piobuf += pioincr;
+ }
+
+ ipath_get_eeprom_info(dd);
+ if (dd->ipath_boardrev == 5) {
+ /*
+ * Later production QHT7040 has same changes as QHT7140, so
+ * can use GPIO interrupts. They have serial #'s starting
+ * with 128, rather than 112.
+ */
+ if (dd->ipath_serial[0] == '1' &&
+ dd->ipath_serial[1] == '2' &&
+ dd->ipath_serial[2] == '8')
+ dd->ipath_flags |= IPATH_GPIO_INTR;
+ else {
+ ipath_dev_err(dd, "Unsupported InfiniPath board "
+ "(serial number %.16s)!\n",
+ dd->ipath_serial);
+ return 1;
+ }
+ }
+
+ if (dd->ipath_minrev >= 4) {
+ /* Rev4+ reports extra errors via internal GPIO pins */
+ dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
+ dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+ dd->ipath_gpio_mask);
+ }
+
+ return 0;
+}
+
+
+/**
+ * ipath_init_ht_get_base_info - set chip-specific flags for user code
+ * @dd: the infinipath device
+ * @kbase: ipath_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
+{
+ struct ipath_base_info *kinfo = kbase;
+
+ kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
+ IPATH_RUNTIME_PIO_REGSWAPPED;
+
+ if (pd->port_dd->ipath_minrev < 4)
+ kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
+
+ return 0;
+}
+
+static void ipath_ht_free_irq(struct ipath_devdata *dd)
+{
+ free_irq(dd->ipath_irq, dd);
+ ht_destroy_irq(dd->ipath_irq);
+ dd->ipath_irq = 0;
+ dd->ipath_intconfig = 0;
+}
+
+static struct ipath_message_header *
+ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
+{
+ return (struct ipath_message_header *)
+ &rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
+{
+ dd->ipath_portcnt =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+ dd->ipath_p0_rcvegrcnt =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+}
+
+static void ipath_ht_read_counters(struct ipath_devdata *dd,
+ struct infinipath_counters *cntrs)
+{
+ cntrs->LBIntCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
+ cntrs->LBFlowStallCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
+ cntrs->TxSDmaDescCnt = 0;
+ cntrs->TxUnsupVLErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
+ cntrs->TxDataPktCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
+ cntrs->TxFlowPktCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
+ cntrs->TxDwordCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
+ cntrs->TxLenErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
+ cntrs->TxMaxMinLenErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
+ cntrs->TxUnderrunCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
+ cntrs->TxFlowStallCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
+ cntrs->TxDroppedPktCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
+ cntrs->RxDroppedPktCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
+ cntrs->RxDataPktCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
+ cntrs->RxFlowPktCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
+ cntrs->RxDwordCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
+ cntrs->RxLenErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
+ cntrs->RxMaxMinLenErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
+ cntrs->RxICRCErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
+ cntrs->RxVCRCErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
+ cntrs->RxFlowCtrlErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
+ cntrs->RxBadFormatCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
+ cntrs->RxLinkProblemCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
+ cntrs->RxEBPCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
+ cntrs->RxLPCRCErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
+ cntrs->RxBufOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
+ cntrs->RxTIDFullErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
+ cntrs->RxTIDValidErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
+ cntrs->RxPKeyMismatchCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
+ cntrs->RxP0HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
+ cntrs->RxP1HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
+ cntrs->RxP2HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
+ cntrs->RxP3HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
+ cntrs->RxP4HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
+ cntrs->RxP5HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
+ cntrs->RxP6HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
+ cntrs->RxP7HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
+ cntrs->RxP8HdrEgrOvflCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
+ cntrs->RxP9HdrEgrOvflCnt = 0;
+ cntrs->RxP10HdrEgrOvflCnt = 0;
+ cntrs->RxP11HdrEgrOvflCnt = 0;
+ cntrs->RxP12HdrEgrOvflCnt = 0;
+ cntrs->RxP13HdrEgrOvflCnt = 0;
+ cntrs->RxP14HdrEgrOvflCnt = 0;
+ cntrs->RxP15HdrEgrOvflCnt = 0;
+ cntrs->RxP16HdrEgrOvflCnt = 0;
+ cntrs->IBStatusChangeCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
+ cntrs->IBLinkErrRecoveryCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
+ cntrs->IBLinkDownedCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
+ cntrs->IBSymbolErrCnt =
+ ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
+ cntrs->RxVL15DroppedPktCnt = 0;
+ cntrs->RxOtherLocalPhyErrCnt = 0;
+ cntrs->PcieRetryBufDiagQwordCnt = 0;
+ cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
+ cntrs->LocalLinkIntegrityErrCnt =
+ (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+ dd->ipath_lli_errs : dd->ipath_lli_errors;
+ cntrs->RxVlErrCnt = 0;
+ cntrs->RxDlidFltrCnt = 0;
+}
+
+
+/* no interrupt fallback for these chips */
+static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
+{
+ return 0;
+}
+
+
+/*
+ * reset the XGXS (between serdes and IBC). Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining. To do this right, we reset IBC
+ * as well.
+ */
+static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
+{
+ u64 val, prev_val;
+
+ prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+ val = prev_val | INFINIPATH_XGXS_RESET;
+ prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+ dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+ dd->ipath_control);
+}
+
+
+static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
+{
+ int ret;
+
+ switch (which) {
+ case IPATH_IB_CFG_LWID:
+ ret = dd->ipath_link_width_active;
+ break;
+ case IPATH_IB_CFG_SPD:
+ ret = dd->ipath_link_speed_active;
+ break;
+ case IPATH_IB_CFG_LWID_ENB:
+ ret = dd->ipath_link_width_enabled;
+ break;
+ case IPATH_IB_CFG_SPD_ENB:
+ ret = dd->ipath_link_speed_enabled;
+ break;
+ default:
+ ret = -ENOTSUPP;
+ break;
+ }
+ return ret;
+}
+
+
+/* we assume range checking is already done, if needed */
+static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
+{
+ int ret = 0;
+
+ if (which == IPATH_IB_CFG_LWID_ENB)
+ dd->ipath_link_width_enabled = val;
+ else if (which == IPATH_IB_CFG_SPD_ENB)
+ dd->ipath_link_speed_enabled = val;
+ else
+ ret = -ENOTSUPP;
+ return ret;
+}
+
+
+static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
+{
+}
+
+
+static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
+{
+ ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
+ ipath_ib_linktrstate(dd, ibcs));
+ return 0;
+}
+
+
+/**
+ * ipath_init_iba6110_funcs - set up the chip-specific function pointers
+ * @dd: the infinipath device
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
+{
+ dd->ipath_f_intrsetup = ipath_ht_intconfig;
+ dd->ipath_f_bus = ipath_setup_ht_config;
+ dd->ipath_f_reset = ipath_setup_ht_reset;
+ dd->ipath_f_get_boardname = ipath_ht_boardname;
+ dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
+ dd->ipath_f_early_init = ipath_ht_early_init;
+ dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
+ dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
+ dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
+ dd->ipath_f_clear_tids = ipath_ht_clear_tids;
+ dd->ipath_f_put_tid = ipath_ht_put_tid;
+ dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
+ dd->ipath_f_setextled = ipath_setup_ht_setextled;
+ dd->ipath_f_get_base_info = ipath_ht_get_base_info;
+ dd->ipath_f_free_irq = ipath_ht_free_irq;
+ dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
+ dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
+ dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
+ dd->ipath_f_config_ports = ipath_ht_config_ports;
+ dd->ipath_f_read_counters = ipath_ht_read_counters;
+ dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
+ dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
+ dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
+ dd->ipath_f_config_jint = ipath_ht_config_jint;
+ dd->ipath_f_ib_updown = ipath_ht_ib_updown;
+
+ /*
+ * initialize chip-specific variables
+ */
+ ipath_init_ht_variables(dd);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_init_chip.c b/drivers/staging/rdma/ipath/ipath_init_chip.c
new file mode 100644
index 000000000000..be2a60e142b0
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_init_chip.c
@@ -0,0 +1,1066 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+/*
+ * min buffers we want to have per port, after driver
+ */
+#define IPATH_MIN_USER_PORT_BUFCNT 7
+
+/*
+ * Number of ports we are configured to use (to allow for more pio
+ * buffers per port, etc.) Zero means use chip value.
+ */
+static ushort ipath_cfgports;
+
+module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
+
+/*
+ * Number of buffers reserved for driver (verbs and layered drivers.)
+ * Initialized based on number of PIO buffers if not set via module interface.
+ * The problem with this is that it's global, but we'll use different
+ * numbers for different chip types.
+ */
+static ushort ipath_kpiobufs;
+
+static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
+
+module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
+ &ipath_kpiobufs, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
+
+/**
+ * create_port0_egr - allocate the eager TID buffers
+ * @dd: the infinipath device
+ *
+ * This code is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance.
+ * This is the kernel (port0) version.
+ *
+ * Allocate the eager TID buffers and program them into infinipath.
+ * We use the network layer alloc_skb() allocator to allocate the
+ * memory, and either use the buffers as is for things like verbs
+ * packets, or pass the buffers up to the ipath layered driver and
+ * thence the network layer, replacing them as we do so (see
+ * ipath_rcv_layer()).
+ */
+static int create_port0_egr(struct ipath_devdata *dd)
+{
+ unsigned e, egrcnt;
+ struct ipath_skbinfo *skbinfo;
+ int ret;
+
+ egrcnt = dd->ipath_p0_rcvegrcnt;
+
+ skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
+ if (skbinfo == NULL) {
+ ipath_dev_err(dd, "allocation error for eager TID "
+ "skb array\n");
+ ret = -ENOMEM;
+ goto bail;
+ }
+ for (e = 0; e < egrcnt; e++) {
+ /*
+ * This is a bit tricky in that we allocate extra
+ * space for 2 bytes of the 14 byte ethernet header.
+ * These two bytes are passed in the ipath header so
+ * the rest of the data is word aligned. We allocate
+ * 4 bytes so that the data buffer stays word aligned.
+ * See ipath_kreceive() for more details.
+ */
+ skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
+ if (!skbinfo[e].skb) {
+ ipath_dev_err(dd, "SKB allocation error for "
+ "eager TID %u\n", e);
+ while (e != 0)
+ dev_kfree_skb(skbinfo[--e].skb);
+ vfree(skbinfo);
+ ret = -ENOMEM;
+ goto bail;
+ }
+ }
+ /*
+ * After loop above, so we can test non-NULL to see if ready
+ * to use at receive, etc.
+ */
+ dd->ipath_port0_skbinfo = skbinfo;
+
+ for (e = 0; e < egrcnt; e++) {
+ dd->ipath_port0_skbinfo[e].phys =
+ ipath_map_single(dd->pcidev,
+ dd->ipath_port0_skbinfo[e].skb->data,
+ dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
+ dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
+ ((char __iomem *) dd->ipath_kregbase +
+ dd->ipath_rcvegrbase),
+ RCVHQ_RCV_TYPE_EAGER,
+ dd->ipath_port0_skbinfo[e].phys);
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static int bringup_link(struct ipath_devdata *dd)
+{
+ u64 val, ibc;
+ int ret = 0;
+
+ /* hold IBC in reset */
+ dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+ dd->ipath_control);
+
+ /*
+ * set initial max size pkt IBC will send, including ICRC; it's the
+ * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
+ */
+ val = (dd->ipath_ibmaxlen >> 2) + 1;
+ ibc = val << dd->ibcc_mpl_shift;
+
+ /* flowcontrolwatermark is in units of KBytes */
+ ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
+ /*
+ * How often flowctrl sent. More or less in usecs; balance against
+ * watermark value, so that in theory senders always get a flow
+ * control update in time to not let the IB link go idle.
+ */
+ ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
+ /* max error tolerance */
+ ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+ /* use "real" buffer space for */
+ ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
+ /* IB credit flow control. */
+ ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+ /* initially come up waiting for TS1, without sending anything. */
+ dd->ipath_ibcctrl = ibc;
+ /*
+ * Want to start out with both LINKCMD and LINKINITCMD in NOP
+ * (0 and 0). Don't put linkinitcmd in ipath_ibcctrl, want that
+ * to stay a NOP. Flag that we are disabled, for the (unlikely)
+ * case that some recovery path is trying to bring the link up
+ * before we are ready.
+ */
+ ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+ INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+ dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+ ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
+ (unsigned long long) ibc);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
+
+ // be sure chip saw it
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+ ret = dd->ipath_f_bringup_serdes(dd);
+
+ if (ret)
+ dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
+ "not usable\n");
+ else {
+ /* enable IBC */
+ dd->ipath_control |= INFINIPATH_C_LINKENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+ dd->ipath_control);
+ }
+
+ return ret;
+}
+
+static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
+{
+ struct ipath_portdata *pd = NULL;
+
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (pd) {
+ pd->port_dd = dd;
+ pd->port_cnt = 1;
+ /* The port 0 pkey table is used by the layer interface. */
+ pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
+ pd->port_seq_cnt = 1;
+ }
+ return pd;
+}
+
+static int init_chip_first(struct ipath_devdata *dd)
+{
+ struct ipath_portdata *pd;
+ int ret = 0;
+ u64 val;
+
+ spin_lock_init(&dd->ipath_kernel_tid_lock);
+ spin_lock_init(&dd->ipath_user_tid_lock);
+ spin_lock_init(&dd->ipath_sendctrl_lock);
+ spin_lock_init(&dd->ipath_uctxt_lock);
+ spin_lock_init(&dd->ipath_sdma_lock);
+ spin_lock_init(&dd->ipath_gpio_lock);
+ spin_lock_init(&dd->ipath_eep_st_lock);
+ spin_lock_init(&dd->ipath_sdepb_lock);
+ mutex_init(&dd->ipath_eep_lock);
+
+ /*
+ * skip cfgports stuff because we are not allocating memory,
+ * and we don't want problems if the portcnt changed due to
+ * cfgports. We do still check and report a difference, if
+ * not same (should be impossible).
+ */
+ dd->ipath_f_config_ports(dd, ipath_cfgports);
+ if (!ipath_cfgports)
+ dd->ipath_cfgports = dd->ipath_portcnt;
+ else if (ipath_cfgports <= dd->ipath_portcnt) {
+ dd->ipath_cfgports = ipath_cfgports;
+ ipath_dbg("Configured to use %u ports out of %u in chip\n",
+ dd->ipath_cfgports, ipath_read_kreg32(dd,
+ dd->ipath_kregs->kr_portcnt));
+ } else {
+ dd->ipath_cfgports = dd->ipath_portcnt;
+ ipath_dbg("Tried to configured to use %u ports; chip "
+ "only supports %u\n", ipath_cfgports,
+ ipath_read_kreg32(dd,
+ dd->ipath_kregs->kr_portcnt));
+ }
+ /*
+ * Allocate full portcnt array, rather than just cfgports, because
+ * cleanup iterates across all possible ports.
+ */
+ dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt,
+ GFP_KERNEL);
+
+ if (!dd->ipath_pd) {
+ ipath_dev_err(dd, "Unable to allocate portdata array, "
+ "failing\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ pd = create_portdata0(dd);
+ if (!pd) {
+ ipath_dev_err(dd, "Unable to allocate portdata for port "
+ "0, failing\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+ dd->ipath_pd[0] = pd;
+
+ dd->ipath_rcvtidcnt =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+ dd->ipath_rcvtidbase =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+ dd->ipath_rcvegrcnt =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+ dd->ipath_rcvegrbase =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+ dd->ipath_palign =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+ dd->ipath_piobufbase =
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
+ dd->ipath_piosize2k = val & ~0U;
+ dd->ipath_piosize4k = val >> 32;
+ if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
+ ipath_mtu4096 = 0; /* 4KB not supported by this chip */
+ dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
+ dd->ipath_piobcnt2k = val & ~0U;
+ dd->ipath_piobcnt4k = val >> 32;
+ dd->ipath_pio2kbase =
+ (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+ (dd->ipath_piobufbase & 0xffffffff));
+ if (dd->ipath_piobcnt4k) {
+ dd->ipath_pio4kbase = (u32 __iomem *)
+ (((char __iomem *) dd->ipath_kregbase) +
+ (dd->ipath_piobufbase >> 32));
+ /*
+ * 4K buffers take 2 pages; we use roundup just to be
+ * paranoid; we calculate it once here, rather than on
+ * ever buf allocate
+ */
+ dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
+ dd->ipath_palign);
+ ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
+ "(%x aligned)\n",
+ dd->ipath_piobcnt2k, dd->ipath_piosize2k,
+ dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
+ dd->ipath_piosize4k, dd->ipath_pio4kbase,
+ dd->ipath_4kalign);
+ }
+ else ipath_dbg("%u 2k piobufs @ %p\n",
+ dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
+
+done:
+ return ret;
+}
+
+/**
+ * init_chip_reset - re-initialize after a reset, or enable
+ * @dd: the infinipath device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_chip_reset(struct ipath_devdata *dd)
+{
+ u32 rtmp;
+ int i;
+ unsigned long flags;
+
+ /*
+ * ensure chip does no sends or receives, tail updates, or
+ * pioavail updates while we re-initialize
+ */
+ dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
+ for (i = 0; i < dd->ipath_portcnt; i++) {
+ clear_bit(dd->ipath_r_portenable_shift + i,
+ &dd->ipath_rcvctrl);
+ clear_bit(dd->ipath_r_intravail_shift + i,
+ &dd->ipath_rcvctrl);
+ }
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl = 0U; /* no sdma, etc */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+ rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+ if (rtmp != dd->ipath_rcvtidcnt)
+ dev_info(&dd->pcidev->dev, "tidcnt was %u before "
+ "reset, now %u, using original\n",
+ dd->ipath_rcvtidcnt, rtmp);
+ rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+ if (rtmp != dd->ipath_rcvtidbase)
+ dev_info(&dd->pcidev->dev, "tidbase was %u before "
+ "reset, now %u, using original\n",
+ dd->ipath_rcvtidbase, rtmp);
+ rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+ if (rtmp != dd->ipath_rcvegrcnt)
+ dev_info(&dd->pcidev->dev, "egrcnt was %u before "
+ "reset, now %u, using original\n",
+ dd->ipath_rcvegrcnt, rtmp);
+ rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+ if (rtmp != dd->ipath_rcvegrbase)
+ dev_info(&dd->pcidev->dev, "egrbase was %u before "
+ "reset, now %u, using original\n",
+ dd->ipath_rcvegrbase, rtmp);
+
+ return 0;
+}
+
+static int init_pioavailregs(struct ipath_devdata *dd)
+{
+ int ret;
+
+ dd->ipath_pioavailregs_dma = dma_alloc_coherent(
+ &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
+ GFP_KERNEL);
+ if (!dd->ipath_pioavailregs_dma) {
+ ipath_dev_err(dd, "failed to allocate PIOavail reg area "
+ "in memory\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ /*
+ * we really want L2 cache aligned, but for current CPUs of
+ * interest, they are the same.
+ */
+ dd->ipath_statusp = (u64 *)
+ ((char *)dd->ipath_pioavailregs_dma +
+ ((2 * L1_CACHE_BYTES +
+ dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+ /* copy the current value now that it's really allocated */
+ *dd->ipath_statusp = dd->_ipath_status;
+ /*
+ * setup buffer to hold freeze msg, accessible to apps,
+ * following statusp
+ */
+ dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
+ /* and its length */
+ dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
+
+ ret = 0;
+
+done:
+ return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the infinipath device
+ *
+ * allocate the shadow TID array, so we can ipath_munlock previous
+ * entries. It may make more sense to move the pageshadow to the
+ * port data structure, so we only allocate memory for ports actually
+ * in use, since we at 8k per port, now.
+ */
+static void init_shadow_tids(struct ipath_devdata *dd)
+{
+ struct page **pages;
+ dma_addr_t *addrs;
+
+ pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+ sizeof(struct page *));
+ if (!pages) {
+ ipath_dev_err(dd, "failed to allocate shadow page * "
+ "array, no expected sends!\n");
+ dd->ipath_pageshadow = NULL;
+ return;
+ }
+
+ addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+ sizeof(dma_addr_t));
+ if (!addrs) {
+ ipath_dev_err(dd, "failed to allocate shadow dma handle "
+ "array, no expected sends!\n");
+ vfree(pages);
+ dd->ipath_pageshadow = NULL;
+ return;
+ }
+
+ dd->ipath_pageshadow = pages;
+ dd->ipath_physshadow = addrs;
+}
+
+static void enable_chip(struct ipath_devdata *dd, int reinit)
+{
+ u32 val;
+ u64 rcvmask;
+ unsigned long flags;
+ int i;
+
+ if (!reinit)
+ init_waitqueue_head(&ipath_state_wait);
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ /* Enable PIO send, and update of PIOavail regs to memory. */
+ dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
+ INFINIPATH_S_PIOBUFAVAILUPD;
+
+ /*
+ * Set the PIO avail update threshold to host memory
+ * on chips that support it.
+ */
+ if (dd->ipath_pioupd_thresh)
+ dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+ << INFINIPATH_S_UPDTHRESH_SHIFT;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ /*
+ * Enable kernel ports' receive and receive interrupt.
+ * Other ports done as user opens and inits them.
+ */
+ rcvmask = 1ULL;
+ dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
+ (rcvmask << dd->ipath_r_intravail_shift);
+ if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
+ dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+
+ /*
+ * now ready for use. this should be cleared whenever we
+ * detect a reset, or initiate one.
+ */
+ dd->ipath_flags |= IPATH_INITTED;
+
+ /*
+ * Init our shadow copies of head from tail values,
+ * and write head values to match.
+ */
+ val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
+ ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
+
+ /* Initialize so we interrupt on next packet received */
+ ipath_write_ureg(dd, ur_rcvhdrhead,
+ dd->ipath_rhdrhead_intr_off |
+ dd->ipath_pd[0]->port_head, 0);
+
+ /*
+ * by now pioavail updates to memory should have occurred, so
+ * copy them into our working/shadow registers; this is in
+ * case something went wrong with abort, but mostly to get the
+ * initial values of the generation bit correct.
+ */
+ for (i = 0; i < dd->ipath_pioavregs; i++) {
+ __le64 pioavail;
+
+ /*
+ * Chip Errata bug 6641; even and odd qwords>3 are swapped.
+ */
+ if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+ pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
+ else
+ pioavail = dd->ipath_pioavailregs_dma[i];
+ /*
+ * don't need to worry about ipath_pioavailkernel here
+ * because we will call ipath_chg_pioavailkernel() later
+ * in initialization, to busy out buffers as needed
+ */
+ dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
+ }
+ /* can get counters, stats, etc. */
+ dd->ipath_flags |= IPATH_PRESENT;
+}
+
+static int init_housekeeping(struct ipath_devdata *dd, int reinit)
+{
+ char boardn[40];
+ int ret = 0;
+
+ /*
+ * have to clear shadow copies of registers at init that are
+ * not otherwise set here, or all kinds of bizarre things
+ * happen with driver on chip reset
+ */
+ dd->ipath_rcvhdrsize = 0;
+
+ /*
+ * Don't clear ipath_flags as 8bit mode was set before
+ * entering this func. However, we do set the linkstate to
+ * unknown, so we can watch for a transition.
+ * PRESENT is set because we want register reads to work,
+ * and the kernel infrastructure saw it in config space;
+ * We clear it if we have failures.
+ */
+ dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
+ dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
+ IPATH_LINKDOWN | IPATH_LINKINIT);
+
+ ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
+ dd->ipath_revision =
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
+
+ /*
+ * set up fundamental info we need to use the chip; we assume
+ * if the revision reg and these regs are OK, we don't need to
+ * special case the rest
+ */
+ dd->ipath_sregbase =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
+ dd->ipath_cregbase =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
+ dd->ipath_uregbase =
+ ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
+ ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
+ "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
+ dd->ipath_uregbase, dd->ipath_cregbase);
+ if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
+ || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
+ || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
+ || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
+ ipath_dev_err(dd, "Register read failures from chip, "
+ "giving up initialization\n");
+ dd->ipath_flags &= ~IPATH_PRESENT;
+ ret = -ENODEV;
+ goto done;
+ }
+
+
+ /* clear diagctrl register, in case diags were running and crashed */
+ ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
+
+ /* clear the initial reset flag, in case first driver load */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+ INFINIPATH_E_RESET);
+
+ ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
+ (unsigned long long) dd->ipath_revision,
+ dd->ipath_pcirev);
+
+ if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
+ INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
+ ipath_dev_err(dd, "Driver only handles version %d, "
+ "chip swversion is %d (%llx), failng\n",
+ IPATH_CHIP_SWVERSION,
+ (int)(dd->ipath_revision >>
+ INFINIPATH_R_SOFTWARE_SHIFT) &
+ INFINIPATH_R_SOFTWARE_MASK,
+ (unsigned long long) dd->ipath_revision);
+ ret = -ENOSYS;
+ goto done;
+ }
+ dd->ipath_majrev = (u8) ((dd->ipath_revision >>
+ INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
+ INFINIPATH_R_CHIPREVMAJOR_MASK);
+ dd->ipath_minrev = (u8) ((dd->ipath_revision >>
+ INFINIPATH_R_CHIPREVMINOR_SHIFT) &
+ INFINIPATH_R_CHIPREVMINOR_MASK);
+ dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
+ INFINIPATH_R_BOARDID_SHIFT) &
+ INFINIPATH_R_BOARDID_MASK);
+
+ ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
+
+ snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
+ "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
+ "SW Compat %u\n",
+ IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
+ (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
+ INFINIPATH_R_ARCH_MASK,
+ dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
+ (unsigned)(dd->ipath_revision >>
+ INFINIPATH_R_SOFTWARE_SHIFT) &
+ INFINIPATH_R_SOFTWARE_MASK);
+
+ ipath_dbg("%s", dd->ipath_boardversion);
+
+ if (ret)
+ goto done;
+
+ if (reinit)
+ ret = init_chip_reset(dd);
+ else
+ ret = init_chip_first(dd);
+
+done:
+ return ret;
+}
+
+static void verify_interrupt(unsigned long opaque)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+
+ if (!dd)
+ return; /* being torn down */
+
+ /*
+ * If we don't have any interrupts, let the user know and
+ * don't bother checking again.
+ */
+ if (dd->ipath_int_counter == 0) {
+ if (!dd->ipath_f_intr_fallback(dd))
+ dev_err(&dd->pcidev->dev, "No interrupts detected, "
+ "not usable.\n");
+ else /* re-arm the timer to see if fallback works */
+ mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
+ } else
+ ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
+ dd->ipath_int_counter);
+}
+
+/**
+ * ipath_init_chip - do the actual initialization sequence on the chip
+ * @dd: the infinipath device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip. This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0). We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int ipath_init_chip(struct ipath_devdata *dd, int reinit)
+{
+ int ret = 0;
+ u32 kpiobufs, defkbufs;
+ u32 piobufs, uports;
+ u64 val;
+ struct ipath_portdata *pd;
+ gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+
+ ret = init_housekeeping(dd, reinit);
+ if (ret)
+ goto done;
+
+ /*
+ * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
+ * but then it no longer nicely fits power of two, and since
+ * we now use routines that backend onto __get_free_pages, the
+ * rest would be wasted.
+ */
+ dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
+ dd->ipath_rcvhdrcnt);
+
+ /*
+ * Set up the shadow copies of the piobufavail registers,
+ * which we compare against the chip registers for now, and
+ * the in memory DMA'ed copies of the registers. This has to
+ * be done early, before we calculate lastport, etc.
+ */
+ piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+ /*
+ * calc number of pioavail registers, and save it; we have 2
+ * bits per buffer.
+ */
+ dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
+ / (sizeof(u64) * BITS_PER_BYTE / 2);
+ uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
+ if (piobufs > 144)
+ defkbufs = 32 + dd->ipath_pioreserved;
+ else
+ defkbufs = 16 + dd->ipath_pioreserved;
+
+ if (ipath_kpiobufs && (ipath_kpiobufs +
+ (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
+ int i = (int) piobufs -
+ (int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
+ if (i < 1)
+ i = 1;
+ dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
+ "%d for kernel leaves too few for %d user ports "
+ "(%d each); using %u\n", ipath_kpiobufs,
+ piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
+ /*
+ * shouldn't change ipath_kpiobufs, because could be
+ * different for different devices...
+ */
+ kpiobufs = i;
+ } else if (ipath_kpiobufs)
+ kpiobufs = ipath_kpiobufs;
+ else
+ kpiobufs = defkbufs;
+ dd->ipath_lastport_piobuf = piobufs - kpiobufs;
+ dd->ipath_pbufsport =
+ uports ? dd->ipath_lastport_piobuf / uports : 0;
+ /* if not an even divisor, some user ports get extra buffers */
+ dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
+ (dd->ipath_pbufsport * uports);
+ if (dd->ipath_ports_extrabuf)
+ ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
+ "ports <= %u\n", dd->ipath_pbufsport,
+ dd->ipath_ports_extrabuf);
+ dd->ipath_lastpioindex = 0;
+ dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
+ /* ipath_pioavailshadow initialized earlier */
+ ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
+ "each for %u user ports\n", kpiobufs,
+ piobufs, dd->ipath_pbufsport, uports);
+ ret = dd->ipath_f_early_init(dd);
+ if (ret) {
+ ipath_dev_err(dd, "Early initialization failure\n");
+ goto done;
+ }
+
+ /*
+ * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
+ * done after early_init.
+ */
+ dd->ipath_hdrqlast =
+ dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
+ dd->ipath_rcvhdrentsize);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+ dd->ipath_rcvhdrsize);
+
+ if (!reinit) {
+ ret = init_pioavailregs(dd);
+ init_shadow_tids(dd);
+ if (ret)
+ goto done;
+ }
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
+ dd->ipath_pioavailregs_phys);
+
+ /*
+ * this is to detect s/w errors, which the h/w works around by
+ * ignoring the low 6 bits of address, if it wasn't aligned.
+ */
+ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
+ if (val != dd->ipath_pioavailregs_phys) {
+ ipath_dev_err(dd, "Catastrophic software error, "
+ "SendPIOAvailAddr written as %lx, "
+ "read back as %llx\n",
+ (unsigned long) dd->ipath_pioavailregs_phys,
+ (unsigned long long) val);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
+
+ /*
+ * make sure we are not in freeze, and PIO send enabled, so
+ * writes to pbc happen
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+ ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+ /*
+ * before error clears, since we expect serdes pll errors during
+ * this, the first time after reset
+ */
+ if (bringup_link(dd)) {
+ dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
+ ret = -ENETDOWN;
+ goto done;
+ }
+
+ /*
+ * clear any "expected" hwerrs from reset and/or initialization
+ * clear any that aren't enabled (at least this once), and then
+ * set the enable mask
+ */
+ dd->ipath_f_init_hwerrors(dd);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+ ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+ dd->ipath_hwerrmask);
+
+ /* clear all */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+ /* enable errors that are masked, at least this first time. */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ ~dd->ipath_maskederrs);
+ dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
+ dd->ipath_errormask =
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+ /* clear any interrupts up to this point (ints still not enabled) */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+ dd->ipath_f_tidtemplate(dd);
+
+ /*
+ * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing
+ * re-init, the simplest way to handle this is to free
+ * existing, and re-allocate.
+ * Need to re-create rest of port 0 portdata as well.
+ */
+ pd = dd->ipath_pd[0];
+ if (reinit) {
+ struct ipath_portdata *npd;
+
+ /*
+ * Alloc and init new ipath_portdata for port0,
+ * Then free old pd. Could lead to fragmentation, but also
+ * makes later support for hot-swap easier.
+ */
+ npd = create_portdata0(dd);
+ if (npd) {
+ ipath_free_pddata(dd, pd);
+ dd->ipath_pd[0] = npd;
+ pd = npd;
+ } else {
+ ipath_dev_err(dd, "Unable to allocate portdata"
+ " for port 0, failing\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+ }
+ ret = ipath_create_rcvhdrq(dd, pd);
+ if (!ret)
+ ret = create_port0_egr(dd);
+ if (ret) {
+ ipath_dev_err(dd, "failed to allocate kernel port's "
+ "rcvhdrq and/or egr bufs\n");
+ goto done;
+ }
+ else
+ enable_chip(dd, reinit);
+
+ /* after enable_chip, so pioavailshadow setup */
+ ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+
+ /*
+ * Cancel any possible active sends from early driver load.
+ * Follows early_init because some chips have to initialize
+ * PIO buffers in early_init to avoid false parity errors.
+ * After enable and ipath_chg_pioavailkernel so we can safely
+ * enable pioavail updates and PIOENABLE; packets are now
+ * ready to go out.
+ */
+ ipath_cancel_sends(dd, 1);
+
+ if (!reinit) {
+ /*
+ * Used when we close a port, for DMA already in flight
+ * at close.
+ */
+ dd->ipath_dummy_hdrq = dma_alloc_coherent(
+ &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
+ &dd->ipath_dummy_hdrq_phys,
+ gfp_flags);
+ if (!dd->ipath_dummy_hdrq) {
+ dev_info(&dd->pcidev->dev,
+ "Couldn't allocate 0x%lx bytes for dummy hdrq\n",
+ dd->ipath_pd[0]->port_rcvhdrq_size);
+ /* fallback to just 0'ing */
+ dd->ipath_dummy_hdrq_phys = 0UL;
+ }
+ }
+
+ /*
+ * cause retrigger of pending interrupts ignored during init,
+ * even if we had errors
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+
+ if (!dd->ipath_stats_timer_active) {
+ /*
+ * first init, or after an admin disable/enable
+ * set up stats retrieval timer, even if we had errors
+ * in last portion of setup
+ */
+ init_timer(&dd->ipath_stats_timer);
+ dd->ipath_stats_timer.function = ipath_get_faststats;
+ dd->ipath_stats_timer.data = (unsigned long) dd;
+ /* every 5 seconds; */
+ dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
+ /* takes ~16 seconds to overflow at full IB 4x bandwdith */
+ add_timer(&dd->ipath_stats_timer);
+ dd->ipath_stats_timer_active = 1;
+ }
+
+ /* Set up SendDMA if chip supports it */
+ if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+ ret = setup_sdma(dd);
+
+ /* Set up HoL state */
+ init_timer(&dd->ipath_hol_timer);
+ dd->ipath_hol_timer.function = ipath_hol_event;
+ dd->ipath_hol_timer.data = (unsigned long)dd;
+ dd->ipath_hol_state = IPATH_HOL_UP;
+
+done:
+ if (!ret) {
+ *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
+ if (!dd->ipath_f_intrsetup(dd)) {
+ /* now we can enable all interrupts from the chip */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+ -1LL);
+ /* force re-interrupt of any pending interrupts. */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
+ 0ULL);
+ /* chip is usable; mark it as initialized */
+ *dd->ipath_statusp |= IPATH_STATUS_INITTED;
+
+ /*
+ * setup to verify we get an interrupt, and fallback
+ * to an alternate if necessary and possible
+ */
+ if (!reinit) {
+ init_timer(&dd->ipath_intrchk_timer);
+ dd->ipath_intrchk_timer.function =
+ verify_interrupt;
+ dd->ipath_intrchk_timer.data =
+ (unsigned long) dd;
+ }
+ dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
+ add_timer(&dd->ipath_intrchk_timer);
+ } else
+ ipath_dev_err(dd, "No interrupts enabled, couldn't "
+ "setup interrupt address\n");
+
+ if (dd->ipath_cfgports > ipath_stats.sps_nports)
+ /*
+ * sps_nports is a global, so, we set it to
+ * the highest number of ports of any of the
+ * chips we find; we never decrement it, at
+ * least for now. Since this might have changed
+ * over disable/enable or prior to reset, always
+ * do the check and potentially adjust.
+ */
+ ipath_stats.sps_nports = dd->ipath_cfgports;
+ } else
+ ipath_dbg("Failed (%d) to initialize chip\n", ret);
+
+ /* if ret is non-zero, we probably should do some cleanup
+ here... */
+ return ret;
+}
+
+static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
+{
+ struct ipath_devdata *dd;
+ unsigned long flags;
+ unsigned short val;
+ int ret;
+
+ ret = ipath_parse_ushort(str, &val);
+
+ spin_lock_irqsave(&ipath_devs_lock, flags);
+
+ if (ret < 0)
+ goto bail;
+
+ if (val == 0) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+ if (dd->ipath_kregbase)
+ continue;
+ if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+ (dd->ipath_cfgports *
+ IPATH_MIN_USER_PORT_BUFCNT)))
+ {
+ ipath_dev_err(
+ dd,
+ "Allocating %d PIO bufs for kernel leaves "
+ "too few for %d user ports (%d each)\n",
+ val, dd->ipath_cfgports - 1,
+ IPATH_MIN_USER_PORT_BUFCNT);
+ ret = -EINVAL;
+ goto bail;
+ }
+ dd->ipath_lastport_piobuf =
+ dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
+ }
+
+ ipath_kpiobufs = val;
+ ret = 0;
+bail:
+ spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_intr.c b/drivers/staging/rdma/ipath/ipath_intr.c
new file mode 100644
index 000000000000..01ba792791a0
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_intr.c
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+{
+ u32 piobcnt;
+ unsigned long sbuf[4];
+ /*
+ * it's possible that sendbuffererror could have bits set; might
+ * have already done this as a result of hardware error handling
+ */
+ piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+ /* read these before writing errorclear */
+ sbuf[0] = ipath_read_kreg64(
+ dd, dd->ipath_kregs->kr_sendbuffererror);
+ sbuf[1] = ipath_read_kreg64(
+ dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+ if (piobcnt > 128)
+ sbuf[2] = ipath_read_kreg64(
+ dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+ if (piobcnt > 192)
+ sbuf[3] = ipath_read_kreg64(
+ dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+ else
+ sbuf[3] = 0;
+
+ if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+ int i;
+ if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
+ time_after(dd->ipath_lastcancel, jiffies)) {
+ __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
+ "SendbufErrs %lx %lx", sbuf[0],
+ sbuf[1]);
+ if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+ printk(" %lx %lx ", sbuf[2], sbuf[3]);
+ printk("\n");
+ }
+
+ for (i = 0; i < piobcnt; i++)
+ if (test_bit(i, sbuf))
+ ipath_disarm_piobufs(dd, i, 1);
+ /* ignore armlaunch errs for a bit */
+ dd->ipath_lastcancel = jiffies+3;
+ }
+}
+
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+ (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
+ INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
+ INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
+ INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+ INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
+ INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+ (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
+ INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+ INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
+ INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+ INFINIPATH_E_INVALIDADDR)
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers. Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+ (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+ INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
+ INFINIPATH_E_SPKTLEN)
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received. This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+ (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+ INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+ INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+ INFINIPATH_E_RUNEXPCHAR)
+
+static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
+{
+ u64 ignore_this_time = 0;
+
+ ipath_disarm_senderrbufs(dd);
+ if ((errs & E_SUM_LINK_PKTERRS) &&
+ !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+ /*
+ * This can happen when SMA is trying to bring the link
+ * up, but the IB link changes state at the "wrong" time.
+ * The IB logic then complains that the packet isn't
+ * valid. We don't want to confuse people, so we just
+ * don't print them, except at debug
+ */
+ ipath_dbg("Ignoring packet errors %llx, because link not "
+ "ACTIVE\n", (unsigned long long) errs);
+ ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+ }
+
+ return ignore_this_time;
+}
+
+/* generic hw error messages... */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
+ { \
+ .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a << \
+ INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ), \
+ .msg = "TXE " #a " Memory Parity" \
+ }
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
+ { \
+ .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a << \
+ INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ), \
+ .msg = "RXE " #a " Memory Parity" \
+ }
+
+static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
+ INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
+ INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
+
+ INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
+ INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
+ INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
+
+ INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
+ INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
+ INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
+ INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
+ INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
+ INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
+ INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
+};
+
+/**
+ * ipath_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+ strlcat(msg, "[", msgl);
+ strlcat(msg, hwmsg, msgl);
+ strlcat(msg, "]", msgl);
+}
+
+/**
+ * ipath_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void ipath_format_hwerrors(u64 hwerrs,
+ const struct ipath_hwerror_msgs *hwerrmsgs,
+ size_t nhwerrmsgs,
+ char *msg, size_t msgl)
+{
+ int i;
+ const int glen =
+ ARRAY_SIZE(ipath_generic_hwerror_msgs);
+
+ for (i=0; i<glen; i++) {
+ if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
+ ipath_format_hwmsg(msg, msgl,
+ ipath_generic_hwerror_msgs[i].msg);
+ }
+ }
+
+ for (i=0; i<nhwerrmsgs; i++) {
+ if (hwerrs & hwerrmsgs[i].mask) {
+ ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+ }
+ }
+}
+
+/* return the strings for the most common link states */
+static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+ char *ret;
+ u32 state;
+
+ state = ipath_ib_state(dd, ibcs);
+ if (state == dd->ib_init)
+ ret = "Init";
+ else if (state == dd->ib_arm)
+ ret = "Arm";
+ else if (state == dd->ib_active)
+ ret = "Active";
+ else
+ ret = "Down";
+ return ret;
+}
+
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
+{
+ struct ib_event event;
+
+ event.device = &dd->verbs_dev->ibdev;
+ event.element.port_num = 1;
+ event.event = ev;
+ ib_dispatch_event(&event);
+}
+
+static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
+ ipath_err_t errs)
+{
+ u32 ltstate, lstate, ibstate, lastlstate;
+ u32 init = dd->ib_init;
+ u32 arm = dd->ib_arm;
+ u32 active = dd->ib_active;
+ const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+
+ lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
+ ibstate = ipath_ib_state(dd, ibcs);
+ /* linkstate at last interrupt */
+ lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+ ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
+
+ /*
+ * Since going into a recovery state causes the link state to go
+ * down and since recovery is transitory, it is better if we "miss"
+ * ever seeing the link training state go into recovery (i.e.,
+ * ignore this transition for link state special handling purposes)
+ * without even updating ipath_lastibcstat.
+ */
+ if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
+ (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
+ (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
+ goto done;
+
+ /*
+ * if linkstate transitions into INIT from any of the various down
+ * states, or if it transitions from any of the up (INIT or better)
+ * states into any of the down states (except link recovery), then
+ * call the chip-specific code to take appropriate actions.
+ */
+ if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
+ lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
+ /* transitioned to UP */
+ if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
+ /* link came up, so we must no longer be disabled */
+ dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+ ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
+ goto skip_ibchange; /* chip-code handled */
+ }
+ } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
+ (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
+ ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
+ ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
+ int handled;
+ handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
+ dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
+ if (handled) {
+ ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
+ goto skip_ibchange; /* chip-code handled */
+ }
+ }
+
+ /*
+ * Significant enough to always print and get into logs, if it was
+ * unexpected. If it was a requested state change, we'll have
+ * already cleared the flags, so we won't print this warning
+ */
+ if ((ibstate != arm && ibstate != active) &&
+ (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
+ dev_info(&dd->pcidev->dev, "Link state changed from %s "
+ "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
+ "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
+ }
+
+ if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+ ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+ u32 lastlts;
+ lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+ /*
+ * Ignore cycling back and forth from Polling.Active to
+ * Polling.Quiet while waiting for the other end of the link
+ * to come up, except to try and decide if we are connected
+ * to a live IB device or not. We will cycle back and
+ * forth between them if no cable is plugged in, the other
+ * device is powered off or disabled, etc.
+ */
+ if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+ lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+ if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
+ (++dd->ipath_ibpollcnt == 40)) {
+ dd->ipath_flags |= IPATH_NOCABLE;
+ *dd->ipath_statusp |=
+ IPATH_STATUS_IB_NOCABLE;
+ ipath_cdbg(LINKVERB, "Set NOCABLE\n");
+ }
+ ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
+ ipath_ibcstatus_str[ltstate], ibstate);
+ goto skip_ibchange;
+ }
+ }
+
+ dd->ipath_ibpollcnt = 0; /* not poll*, now */
+ ipath_stats.sps_iblink++;
+
+ if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
+ u64 linkrecov;
+ linkrecov = ipath_snap_cntr(dd,
+ dd->ipath_cregs->cr_iblinkerrrecovcnt);
+ if (linkrecov != dd->ipath_lastlinkrecov) {
+ ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
+ (unsigned long long) ibcs,
+ ib_linkstate(dd, ibcs),
+ ipath_ibcstatus_str[ltstate],
+ (unsigned long long) linkrecov);
+ /* and no more until active again */
+ dd->ipath_lastlinkrecov = 0;
+ ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+ goto skip_ibchange;
+ }
+ }
+
+ if (ibstate == init || ibstate == arm || ibstate == active) {
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+ if (ibstate == init || ibstate == arm) {
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+ if (dd->ipath_flags & IPATH_LINKACTIVE)
+ signal_ib_event(dd, IB_EVENT_PORT_ERR);
+ }
+ if (ibstate == arm) {
+ dd->ipath_flags |= IPATH_LINKARMED;
+ dd->ipath_flags &= ~(IPATH_LINKUNK |
+ IPATH_LINKINIT | IPATH_LINKDOWN |
+ IPATH_LINKACTIVE | IPATH_NOCABLE);
+ ipath_hol_down(dd);
+ } else if (ibstate == init) {
+ /*
+ * set INIT and DOWN. Down is checked by
+ * most of the other code, but INIT is
+ * useful to know in a few places.
+ */
+ dd->ipath_flags |= IPATH_LINKINIT |
+ IPATH_LINKDOWN;
+ dd->ipath_flags &= ~(IPATH_LINKUNK |
+ IPATH_LINKARMED | IPATH_LINKACTIVE |
+ IPATH_NOCABLE);
+ ipath_hol_down(dd);
+ } else { /* active */
+ dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
+ dd->ipath_cregs->cr_iblinkerrrecovcnt);
+ *dd->ipath_statusp |=
+ IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+ dd->ipath_flags |= IPATH_LINKACTIVE;
+ dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+ | IPATH_LINKDOWN | IPATH_LINKARMED |
+ IPATH_NOCABLE);
+ if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+ ipath_restart_sdma(dd);
+ signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
+ /* LED active not handled in chip _f_updown */
+ dd->ipath_f_setextled(dd, lstate, ltstate);
+ ipath_hol_up(dd);
+ }
+
+ /*
+ * print after we've already done the work, so as not to
+ * delay the state changes and notifications, for debugging
+ */
+ if (lstate == lastlstate)
+ ipath_cdbg(LINKVERB, "Unchanged from last: %s "
+ "(%x)\n", ib_linkstate(dd, ibcs), ibstate);
+ else
+ ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
+ dd->ipath_unit, ib_linkstate(dd, ibcs),
+ ipath_ibcstatus_str[ltstate], ibstate);
+ } else { /* down */
+ if (dd->ipath_flags & IPATH_LINKACTIVE)
+ signal_ib_event(dd, IB_EVENT_PORT_ERR);
+ dd->ipath_flags |= IPATH_LINKDOWN;
+ dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+ | IPATH_LINKACTIVE |
+ IPATH_LINKARMED);
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+ dd->ipath_lli_counter = 0;
+
+ if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
+ ipath_cdbg(VERBOSE, "Unit %u link state down "
+ "(state 0x%x), from %s\n",
+ dd->ipath_unit, lstate,
+ ib_linkstate(dd, dd->ipath_lastibcstat));
+ else
+ ipath_cdbg(LINKVERB, "Unit %u link state changed "
+ "to %s (0x%x) from down (%x)\n",
+ dd->ipath_unit,
+ ipath_ibcstatus_str[ltstate],
+ ibstate, lastlstate);
+ }
+
+skip_ibchange:
+ dd->ipath_lastibcstat = ibcs;
+done:
+ return;
+}
+
+static void handle_supp_msgs(struct ipath_devdata *dd,
+ unsigned supp_msgs, char *msg, u32 msgsz)
+{
+ /*
+ * Print the message unless it's ibc status change only, which
+ * happens so often we never want to count it.
+ */
+ if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+ int iserr;
+ ipath_err_t mask;
+ iserr = ipath_decode_err(dd, msg, msgsz,
+ dd->ipath_lasterror &
+ ~INFINIPATH_E_IBSTATUSCHANGED);
+
+ mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+ INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
+
+ /* if we're in debug, then don't mask SDMADISABLED msgs */
+ if (ipath_debug & __IPATH_DBG)
+ mask &= ~INFINIPATH_E_SDMADISABLED;
+
+ if (dd->ipath_lasterror & ~mask)
+ ipath_dev_err(dd, "Suppressed %u messages for "
+ "fast-repeating errors (%s) (%llx)\n",
+ supp_msgs, msg,
+ (unsigned long long)
+ dd->ipath_lasterror);
+ else {
+ /*
+ * rcvegrfull and rcvhdrqfull are "normal", for some
+ * types of processes (mostly benchmarks) that send
+ * huge numbers of messages, while not processing
+ * them. So only complain about these at debug
+ * level.
+ */
+ if (iserr)
+ ipath_dbg("Suppressed %u messages for %s\n",
+ supp_msgs, msg);
+ else
+ ipath_cdbg(ERRPKT,
+ "Suppressed %u messages for %s\n",
+ supp_msgs, msg);
+ }
+ }
+}
+
+static unsigned handle_frequent_errors(struct ipath_devdata *dd,
+ ipath_err_t errs, char *msg,
+ u32 msgsz, int *noprint)
+{
+ unsigned long nc;
+ static unsigned long nextmsg_time;
+ static unsigned nmsgs, supp_msgs;
+
+ /*
+ * Throttle back "fast" messages to no more than 10 per 5 seconds.
+ * This isn't perfect, but it's a reasonable heuristic. If we get
+ * more than 10, give a 6x longer delay.
+ */
+ nc = jiffies;
+ if (nmsgs > 10) {
+ if (time_before(nc, nextmsg_time)) {
+ *noprint = 1;
+ if (!supp_msgs++)
+ nextmsg_time = nc + HZ * 3;
+ }
+ else if (supp_msgs) {
+ handle_supp_msgs(dd, supp_msgs, msg, msgsz);
+ supp_msgs = 0;
+ nmsgs = 0;
+ }
+ }
+ else if (!nmsgs++ || time_after(nc, nextmsg_time))
+ nextmsg_time = nc + HZ / 2;
+
+ return supp_msgs;
+}
+
+static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+ unsigned long flags;
+ int expected;
+
+ if (ipath_debug & __IPATH_DBG) {
+ char msg[128];
+ ipath_decode_err(dd, msg, sizeof msg, errs &
+ INFINIPATH_E_SDMAERRS);
+ ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
+ }
+ if (ipath_debug & __IPATH_VERBDBG) {
+ unsigned long tl, hd, status, lengen;
+ tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+ hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+ status = ipath_read_kreg64(dd
+ , dd->ipath_kregs->kr_senddmastatus);
+ lengen = ipath_read_kreg64(dd,
+ dd->ipath_kregs->kr_senddmalengen);
+ ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
+ "lengen 0x%lx\n", tl, hd, status, lengen);
+ }
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+ expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+ if (!expected)
+ ipath_cancel_sends(dd, 1);
+}
+
+static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
+{
+ unsigned long flags;
+ int expected;
+
+ if ((istat & INFINIPATH_I_SDMAINT) &&
+ !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+ ipath_sdma_intr(dd);
+
+ if (istat & INFINIPATH_I_SDMADISABLED) {
+ expected = test_bit(IPATH_SDMA_ABORTING,
+ &dd->ipath_sdma_status);
+ ipath_dbg("%s SDmaDisabled intr\n",
+ expected ? "expected" : "unexpected");
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+ if (!expected)
+ ipath_cancel_sends(dd, 1);
+ if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+ tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+ }
+}
+
+static int handle_hdrq_full(struct ipath_devdata *dd)
+{
+ int chkerrpkts = 0;
+ u32 hd, tl;
+ u32 i;
+
+ ipath_stats.sps_hdrqfull++;
+ for (i = 0; i < dd->ipath_cfgports; i++) {
+ struct ipath_portdata *pd = dd->ipath_pd[i];
+
+ if (i == 0) {
+ /*
+ * For kernel receive queues, we just want to know
+ * if there are packets in the queue that we can
+ * process.
+ */
+ if (pd->port_head != ipath_get_hdrqtail(pd))
+ chkerrpkts |= 1 << i;
+ continue;
+ }
+
+ /* Skip if user context is not open */
+ if (!pd || !pd->port_cnt)
+ continue;
+
+ /* Don't report the same point multiple times. */
+ if (dd->ipath_flags & IPATH_NODMA_RTAIL)
+ tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
+ else
+ tl = ipath_get_rcvhdrtail(pd);
+ if (tl == pd->port_lastrcvhdrqtail)
+ continue;
+
+ hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
+ if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
+ pd->port_lastrcvhdrqtail = tl;
+ pd->port_hdrqfull++;
+ /* flush hdrqfull so that poll() sees it */
+ wmb();
+ wake_up_interruptible(&pd->port_wait);
+ }
+ }
+
+ return chkerrpkts;
+}
+
+static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+ char msg[128];
+ u64 ignore_this_time = 0;
+ u64 iserr = 0;
+ int chkerrpkts = 0, noprint = 0;
+ unsigned supp_msgs;
+ int log_idx;
+
+ /*
+ * don't report errors that are masked, either at init
+ * (not set in ipath_errormask), or temporarily (set in
+ * ipath_maskederrs)
+ */
+ errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
+
+ supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
+ &noprint);
+
+ /* do these first, they are most important */
+ if (errs & INFINIPATH_E_HARDWARE) {
+ /* reuse same msg buf */
+ dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
+ } else {
+ u64 mask;
+ for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
+ mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
+ if (errs & mask)
+ ipath_inc_eeprom_err(dd, log_idx, 1);
+ }
+ }
+
+ if (errs & INFINIPATH_E_SDMAERRS)
+ handle_sdma_errors(dd, errs);
+
+ if (!noprint && (errs & ~dd->ipath_e_bitsextant))
+ ipath_dev_err(dd, "error interrupt with unknown errors "
+ "%llx set\n", (unsigned long long)
+ (errs & ~dd->ipath_e_bitsextant));
+
+ if (errs & E_SUM_ERRS)
+ ignore_this_time = handle_e_sum_errs(dd, errs);
+ else if ((errs & E_SUM_LINK_PKTERRS) &&
+ !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+ /*
+ * This can happen when SMA is trying to bring the link
+ * up, but the IB link changes state at the "wrong" time.
+ * The IB logic then complains that the packet isn't
+ * valid. We don't want to confuse people, so we just
+ * don't print them, except at debug
+ */
+ ipath_dbg("Ignoring packet errors %llx, because link not "
+ "ACTIVE\n", (unsigned long long) errs);
+ ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+ }
+
+ if (supp_msgs == 250000) {
+ int s_iserr;
+ /*
+ * It's not entirely reasonable assuming that the errors set
+ * in the last clear period are all responsible for the
+ * problem, but the alternative is to assume it's the only
+ * ones on this particular interrupt, which also isn't great
+ */
+ dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+
+ dd->ipath_errormask &= ~dd->ipath_maskederrs;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ dd->ipath_errormask);
+ s_iserr = ipath_decode_err(dd, msg, sizeof msg,
+ dd->ipath_maskederrs);
+
+ if (dd->ipath_maskederrs &
+ ~(INFINIPATH_E_RRCVEGRFULL |
+ INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
+ ipath_dev_err(dd, "Temporarily disabling "
+ "error(s) %llx reporting; too frequent (%s)\n",
+ (unsigned long long) dd->ipath_maskederrs,
+ msg);
+ else {
+ /*
+ * rcvegrfull and rcvhdrqfull are "normal",
+ * for some types of processes (mostly benchmarks)
+ * that send huge numbers of messages, while not
+ * processing them. So only complain about
+ * these at debug level.
+ */
+ if (s_iserr)
+ ipath_dbg("Temporarily disabling reporting "
+ "too frequent queue full errors (%s)\n",
+ msg);
+ else
+ ipath_cdbg(ERRPKT,
+ "Temporarily disabling reporting too"
+ " frequent packet errors (%s)\n",
+ msg);
+ }
+
+ /*
+ * Re-enable the masked errors after around 3 minutes. in
+ * ipath_get_faststats(). If we have a series of fast
+ * repeating but different errors, the interval will keep
+ * stretching out, but that's OK, as that's pretty
+ * catastrophic.
+ */
+ dd->ipath_unmasktime = jiffies + HZ * 180;
+ }
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
+ if (ignore_this_time)
+ errs &= ~ignore_this_time;
+ if (errs & ~dd->ipath_lasterror) {
+ errs &= ~dd->ipath_lasterror;
+ /* never suppress duplicate hwerrors or ibstatuschange */
+ dd->ipath_lasterror |= errs &
+ ~(INFINIPATH_E_HARDWARE |
+ INFINIPATH_E_IBSTATUSCHANGED);
+ }
+
+ if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
+ dd->ipath_spectriggerhit++;
+ ipath_dbg("%lu special trigger hits\n",
+ dd->ipath_spectriggerhit);
+ }
+
+ /* likely due to cancel; so suppress message unless verbose */
+ if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
+ time_after(dd->ipath_lastcancel, jiffies)) {
+ /* armlaunch takes precedence; it often causes both. */
+ ipath_cdbg(VERBOSE,
+ "Suppressed %s error (%llx) after sendbuf cancel\n",
+ (errs & INFINIPATH_E_SPIOARMLAUNCH) ?
+ "armlaunch" : "sendpktlen", (unsigned long long)errs);
+ errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
+ }
+
+ if (!errs)
+ return 0;
+
+ if (!noprint) {
+ ipath_err_t mask;
+ /*
+ * The ones we mask off are handled specially below
+ * or above. Also mask SDMADISABLED by default as it
+ * is too chatty.
+ */
+ mask = INFINIPATH_E_IBSTATUSCHANGED |
+ INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+ INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
+
+ /* if we're in debug, then don't mask SDMADISABLED msgs */
+ if (ipath_debug & __IPATH_DBG)
+ mask &= ~INFINIPATH_E_SDMADISABLED;
+
+ ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
+ } else
+ /* so we don't need if (!noprint) at strlcat's below */
+ *msg = 0;
+
+ if (errs & E_SUM_PKTERRS) {
+ ipath_stats.sps_pkterrs++;
+ chkerrpkts = 1;
+ }
+ if (errs & E_SUM_ERRS)
+ ipath_stats.sps_errs++;
+
+ if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+ ipath_stats.sps_crcerrs++;
+ chkerrpkts = 1;
+ }
+ iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
+
+
+ /*
+ * We don't want to print these two as they happen, or we can make
+ * the situation even worse, because it takes so long to print
+ * messages to serial consoles. Kernel ports get printed from
+ * fast_stats, no more than every 5 seconds, user ports get printed
+ * on close
+ */
+ if (errs & INFINIPATH_E_RRCVHDRFULL)
+ chkerrpkts |= handle_hdrq_full(dd);
+ if (errs & INFINIPATH_E_RRCVEGRFULL) {
+ struct ipath_portdata *pd = dd->ipath_pd[0];
+
+ /*
+ * since this is of less importance and not likely to
+ * happen without also getting hdrfull, only count
+ * occurrences; don't check each port (or even the kernel
+ * vs user)
+ */
+ ipath_stats.sps_etidfull++;
+ if (pd->port_head != ipath_get_hdrqtail(pd))
+ chkerrpkts |= 1;
+ }
+
+ /*
+ * do this before IBSTATUSCHANGED, in case both bits set in a single
+ * interrupt; we want the STATUSCHANGE to "win", so we do our
+ * internal copy of state machine correctly
+ */
+ if (errs & INFINIPATH_E_RIBLOSTLINK) {
+ /*
+ * force through block below
+ */
+ errs |= INFINIPATH_E_IBSTATUSCHANGED;
+ ipath_stats.sps_iblink++;
+ dd->ipath_flags |= IPATH_LINKDOWN;
+ dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+ | IPATH_LINKARMED | IPATH_LINKACTIVE);
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+
+ ipath_dbg("Lost link, link now down (%s)\n",
+ ipath_ibcstatus_str[ipath_read_kreg64(dd,
+ dd->ipath_kregs->kr_ibcstatus) & 0xf]);
+ }
+ if (errs & INFINIPATH_E_IBSTATUSCHANGED)
+ handle_e_ibstatuschanged(dd, errs);
+
+ if (errs & INFINIPATH_E_RESET) {
+ if (!noprint)
+ ipath_dev_err(dd, "Got reset, requires re-init "
+ "(unload and reload driver)\n");
+ dd->ipath_flags &= ~IPATH_INITTED; /* needs re-init */
+ /* mark as having had error */
+ *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+ *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+ }
+
+ if (!noprint && *msg) {
+ if (iserr)
+ ipath_dev_err(dd, "%s error\n", msg);
+ }
+ if (dd->ipath_state_wanted & dd->ipath_flags) {
+ ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
+ "waking\n", dd->ipath_state_wanted,
+ dd->ipath_flags);
+ wake_up_interruptible(&ipath_state_wait);
+ }
+
+ return chkerrpkts;
+}
+
+/*
+ * try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ */
+void ipath_clear_freeze(struct ipath_devdata *dd)
+{
+ /* disable error interrupts, to avoid confusion */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
+
+ /* also disable interrupts; errormask is sometimes overwriten */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+ ipath_cancel_sends(dd, 1);
+
+ /* clear the freeze, and be sure chip saw it */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+ dd->ipath_control);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+ /* force in-memory update now we are out of freeze */
+ ipath_force_pio_avail_update(dd);
+
+ /*
+ * force new interrupt if any hwerr, error or interrupt bits are
+ * still set, and clear "safe" send packet errors related to freeze
+ * and cancelling sends. Re-enable error interrupts before possible
+ * force of re-interrupt on pending interrupts.
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+ E_SPKT_ERRS_IGNORE);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ dd->ipath_errormask);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+}
+
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
+{
+ /*
+ * sometimes happen during driver init and unload, don't want
+ * to process any interrupts at that point
+ */
+
+ /* this is just a bandaid, not a fix, if something goes badly
+ * wrong */
+ if (++*unexpectp > 100) {
+ if (++*unexpectp > 105) {
+ /*
+ * ok, we must be taking somebody else's interrupts,
+ * due to a messed up mptable and/or PIRQ table, so
+ * unregister the interrupt. We've seen this during
+ * linuxbios development work, and it may happen in
+ * the future again.
+ */
+ if (dd->pcidev && dd->ipath_irq) {
+ ipath_dev_err(dd, "Now %u unexpected "
+ "interrupts, unregistering "
+ "interrupt handler\n",
+ *unexpectp);
+ ipath_dbg("free_irq of irq %d\n",
+ dd->ipath_irq);
+ dd->ipath_f_free_irq(dd);
+ }
+ }
+ if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
+ ipath_dev_err(dd, "%u unexpected interrupts, "
+ "disabling interrupts completely\n",
+ *unexpectp);
+ /*
+ * disable all interrupts, something is very wrong
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+ 0ULL);
+ }
+ } else if (*unexpectp > 1)
+ ipath_dbg("Interrupt when not ready, should not happen, "
+ "ignoring\n");
+}
+
+static noinline void ipath_bad_regread(struct ipath_devdata *dd)
+{
+ static int allbits;
+
+ /* separate routine, for better optimization of ipath_intr() */
+
+ /*
+ * We print the message and disable interrupts, in hope of
+ * having a better chance of debugging the problem.
+ */
+ ipath_dev_err(dd,
+ "Read of interrupt status failed (all bits set)\n");
+ if (allbits++) {
+ /* disable all interrupts, something is very wrong */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+ if (allbits == 2) {
+ ipath_dev_err(dd, "Still bad interrupt status, "
+ "unregistering interrupt\n");
+ dd->ipath_f_free_irq(dd);
+ } else if (allbits > 2) {
+ if ((allbits % 10000) == 0)
+ printk(".");
+ } else
+ ipath_dev_err(dd, "Disabling interrupts, "
+ "multiple errors\n");
+ }
+}
+
+static void handle_layer_pioavail(struct ipath_devdata *dd)
+{
+ unsigned long flags;
+ int ret;
+
+ ret = ipath_ib_piobufavail(dd->verbs_dev);
+ if (ret > 0)
+ goto set;
+
+ return;
+set:
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+/*
+ * Handle receive interrupts for user ports; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll
+ */
+static void handle_urcv(struct ipath_devdata *dd, u64 istat)
+{
+ u64 portr;
+ int i;
+ int rcvdint = 0;
+
+ /*
+ * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
+ * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
+ * would both like timely updates of the bits so that
+ * we don't pass them by unnecessarily. the rmb()
+ * here ensures that we see them promptly -- the
+ * corresponding wmb()'s are in ipath_poll_urgent()
+ * and ipath_poll_next()...
+ */
+ rmb();
+ portr = ((istat >> dd->ipath_i_rcvavail_shift) &
+ dd->ipath_i_rcvavail_mask) |
+ ((istat >> dd->ipath_i_rcvurg_shift) &
+ dd->ipath_i_rcvurg_mask);
+ for (i = 1; i < dd->ipath_cfgports; i++) {
+ struct ipath_portdata *pd = dd->ipath_pd[i];
+
+ if (portr & (1 << i) && pd && pd->port_cnt) {
+ if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
+ &pd->port_flag)) {
+ clear_bit(i + dd->ipath_r_intravail_shift,
+ &dd->ipath_rcvctrl);
+ wake_up_interruptible(&pd->port_wait);
+ rcvdint = 1;
+ } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
+ &pd->port_flag)) {
+ pd->port_urgent++;
+ wake_up_interruptible(&pd->port_wait);
+ }
+ }
+ }
+ if (rcvdint) {
+ /* only want to take one interrupt, so turn off the rcv
+ * interrupt for all the ports that we set the rcv_waiting
+ * (but never for kernel port)
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+ dd->ipath_rcvctrl);
+ }
+}
+
+irqreturn_t ipath_intr(int irq, void *data)
+{
+ struct ipath_devdata *dd = data;
+ u64 istat, chk0rcv = 0;
+ ipath_err_t estat = 0;
+ irqreturn_t ret;
+ static unsigned unexpected = 0;
+ u64 kportrbits;
+
+ ipath_stats.sps_ints++;
+
+ if (dd->ipath_int_counter != (u32) -1)
+ dd->ipath_int_counter++;
+
+ if (!(dd->ipath_flags & IPATH_PRESENT)) {
+ /*
+ * This return value is not great, but we do not want the
+ * interrupt core code to remove our interrupt handler
+ * because we don't appear to be handling an interrupt
+ * during a chip reset.
+ */
+ return IRQ_HANDLED;
+ }
+
+ /*
+ * this needs to be flags&initted, not statusp, so we keep
+ * taking interrupts even after link goes down, etc.
+ * Also, we *must* clear the interrupt at some point, or we won't
+ * take it again, which can be real bad for errors, etc...
+ */
+
+ if (!(dd->ipath_flags & IPATH_INITTED)) {
+ ipath_bad_intr(dd, &unexpected);
+ ret = IRQ_NONE;
+ goto bail;
+ }
+
+ istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
+
+ if (unlikely(!istat)) {
+ ipath_stats.sps_nullintr++;
+ ret = IRQ_NONE; /* not our interrupt, or already handled */
+ goto bail;
+ }
+ if (unlikely(istat == -1)) {
+ ipath_bad_regread(dd);
+ /* don't know if it was our interrupt or not */
+ ret = IRQ_NONE;
+ goto bail;
+ }
+
+ if (unexpected)
+ unexpected = 0;
+
+ if (unlikely(istat & ~dd->ipath_i_bitsextant))
+ ipath_dev_err(dd,
+ "interrupt with unknown interrupts %Lx set\n",
+ (unsigned long long)
+ istat & ~dd->ipath_i_bitsextant);
+ else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
+ ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
+ (unsigned long long) istat);
+
+ if (istat & INFINIPATH_I_ERROR) {
+ ipath_stats.sps_errints++;
+ estat = ipath_read_kreg64(dd,
+ dd->ipath_kregs->kr_errorstatus);
+ if (!estat)
+ dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
+ "but no error bits set!\n",
+ (unsigned long long) istat);
+ else if (estat == -1LL)
+ /*
+ * should we try clearing all, or hope next read
+ * works?
+ */
+ ipath_dev_err(dd, "Read of error status failed "
+ "(all bits set); ignoring\n");
+ else
+ chk0rcv |= handle_errors(dd, estat);
+ }
+
+ if (istat & INFINIPATH_I_GPIO) {
+ /*
+ * GPIO interrupts fall in two broad classes:
+ * GPIO_2 indicates (on some HT4xx boards) that a packet
+ * has arrived for Port 0. Checking for this
+ * is controlled by flag IPATH_GPIO_INTR.
+ * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
+ * errors that we need to count. Checking for this
+ * is controlled by flag IPATH_GPIO_ERRINTRS.
+ */
+ u32 gpiostatus;
+ u32 to_clear = 0;
+
+ gpiostatus = ipath_read_kreg32(
+ dd, dd->ipath_kregs->kr_gpio_status);
+ /* First the error-counter case. */
+ if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
+ (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
+ /* want to clear the bits we see asserted. */
+ to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
+
+ /*
+ * Count appropriately, clear bits out of our copy,
+ * as they have been "handled".
+ */
+ if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
+ ipath_dbg("FlowCtl on UnsupVL\n");
+ dd->ipath_rxfc_unsupvl_errs++;
+ }
+ if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
+ ipath_dbg("Overrun Threshold exceeded\n");
+ dd->ipath_overrun_thresh_errs++;
+ }
+ if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
+ ipath_dbg("Local Link Integrity error\n");
+ dd->ipath_lli_errs++;
+ }
+ gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
+ }
+ /* Now the Port0 Receive case */
+ if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
+ (dd->ipath_flags & IPATH_GPIO_INTR)) {
+ /*
+ * GPIO status bit 2 is set, and we expected it.
+ * clear it and indicate in p0bits.
+ * This probably only happens if a Port0 pkt
+ * arrives at _just_ the wrong time, and we
+ * handle that by seting chk0rcv;
+ */
+ to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
+ gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
+ chk0rcv = 1;
+ }
+ if (gpiostatus) {
+ /*
+ * Some unexpected bits remain. If they could have
+ * caused the interrupt, complain and clear.
+ * To avoid repetition of this condition, also clear
+ * the mask. It is almost certainly due to error.
+ */
+ const u32 mask = (u32) dd->ipath_gpio_mask;
+
+ if (mask & gpiostatus) {
+ ipath_dbg("Unexpected GPIO IRQ bits %x\n",
+ gpiostatus & mask);
+ to_clear |= (gpiostatus & mask);
+ dd->ipath_gpio_mask &= ~(gpiostatus & mask);
+ ipath_write_kreg(dd,
+ dd->ipath_kregs->kr_gpio_mask,
+ dd->ipath_gpio_mask);
+ }
+ }
+ if (to_clear) {
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+ (u64) to_clear);
+ }
+ }
+
+ /*
+ * Clear the interrupt bits we found set, unless they are receive
+ * related, in which case we already cleared them above, and don't
+ * want to clear them again, because we might lose an interrupt.
+ * Clear it early, so we "know" know the chip will have seen this by
+ * the time we process the queue, and will re-interrupt if necessary.
+ * The processor itself won't take the interrupt again until we return.
+ */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
+
+ /*
+ * Handle kernel receive queues before checking for pio buffers
+ * available since receives can overflow; piobuf waiters can afford
+ * a few extra cycles, since they were waiting anyway, and user's
+ * waiting for receive are at the bottom.
+ */
+ kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
+ (1ULL << dd->ipath_i_rcvurg_shift);
+ if (chk0rcv || (istat & kportrbits)) {
+ istat &= ~kportrbits;
+ ipath_kreceive(dd->ipath_pd[0]);
+ }
+
+ if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
+ (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
+ handle_urcv(dd, istat);
+
+ if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
+ handle_sdma_intr(dd, istat);
+
+ if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ /* always process; sdma verbs uses PIO for acks and VL15 */
+ handle_layer_pioavail(dd);
+ }
+
+ ret = IRQ_HANDLED;
+
+bail:
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_kernel.h b/drivers/staging/rdma/ipath/ipath_kernel.h
new file mode 100644
index 000000000000..f0f947122779
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_kernel.h
@@ -0,0 +1,1373 @@
+#ifndef _IPATH_KERNEL_H
+#define _IPATH_KERNEL_H
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for infinipath kernel code
+ * ipath_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_common.h"
+#include "ipath_debug.h"
+#include "ipath_registers.h"
+
+/* only s/w major version of InfiniPath we can handle */
+#define IPATH_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define IPATH_CHIP_VERS_MIN 0U
+
+/* temporary, maybe always */
+extern struct infinipath_stats ipath_stats;
+
+#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
+/*
+ * Struct used to indicate which errors are logged in each of the
+ * error-counters that are logged to EEPROM. A counter is incremented
+ * _once_ (saturating at 255) for each event with any bits set in
+ * the error or hwerror register masks below.
+ */
+#define IPATH_EEP_LOG_CNT (4)
+struct ipath_eep_log_mask {
+ u64 errs_to_log;
+ u64 hwerrs_to_log;
+};
+
+struct ipath_portdata {
+ void **port_rcvegrbuf;
+ dma_addr_t *port_rcvegrbuf_phys;
+ /* rcvhdrq base, needs mmap before useful */
+ void *port_rcvhdrq;
+ /* kernel virtual address where hdrqtail is updated */
+ void *port_rcvhdrtail_kvaddr;
+ /*
+ * temp buffer for expected send setup, allocated at open, instead
+ * of each setup call
+ */
+ void *port_tid_pg_list;
+ /* when waiting for rcv or pioavail */
+ wait_queue_head_t port_wait;
+ /*
+ * rcvegr bufs base, physical, must fit
+ * in 44 bits so 32 bit programs mmap64 44 bit works)
+ */
+ dma_addr_t port_rcvegr_phys;
+ /* mmap of hdrq, must fit in 44 bits */
+ dma_addr_t port_rcvhdrq_phys;
+ dma_addr_t port_rcvhdrqtailaddr_phys;
+ /*
+ * number of opens (including slave subports) on this instance
+ * (ignoring forks, dup, etc. for now)
+ */
+ int port_cnt;
+ /*
+ * how much space to leave at start of eager TID entries for
+ * protocol use, on each TID
+ */
+ /* instead of calculating it */
+ unsigned port_port;
+ /* non-zero if port is being shared. */
+ u16 port_subport_cnt;
+ /* non-zero if port is being shared. */
+ u16 port_subport_id;
+ /* number of pio bufs for this port (all procs, if shared) */
+ u32 port_piocnt;
+ /* first pio buffer for this port */
+ u32 port_pio_base;
+ /* chip offset of PIO buffers for this port */
+ u32 port_piobufs;
+ /* how many alloc_pages() chunks in port_rcvegrbuf_pages */
+ u32 port_rcvegrbuf_chunks;
+ /* how many egrbufs per chunk */
+ u32 port_rcvegrbufs_perchunk;
+ /* order for port_rcvegrbuf_pages */
+ size_t port_rcvegrbuf_size;
+ /* rcvhdrq size (for freeing) */
+ size_t port_rcvhdrq_size;
+ /* next expected TID to check when looking for free */
+ u32 port_tidcursor;
+ /* next expected TID to check */
+ unsigned long port_flag;
+ /* what happened */
+ unsigned long int_flag;
+ /* WAIT_RCV that timed out, no interrupt */
+ u32 port_rcvwait_to;
+ /* WAIT_PIO that timed out, no interrupt */
+ u32 port_piowait_to;
+ /* WAIT_RCV already happened, no wait */
+ u32 port_rcvnowait;
+ /* WAIT_PIO already happened, no wait */
+ u32 port_pionowait;
+ /* total number of rcvhdrqfull errors */
+ u32 port_hdrqfull;
+ /*
+ * Used to suppress multiple instances of same
+ * port staying stuck at same point.
+ */
+ u32 port_lastrcvhdrqtail;
+ /* saved total number of rcvhdrqfull errors for poll edge trigger */
+ u32 port_hdrqfull_poll;
+ /* total number of polled urgent packets */
+ u32 port_urgent;
+ /* saved total number of polled urgent packets for poll edge trigger */
+ u32 port_urgent_poll;
+ /* pid of process using this port */
+ struct pid *port_pid;
+ struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
+ /* same size as task_struct .comm[] */
+ char port_comm[16];
+ /* pkeys set by this use of this port */
+ u16 port_pkeys[4];
+ /* so file ops can get at unit */
+ struct ipath_devdata *port_dd;
+ /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+ void *subport_uregbase;
+ /* An array of pages for the eager receive buffers * N */
+ void *subport_rcvegrbuf;
+ /* An array of pages for the eager header queue entries * N */
+ void *subport_rcvhdr_base;
+ /* The version of the library which opened this port */
+ u32 userversion;
+ /* Bitmask of active slaves */
+ u32 active_slaves;
+ /* Type of packets or conditions we want to poll for */
+ u16 poll_type;
+ /* port rcvhdrq head offset */
+ u32 port_head;
+ /* receive packet sequence counter */
+ u32 port_seq_cnt;
+};
+
+struct sk_buff;
+struct ipath_sge_state;
+struct ipath_verbs_txreq;
+
+/*
+ * control information for layered drivers
+ */
+struct _ipath_layer {
+ void *l_arg;
+};
+
+struct ipath_skbinfo {
+ struct sk_buff *skb;
+ dma_addr_t phys;
+};
+
+struct ipath_sdma_txreq {
+ int flags;
+ int sg_count;
+ union {
+ struct scatterlist *sg;
+ void *map_addr;
+ };
+ void (*callback)(void *, int);
+ void *callback_cookie;
+ int callback_status;
+ u16 start_idx; /* sdma private */
+ u16 next_descq_idx; /* sdma private */
+ struct list_head list; /* sdma private */
+};
+
+struct ipath_sdma_desc {
+ __le64 qw[2];
+};
+
+#define IPATH_SDMA_TXREQ_F_USELARGEBUF 0x1
+#define IPATH_SDMA_TXREQ_F_HEADTOHOST 0x2
+#define IPATH_SDMA_TXREQ_F_INTREQ 0x4
+#define IPATH_SDMA_TXREQ_F_FREEBUF 0x8
+#define IPATH_SDMA_TXREQ_F_FREEDESC 0x10
+#define IPATH_SDMA_TXREQ_F_VL15 0x20
+
+#define IPATH_SDMA_TXREQ_S_OK 0
+#define IPATH_SDMA_TXREQ_S_SENDERROR 1
+#define IPATH_SDMA_TXREQ_S_ABORTED 2
+#define IPATH_SDMA_TXREQ_S_SHUTDOWN 3
+
+#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG (1ull << 63)
+#define IPATH_SDMA_STATUS_ABORT_IN_PROG (1ull << 62)
+#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE (1ull << 61)
+#define IPATH_SDMA_STATUS_SCB_EMPTY (1ull << 30)
+
+/* max dwords in small buffer packet */
+#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
+
+/*
+ * Possible IB config parameters for ipath_f_get/set_ib_cfg()
+ */
+#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
+#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
+#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
+#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
+#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
+#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
+#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
+#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
+#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
+#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
+#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
+
+
+struct ipath_devdata {
+ struct list_head ipath_list;
+
+ struct ipath_kregs const *ipath_kregs;
+ struct ipath_cregs const *ipath_cregs;
+
+ /* mem-mapped pointer to base of chip regs */
+ u64 __iomem *ipath_kregbase;
+ /* end of mem-mapped chip space; range checking */
+ u64 __iomem *ipath_kregend;
+ /* physical address of chip for io_remap, etc. */
+ unsigned long ipath_physaddr;
+ /* base of memory alloced for ipath_kregbase, for free */
+ u64 *ipath_kregalloc;
+ /* ipath_cfgports pointers */
+ struct ipath_portdata **ipath_pd;
+ /* sk_buffs used by port 0 eager receive queue */
+ struct ipath_skbinfo *ipath_port0_skbinfo;
+ /* kvirt address of 1st 2k pio buffer */
+ void __iomem *ipath_pio2kbase;
+ /* kvirt address of 1st 4k pio buffer */
+ void __iomem *ipath_pio4kbase;
+ /*
+ * points to area where PIOavail registers will be DMA'ed.
+ * Has to be on a page of it's own, because the page will be
+ * mapped into user program space. This copy is *ONLY* ever
+ * written by DMA, not by the driver! Need a copy per device
+ * when we get to multiple devices
+ */
+ volatile __le64 *ipath_pioavailregs_dma;
+ /* physical address where updates occur */
+ dma_addr_t ipath_pioavailregs_phys;
+ struct _ipath_layer ipath_layer;
+ /* setup intr */
+ int (*ipath_f_intrsetup)(struct ipath_devdata *);
+ /* fallback to alternate interrupt type if possible */
+ int (*ipath_f_intr_fallback)(struct ipath_devdata *);
+ /* setup on-chip bus config */
+ int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
+ /* hard reset chip */
+ int (*ipath_f_reset)(struct ipath_devdata *);
+ int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
+ size_t);
+ void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
+ void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
+ size_t);
+ void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
+ int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
+ int (*ipath_f_early_init)(struct ipath_devdata *);
+ void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
+ void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
+ u32, unsigned long);
+ void (*ipath_f_tidtemplate)(struct ipath_devdata *);
+ void (*ipath_f_cleanup)(struct ipath_devdata *);
+ void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
+ /* fill out chip-specific fields */
+ int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
+ /* free irq */
+ void (*ipath_f_free_irq)(struct ipath_devdata *);
+ struct ipath_message_header *(*ipath_f_get_msgheader)
+ (struct ipath_devdata *, __le32 *);
+ void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
+ int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
+ int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
+ void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
+ void (*ipath_f_read_counters)(struct ipath_devdata *,
+ struct infinipath_counters *);
+ void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
+ /* per chip actions needed for IB Link up/down changes */
+ int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
+
+ unsigned ipath_lastegr_idx;
+ struct ipath_ibdev *verbs_dev;
+ struct timer_list verbs_timer;
+ /* total dwords sent (summed from counter) */
+ u64 ipath_sword;
+ /* total dwords rcvd (summed from counter) */
+ u64 ipath_rword;
+ /* total packets sent (summed from counter) */
+ u64 ipath_spkts;
+ /* total packets rcvd (summed from counter) */
+ u64 ipath_rpkts;
+ /* ipath_statusp initially points to this. */
+ u64 _ipath_status;
+ /* GUID for this interface, in network order */
+ __be64 ipath_guid;
+ /*
+ * aggregrate of error bits reported since last cleared, for
+ * limiting of error reporting
+ */
+ ipath_err_t ipath_lasterror;
+ /*
+ * aggregrate of error bits reported since last cleared, for
+ * limiting of hwerror reporting
+ */
+ ipath_err_t ipath_lasthwerror;
+ /* errors masked because they occur too fast */
+ ipath_err_t ipath_maskederrs;
+ u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
+ /* these 5 fields are used to establish deltas for IB Symbol
+ * errors and linkrecovery errors. They can be reported on
+ * some chips during link negotiation prior to INIT, and with
+ * DDR when faking DDR negotiations with non-IBTA switches.
+ * The chip counters are adjusted at driver unload if there is
+ * a non-zero delta.
+ */
+ u64 ibdeltainprog;
+ u64 ibsymdelta;
+ u64 ibsymsnap;
+ u64 iblnkerrdelta;
+ u64 iblnkerrsnap;
+
+ /* time in jiffies at which to re-enable maskederrs */
+ unsigned long ipath_unmasktime;
+ /* count of egrfull errors, combined for all ports */
+ u64 ipath_last_tidfull;
+ /* for ipath_qcheck() */
+ u64 ipath_lastport0rcv_cnt;
+ /* template for writing TIDs */
+ u64 ipath_tidtemplate;
+ /* value to write to free TIDs */
+ u64 ipath_tidinvalid;
+ /* IBA6120 rcv interrupt setup */
+ u64 ipath_rhdrhead_intr_off;
+
+ /* size of memory at ipath_kregbase */
+ u32 ipath_kregsize;
+ /* number of registers used for pioavail */
+ u32 ipath_pioavregs;
+ /* IPATH_POLL, etc. */
+ u32 ipath_flags;
+ /* ipath_flags driver is waiting for */
+ u32 ipath_state_wanted;
+ /* last buffer for user use, first buf for kernel use is this
+ * index. */
+ u32 ipath_lastport_piobuf;
+ /* is a stats timer active */
+ u32 ipath_stats_timer_active;
+ /* number of interrupts for this device -- saturates... */
+ u32 ipath_int_counter;
+ /* dwords sent read from counter */
+ u32 ipath_lastsword;
+ /* dwords received read from counter */
+ u32 ipath_lastrword;
+ /* sent packets read from counter */
+ u32 ipath_lastspkts;
+ /* received packets read from counter */
+ u32 ipath_lastrpkts;
+ /* pio bufs allocated per port */
+ u32 ipath_pbufsport;
+ /* if remainder on bufs/port, ports < extrabuf get 1 extra */
+ u32 ipath_ports_extrabuf;
+ u32 ipath_pioupd_thresh; /* update threshold, some chips */
+ /*
+ * number of ports configured as max; zero is set to number chip
+ * supports, less gives more pio bufs/port, etc.
+ */
+ u32 ipath_cfgports;
+ /* count of port 0 hdrqfull errors */
+ u32 ipath_p0_hdrqfull;
+ /* port 0 number of receive eager buffers */
+ u32 ipath_p0_rcvegrcnt;
+
+ /*
+ * index of last piobuffer we used. Speeds up searching, by
+ * starting at this point. Doesn't matter if multiple cpu's use and
+ * update, last updater is only write that matters. Whenever it
+ * wraps, we update shadow copies. Need a copy per device when we
+ * get to multiple devices
+ */
+ u32 ipath_lastpioindex;
+ u32 ipath_lastpioindexl;
+ /* max length of freezemsg */
+ u32 ipath_freezelen;
+ /*
+ * consecutive times we wanted a PIO buffer but were unable to
+ * get one
+ */
+ u32 ipath_consec_nopiobuf;
+ /*
+ * hint that we should update ipath_pioavailshadow before
+ * looking for a PIO buffer
+ */
+ u32 ipath_upd_pio_shadow;
+ /* so we can rewrite it after a chip reset */
+ u32 ipath_pcibar0;
+ /* so we can rewrite it after a chip reset */
+ u32 ipath_pcibar1;
+ u32 ipath_x1_fix_tries;
+ u32 ipath_autoneg_tries;
+ u32 serdes_first_init_done;
+
+ struct ipath_relock {
+ atomic_t ipath_relock_timer_active;
+ struct timer_list ipath_relock_timer;
+ unsigned int ipath_relock_interval; /* in jiffies */
+ } ipath_relock_singleton;
+
+ /* interrupt number */
+ int ipath_irq;
+ /* HT/PCI Vendor ID (here for NodeInfo) */
+ u16 ipath_vendorid;
+ /* HT/PCI Device ID (here for NodeInfo) */
+ u16 ipath_deviceid;
+ /* offset in HT config space of slave/primary interface block */
+ u8 ipath_ht_slave_off;
+ /* for write combining settings */
+ int wc_cookie;
+ /* ref count for each pkey */
+ atomic_t ipath_pkeyrefs[4];
+ /* shadow copy of struct page *'s for exp tid pages */
+ struct page **ipath_pageshadow;
+ /* shadow copy of dma handles for exp tid pages */
+ dma_addr_t *ipath_physshadow;
+ u64 __iomem *ipath_egrtidbase;
+ /* lock to workaround chip bug 9437 and others */
+ spinlock_t ipath_kernel_tid_lock;
+ spinlock_t ipath_user_tid_lock;
+ spinlock_t ipath_sendctrl_lock;
+ /* around ipath_pd and (user ports) port_cnt use (intr vs free) */
+ spinlock_t ipath_uctxt_lock;
+
+ /*
+ * IPATH_STATUS_*,
+ * this address is mapped readonly into user processes so they can
+ * get status cheaply, whenever they want.
+ */
+ u64 *ipath_statusp;
+ /* freeze msg if hw error put chip in freeze */
+ char *ipath_freezemsg;
+ /* pci access data structure */
+ struct pci_dev *pcidev;
+ struct cdev *user_cdev;
+ struct cdev *diag_cdev;
+ struct device *user_dev;
+ struct device *diag_dev;
+ /* timer used to prevent stats overflow, error throttling, etc. */
+ struct timer_list ipath_stats_timer;
+ /* timer to verify interrupts work, and fallback if possible */
+ struct timer_list ipath_intrchk_timer;
+ void *ipath_dummy_hdrq; /* used after port close */
+ dma_addr_t ipath_dummy_hdrq_phys;
+
+ /* SendDMA related entries */
+ spinlock_t ipath_sdma_lock;
+ unsigned long ipath_sdma_status;
+ unsigned long ipath_sdma_abort_jiffies;
+ unsigned long ipath_sdma_abort_intr_timeout;
+ unsigned long ipath_sdma_buf_jiffies;
+ struct ipath_sdma_desc *ipath_sdma_descq;
+ u64 ipath_sdma_descq_added;
+ u64 ipath_sdma_descq_removed;
+ int ipath_sdma_desc_nreserved;
+ u16 ipath_sdma_descq_cnt;
+ u16 ipath_sdma_descq_tail;
+ u16 ipath_sdma_descq_head;
+ u16 ipath_sdma_next_intr;
+ u16 ipath_sdma_reset_wait;
+ u8 ipath_sdma_generation;
+ struct tasklet_struct ipath_sdma_abort_task;
+ struct tasklet_struct ipath_sdma_notify_task;
+ struct list_head ipath_sdma_activelist;
+ struct list_head ipath_sdma_notifylist;
+ atomic_t ipath_sdma_vl15_count;
+ struct timer_list ipath_sdma_vl15_timer;
+
+ dma_addr_t ipath_sdma_descq_phys;
+ volatile __le64 *ipath_sdma_head_dma;
+ dma_addr_t ipath_sdma_head_phys;
+
+ unsigned long ipath_ureg_align; /* user register alignment */
+
+ struct delayed_work ipath_autoneg_work;
+ wait_queue_head_t ipath_autoneg_wait;
+
+ /* HoL blocking / user app forward-progress state */
+ unsigned ipath_hol_state;
+ unsigned ipath_hol_next;
+ struct timer_list ipath_hol_timer;
+
+ /*
+ * Shadow copies of registers; size indicates read access size.
+ * Most of them are readonly, but some are write-only register,
+ * where we manipulate the bits in the shadow copy, and then write
+ * the shadow copy to infinipath.
+ *
+ * We deliberately make most of these 32 bits, since they have
+ * restricted range. For any that we read, we won't to generate 32
+ * bit accesses, since Opteron will generate 2 separate 32 bit HT
+ * transactions for a 64 bit read, and we want to avoid unnecessary
+ * HT transactions.
+ */
+
+ /* This is the 64 bit group */
+
+ /*
+ * shadow of pioavail, check to be sure it's large enough at
+ * init time.
+ */
+ unsigned long ipath_pioavailshadow[8];
+ /* bitmap of send buffers available for the kernel to use with PIO. */
+ unsigned long ipath_pioavailkernel[8];
+ /* shadow of kr_gpio_out, for rmw ops */
+ u64 ipath_gpio_out;
+ /* shadow the gpio mask register */
+ u64 ipath_gpio_mask;
+ /* shadow the gpio output enable, etc... */
+ u64 ipath_extctrl;
+ /* kr_revision shadow */
+ u64 ipath_revision;
+ /*
+ * shadow of ibcctrl, for interrupt handling of link changes,
+ * etc.
+ */
+ u64 ipath_ibcctrl;
+ /*
+ * last ibcstatus, to suppress "duplicate" status change messages,
+ * mostly from 2 to 3
+ */
+ u64 ipath_lastibcstat;
+ /* hwerrmask shadow */
+ ipath_err_t ipath_hwerrmask;
+ ipath_err_t ipath_errormask; /* errormask shadow */
+ /* interrupt config reg shadow */
+ u64 ipath_intconfig;
+ /* kr_sendpiobufbase value */
+ u64 ipath_piobufbase;
+ /* kr_ibcddrctrl shadow */
+ u64 ipath_ibcddrctrl;
+
+ /* these are the "32 bit" regs */
+
+ /*
+ * number of GUIDs in the flash for this interface; may need some
+ * rethinking for setting on other ifaces
+ */
+ u32 ipath_nguid;
+ /*
+ * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+ * all expect bit fields to be "unsigned long"
+ */
+ /* shadow kr_rcvctrl */
+ unsigned long ipath_rcvctrl;
+ /* shadow kr_sendctrl */
+ unsigned long ipath_sendctrl;
+ /* to not count armlaunch after cancel */
+ unsigned long ipath_lastcancel;
+ /* count cases where special trigger was needed (double write) */
+ unsigned long ipath_spectriggerhit;
+
+ /* value we put in kr_rcvhdrcnt */
+ u32 ipath_rcvhdrcnt;
+ /* value we put in kr_rcvhdrsize */
+ u32 ipath_rcvhdrsize;
+ /* value we put in kr_rcvhdrentsize */
+ u32 ipath_rcvhdrentsize;
+ /* offset of last entry in rcvhdrq */
+ u32 ipath_hdrqlast;
+ /* kr_portcnt value */
+ u32 ipath_portcnt;
+ /* kr_pagealign value */
+ u32 ipath_palign;
+ /* number of "2KB" PIO buffers */
+ u32 ipath_piobcnt2k;
+ /* size in bytes of "2KB" PIO buffers */
+ u32 ipath_piosize2k;
+ /* number of "4KB" PIO buffers */
+ u32 ipath_piobcnt4k;
+ /* size in bytes of "4KB" PIO buffers */
+ u32 ipath_piosize4k;
+ u32 ipath_pioreserved; /* reserved special-inkernel; */
+ /* kr_rcvegrbase value */
+ u32 ipath_rcvegrbase;
+ /* kr_rcvegrcnt value */
+ u32 ipath_rcvegrcnt;
+ /* kr_rcvtidbase value */
+ u32 ipath_rcvtidbase;
+ /* kr_rcvtidcnt value */
+ u32 ipath_rcvtidcnt;
+ /* kr_sendregbase */
+ u32 ipath_sregbase;
+ /* kr_userregbase */
+ u32 ipath_uregbase;
+ /* kr_counterregbase */
+ u32 ipath_cregbase;
+ /* shadow the control register contents */
+ u32 ipath_control;
+ /* PCI revision register (HTC rev on FPGA) */
+ u32 ipath_pcirev;
+
+ /* chip address space used by 4k pio buffers */
+ u32 ipath_4kalign;
+ /* The MTU programmed for this unit */
+ u32 ipath_ibmtu;
+ /*
+ * The max size IB packet, included IB headers that we can send.
+ * Starts same as ipath_piosize, but is affected when ibmtu is
+ * changed, or by size of eager buffers
+ */
+ u32 ipath_ibmaxlen;
+ /*
+ * ibmaxlen at init time, limited by chip and by receive buffer
+ * size. Not changed after init.
+ */
+ u32 ipath_init_ibmaxlen;
+ /* size of each rcvegrbuffer */
+ u32 ipath_rcvegrbufsize;
+ /* localbus width (1, 2,4,8,16,32) from config space */
+ u32 ipath_lbus_width;
+ /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
+ u32 ipath_lbus_speed;
+ /*
+ * number of sequential ibcstatus change for polling active/quiet
+ * (i.e., link not coming up).
+ */
+ u32 ipath_ibpollcnt;
+ /* low and high portions of MSI capability/vector */
+ u32 ipath_msi_lo;
+ /* saved after PCIe init for restore after reset */
+ u32 ipath_msi_hi;
+ /* MSI data (vector) saved for restore */
+ u16 ipath_msi_data;
+ /* MLID programmed for this instance */
+ u16 ipath_mlid;
+ /* LID programmed for this instance */
+ u16 ipath_lid;
+ /* list of pkeys programmed; 0 if not set */
+ u16 ipath_pkeys[4];
+ /*
+ * ASCII serial number, from flash, large enough for original
+ * all digit strings, and longer QLogic serial number format
+ */
+ u8 ipath_serial[16];
+ /* human readable board version */
+ u8 ipath_boardversion[96];
+ u8 ipath_lbus_info[32]; /* human readable localbus info */
+ /* chip major rev, from ipath_revision */
+ u8 ipath_majrev;
+ /* chip minor rev, from ipath_revision */
+ u8 ipath_minrev;
+ /* board rev, from ipath_revision */
+ u8 ipath_boardrev;
+ /* saved for restore after reset */
+ u8 ipath_pci_cacheline;
+ /* LID mask control */
+ u8 ipath_lmc;
+ /* link width supported */
+ u8 ipath_link_width_supported;
+ /* link speed supported */
+ u8 ipath_link_speed_supported;
+ u8 ipath_link_width_enabled;
+ u8 ipath_link_speed_enabled;
+ u8 ipath_link_width_active;
+ u8 ipath_link_speed_active;
+ /* Rx Polarity inversion (compensate for ~tx on partner) */
+ u8 ipath_rx_pol_inv;
+
+ u8 ipath_r_portenable_shift;
+ u8 ipath_r_intravail_shift;
+ u8 ipath_r_tailupd_shift;
+ u8 ipath_r_portcfg_shift;
+
+ /* unit # of this chip, if present */
+ int ipath_unit;
+
+ /* local link integrity counter */
+ u32 ipath_lli_counter;
+ /* local link integrity errors */
+ u32 ipath_lli_errors;
+ /*
+ * Above counts only cases where _successive_ LocalLinkIntegrity
+ * errors were seen in the receive headers of kern-packets.
+ * Below are the three (monotonically increasing) counters
+ * maintained via GPIO interrupts on iba6120-rev2.
+ */
+ u32 ipath_rxfc_unsupvl_errs;
+ u32 ipath_overrun_thresh_errs;
+ u32 ipath_lli_errs;
+
+ /*
+ * Not all devices managed by a driver instance are the same
+ * type, so these fields must be per-device.
+ */
+ u64 ipath_i_bitsextant;
+ ipath_err_t ipath_e_bitsextant;
+ ipath_err_t ipath_hwe_bitsextant;
+
+ /*
+ * Below should be computable from number of ports,
+ * since they are never modified.
+ */
+ u64 ipath_i_rcvavail_mask;
+ u64 ipath_i_rcvurg_mask;
+ u16 ipath_i_rcvurg_shift;
+ u16 ipath_i_rcvavail_shift;
+
+ /*
+ * Register bits for selecting i2c direction and values, used for
+ * I2C serial flash.
+ */
+ u8 ipath_gpio_sda_num;
+ u8 ipath_gpio_scl_num;
+ u8 ipath_i2c_chain_type;
+ u64 ipath_gpio_sda;
+ u64 ipath_gpio_scl;
+
+ /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
+ spinlock_t ipath_gpio_lock;
+
+ /*
+ * IB link and linktraining states and masks that vary per chip in
+ * some way. Set at init, to avoid each IB status change interrupt
+ */
+ u8 ibcs_ls_shift;
+ u8 ibcs_lts_mask;
+ u32 ibcs_mask;
+ u32 ib_init;
+ u32 ib_arm;
+ u32 ib_active;
+
+ u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
+
+ /*
+ * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
+ * reg. Changes for IBA7220
+ */
+ u8 ibcc_lic_mask; /* LinkInitCmd */
+ u8 ibcc_lc_shift; /* LinkCmd */
+ u8 ibcc_mpl_shift; /* Maxpktlen */
+
+ u8 delay_mult;
+
+ /* used to override LED behavior */
+ u8 ipath_led_override; /* Substituted for normal value, if non-zero */
+ u16 ipath_led_override_timeoff; /* delta to next timer event */
+ u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
+ u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
+ atomic_t ipath_led_override_timer_active;
+ /* Used to flash LEDs in override mode */
+ struct timer_list ipath_led_override_timer;
+
+ /* Support (including locks) for EEPROM logging of errors and time */
+ /* control access to actual counters, timer */
+ spinlock_t ipath_eep_st_lock;
+ /* control high-level access to EEPROM */
+ struct mutex ipath_eep_lock;
+ /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
+ uint64_t ipath_traffic_wds;
+ /* active time is kept in seconds, but logged in hours */
+ atomic_t ipath_active_time;
+ /* Below are nominal shadow of EEPROM, new since last EEPROM update */
+ uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
+ uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
+ uint16_t ipath_eep_hrs;
+ /*
+ * masks for which bits of errs, hwerrs that cause
+ * each of the counters to increment.
+ */
+ struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
+
+ /* interrupt mitigation reload register info */
+ u16 ipath_jint_idle_ticks; /* idle clock ticks */
+ u16 ipath_jint_max_packets; /* max packets across all ports */
+
+ /*
+ * lock for access to SerDes, and flags to sequence preset
+ * versus steady-state. 7220-only at the moment.
+ */
+ spinlock_t ipath_sdepb_lock;
+ u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
+};
+
+/* ipath_hol_state values (stopping/starting user proc, send flushing) */
+#define IPATH_HOL_UP 0
+#define IPATH_HOL_DOWN 1
+/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
+#define IPATH_HOL_DOWNSTOP 0
+#define IPATH_HOL_DOWNCONT 1
+
+/* bit positions for sdma_status */
+#define IPATH_SDMA_ABORTING 0
+#define IPATH_SDMA_DISARMED 1
+#define IPATH_SDMA_DISABLED 2
+#define IPATH_SDMA_LAYERBUF 3
+#define IPATH_SDMA_RUNNING 30
+#define IPATH_SDMA_SHUTDOWN 31
+
+/* bit combinations that correspond to abort states */
+#define IPATH_SDMA_ABORT_NONE 0
+#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
+#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
+ (1UL << IPATH_SDMA_DISARMED))
+#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
+ (1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
+ (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
+ (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+
+#define IPATH_SDMA_BUF_NONE 0
+#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
+
+/* Private data for file operations */
+struct ipath_filedata {
+ struct ipath_portdata *pd;
+ unsigned subport;
+ unsigned tidcursor;
+ struct ipath_user_sdma_queue *pq;
+};
+extern struct list_head ipath_dev_list;
+extern spinlock_t ipath_devs_lock;
+extern struct ipath_devdata *ipath_lookup(int unit);
+
+int ipath_init_chip(struct ipath_devdata *, int);
+int ipath_enable_wc(struct ipath_devdata *dd);
+void ipath_disable_wc(struct ipath_devdata *dd);
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
+void ipath_shutdown_device(struct ipath_devdata *);
+void ipath_clear_freeze(struct ipath_devdata *);
+
+struct file_operations;
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+ struct cdev **cdevp, struct device **devp);
+void ipath_cdev_cleanup(struct cdev **cdevp,
+ struct device **devp);
+
+int ipath_diag_add(struct ipath_devdata *);
+void ipath_diag_remove(struct ipath_devdata *);
+
+extern wait_queue_head_t ipath_state_wait;
+
+int ipath_user_add(struct ipath_devdata *dd);
+void ipath_user_remove(struct ipath_devdata *dd);
+
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
+
+extern int ipath_diag_inuse;
+
+irqreturn_t ipath_intr(int irq, void *devid);
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+ ipath_err_t err);
+#if __IPATH_INFO || __IPATH_DBG
+extern const char *ipath_ibcstatus_str[];
+#endif
+
+/* clean up any per-chip chip-specific stuff */
+void ipath_chip_cleanup(struct ipath_devdata *);
+/* clean up any chip type-specific stuff */
+void ipath_chip_done(void);
+
+void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
+ unsigned cnt);
+void ipath_cancel_sends(struct ipath_devdata *, int);
+
+int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
+void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
+
+int ipath_parse_ushort(const char *str, unsigned short *valp);
+
+void ipath_kreceive(struct ipath_portdata *);
+int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
+int ipath_reset_device(int);
+void ipath_get_faststats(unsigned long);
+int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
+int ipath_set_linkstate(struct ipath_devdata *, u8);
+int ipath_set_mtu(struct ipath_devdata *, u16);
+int ipath_set_lid(struct ipath_devdata *, u32, u8);
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
+void ipath_enable_armlaunch(struct ipath_devdata *);
+void ipath_disable_armlaunch(struct ipath_devdata *);
+void ipath_hol_down(struct ipath_devdata *);
+void ipath_hol_up(struct ipath_devdata *);
+void ipath_hol_event(unsigned long);
+void ipath_toggle_rclkrls(struct ipath_devdata *);
+void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
+void ipath_set_relock_poll(struct ipath_devdata *, int);
+void ipath_shutdown_relock_poll(struct ipath_devdata *);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
+#define subport_fp(fp) \
+ ((struct ipath_filedata *)(fp)->private_data)->subport
+#define tidcursor_fp(fp) \
+ ((struct ipath_filedata *)(fp)->private_data)->tidcursor
+#define user_sdma_queue_fp(fp) \
+ ((struct ipath_filedata *)(fp)->private_data)->pq
+
+/*
+ * values for ipath_flags
+ */
+ /* chip can report link latency (IB 1.2) */
+#define IPATH_HAS_LINK_LATENCY 0x1
+ /* The chip is up and initted */
+#define IPATH_INITTED 0x2
+ /* set if any user code has set kr_rcvhdrsize */
+#define IPATH_RCVHDRSZ_SET 0x4
+ /* The chip is present and valid for accesses */
+#define IPATH_PRESENT 0x8
+ /* HT link0 is only 8 bits wide, ignore upper byte crc
+ * errors, etc. */
+#define IPATH_8BIT_IN_HT0 0x10
+ /* HT link1 is only 8 bits wide, ignore upper byte crc
+ * errors, etc. */
+#define IPATH_8BIT_IN_HT1 0x20
+ /* The link is down */
+#define IPATH_LINKDOWN 0x40
+ /* The link level is up (0x11) */
+#define IPATH_LINKINIT 0x80
+ /* The link is in the armed (0x21) state */
+#define IPATH_LINKARMED 0x100
+ /* The link is in the active (0x31) state */
+#define IPATH_LINKACTIVE 0x200
+ /* link current state is unknown */
+#define IPATH_LINKUNK 0x400
+ /* Write combining flush needed for PIO */
+#define IPATH_PIO_FLUSH_WC 0x1000
+ /* DMA Receive tail pointer */
+#define IPATH_NODMA_RTAIL 0x2000
+ /* no IB cable, or no device on IB cable */
+#define IPATH_NOCABLE 0x4000
+ /* Supports port zero per packet receive interrupts via
+ * GPIO */
+#define IPATH_GPIO_INTR 0x8000
+ /* uses the coded 4byte TID, not 8 byte */
+#define IPATH_4BYTE_TID 0x10000
+ /* packet/word counters are 32 bit, else those 4 counters
+ * are 64bit */
+#define IPATH_32BITCOUNTERS 0x20000
+ /* Interrupt register is 64 bits */
+#define IPATH_INTREG_64 0x40000
+ /* can miss port0 rx interrupts */
+#define IPATH_DISABLED 0x80000 /* administratively disabled */
+ /* Use GPIO interrupts for new counters */
+#define IPATH_GPIO_ERRINTRS 0x100000
+#define IPATH_SWAP_PIOBUFS 0x200000
+ /* Supports Send DMA */
+#define IPATH_HAS_SEND_DMA 0x400000
+ /* Supports Send Count (not just word count) in PBC */
+#define IPATH_HAS_PBC_CNT 0x800000
+ /* Suppress heartbeat, even if turning off loopback */
+#define IPATH_NO_HRTBT 0x1000000
+#define IPATH_HAS_THRESH_UPDATE 0x4000000
+#define IPATH_HAS_MULT_IB_SPEED 0x8000000
+#define IPATH_IB_AUTONEG_INPROG 0x10000000
+#define IPATH_IB_AUTONEG_FAILED 0x20000000
+ /* Linkdown-disable intentionally, Do not attempt to bring up */
+#define IPATH_IB_LINK_DISABLED 0x40000000
+#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
+
+/* Bits in GPIO for the added interrupts */
+#define IPATH_GPIO_PORT0_BIT 2
+#define IPATH_GPIO_RXUVL_BIT 3
+#define IPATH_GPIO_OVRUN_BIT 4
+#define IPATH_GPIO_LLI_BIT 5
+#define IPATH_GPIO_ERRINTR_MASK 0x38
+
+/* portdata flag bit offsets */
+ /* waiting for a packet to arrive */
+#define IPATH_PORT_WAITING_RCV 2
+ /* master has not finished initializing */
+#define IPATH_PORT_MASTER_UNINIT 4
+ /* waiting for an urgent packet to arrive */
+#define IPATH_PORT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void ipath_free_data(struct ipath_portdata *dd);
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+ unsigned len, int avail);
+void ipath_init_iba6110_funcs(struct ipath_devdata *);
+void ipath_get_eeprom_info(struct ipath_devdata *);
+int ipath_update_eeprom_log(struct ipath_devdata *dd);
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
+u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
+void ipath_force_pio_avail_update(struct ipath_devdata *);
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define IPATH_LED_LOG 2 /* Logical (link) YELLOW LED */
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
+
+/* send dma routines */
+int setup_sdma(struct ipath_devdata *);
+void teardown_sdma(struct ipath_devdata *);
+void ipath_restart_sdma(struct ipath_devdata *);
+void ipath_sdma_intr(struct ipath_devdata *);
+int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
+ u32, struct ipath_verbs_txreq *);
+/* ipath_sdma_lock should be locked before calling this. */
+int ipath_sdma_make_progress(struct ipath_devdata *dd);
+
+/* must be called under ipath_sdma_lock */
+static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
+{
+ return dd->ipath_sdma_descq_cnt -
+ (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
+ 1 - dd->ipath_sdma_desc_nreserved;
+}
+
+static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
+{
+ dd->ipath_sdma_desc_nreserved += cnt;
+}
+
+static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
+{
+ dd->ipath_sdma_desc_nreserved -= cnt;
+}
+
+/*
+ * number of words used for protocol header if not set by ipath_userinit();
+ */
+#define IPATH_DFLT_RCVHDRSIZE 9
+
+int ipath_get_user_pages(unsigned long, size_t, struct page **);
+void ipath_release_user_pages(struct page **, size_t);
+void ipath_release_user_pages_on_close(struct page **, size_t);
+int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
+int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
+int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
+int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
+
+/* these are used for the registers that vary with port */
+void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
+ unsigned, u64);
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner. It also gives us some error
+ * checking. 64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible. User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/*
+ * At the moment, none of the s-registers are writable, so no
+ * ipath_write_sreg().
+ */
+
+/**
+ * ipath_read_ureg32 - read 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @port: port number
+ *
+ * Return the contents of a register that is virtualized to be per port.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
+ ipath_ureg regno, int port)
+{
+ if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+ return 0;
+
+ return readl(regno + (u64 __iomem *)
+ (dd->ipath_uregbase +
+ (char __iomem *)dd->ipath_kregbase +
+ dd->ipath_ureg_align * port));
+}
+
+/**
+ * ipath_write_ureg - write 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @port: port
+ *
+ * Write the contents of a register that is virtualized to be per port.
+ */
+static inline void ipath_write_ureg(const struct ipath_devdata *dd,
+ ipath_ureg regno, u64 value, int port)
+{
+ u64 __iomem *ubase = (u64 __iomem *)
+ (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
+ dd->ipath_ureg_align * port);
+ if (dd->ipath_kregbase)
+ writeq(value, &ubase[regno]);
+}
+
+static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
+ ipath_kreg regno)
+{
+ if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+ return -1;
+ return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
+ ipath_kreg regno)
+{
+ if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+ return -1;
+
+ return readq(&dd->ipath_kregbase[regno]);
+}
+
+static inline void ipath_write_kreg(const struct ipath_devdata *dd,
+ ipath_kreg regno, u64 value)
+{
+ if (dd->ipath_kregbase)
+ writeq(value, &dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
+ ipath_sreg regno)
+{
+ if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+ return 0;
+
+ return readq(regno + (u64 __iomem *)
+ (dd->ipath_cregbase +
+ (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
+ ipath_sreg regno)
+{
+ if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+ return 0;
+ return readl(regno + (u64 __iomem *)
+ (dd->ipath_cregbase +
+ (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_write_creg(const struct ipath_devdata *dd,
+ ipath_creg regno, u64 value)
+{
+ if (dd->ipath_kregbase)
+ writeq(value, regno + (u64 __iomem *)
+ (dd->ipath_cregbase +
+ (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
+{
+ *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
+{
+ return (u32) le64_to_cpu(*((volatile __le64 *)
+ pd->port_rcvhdrtail_kvaddr));
+}
+
+static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
+{
+ const struct ipath_devdata *dd = pd->port_dd;
+ u32 hdrqtail;
+
+ if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+ __le32 *rhf_addr;
+ u32 seq;
+
+ rhf_addr = (__le32 *) pd->port_rcvhdrq +
+ pd->port_head + dd->ipath_rhf_offset;
+ seq = ipath_hdrget_seq(rhf_addr);
+ hdrqtail = pd->port_head;
+ if (seq == pd->port_seq_cnt)
+ hdrqtail++;
+ } else
+ hdrqtail = ipath_get_rcvhdrtail(pd);
+
+ return hdrqtail;
+}
+
+static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
+{
+ return (dd->ipath_flags & IPATH_INTREG_64) ?
+ ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return linkstate
+ * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
+ * everywhere, anyway (and should be, for almost all purposes).
+ */
+static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+ u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
+ INFINIPATH_IBCS_LINKSTATE_MASK;
+ if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
+ state = INFINIPATH_IBCS_L_STATE_ACTIVE;
+ return state;
+}
+
+/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
+static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
+{
+ return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+ dd->ibcs_lts_mask;
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return logical link state
+ * combination of link state and linktraining state (down, active, init,
+ * arm, etc.
+ */
+static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
+{
+ u32 ibs;
+ ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+ dd->ibcs_lts_mask;
+ ibs |= (u32)(ibcs &
+ (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
+ return ibs;
+}
+
+/*
+ * sysfs interface.
+ */
+
+struct device_driver;
+
+extern const char ib_ipath_version[];
+
+extern const struct attribute_group *ipath_driver_attr_groups[];
+
+int ipath_device_create_group(struct device *, struct ipath_devdata *);
+void ipath_device_remove_group(struct device *, struct ipath_devdata *);
+int ipath_expose_reset(struct device *);
+
+int ipath_init_ipathfs(void);
+void ipath_exit_ipathfs(void);
+int ipathfs_add_device(struct ipath_devdata *);
+int ipathfs_remove_device(struct ipath_devdata *);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
+ size_t, int);
+dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
+const char *ipath_get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+#if defined(CONFIG_X86_64)
+#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
+#else
+#define ipath_flush_wc() wmb()
+#endif
+
+extern unsigned ipath_debug; /* debugging bit mask */
+extern unsigned ipath_linkrecovery;
+extern unsigned ipath_mtu4096;
+extern struct mutex ipath_mutex;
+
+#define IPATH_DRV_NAME "ib_ipath"
+#define IPATH_MAJOR 233
+#define IPATH_USER_MINOR_BASE 0
+#define IPATH_DIAGPKT_MINOR 127
+#define IPATH_DIAG_MINOR_BASE 129
+#define IPATH_NMINORS 255
+
+#define ipath_dev_err(dd,fmt,...) \
+ do { \
+ const struct ipath_devdata *__dd = (dd); \
+ if (__dd->pcidev) \
+ dev_err(&__dd->pcidev->dev, "%s: " fmt, \
+ ipath_get_unit_name(__dd->ipath_unit), \
+ ##__VA_ARGS__); \
+ else \
+ printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
+ ipath_get_unit_name(__dd->ipath_unit), \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#if _IPATH_DEBUGGING
+
+# define __IPATH_DBG_WHICH(which,fmt,...) \
+ do { \
+ if (unlikely(ipath_debug & (which))) \
+ printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
+ __func__,##__VA_ARGS__); \
+ } while(0)
+
+# define ipath_dbg(fmt,...) \
+ __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
+# define ipath_cdbg(which,fmt,...) \
+ __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
+
+#else /* ! _IPATH_DEBUGGING */
+
+# define ipath_dbg(fmt,...)
+# define ipath_cdbg(which,fmt,...)
+
+#endif /* _IPATH_DEBUGGING */
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct ipath_hwerror_msgs {
+ u64 mask;
+ const char *msg;
+};
+
+#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
+
+/* in ipath_intr.c... */
+void ipath_format_hwerrors(u64 hwerrs,
+ const struct ipath_hwerror_msgs *hwerrmsgs,
+ size_t nhwerrmsgs,
+ char *msg, size_t lmsg);
+
+#endif /* _IPATH_KERNEL_H */
diff --git a/drivers/staging/rdma/ipath/ipath_keys.c b/drivers/staging/rdma/ipath/ipath_keys.c
new file mode 100644
index 000000000000..c0e933fec218
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_keys.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_alloc_lkey - allocate an lkey
+ * @rkt: lkey table in which to allocate the lkey
+ * @mr: memory region that this lkey protects
+ *
+ * Returns 1 if successful, otherwise returns 0.
+ */
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
+{
+ unsigned long flags;
+ u32 r;
+ u32 n;
+ int ret;
+
+ spin_lock_irqsave(&rkt->lock, flags);
+
+ /* Find the next available LKEY */
+ r = n = rkt->next;
+ for (;;) {
+ if (rkt->table[r] == NULL)
+ break;
+ r = (r + 1) & (rkt->max - 1);
+ if (r == n) {
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ ipath_dbg("LKEY table full\n");
+ ret = 0;
+ goto bail;
+ }
+ }
+ rkt->next = (r + 1) & (rkt->max - 1);
+ /*
+ * Make sure lkey is never zero which is reserved to indicate an
+ * unrestricted LKEY.
+ */
+ rkt->gen++;
+ mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
+ ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
+ << 8);
+ if (mr->lkey == 0) {
+ mr->lkey |= 1 << 8;
+ rkt->gen++;
+ }
+ rkt->table[r] = mr;
+ spin_unlock_irqrestore(&rkt->lock, flags);
+
+ ret = 1;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_free_lkey - free an lkey
+ * @rkt: table from which to free the lkey
+ * @lkey: lkey id to free
+ */
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+{
+ unsigned long flags;
+ u32 r;
+
+ if (lkey == 0)
+ return;
+ r = lkey >> (32 - ib_ipath_lkey_table_size);
+ spin_lock_irqsave(&rkt->lock, flags);
+ rkt->table[r] = NULL;
+ spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * ipath_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+ struct ib_sge *sge, int acc)
+{
+ struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+ struct ipath_mregion *mr;
+ unsigned n, m;
+ size_t off;
+ int ret;
+
+ /*
+ * We use LKEY == zero for kernel virtual addresses
+ * (see ipath_get_dma_mr and ipath_dma.c).
+ */
+ if (sge->lkey == 0) {
+ /* always a kernel port, no locking needed */
+ struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+ if (pd->user) {
+ ret = 0;
+ goto bail;
+ }
+ isge->mr = NULL;
+ isge->vaddr = (void *) sge->addr;
+ isge->length = sge->length;
+ isge->sge_length = sge->length;
+ ret = 1;
+ goto bail;
+ }
+ mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
+ if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
+ qp->ibqp.pd != mr->pd)) {
+ ret = 0;
+ goto bail;
+ }
+
+ off = sge->addr - mr->user_base;
+ if (unlikely(sge->addr < mr->user_base ||
+ off + sge->length > mr->length ||
+ (mr->access_flags & acc) != acc)) {
+ ret = 0;
+ goto bail;
+ }
+
+ off += mr->offset;
+ m = 0;
+ n = 0;
+ while (off >= mr->map[m]->segs[n].length) {
+ off -= mr->map[m]->segs[n].length;
+ n++;
+ if (n >= IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ isge->mr = mr;
+ isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+ isge->length = mr->map[m]->segs[n].length - off;
+ isge->sge_length = sge->length;
+ isge->m = m;
+ isge->n = n;
+
+ ret = 1;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_rkey_ok - check the IB virtual address, length, and RKEY
+ * @dev: infiniband device
+ * @ss: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ */
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+ u32 len, u64 vaddr, u32 rkey, int acc)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_lkey_table *rkt = &dev->lk_table;
+ struct ipath_sge *sge = &ss->sge;
+ struct ipath_mregion *mr;
+ unsigned n, m;
+ size_t off;
+ int ret;
+
+ /*
+ * We use RKEY == zero for kernel virtual addresses
+ * (see ipath_get_dma_mr and ipath_dma.c).
+ */
+ if (rkey == 0) {
+ /* always a kernel port, no locking needed */
+ struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+ if (pd->user) {
+ ret = 0;
+ goto bail;
+ }
+ sge->mr = NULL;
+ sge->vaddr = (void *) vaddr;
+ sge->length = len;
+ sge->sge_length = len;
+ ss->sg_list = NULL;
+ ss->num_sge = 1;
+ ret = 1;
+ goto bail;
+ }
+
+ mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
+ if (unlikely(mr == NULL || mr->lkey != rkey ||
+ qp->ibqp.pd != mr->pd)) {
+ ret = 0;
+ goto bail;
+ }
+
+ off = vaddr - mr->iova;
+ if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+ (mr->access_flags & acc) == 0)) {
+ ret = 0;
+ goto bail;
+ }
+
+ off += mr->offset;
+ m = 0;
+ n = 0;
+ while (off >= mr->map[m]->segs[n].length) {
+ off -= mr->map[m]->segs[n].length;
+ n++;
+ if (n >= IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ sge->mr = mr;
+ sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+ sge->length = mr->map[m]->segs[n].length - off;
+ sge->sge_length = len;
+ sge->m = m;
+ sge->n = n;
+ ss->sg_list = NULL;
+ ss->num_sge = 1;
+
+ ret = 1;
+
+bail:
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mad.c b/drivers/staging/rdma/ipath/ipath_mad.c
new file mode 100644
index 000000000000..ad3a926ab3c5
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_mad.c
@@ -0,0 +1,1521 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_pma.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C)
+
+static int reply(struct ib_smp *smp)
+{
+ /*
+ * The verbs framework will handle the directed/LID route
+ * packet changes.
+ */
+ smp->method = IB_MGMT_METHOD_GET_RESP;
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ smp->status |= IB_SMP_DIRECTION;
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int recv_subn_get_nodedescription(struct ib_smp *smp,
+ struct ib_device *ibdev)
+{
+ if (smp->attr_mod)
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+ return reply(smp);
+}
+
+struct nodeinfo {
+ u8 base_version;
+ u8 class_version;
+ u8 node_type;
+ u8 num_ports;
+ __be64 sys_guid;
+ __be64 node_guid;
+ __be64 port_guid;
+ __be16 partition_cap;
+ __be16 device_id;
+ __be32 revision;
+ u8 local_port_num;
+ u8 vendor_id[3];
+} __attribute__ ((packed));
+
+static int recv_subn_get_nodeinfo(struct ib_smp *smp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
+ struct ipath_devdata *dd = to_idev(ibdev)->dd;
+ u32 vendor, majrev, minrev;
+
+ /* GUID 0 is illegal */
+ if (smp->attr_mod || (dd->ipath_guid == 0))
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ nip->base_version = 1;
+ nip->class_version = 1;
+ nip->node_type = 1; /* channel adapter */
+ /*
+ * XXX The num_ports value will need a layer function to get
+ * the value if we ever have more than one IB port on a chip.
+ * We will also need to get the GUID for the port.
+ */
+ nip->num_ports = ibdev->phys_port_cnt;
+ /* This is already in network order */
+ nip->sys_guid = to_idev(ibdev)->sys_image_guid;
+ nip->node_guid = dd->ipath_guid;
+ nip->port_guid = dd->ipath_guid;
+ nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
+ nip->device_id = cpu_to_be16(dd->ipath_deviceid);
+ majrev = dd->ipath_majrev;
+ minrev = dd->ipath_minrev;
+ nip->revision = cpu_to_be32((majrev << 16) | minrev);
+ nip->local_port_num = port;
+ vendor = dd->ipath_vendorid;
+ nip->vendor_id[0] = IPATH_SRC_OUI_1;
+ nip->vendor_id[1] = IPATH_SRC_OUI_2;
+ nip->vendor_id[2] = IPATH_SRC_OUI_3;
+
+ return reply(smp);
+}
+
+static int recv_subn_get_guidinfo(struct ib_smp *smp,
+ struct ib_device *ibdev)
+{
+ u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+ __be64 *p = (__be64 *) smp->data;
+
+ /* 32 blocks of 8 64-bit GUIDs per block */
+
+ memset(smp->data, 0, sizeof(smp->data));
+
+ /*
+ * We only support one GUID for now. If this changes, the
+ * portinfo.guid_cap field needs to be updated too.
+ */
+ if (startgx == 0) {
+ __be64 g = to_idev(ibdev)->dd->ipath_guid;
+ if (g == 0)
+ /* GUID 0 is illegal */
+ smp->status |= IB_SMP_INVALID_FIELD;
+ else
+ /* The first is a copy of the read-only HW GUID. */
+ *p = g;
+ } else
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ return reply(smp);
+}
+
+static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
+{
+ (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
+{
+ (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct ipath_devdata *dd)
+{
+ return (dd->ipath_ibcctrl >>
+ INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+ INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
+{
+ unsigned v;
+
+ v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+ INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+ if (v != n) {
+ dd->ipath_ibcctrl &=
+ ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
+ INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
+ dd->ipath_ibcctrl |=
+ (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+ dd->ipath_ibcctrl);
+ }
+ return 0;
+}
+
+static int get_phyerrthreshold(struct ipath_devdata *dd)
+{
+ return (dd->ipath_ibcctrl >>
+ INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+ INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
+{
+ unsigned v;
+
+ v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+ INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+ if (v != n) {
+ dd->ipath_ibcctrl &=
+ ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
+ INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
+ dd->ipath_ibcctrl |=
+ (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+ dd->ipath_ibcctrl);
+ }
+ return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @dd: the infinipath device
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct ipath_devdata *dd)
+{
+ return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
+}
+
+static int recv_subn_get_portinfo(struct ib_smp *smp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ipath_ibdev *dev;
+ struct ipath_devdata *dd;
+ struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+ u16 lid;
+ u8 ibcstat;
+ u8 mtu;
+ int ret;
+
+ if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ ret = reply(smp);
+ goto bail;
+ }
+
+ dev = to_idev(ibdev);
+ dd = dev->dd;
+
+ /* Clear all fields. Only set the non-zero fields. */
+ memset(smp->data, 0, sizeof(smp->data));
+
+ /* Only return the mkey if the protection field allows it. */
+ if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
+ dev->mkeyprot == 0)
+ pip->mkey = dev->mkey;
+ pip->gid_prefix = dev->gid_prefix;
+ lid = dd->ipath_lid;
+ pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
+ pip->sm_lid = cpu_to_be16(dev->sm_lid);
+ pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
+ /* pip->diag_code; */
+ pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
+ pip->local_port_num = port;
+ pip->link_width_enabled = dd->ipath_link_width_enabled;
+ pip->link_width_supported = dd->ipath_link_width_supported;
+ pip->link_width_active = dd->ipath_link_width_active;
+ pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
+ ibcstat = dd->ipath_lastibcstat;
+ /* map LinkState to IB portinfo values. */
+ pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
+
+ pip->portphysstate_linkdown =
+ (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
+ (get_linkdowndefaultstate(dd) ? 1 : 2);
+ pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
+ pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
+ dd->ipath_link_speed_enabled;
+ switch (dd->ipath_ibmtu) {
+ case 4096:
+ mtu = IB_MTU_4096;
+ break;
+ case 2048:
+ mtu = IB_MTU_2048;
+ break;
+ case 1024:
+ mtu = IB_MTU_1024;
+ break;
+ case 512:
+ mtu = IB_MTU_512;
+ break;
+ case 256:
+ mtu = IB_MTU_256;
+ break;
+ default: /* oops, something is wrong */
+ mtu = IB_MTU_2048;
+ break;
+ }
+ pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
+ pip->vlcap_inittype = 0x10; /* VLCap = VL0, InitType = 0 */
+ pip->vl_high_limit = dev->vl_high_limit;
+ /* pip->vl_arb_high_cap; // only one VL */
+ /* pip->vl_arb_low_cap; // only one VL */
+ /* InitTypeReply = 0 */
+ /* our mtu cap depends on whether 4K MTU enabled or not */
+ pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+ /* HCAs ignore VLStallCount and HOQLife */
+ /* pip->vlstallcnt_hoqlife; */
+ pip->operationalvl_pei_peo_fpi_fpo = 0x10; /* OVLs = 1 */
+ pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
+ /* P_KeyViolations are counted by hardware. */
+ pip->pkey_violations =
+ cpu_to_be16((ipath_get_cr_errpkey(dd) -
+ dev->z_pkey_violations) & 0xFFFF);
+ pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
+ /* Only the hardware GUID is supported for now */
+ pip->guid_cap = 1;
+ pip->clientrereg_resv_subnetto = dev->subnet_timeout;
+ /* 32.768 usec. response time (guessing) */
+ pip->resv_resptimevalue = 3;
+ pip->localphyerrors_overrunerrors =
+ (get_phyerrthreshold(dd) << 4) |
+ get_overrunthreshold(dd);
+ /* pip->max_credit_hint; */
+ if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+ u32 v;
+
+ v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
+ pip->link_roundtrip_latency[0] = v >> 16;
+ pip->link_roundtrip_latency[1] = v >> 8;
+ pip->link_roundtrip_latency[2] = v;
+ }
+
+ ret = reply(smp);
+
+bail:
+ return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
+{
+ /* always a kernel port, no locking needed */
+ struct ipath_portdata *pd = dd->ipath_pd[0];
+
+ memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
+
+ return 0;
+}
+
+static int recv_subn_get_pkeytable(struct ib_smp *smp,
+ struct ib_device *ibdev)
+{
+ u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+ u16 *p = (u16 *) smp->data;
+ __be16 *q = (__be16 *) smp->data;
+
+ /* 64 blocks of 32 16-bit P_Key entries */
+
+ memset(smp->data, 0, sizeof(smp->data));
+ if (startpx == 0) {
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ unsigned i, n = ipath_get_npkeys(dev->dd);
+
+ get_pkeys(dev->dd, p);
+
+ for (i = 0; i < n; i++)
+ q[i] = cpu_to_be16(p[i]);
+ } else
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ return reply(smp);
+}
+
+static int recv_subn_set_guidinfo(struct ib_smp *smp,
+ struct ib_device *ibdev)
+{
+ /* The only GUID we support is the first read-only entry. */
+ return recv_subn_get_guidinfo(smp, ibdev);
+}
+
+/**
+ * set_linkdowndefaultstate - set the default linkdown state
+ * @dd: the infinipath device
+ * @sleep: the new state
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
+{
+ if (sleep)
+ dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+ else
+ dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+ dd->ipath_ibcctrl);
+ return 0;
+}
+
+/**
+ * recv_subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int recv_subn_set_portinfo(struct ib_smp *smp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+ struct ib_event event;
+ struct ipath_ibdev *dev;
+ struct ipath_devdata *dd;
+ char clientrereg = 0;
+ u16 lid, smlid;
+ u8 lwe;
+ u8 lse;
+ u8 state;
+ u16 lstate;
+ u32 mtu;
+ int ret, ore;
+
+ if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
+ goto err;
+
+ dev = to_idev(ibdev);
+ dd = dev->dd;
+ event.device = ibdev;
+ event.element.port_num = port;
+
+ dev->mkey = pip->mkey;
+ dev->gid_prefix = pip->gid_prefix;
+ dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+ lid = be16_to_cpu(pip->lid);
+ if (dd->ipath_lid != lid ||
+ dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
+ /* Must be a valid unicast LID address. */
+ if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
+ goto err;
+ ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
+ event.event = IB_EVENT_LID_CHANGE;
+ ib_dispatch_event(&event);
+ }
+
+ smlid = be16_to_cpu(pip->sm_lid);
+ if (smlid != dev->sm_lid) {
+ /* Must be a valid unicast LID address. */
+ if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
+ goto err;
+ dev->sm_lid = smlid;
+ event.event = IB_EVENT_SM_CHANGE;
+ ib_dispatch_event(&event);
+ }
+
+ /* Allow 1x or 4x to be set (see 14.2.6.6). */
+ lwe = pip->link_width_enabled;
+ if (lwe) {
+ if (lwe == 0xFF)
+ lwe = dd->ipath_link_width_supported;
+ else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
+ goto err;
+ set_link_width_enabled(dd, lwe);
+ }
+
+ /* Allow 2.5 or 5.0 Gbs. */
+ lse = pip->linkspeedactive_enabled & 0xF;
+ if (lse) {
+ if (lse == 15)
+ lse = dd->ipath_link_speed_supported;
+ else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
+ goto err;
+ set_link_speed_enabled(dd, lse);
+ }
+
+ /* Set link down default state. */
+ switch (pip->portphysstate_linkdown & 0xF) {
+ case 0: /* NOP */
+ break;
+ case 1: /* SLEEP */
+ if (set_linkdowndefaultstate(dd, 1))
+ goto err;
+ break;
+ case 2: /* POLL */
+ if (set_linkdowndefaultstate(dd, 0))
+ goto err;
+ break;
+ default:
+ goto err;
+ }
+
+ dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+ dev->vl_high_limit = pip->vl_high_limit;
+
+ switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
+ case IB_MTU_256:
+ mtu = 256;
+ break;
+ case IB_MTU_512:
+ mtu = 512;
+ break;
+ case IB_MTU_1024:
+ mtu = 1024;
+ break;
+ case IB_MTU_2048:
+ mtu = 2048;
+ break;
+ case IB_MTU_4096:
+ if (!ipath_mtu4096)
+ goto err;
+ mtu = 4096;
+ break;
+ default:
+ /* XXX We have already partially updated our state! */
+ goto err;
+ }
+ ipath_set_mtu(dd, mtu);
+
+ dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
+
+ /* We only support VL0 */
+ if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
+ goto err;
+
+ if (pip->mkey_violations == 0)
+ dev->mkey_violations = 0;
+
+ /*
+ * Hardware counter can't be reset so snapshot and subtract
+ * later.
+ */
+ if (pip->pkey_violations == 0)
+ dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
+
+ if (pip->qkey_violations == 0)
+ dev->qkey_violations = 0;
+
+ ore = pip->localphyerrors_overrunerrors;
+ if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
+ goto err;
+
+ if (set_overrunthreshold(dd, (ore & 0xF)))
+ goto err;
+
+ dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+ if (pip->clientrereg_resv_subnetto & 0x80) {
+ clientrereg = 1;
+ event.event = IB_EVENT_CLIENT_REREGISTER;
+ ib_dispatch_event(&event);
+ }
+
+ /*
+ * Do the port state change now that the other link parameters
+ * have been set.
+ * Changing the port physical state only makes sense if the link
+ * is down or is being set to down.
+ */
+ state = pip->linkspeed_portstate & 0xF;
+ lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+ if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+ goto err;
+
+ /*
+ * Only state changes of DOWN, ARM, and ACTIVE are valid
+ * and must be in the correct state to take effect (see 7.2.6).
+ */
+ switch (state) {
+ case IB_PORT_NOP:
+ if (lstate == 0)
+ break;
+ /* FALLTHROUGH */
+ case IB_PORT_DOWN:
+ if (lstate == 0)
+ lstate = IPATH_IB_LINKDOWN_ONLY;
+ else if (lstate == 1)
+ lstate = IPATH_IB_LINKDOWN_SLEEP;
+ else if (lstate == 2)
+ lstate = IPATH_IB_LINKDOWN;
+ else if (lstate == 3)
+ lstate = IPATH_IB_LINKDOWN_DISABLE;
+ else
+ goto err;
+ ipath_set_linkstate(dd, lstate);
+ if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
+ ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ goto done;
+ }
+ ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
+ IPATH_LINKACTIVE, 1000);
+ break;
+ case IB_PORT_ARMED:
+ ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+ break;
+ case IB_PORT_ACTIVE:
+ ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+ break;
+ default:
+ /* XXX We have already partially updated our state! */
+ goto err;
+ }
+
+ ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+ if (clientrereg)
+ pip->clientrereg_resv_subnetto |= 0x80;
+
+ goto done;
+
+err:
+ smp->status |= IB_SMP_INVALID_FIELD;
+ ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+done:
+ return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the infinipath device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct ipath_devdata *dd, u16 key)
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+ if (dd->ipath_pkeys[i] != key)
+ continue;
+ if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
+ dd->ipath_pkeys[i] = 0;
+ ret = 1;
+ goto bail;
+ }
+ break;
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the infinipath device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct ipath_devdata *dd, u16 key)
+{
+ int i;
+ u16 lkey = key & 0x7FFF;
+ int any = 0;
+ int ret;
+
+ if (lkey == 0x7FFF) {
+ ret = 0;
+ goto bail;
+ }
+
+ /* Look for an empty slot or a matching PKEY. */
+ for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+ if (!dd->ipath_pkeys[i]) {
+ any++;
+ continue;
+ }
+ /* If it matches exactly, try to increment the ref count */
+ if (dd->ipath_pkeys[i] == key) {
+ if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
+ ret = 0;
+ goto bail;
+ }
+ /* Lost the race. Look for an empty slot below. */
+ atomic_dec(&dd->ipath_pkeyrefs[i]);
+ any++;
+ }
+ /*
+ * It makes no sense to have both the limited and unlimited
+ * PKEY set at the same time since the unlimited one will
+ * disable the limited one.
+ */
+ if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+ ret = -EEXIST;
+ goto bail;
+ }
+ }
+ if (!any) {
+ ret = -EBUSY;
+ goto bail;
+ }
+ for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+ if (!dd->ipath_pkeys[i] &&
+ atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+ /* for ipathstats, etc. */
+ ipath_stats.sps_pkeys[i] = lkey;
+ dd->ipath_pkeys[i] = key;
+ ret = 1;
+ goto bail;
+ }
+ }
+ ret = -EBUSY;
+
+bail:
+ return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
+{
+ struct ipath_portdata *pd;
+ int i;
+ int changed = 0;
+
+ /* always a kernel port, no locking needed */
+ pd = dd->ipath_pd[0];
+
+ for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+ u16 key = pkeys[i];
+ u16 okey = pd->port_pkeys[i];
+
+ if (key == okey)
+ continue;
+ /*
+ * The value of this PKEY table entry is changing.
+ * Remove the old entry in the hardware's array of PKEYs.
+ */
+ if (okey & 0x7FFF)
+ changed |= rm_pkey(dd, okey);
+ if (key & 0x7FFF) {
+ int ret = add_pkey(dd, key);
+
+ if (ret < 0)
+ key = 0;
+ else
+ changed |= ret;
+ }
+ pd->port_pkeys[i] = key;
+ }
+ if (changed) {
+ u64 pkey;
+ struct ib_event event;
+
+ pkey = (u64) dd->ipath_pkeys[0] |
+ ((u64) dd->ipath_pkeys[1] << 16) |
+ ((u64) dd->ipath_pkeys[2] << 32) |
+ ((u64) dd->ipath_pkeys[3] << 48);
+ ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
+ (unsigned long long) pkey);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+ pkey);
+
+ event.event = IB_EVENT_PKEY_CHANGE;
+ event.device = &dd->verbs_dev->ibdev;
+ event.element.port_num = port;
+ ib_dispatch_event(&event);
+ }
+ return 0;
+}
+
+static int recv_subn_set_pkeytable(struct ib_smp *smp,
+ struct ib_device *ibdev, u8 port)
+{
+ u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+ __be16 *p = (__be16 *) smp->data;
+ u16 *q = (u16 *) smp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ unsigned i, n = ipath_get_npkeys(dev->dd);
+
+ for (i = 0; i < n; i++)
+ q[i] = be16_to_cpu(p[i]);
+
+ if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ return recv_subn_get_pkeytable(smp, ibdev);
+}
+
+static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
+{
+ struct ib_class_port_info *p =
+ (struct ib_class_port_info *)pmp->data;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ if (pmp->mad_hdr.attr_mod != 0)
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+ /* Indicate AllPortSelect is valid (only one port anyway) */
+ p->capability_mask = cpu_to_be16(1 << 8);
+ p->base_version = 1;
+ p->class_version = 1;
+ /*
+ * Expected response time is 4.096 usec. * 2^18 == 1.073741824
+ * sec.
+ */
+ p->resp_time_value = 18;
+
+ return reply((struct ib_smp *) pmp);
+}
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
+ COUNTER_MASK(1, 1) | \
+ COUNTER_MASK(1, 2) | \
+ COUNTER_MASK(1, 3) | \
+ COUNTER_MASK(1, 4))
+
+static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portsamplescontrol *p =
+ (struct ib_pma_portsamplescontrol *)pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+ unsigned long flags;
+ u8 port_select = p->port_select;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ p->port_select = port_select;
+ if (pmp->mad_hdr.attr_mod != 0 ||
+ (port_select != port && port_select != 0xFF))
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ /*
+ * Ticks are 10x the link transfer period which for 2.5Gbs is 4
+ * nsec. 0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec. Sample
+ * intervals are counted in ticks. Since we use Linux timers, that
+ * count in jiffies, we can't sample for less than 1000 ticks if HZ
+ * == 1000 (4000 ticks if HZ is 250). link_speed_active returns 2 for
+ * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
+ * have hardware support for delaying packets.
+ */
+ if (crp->cr_psstat)
+ p->tick = dev->dd->ipath_link_speed_active - 1;
+ else
+ p->tick = 250; /* 1 usec. */
+ p->counter_width = 4; /* 32 bit counters */
+ p->counter_mask0_9 = COUNTER_MASK0_9;
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ if (crp->cr_psstat)
+ p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+ else
+ p->sample_status = dev->pma_sample_status;
+ p->sample_start = cpu_to_be32(dev->pma_sample_start);
+ p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
+ p->tag = cpu_to_be16(dev->pma_tag);
+ p->counter_select[0] = dev->pma_counter_select[0];
+ p->counter_select[1] = dev->pma_counter_select[1];
+ p->counter_select[2] = dev->pma_counter_select[2];
+ p->counter_select[3] = dev->pma_counter_select[3];
+ p->counter_select[4] = dev->pma_counter_select[4];
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+ return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portsamplescontrol *p =
+ (struct ib_pma_portsamplescontrol *)pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+ unsigned long flags;
+ u8 status;
+ int ret;
+
+ if (pmp->mad_hdr.attr_mod != 0 ||
+ (p->port_select != port && p->port_select != 0xFF)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ ret = reply((struct ib_smp *) pmp);
+ goto bail;
+ }
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ if (crp->cr_psstat)
+ status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+ else
+ status = dev->pma_sample_status;
+ if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+ dev->pma_sample_start = be32_to_cpu(p->sample_start);
+ dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
+ dev->pma_tag = be16_to_cpu(p->tag);
+ dev->pma_counter_select[0] = p->counter_select[0];
+ dev->pma_counter_select[1] = p->counter_select[1];
+ dev->pma_counter_select[2] = p->counter_select[2];
+ dev->pma_counter_select[3] = p->counter_select[3];
+ dev->pma_counter_select[4] = p->counter_select[4];
+ if (crp->cr_psstat) {
+ ipath_write_creg(dev->dd, crp->cr_psinterval,
+ dev->pma_sample_interval);
+ ipath_write_creg(dev->dd, crp->cr_psstart,
+ dev->pma_sample_start);
+ } else
+ dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+ ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+ return ret;
+}
+
+static u64 get_counter(struct ipath_ibdev *dev,
+ struct ipath_cregs const *crp,
+ __be16 sel)
+{
+ u64 ret;
+
+ switch (sel) {
+ case IB_PMA_PORT_XMIT_DATA:
+ ret = (crp->cr_psxmitdatacount) ?
+ ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
+ dev->ipath_sword;
+ break;
+ case IB_PMA_PORT_RCV_DATA:
+ ret = (crp->cr_psrcvdatacount) ?
+ ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
+ dev->ipath_rword;
+ break;
+ case IB_PMA_PORT_XMIT_PKTS:
+ ret = (crp->cr_psxmitpktscount) ?
+ ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
+ dev->ipath_spkts;
+ break;
+ case IB_PMA_PORT_RCV_PKTS:
+ ret = (crp->cr_psrcvpktscount) ?
+ ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
+ dev->ipath_rpkts;
+ break;
+ case IB_PMA_PORT_XMIT_WAIT:
+ ret = (crp->cr_psxmitwaitcount) ?
+ ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
+ dev->ipath_xmit_wait;
+ break;
+ default:
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev)
+{
+ struct ib_pma_portsamplesresult *p =
+ (struct ib_pma_portsamplesresult *)pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+ u8 status;
+ int i;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+ p->tag = cpu_to_be16(dev->pma_tag);
+ if (crp->cr_psstat)
+ status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+ else
+ status = dev->pma_sample_status;
+ p->sample_status = cpu_to_be16(status);
+ for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+ p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+ cpu_to_be32(
+ get_counter(dev, crp, dev->pma_counter_select[i]));
+
+ return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev)
+{
+ struct ib_pma_portsamplesresult_ext *p =
+ (struct ib_pma_portsamplesresult_ext *)pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+ u8 status;
+ int i;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+ p->tag = cpu_to_be16(dev->pma_tag);
+ if (crp->cr_psstat)
+ status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+ else
+ status = dev->pma_sample_status;
+ p->sample_status = cpu_to_be16(status);
+ /* 64 bits */
+ p->extended_width = cpu_to_be32(0x80000000);
+ for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+ p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+ cpu_to_be64(
+ get_counter(dev, crp, dev->pma_counter_select[i]));
+
+ return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+ pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_verbs_counters cntrs;
+ u8 port_select = p->port_select;
+
+ ipath_get_counters(dev->dd, &cntrs);
+
+ /* Adjust counters for any resets done. */
+ cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
+ cntrs.link_error_recovery_counter -=
+ dev->z_link_error_recovery_counter;
+ cntrs.link_downed_counter -= dev->z_link_downed_counter;
+ cntrs.port_rcv_errors += dev->rcv_errors;
+ cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
+ cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
+ cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
+ cntrs.port_xmit_data -= dev->z_port_xmit_data;
+ cntrs.port_rcv_data -= dev->z_port_rcv_data;
+ cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
+ cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
+ cntrs.local_link_integrity_errors -=
+ dev->z_local_link_integrity_errors;
+ cntrs.excessive_buffer_overrun_errors -=
+ dev->z_excessive_buffer_overrun_errors;
+ cntrs.vl15_dropped -= dev->z_vl15_dropped;
+ cntrs.vl15_dropped += dev->n_vl15_dropped;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ p->port_select = port_select;
+ if (pmp->mad_hdr.attr_mod != 0 ||
+ (port_select != port && port_select != 0xFF))
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+ if (cntrs.symbol_error_counter > 0xFFFFUL)
+ p->symbol_error_counter = cpu_to_be16(0xFFFF);
+ else
+ p->symbol_error_counter =
+ cpu_to_be16((u16)cntrs.symbol_error_counter);
+ if (cntrs.link_error_recovery_counter > 0xFFUL)
+ p->link_error_recovery_counter = 0xFF;
+ else
+ p->link_error_recovery_counter =
+ (u8)cntrs.link_error_recovery_counter;
+ if (cntrs.link_downed_counter > 0xFFUL)
+ p->link_downed_counter = 0xFF;
+ else
+ p->link_downed_counter = (u8)cntrs.link_downed_counter;
+ if (cntrs.port_rcv_errors > 0xFFFFUL)
+ p->port_rcv_errors = cpu_to_be16(0xFFFF);
+ else
+ p->port_rcv_errors =
+ cpu_to_be16((u16) cntrs.port_rcv_errors);
+ if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+ p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+ else
+ p->port_rcv_remphys_errors =
+ cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+ if (cntrs.port_xmit_discards > 0xFFFFUL)
+ p->port_xmit_discards = cpu_to_be16(0xFFFF);
+ else
+ p->port_xmit_discards =
+ cpu_to_be16((u16)cntrs.port_xmit_discards);
+ if (cntrs.local_link_integrity_errors > 0xFUL)
+ cntrs.local_link_integrity_errors = 0xFUL;
+ if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+ cntrs.excessive_buffer_overrun_errors = 0xFUL;
+ p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+ cntrs.excessive_buffer_overrun_errors;
+ if (cntrs.vl15_dropped > 0xFFFFUL)
+ p->vl15_dropped = cpu_to_be16(0xFFFF);
+ else
+ p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+ if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+ p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+ else
+ p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+ if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+ p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+ else
+ p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+ if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+ p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+ else
+ p->port_xmit_packets =
+ cpu_to_be32((u32)cntrs.port_xmit_packets);
+ if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+ p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+ else
+ p->port_rcv_packets =
+ cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+ return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portcounters_ext *p =
+ (struct ib_pma_portcounters_ext *)pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ u64 swords, rwords, spkts, rpkts, xwait;
+ u8 port_select = p->port_select;
+
+ ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+ &rpkts, &xwait);
+
+ /* Adjust counters for any resets done. */
+ swords -= dev->z_port_xmit_data;
+ rwords -= dev->z_port_rcv_data;
+ spkts -= dev->z_port_xmit_packets;
+ rpkts -= dev->z_port_rcv_packets;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ p->port_select = port_select;
+ if (pmp->mad_hdr.attr_mod != 0 ||
+ (port_select != port && port_select != 0xFF))
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+ p->port_xmit_data = cpu_to_be64(swords);
+ p->port_rcv_data = cpu_to_be64(rwords);
+ p->port_xmit_packets = cpu_to_be64(spkts);
+ p->port_rcv_packets = cpu_to_be64(rpkts);
+ p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
+ p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
+ p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
+ p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
+
+ return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+ pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_verbs_counters cntrs;
+
+ /*
+ * Since the HW doesn't support clearing counters, we save the
+ * current count and subtract it from future responses.
+ */
+ ipath_get_counters(dev->dd, &cntrs);
+
+ if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+ dev->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+ if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+ dev->z_link_error_recovery_counter =
+ cntrs.link_error_recovery_counter;
+
+ if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+ dev->z_link_downed_counter = cntrs.link_downed_counter;
+
+ if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+ dev->z_port_rcv_errors =
+ cntrs.port_rcv_errors + dev->rcv_errors;
+
+ if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+ dev->z_port_rcv_remphys_errors =
+ cntrs.port_rcv_remphys_errors;
+
+ if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+ dev->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+ if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+ dev->z_local_link_integrity_errors =
+ cntrs.local_link_integrity_errors;
+
+ if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+ dev->z_excessive_buffer_overrun_errors =
+ cntrs.excessive_buffer_overrun_errors;
+
+ if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+ dev->n_vl15_dropped = 0;
+ dev->z_vl15_dropped = cntrs.vl15_dropped;
+ }
+
+ if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+ dev->z_port_xmit_data = cntrs.port_xmit_data;
+
+ if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+ dev->z_port_rcv_data = cntrs.port_rcv_data;
+
+ if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+ dev->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+ if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+ dev->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+ return recv_pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+ pmp->data;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ u64 swords, rwords, spkts, rpkts, xwait;
+
+ ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+ &rpkts, &xwait);
+
+ if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+ dev->z_port_xmit_data = swords;
+
+ if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+ dev->z_port_rcv_data = rwords;
+
+ if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+ dev->z_port_xmit_packets = spkts;
+
+ if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+ dev->z_port_rcv_packets = rpkts;
+
+ if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+ dev->n_unicast_xmit = 0;
+
+ if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+ dev->n_unicast_rcv = 0;
+
+ if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+ dev->n_multicast_xmit = 0;
+
+ if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+ dev->n_multicast_rcv = 0;
+
+ return recv_pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+ u8 port_num, const struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ struct ib_smp *smp = (struct ib_smp *)out_mad;
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ int ret;
+
+ *out_mad = *in_mad;
+ if (smp->class_version != 1) {
+ smp->status |= IB_SMP_UNSUP_VERSION;
+ ret = reply(smp);
+ goto bail;
+ }
+
+ /* Is the mkey in the process of expiring? */
+ if (dev->mkey_lease_timeout &&
+ time_after_eq(jiffies, dev->mkey_lease_timeout)) {
+ /* Clear timeout and mkey protection field. */
+ dev->mkey_lease_timeout = 0;
+ dev->mkeyprot = 0;
+ }
+
+ /*
+ * M_Key checking depends on
+ * Portinfo:M_Key_protect_bits
+ */
+ if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
+ dev->mkey != smp->mkey &&
+ (smp->method == IB_MGMT_METHOD_SET ||
+ (smp->method == IB_MGMT_METHOD_GET &&
+ dev->mkeyprot >= 2))) {
+ if (dev->mkey_violations != 0xFFFF)
+ ++dev->mkey_violations;
+ if (dev->mkey_lease_timeout ||
+ dev->mkey_lease_period == 0) {
+ ret = IB_MAD_RESULT_SUCCESS |
+ IB_MAD_RESULT_CONSUMED;
+ goto bail;
+ }
+ dev->mkey_lease_timeout = jiffies +
+ dev->mkey_lease_period * HZ;
+ /* Future: Generate a trap notice. */
+ ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ goto bail;
+ } else if (dev->mkey_lease_timeout)
+ dev->mkey_lease_timeout = 0;
+
+ switch (smp->method) {
+ case IB_MGMT_METHOD_GET:
+ switch (smp->attr_id) {
+ case IB_SMP_ATTR_NODE_DESC:
+ ret = recv_subn_get_nodedescription(smp, ibdev);
+ goto bail;
+ case IB_SMP_ATTR_NODE_INFO:
+ ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
+ goto bail;
+ case IB_SMP_ATTR_GUID_INFO:
+ ret = recv_subn_get_guidinfo(smp, ibdev);
+ goto bail;
+ case IB_SMP_ATTR_PORT_INFO:
+ ret = recv_subn_get_portinfo(smp, ibdev, port_num);
+ goto bail;
+ case IB_SMP_ATTR_PKEY_TABLE:
+ ret = recv_subn_get_pkeytable(smp, ibdev);
+ goto bail;
+ case IB_SMP_ATTR_SM_INFO:
+ if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+ ret = IB_MAD_RESULT_SUCCESS |
+ IB_MAD_RESULT_CONSUMED;
+ goto bail;
+ }
+ if (dev->port_cap_flags & IB_PORT_SM) {
+ ret = IB_MAD_RESULT_SUCCESS;
+ goto bail;
+ }
+ /* FALLTHROUGH */
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply(smp);
+ goto bail;
+ }
+
+ case IB_MGMT_METHOD_SET:
+ switch (smp->attr_id) {
+ case IB_SMP_ATTR_GUID_INFO:
+ ret = recv_subn_set_guidinfo(smp, ibdev);
+ goto bail;
+ case IB_SMP_ATTR_PORT_INFO:
+ ret = recv_subn_set_portinfo(smp, ibdev, port_num);
+ goto bail;
+ case IB_SMP_ATTR_PKEY_TABLE:
+ ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
+ goto bail;
+ case IB_SMP_ATTR_SM_INFO:
+ if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+ ret = IB_MAD_RESULT_SUCCESS |
+ IB_MAD_RESULT_CONSUMED;
+ goto bail;
+ }
+ if (dev->port_cap_flags & IB_PORT_SM) {
+ ret = IB_MAD_RESULT_SUCCESS;
+ goto bail;
+ }
+ /* FALLTHROUGH */
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply(smp);
+ goto bail;
+ }
+
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_REPORT:
+ case IB_MGMT_METHOD_REPORT_RESP:
+ case IB_MGMT_METHOD_TRAP_REPRESS:
+ case IB_MGMT_METHOD_GET_RESP:
+ /*
+ * The ib_mad module will call us to process responses
+ * before checking for other consumers.
+ * Just tell the caller to process it normally.
+ */
+ ret = IB_MAD_RESULT_SUCCESS;
+ goto bail;
+ default:
+ smp->status |= IB_SMP_UNSUP_METHOD;
+ ret = reply(smp);
+ }
+
+bail:
+ return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port_num,
+ const struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+ int ret;
+
+ *out_mad = *in_mad;
+ if (pmp->mad_hdr.class_version != 1) {
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+ ret = reply((struct ib_smp *) pmp);
+ goto bail;
+ }
+
+ switch (pmp->mad_hdr.method) {
+ case IB_MGMT_METHOD_GET:
+ switch (pmp->mad_hdr.attr_id) {
+ case IB_PMA_CLASS_PORT_INFO:
+ ret = recv_pma_get_classportinfo(pmp);
+ goto bail;
+ case IB_PMA_PORT_SAMPLES_CONTROL:
+ ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
+ port_num);
+ goto bail;
+ case IB_PMA_PORT_SAMPLES_RESULT:
+ ret = recv_pma_get_portsamplesresult(pmp, ibdev);
+ goto bail;
+ case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+ ret = recv_pma_get_portsamplesresult_ext(pmp,
+ ibdev);
+ goto bail;
+ case IB_PMA_PORT_COUNTERS:
+ ret = recv_pma_get_portcounters(pmp, ibdev,
+ port_num);
+ goto bail;
+ case IB_PMA_PORT_COUNTERS_EXT:
+ ret = recv_pma_get_portcounters_ext(pmp, ibdev,
+ port_num);
+ goto bail;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_smp *) pmp);
+ goto bail;
+ }
+
+ case IB_MGMT_METHOD_SET:
+ switch (pmp->mad_hdr.attr_id) {
+ case IB_PMA_PORT_SAMPLES_CONTROL:
+ ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
+ port_num);
+ goto bail;
+ case IB_PMA_PORT_COUNTERS:
+ ret = recv_pma_set_portcounters(pmp, ibdev,
+ port_num);
+ goto bail;
+ case IB_PMA_PORT_COUNTERS_EXT:
+ ret = recv_pma_set_portcounters_ext(pmp, ibdev,
+ port_num);
+ goto bail;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_smp *) pmp);
+ goto bail;
+ }
+
+ case IB_MGMT_METHOD_GET_RESP:
+ /*
+ * The ib_mad module will call us to process responses
+ * before checking for other consumers.
+ * Just tell the caller to process it normally.
+ */
+ ret = IB_MAD_RESULT_SUCCESS;
+ goto bail;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+ ret = reply((struct ib_smp *) pmp);
+ }
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in, size_t in_mad_size,
+ struct ib_mad_hdr *out, size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ int ret;
+ const struct ib_mad *in_mad = (const struct ib_mad *)in;
+ struct ib_mad *out_mad = (struct ib_mad *)out;
+
+ if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+ *out_mad_size != sizeof(*out_mad)))
+ return IB_MAD_RESULT_FAILURE;
+
+ switch (in_mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ ret = process_subn(ibdev, mad_flags, port_num,
+ in_mad, out_mad);
+ goto bail;
+ case IB_MGMT_CLASS_PERF_MGMT:
+ ret = process_perf(ibdev, port_num, in_mad, out_mad);
+ goto bail;
+ default:
+ ret = IB_MAD_RESULT_SUCCESS;
+ }
+
+bail:
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mmap.c b/drivers/staging/rdma/ipath/ipath_mmap.c
new file mode 100644
index 000000000000..e73274229404
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_mmap.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct ipath_mmap_info
+ */
+void ipath_release_mmap_info(struct kref *ref)
+{
+ struct ipath_mmap_info *ip =
+ container_of(ref, struct ipath_mmap_info, ref);
+ struct ipath_ibdev *dev = to_idev(ip->context->device);
+
+ spin_lock_irq(&dev->pending_lock);
+ list_del(&ip->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+
+ vfree(ip->obj);
+ kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void ipath_vma_open(struct vm_area_struct *vma)
+{
+ struct ipath_mmap_info *ip = vma->vm_private_data;
+
+ kref_get(&ip->ref);
+}
+
+static void ipath_vma_close(struct vm_area_struct *vma)
+{
+ struct ipath_mmap_info *ip = vma->vm_private_data;
+
+ kref_put(&ip->ref, ipath_release_mmap_info);
+}
+
+static const struct vm_operations_struct ipath_vm_ops = {
+ .open = ipath_vma_open,
+ .close = ipath_vma_close,
+};
+
+/**
+ * ipath_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct ipath_ibdev *dev = to_idev(context->device);
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct ipath_mmap_info *ip, *pp;
+ int ret = -EINVAL;
+
+ /*
+ * Search the device's list of objects waiting for a mmap call.
+ * Normally, this list is very short since a call to create a
+ * CQ, QP, or SRQ is soon followed by a call to mmap().
+ */
+ spin_lock_irq(&dev->pending_lock);
+ list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+ pending_mmaps) {
+ /* Only the creator is allowed to mmap the object */
+ if (context != ip->context || (__u64) offset != ip->offset)
+ continue;
+ /* Don't allow a mmap larger than the object. */
+ if (size > ip->size)
+ break;
+
+ list_del_init(&ip->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+
+ ret = remap_vmalloc_range(vma, ip->obj, 0);
+ if (ret)
+ goto done;
+ vma->vm_ops = &ipath_vm_ops;
+ vma->vm_private_data = ip;
+ ipath_vma_open(vma);
+ goto done;
+ }
+ spin_unlock_irq(&dev->pending_lock);
+done:
+ return ret;
+}
+
+/*
+ * Allocate information for ipath_mmap
+ */
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+ u32 size,
+ struct ib_ucontext *context,
+ void *obj) {
+ struct ipath_mmap_info *ip;
+
+ ip = kmalloc(sizeof *ip, GFP_KERNEL);
+ if (!ip)
+ goto bail;
+
+ size = PAGE_ALIGN(size);
+
+ spin_lock_irq(&dev->mmap_offset_lock);
+ if (dev->mmap_offset == 0)
+ dev->mmap_offset = PAGE_SIZE;
+ ip->offset = dev->mmap_offset;
+ dev->mmap_offset += size;
+ spin_unlock_irq(&dev->mmap_offset_lock);
+
+ INIT_LIST_HEAD(&ip->pending_mmaps);
+ ip->size = size;
+ ip->context = context;
+ ip->obj = obj;
+ kref_init(&ip->ref);
+
+bail:
+ return ip;
+}
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+ struct ipath_mmap_info *ip,
+ u32 size, void *obj) {
+ size = PAGE_ALIGN(size);
+
+ spin_lock_irq(&dev->mmap_offset_lock);
+ if (dev->mmap_offset == 0)
+ dev->mmap_offset = PAGE_SIZE;
+ ip->offset = dev->mmap_offset;
+ dev->mmap_offset += size;
+ spin_unlock_irq(&dev->mmap_offset_lock);
+
+ ip->size = size;
+ ip->obj = obj;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c
new file mode 100644
index 000000000000..c7278f6a8217
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_mr.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+
+/* Fast memory region */
+struct ipath_fmr {
+ struct ib_fmr ibfmr;
+ u8 page_shift;
+ struct ipath_mregion mr; /* must be last */
+};
+
+static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+ return container_of(ibfmr, struct ipath_fmr, ibfmr);
+}
+
+/**
+ * ipath_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see ipath_dma.c).
+ */
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct ipath_mr *mr;
+ struct ib_mr *ret;
+
+ mr = kzalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ mr->mr.access_flags = acc;
+ ret = &mr->ibmr;
+
+bail:
+ return ret;
+}
+
+static struct ipath_mr *alloc_mr(int count,
+ struct ipath_lkey_table *lk_table)
+{
+ struct ipath_mr *mr;
+ int m, i = 0;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+ mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+ if (!mr)
+ goto done;
+
+ /* Allocate first level page tables. */
+ for (; i < m; i++) {
+ mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+ if (!mr->mr.map[i])
+ goto bail;
+ }
+ mr->mr.mapsz = m;
+
+ /*
+ * ib_reg_phys_mr() will initialize mr->ibmr except for
+ * lkey and rkey.
+ */
+ if (!ipath_alloc_lkey(lk_table, &mr->mr))
+ goto bail;
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+
+ goto done;
+
+bail:
+ while (i) {
+ i--;
+ kfree(mr->mr.map[i]);
+ }
+ kfree(mr);
+ mr = NULL;
+
+done:
+ return mr;
+}
+
+/**
+ * ipath_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf, int acc, u64 *iova_start)
+{
+ struct ipath_mr *mr;
+ int n, m, i;
+ struct ib_mr *ret;
+
+ mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
+ if (mr == NULL) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ mr->mr.pd = pd;
+ mr->mr.user_base = *iova_start;
+ mr->mr.iova = *iova_start;
+ mr->mr.length = 0;
+ mr->mr.offset = 0;
+ mr->mr.access_flags = acc;
+ mr->mr.max_segs = num_phys_buf;
+ mr->umem = NULL;
+
+ m = 0;
+ n = 0;
+ for (i = 0; i < num_phys_buf; i++) {
+ mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+ mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+ mr->mr.length += buffer_list[i].size;
+ n++;
+ if (n == IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+
+ ret = &mr->ibmr;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @virt_addr: virtual address to use (from HCA's point of view)
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata)
+{
+ struct ipath_mr *mr;
+ struct ib_umem *umem;
+ int n, m, entry;
+ struct scatterlist *sg;
+ struct ib_mr *ret;
+
+ if (length == 0) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ umem = ib_umem_get(pd->uobject->context, start, length,
+ mr_access_flags, 0);
+ if (IS_ERR(umem))
+ return (void *) umem;
+
+ n = umem->nmap;
+ mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
+ if (!mr) {
+ ret = ERR_PTR(-ENOMEM);
+ ib_umem_release(umem);
+ goto bail;
+ }
+
+ mr->mr.pd = pd;
+ mr->mr.user_base = start;
+ mr->mr.iova = virt_addr;
+ mr->mr.length = length;
+ mr->mr.offset = ib_umem_offset(umem);
+ mr->mr.access_flags = mr_access_flags;
+ mr->mr.max_segs = n;
+ mr->umem = umem;
+
+ m = 0;
+ n = 0;
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+ void *vaddr;
+
+ vaddr = page_address(sg_page(sg));
+ if (!vaddr) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+ mr->mr.map[m]->segs[n].vaddr = vaddr;
+ mr->mr.map[m]->segs[n].length = umem->page_size;
+ n++;
+ if (n == IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ ret = &mr->ibmr;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by ipath_get_dma_mr()
+ * or ipath_reg_user_mr().
+ */
+int ipath_dereg_mr(struct ib_mr *ibmr)
+{
+ struct ipath_mr *mr = to_imr(ibmr);
+ int i;
+
+ ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+ i = mr->mr.mapsz;
+ while (i) {
+ i--;
+ kfree(mr->mr.map[i]);
+ }
+
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+
+ kfree(mr);
+ return 0;
+}
+
+/**
+ * ipath_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ipath_fmr *fmr;
+ int m, i = 0;
+ struct ib_fmr *ret;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+ fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+ if (!fmr)
+ goto bail;
+
+ /* Allocate first level page tables. */
+ for (; i < m; i++) {
+ fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
+ GFP_KERNEL);
+ if (!fmr->mr.map[i])
+ goto bail;
+ }
+ fmr->mr.mapsz = m;
+
+ /*
+ * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+ * rkey.
+ */
+ if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
+ goto bail;
+ fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
+ /*
+ * Resources are allocated but no valid mapping (RKEY can't be
+ * used).
+ */
+ fmr->mr.pd = pd;
+ fmr->mr.user_base = 0;
+ fmr->mr.iova = 0;
+ fmr->mr.length = 0;
+ fmr->mr.offset = 0;
+ fmr->mr.access_flags = mr_access_flags;
+ fmr->mr.max_segs = fmr_attr->max_pages;
+ fmr->page_shift = fmr_attr->page_shift;
+
+ ret = &fmr->ibfmr;
+ goto done;
+
+bail:
+ while (i)
+ kfree(fmr->mr.map[--i]);
+ kfree(fmr);
+ ret = ERR_PTR(-ENOMEM);
+
+done:
+ return ret;
+}
+
+/**
+ * ipath_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+ int list_len, u64 iova)
+{
+ struct ipath_fmr *fmr = to_ifmr(ibfmr);
+ struct ipath_lkey_table *rkt;
+ unsigned long flags;
+ int m, n, i;
+ u32 ps;
+ int ret;
+
+ if (list_len > fmr->mr.max_segs) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ rkt = &to_idev(ibfmr->device)->lk_table;
+ spin_lock_irqsave(&rkt->lock, flags);
+ fmr->mr.user_base = iova;
+ fmr->mr.iova = iova;
+ ps = 1 << fmr->page_shift;
+ fmr->mr.length = list_len * ps;
+ m = 0;
+ n = 0;
+ ps = 1 << fmr->page_shift;
+ for (i = 0; i < list_len; i++) {
+ fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+ fmr->mr.map[m]->segs[n].length = ps;
+ if (++n == IPATH_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int ipath_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ipath_fmr *fmr;
+ struct ipath_lkey_table *rkt;
+ unsigned long flags;
+
+ list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+ rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+ spin_lock_irqsave(&rkt->lock, flags);
+ fmr->mr.user_base = 0;
+ fmr->mr.iova = 0;
+ fmr->mr.length = 0;
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ }
+ return 0;
+}
+
+/**
+ * ipath_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+ struct ipath_fmr *fmr = to_ifmr(ibfmr);
+ int i;
+
+ ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+ i = fmr->mr.mapsz;
+ while (i)
+ kfree(fmr->mr.map[--i]);
+ kfree(fmr);
+ return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_qp.c b/drivers/staging/rdma/ipath/ipath_qp.c
new file mode 100644
index 000000000000..face87602dc1
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_qp.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
+#define mk_qpn(qpt, map, off) (((map) - (qpt)->map) * BITS_PER_PAGE + \
+ (off))
+#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
+ BITS_PER_PAGE, off)
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+ 0, /* 0 */
+ 1, /* 1 */
+ 2, /* 2 */
+ 3, /* 3 */
+ 4, /* 4 */
+ 6, /* 5 */
+ 8, /* 6 */
+ 12, /* 7 */
+ 16, /* 8 */
+ 24, /* 9 */
+ 32, /* A */
+ 48, /* B */
+ 64, /* C */
+ 96, /* D */
+ 128, /* E */
+ 192, /* F */
+ 256, /* 10 */
+ 384, /* 11 */
+ 512, /* 12 */
+ 768, /* 13 */
+ 1024, /* 14 */
+ 1536, /* 15 */
+ 2048, /* 16 */
+ 3072, /* 17 */
+ 4096, /* 18 */
+ 6144, /* 19 */
+ 8192, /* 1A */
+ 12288, /* 1B */
+ 16384, /* 1C */
+ 24576, /* 1D */
+ 32768 /* 1E */
+};
+
+
+static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
+{
+ unsigned long page = get_zeroed_page(GFP_KERNEL);
+ unsigned long flags;
+
+ /*
+ * Free the page if someone raced with us installing it.
+ */
+
+ spin_lock_irqsave(&qpt->lock, flags);
+ if (map->page)
+ free_page(page);
+ else
+ map->page = (void *)page;
+ spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+
+static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
+{
+ u32 i, offset, max_scan, qpn;
+ struct qpn_map *map;
+ u32 ret = -1;
+
+ if (type == IB_QPT_SMI)
+ ret = 0;
+ else if (type == IB_QPT_GSI)
+ ret = 1;
+
+ if (ret != -1) {
+ map = &qpt->map[0];
+ if (unlikely(!map->page)) {
+ get_map_page(qpt, map);
+ if (unlikely(!map->page)) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ }
+ if (!test_and_set_bit(ret, map->page))
+ atomic_dec(&map->n_free);
+ else
+ ret = -EBUSY;
+ goto bail;
+ }
+
+ qpn = qpt->last + 1;
+ if (qpn >= QPN_MAX)
+ qpn = 2;
+ offset = qpn & BITS_PER_PAGE_MASK;
+ map = &qpt->map[qpn / BITS_PER_PAGE];
+ max_scan = qpt->nmaps - !offset;
+ for (i = 0;;) {
+ if (unlikely(!map->page)) {
+ get_map_page(qpt, map);
+ if (unlikely(!map->page))
+ break;
+ }
+ if (likely(atomic_read(&map->n_free))) {
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->n_free);
+ qpt->last = qpn;
+ ret = qpn;
+ goto bail;
+ }
+ offset = find_next_offset(map, offset);
+ qpn = mk_qpn(qpt, map, offset);
+ /*
+ * This test differs from alloc_pidmap().
+ * If find_next_offset() does find a zero
+ * bit, we don't need to check for QPN
+ * wrapping around past our starting QPN.
+ * We just need to be sure we don't loop
+ * forever.
+ */
+ } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+ }
+ /*
+ * In order to keep the number of pages allocated to a
+ * minimum, we scan the all existing pages before increasing
+ * the size of the bitmap table.
+ */
+ if (++i > max_scan) {
+ if (qpt->nmaps == QPNMAP_ENTRIES)
+ break;
+ map = &qpt->map[qpt->nmaps++];
+ offset = 0;
+ } else if (map < &qpt->map[qpt->nmaps]) {
+ ++map;
+ offset = 0;
+ } else {
+ map = &qpt->map[0];
+ offset = 2;
+ }
+ qpn = mk_qpn(qpt, map, offset);
+ }
+
+ ret = -ENOMEM;
+
+bail:
+ return ret;
+}
+
+static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+ struct qpn_map *map;
+
+ map = qpt->map + qpn / BITS_PER_PAGE;
+ if (map->page)
+ clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+ atomic_inc(&map->n_free);
+}
+
+/**
+ * ipath_alloc_qpn - allocate a QP number
+ * @qpt: the QP table
+ * @qp: the QP
+ * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
+ *
+ * Allocate the next available QPN and put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
+ enum ib_qp_type type)
+{
+ unsigned long flags;
+ int ret;
+
+ ret = alloc_qpn(qpt, type);
+ if (ret < 0)
+ goto bail;
+ qp->ibqp.qp_num = ret;
+
+ /* Add the QP to the hash table. */
+ spin_lock_irqsave(&qpt->lock, flags);
+
+ ret %= qpt->max;
+ qp->next = qpt->table[ret];
+ qpt->table[ret] = qp;
+ atomic_inc(&qp->refcount);
+
+ spin_unlock_irqrestore(&qpt->lock, flags);
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_free_qp - remove a QP from the QP table
+ * @qpt: the QP table
+ * @qp: the QP to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
+{
+ struct ipath_qp *q, **qpp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qpt->lock, flags);
+
+ /* Remove QP from the hash table. */
+ qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
+ for (; (q = *qpp) != NULL; qpp = &q->next) {
+ if (q == qp) {
+ *qpp = qp->next;
+ qp->next = NULL;
+ atomic_dec(&qp->refcount);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+/**
+ * ipath_free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
+{
+ unsigned long flags;
+ struct ipath_qp *qp;
+ u32 n, qp_inuse = 0;
+
+ spin_lock_irqsave(&qpt->lock, flags);
+ for (n = 0; n < qpt->max; n++) {
+ qp = qpt->table[n];
+ qpt->table[n] = NULL;
+
+ for (; qp; qp = qp->next)
+ qp_inuse++;
+ }
+ spin_unlock_irqrestore(&qpt->lock, flags);
+
+ for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
+ if (qpt->map[n].page)
+ free_page((unsigned long) qpt->map[n].page);
+ return qp_inuse;
+}
+
+/**
+ * ipath_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+ unsigned long flags;
+ struct ipath_qp *qp;
+
+ spin_lock_irqsave(&qpt->lock, flags);
+
+ for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
+ if (qp->ibqp.qp_num == qpn) {
+ atomic_inc(&qp->refcount);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&qpt->lock, flags);
+ return qp;
+}
+
+/**
+ * ipath_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
+{
+ qp->remote_qpn = 0;
+ qp->qkey = 0;
+ qp->qp_access_flags = 0;
+ atomic_set(&qp->s_dma_busy, 0);
+ qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
+ qp->s_hdrwords = 0;
+ qp->s_wqe = NULL;
+ qp->s_pkt_delay = 0;
+ qp->s_draining = 0;
+ qp->s_psn = 0;
+ qp->r_psn = 0;
+ qp->r_msn = 0;
+ if (type == IB_QPT_RC) {
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ qp->r_state = IB_OPCODE_RC_SEND_LAST;
+ } else {
+ qp->s_state = IB_OPCODE_UC_SEND_LAST;
+ qp->r_state = IB_OPCODE_UC_SEND_LAST;
+ }
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+ qp->r_nak_state = 0;
+ qp->r_aflags = 0;
+ qp->r_flags = 0;
+ qp->s_rnr_timeout = 0;
+ qp->s_head = 0;
+ qp->s_tail = 0;
+ qp->s_cur = 0;
+ qp->s_last = 0;
+ qp->s_ssn = 1;
+ qp->s_lsn = 0;
+ memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+ qp->r_head_ack_queue = 0;
+ qp->s_tail_ack_queue = 0;
+ qp->s_num_rd_atomic = 0;
+ if (qp->r_rq.wq) {
+ qp->r_rq.wq->head = 0;
+ qp->r_rq.wq->tail = 0;
+ }
+}
+
+/**
+ * ipath_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ib_wc wc;
+ int ret = 0;
+
+ if (qp->state == IB_QPS_ERR)
+ goto bail;
+
+ qp->state = IB_QPS_ERR;
+
+ spin_lock(&dev->pending_lock);
+ if (!list_empty(&qp->timerwait))
+ list_del_init(&qp->timerwait);
+ if (!list_empty(&qp->piowait))
+ list_del_init(&qp->piowait);
+ spin_unlock(&dev->pending_lock);
+
+ /* Schedule the sending tasklet to drain the send work queue. */
+ if (qp->s_last != qp->s_head)
+ ipath_schedule_send(qp);
+
+ memset(&wc, 0, sizeof(wc));
+ wc.qp = &qp->ibqp;
+ wc.opcode = IB_WC_RECV;
+
+ if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
+ wc.wr_id = qp->r_wr_id;
+ wc.status = err;
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+ }
+ wc.status = IB_WC_WR_FLUSH_ERR;
+
+ if (qp->r_rq.wq) {
+ struct ipath_rwq *wq;
+ u32 head;
+ u32 tail;
+
+ spin_lock(&qp->r_rq.lock);
+
+ /* sanity check pointers before trusting them */
+ wq = qp->r_rq.wq;
+ head = wq->head;
+ if (head >= qp->r_rq.size)
+ head = 0;
+ tail = wq->tail;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ while (tail != head) {
+ wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+ if (++tail >= qp->r_rq.size)
+ tail = 0;
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+ }
+ wq->tail = tail;
+
+ spin_unlock(&qp->r_rq.lock);
+ } else if (qp->ibqp.event_handler)
+ ret = 1;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for ipathverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct ipath_ibdev *dev = to_idev(ibqp->device);
+ struct ipath_qp *qp = to_iqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int lastwqe = 0;
+ int ret;
+
+ spin_lock_irq(&qp->s_lock);
+
+ cur_state = attr_mask & IB_QP_CUR_STATE ?
+ attr->cur_qp_state : qp->state;
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+ attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+ goto inval;
+
+ if (attr_mask & IB_QP_AV) {
+ if (attr->ah_attr.dlid == 0 ||
+ attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
+ goto inval;
+
+ if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
+ (attr->ah_attr.grh.sgid_index > 1))
+ goto inval;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
+ goto inval;
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER)
+ if (attr->min_rnr_timer > 31)
+ goto inval;
+
+ if (attr_mask & IB_QP_PORT)
+ if (attr->port_num == 0 ||
+ attr->port_num > ibqp->device->phys_port_cnt)
+ goto inval;
+
+ /*
+ * don't allow invalid Path MTU values or greater than 2048
+ * unless we are configured for a 4KB MTU
+ */
+ if ((attr_mask & IB_QP_PATH_MTU) &&
+ (ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
+ (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
+ goto inval;
+
+ if (attr_mask & IB_QP_PATH_MIG_STATE)
+ if (attr->path_mig_state != IB_MIG_MIGRATED &&
+ attr->path_mig_state != IB_MIG_REARM)
+ goto inval;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
+ goto inval;
+
+ switch (new_state) {
+ case IB_QPS_RESET:
+ if (qp->state != IB_QPS_RESET) {
+ qp->state = IB_QPS_RESET;
+ spin_lock(&dev->pending_lock);
+ if (!list_empty(&qp->timerwait))
+ list_del_init(&qp->timerwait);
+ if (!list_empty(&qp->piowait))
+ list_del_init(&qp->piowait);
+ spin_unlock(&dev->pending_lock);
+ qp->s_flags &= ~IPATH_S_ANY_WAIT;
+ spin_unlock_irq(&qp->s_lock);
+ /* Stop the sending tasklet */
+ tasklet_kill(&qp->s_task);
+ wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+ spin_lock_irq(&qp->s_lock);
+ }
+ ipath_reset_qp(qp, ibqp->qp_type);
+ break;
+
+ case IB_QPS_SQD:
+ qp->s_draining = qp->s_last != qp->s_cur;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_SQE:
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ goto inval;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_ERR:
+ lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ break;
+
+ default:
+ qp->state = new_state;
+ break;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ qp->s_pkey_index = attr->pkey_index;
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ qp->remote_qpn = attr->dest_qp_num;
+
+ if (attr_mask & IB_QP_SQ_PSN) {
+ qp->s_psn = qp->s_next_psn = attr->sq_psn;
+ qp->s_last_psn = qp->s_next_psn - 1;
+ }
+
+ if (attr_mask & IB_QP_RQ_PSN)
+ qp->r_psn = attr->rq_psn;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->qp_access_flags = attr->qp_access_flags;
+
+ if (attr_mask & IB_QP_AV) {
+ qp->remote_ah_attr = attr->ah_attr;
+ qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
+ }
+
+ if (attr_mask & IB_QP_PATH_MTU)
+ qp->path_mtu = attr->path_mtu;
+
+ if (attr_mask & IB_QP_RETRY_CNT)
+ qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ qp->s_rnr_retry = attr->rnr_retry;
+ if (qp->s_rnr_retry > 7)
+ qp->s_rnr_retry = 7;
+ qp->s_rnr_retry_cnt = qp->s_rnr_retry;
+ }
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER)
+ qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+ if (attr_mask & IB_QP_TIMEOUT)
+ qp->timeout = attr->timeout;
+
+ if (attr_mask & IB_QP_QKEY)
+ qp->qkey = attr->qkey;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+ qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+ spin_unlock_irq(&qp->s_lock);
+
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ ret = 0;
+ goto bail;
+
+inval:
+ spin_unlock_irq(&qp->s_lock);
+ ret = -EINVAL;
+
+bail:
+ return ret;
+}
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+
+ attr->qp_state = qp->state;
+ attr->cur_qp_state = attr->qp_state;
+ attr->path_mtu = qp->path_mtu;
+ attr->path_mig_state = 0;
+ attr->qkey = qp->qkey;
+ attr->rq_psn = qp->r_psn;
+ attr->sq_psn = qp->s_next_psn;
+ attr->dest_qp_num = qp->remote_qpn;
+ attr->qp_access_flags = qp->qp_access_flags;
+ attr->cap.max_send_wr = qp->s_size - 1;
+ attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+ attr->cap.max_send_sge = qp->s_max_sge;
+ attr->cap.max_recv_sge = qp->r_rq.max_sge;
+ attr->cap.max_inline_data = 0;
+ attr->ah_attr = qp->remote_ah_attr;
+ memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
+ attr->pkey_index = qp->s_pkey_index;
+ attr->alt_pkey_index = 0;
+ attr->en_sqd_async_notify = 0;
+ attr->sq_draining = qp->s_draining;
+ attr->max_rd_atomic = qp->s_max_rd_atomic;
+ attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+ attr->min_rnr_timer = qp->r_min_rnr_timer;
+ attr->port_num = 1;
+ attr->timeout = qp->timeout;
+ attr->retry_cnt = qp->s_retry_cnt;
+ attr->rnr_retry = qp->s_rnr_retry_cnt;
+ attr->alt_port_num = 0;
+ attr->alt_timeout = 0;
+
+ init_attr->event_handler = qp->ibqp.event_handler;
+ init_attr->qp_context = qp->ibqp.qp_context;
+ init_attr->send_cq = qp->ibqp.send_cq;
+ init_attr->recv_cq = qp->ibqp.recv_cq;
+ init_attr->srq = qp->ibqp.srq;
+ init_attr->cap = attr->cap;
+ if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
+ init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+ else
+ init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+ init_attr->qp_type = qp->ibqp.qp_type;
+ init_attr->port_num = 1;
+ return 0;
+}
+
+/**
+ * ipath_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 ipath_compute_aeth(struct ipath_qp *qp)
+{
+ u32 aeth = qp->r_msn & IPATH_MSN_MASK;
+
+ if (qp->ibqp.srq) {
+ /*
+ * Shared receive queues don't generate credits.
+ * Set the credit field to the invalid value.
+ */
+ aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
+ } else {
+ u32 min, max, x;
+ u32 credits;
+ struct ipath_rwq *wq = qp->r_rq.wq;
+ u32 head;
+ u32 tail;
+
+ /* sanity check pointers before trusting them */
+ head = wq->head;
+ if (head >= qp->r_rq.size)
+ head = 0;
+ tail = wq->tail;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ /*
+ * Compute the number of credits available (RWQEs).
+ * XXX Not holding the r_rq.lock here so there is a small
+ * chance that the pair of reads are not atomic.
+ */
+ credits = head - tail;
+ if ((int)credits < 0)
+ credits += qp->r_rq.size;
+ /*
+ * Binary search the credit table to find the code to
+ * use.
+ */
+ min = 0;
+ max = 31;
+ for (;;) {
+ x = (min + max) / 2;
+ if (credit_table[x] == credits)
+ break;
+ if (credit_table[x] > credits)
+ max = x;
+ else if (min == x)
+ break;
+ else
+ min = x;
+ }
+ aeth |= x << IPATH_AETH_CREDIT_SHIFT;
+ }
+ return cpu_to_be32(aeth);
+}
+
+/**
+ * ipath_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: unused by InfiniPath
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct ipath_qp *qp;
+ int err;
+ struct ipath_swqe *swq = NULL;
+ struct ipath_ibdev *dev;
+ size_t sz;
+ size_t sg_list_sz;
+ struct ib_qp *ret;
+
+ if (init_attr->create_flags) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
+ init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ /* Check receive queue parameters if no SRQ is specified. */
+ if (!init_attr->srq) {
+ if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
+ init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+ if (init_attr->cap.max_send_sge +
+ init_attr->cap.max_send_wr +
+ init_attr->cap.max_recv_sge +
+ init_attr->cap.max_recv_wr == 0) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+ }
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_UC:
+ case IB_QPT_RC:
+ case IB_QPT_UD:
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ sz = sizeof(struct ipath_sge) *
+ init_attr->cap.max_send_sge +
+ sizeof(struct ipath_swqe);
+ swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+ if (swq == NULL) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+ sz = sizeof(*qp);
+ sg_list_sz = 0;
+ if (init_attr->srq) {
+ struct ipath_srq *srq = to_isrq(init_attr->srq);
+
+ if (srq->rq.max_sge > 1)
+ sg_list_sz = sizeof(*qp->r_sg_list) *
+ (srq->rq.max_sge - 1);
+ } else if (init_attr->cap.max_recv_sge > 1)
+ sg_list_sz = sizeof(*qp->r_sg_list) *
+ (init_attr->cap.max_recv_sge - 1);
+ qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
+ if (!qp) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_swq;
+ }
+ if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
+ init_attr->qp_type == IB_QPT_SMI ||
+ init_attr->qp_type == IB_QPT_GSI)) {
+ qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
+ if (!qp->r_ud_sg_list) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_qp;
+ }
+ } else
+ qp->r_ud_sg_list = NULL;
+ if (init_attr->srq) {
+ sz = 0;
+ qp->r_rq.size = 0;
+ qp->r_rq.max_sge = 0;
+ qp->r_rq.wq = NULL;
+ init_attr->cap.max_recv_wr = 0;
+ init_attr->cap.max_recv_sge = 0;
+ } else {
+ qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+ qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+ sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+ sizeof(struct ipath_rwqe);
+ qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
+ qp->r_rq.size * sz);
+ if (!qp->r_rq.wq) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_sg_list;
+ }
+ }
+
+ /*
+ * ib_create_qp() will initialize qp->ibqp
+ * except for qp->ibqp.qp_num.
+ */
+ spin_lock_init(&qp->s_lock);
+ spin_lock_init(&qp->r_rq.lock);
+ atomic_set(&qp->refcount, 0);
+ init_waitqueue_head(&qp->wait);
+ init_waitqueue_head(&qp->wait_dma);
+ tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
+ INIT_LIST_HEAD(&qp->piowait);
+ INIT_LIST_HEAD(&qp->timerwait);
+ qp->state = IB_QPS_RESET;
+ qp->s_wq = swq;
+ qp->s_size = init_attr->cap.max_send_wr + 1;
+ qp->s_max_sge = init_attr->cap.max_send_sge;
+ if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+ qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
+ else
+ qp->s_flags = 0;
+ dev = to_idev(ibpd->device);
+ err = ipath_alloc_qpn(&dev->qp_table, qp,
+ init_attr->qp_type);
+ if (err) {
+ ret = ERR_PTR(err);
+ vfree(qp->r_rq.wq);
+ goto bail_sg_list;
+ }
+ qp->ip = NULL;
+ qp->s_tx = NULL;
+ ipath_reset_qp(qp, init_attr->qp_type);
+ break;
+
+ default:
+ /* Don't support raw QPs */
+ ret = ERR_PTR(-ENOSYS);
+ goto bail;
+ }
+
+ init_attr->cap.max_inline_data = 0;
+
+ /*
+ * Return the address of the RWQ as the offset to mmap.
+ * See ipath_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ if (!qp->r_rq.wq) {
+ __u64 offset = 0;
+
+ err = ib_copy_to_udata(udata, &offset,
+ sizeof(offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ } else {
+ u32 s = sizeof(struct ipath_rwq) +
+ qp->r_rq.size * sz;
+
+ qp->ip =
+ ipath_create_mmap_info(dev, s,
+ ibpd->uobject->context,
+ qp->r_rq.wq);
+ if (!qp->ip) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ err = ib_copy_to_udata(udata, &(qp->ip->offset),
+ sizeof(qp->ip->offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ }
+ }
+
+ spin_lock(&dev->n_qps_lock);
+ if (dev->n_qps_allocated == ib_ipath_max_qps) {
+ spin_unlock(&dev->n_qps_lock);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ dev->n_qps_allocated++;
+ spin_unlock(&dev->n_qps_lock);
+
+ if (qp->ip) {
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ ret = &qp->ibqp;
+ goto bail;
+
+bail_ip:
+ if (qp->ip)
+ kref_put(&qp->ip->ref, ipath_release_mmap_info);
+ else
+ vfree(qp->r_rq.wq);
+ ipath_free_qp(&dev->qp_table, qp);
+ free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+bail_sg_list:
+ kfree(qp->r_ud_sg_list);
+bail_qp:
+ kfree(qp);
+bail_swq:
+ vfree(swq);
+bail:
+ return ret;
+}
+
+/**
+ * ipath_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int ipath_destroy_qp(struct ib_qp *ibqp)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+ /* Make sure HW and driver activity is stopped. */
+ spin_lock_irq(&qp->s_lock);
+ if (qp->state != IB_QPS_RESET) {
+ qp->state = IB_QPS_RESET;
+ spin_lock(&dev->pending_lock);
+ if (!list_empty(&qp->timerwait))
+ list_del_init(&qp->timerwait);
+ if (!list_empty(&qp->piowait))
+ list_del_init(&qp->piowait);
+ spin_unlock(&dev->pending_lock);
+ qp->s_flags &= ~IPATH_S_ANY_WAIT;
+ spin_unlock_irq(&qp->s_lock);
+ /* Stop the sending tasklet */
+ tasklet_kill(&qp->s_task);
+ wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+ } else
+ spin_unlock_irq(&qp->s_lock);
+
+ ipath_free_qp(&dev->qp_table, qp);
+
+ if (qp->s_tx) {
+ atomic_dec(&qp->refcount);
+ if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+ kfree(qp->s_tx->txreq.map_addr);
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
+ spin_unlock_irq(&dev->pending_lock);
+ qp->s_tx = NULL;
+ }
+
+ wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+ /* all user's cleaned up, mark it available */
+ free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+ spin_lock(&dev->n_qps_lock);
+ dev->n_qps_allocated--;
+ spin_unlock(&dev->n_qps_lock);
+
+ if (qp->ip)
+ kref_put(&qp->ip->ref, ipath_release_mmap_info);
+ else
+ vfree(qp->r_rq.wq);
+ kfree(qp->r_ud_sg_list);
+ vfree(qp->s_wq);
+ kfree(qp);
+ return 0;
+}
+
+/**
+ * ipath_init_qp_table - initialize the QP table for a device
+ * @idev: the device who's QP table we're initializing
+ * @size: the size of the QP table
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
+{
+ int i;
+ int ret;
+
+ idev->qp_table.last = 1; /* QPN 0 and 1 are special. */
+ idev->qp_table.max = size;
+ idev->qp_table.nmaps = 1;
+ idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
+ GFP_KERNEL);
+ if (idev->qp_table.table == NULL) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
+ atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
+ idev->qp_table.map[i].page = NULL;
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
+{
+ u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
+
+ /*
+ * If the credit is invalid, we can send
+ * as many packets as we like. Otherwise, we have to
+ * honor the credit field.
+ */
+ if (credit == IPATH_AETH_CREDIT_INVAL)
+ qp->s_lsn = (u32) -1;
+ else if (qp->s_lsn != (u32) -1) {
+ /* Compute new LSN (i.e., MSN + credit) */
+ credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
+ if (ipath_cmp24(credit, qp->s_lsn) > 0)
+ qp->s_lsn = credit;
+ }
+
+ /* Restart sending if it was blocked due to lack of credits. */
+ if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
+ qp->s_cur != qp->s_head &&
+ (qp->s_lsn == (u32) -1 ||
+ ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
+ qp->s_lsn + 1) <= 0))
+ ipath_schedule_send(qp);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_rc.c b/drivers/staging/rdma/ipath/ipath_rc.c
new file mode 100644
index 000000000000..79b3dbc97179
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_rc.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
+ u32 psn, u32 pmtu)
+{
+ u32 len;
+
+ len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+ ss->sge = wqe->sg_list[0];
+ ss->sg_list = wqe->sg_list + 1;
+ ss->num_sge = wqe->wr.num_sge;
+ ipath_skip_sge(ss, len);
+ return wqe->length - len;
+}
+
+/**
+ * ipath_init_restart- initialize the qp->s_sge after a restart
+ * @qp: the QP who's SGE we're restarting
+ * @wqe: the work queue to initialize the QP's SGE from
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
+{
+ struct ipath_ibdev *dev;
+
+ qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
+ ib_mtu_enum_to_int(qp->path_mtu));
+ dev = to_idev(qp->ibqp.device);
+ spin_lock(&dev->pending_lock);
+ if (list_empty(&qp->timerwait))
+ list_add_tail(&qp->timerwait,
+ &dev->pending[dev->pending_index]);
+ spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
+ struct ipath_other_headers *ohdr, u32 pmtu)
+{
+ struct ipath_ack_entry *e;
+ u32 hwords;
+ u32 len;
+ u32 bth0;
+ u32 bth2;
+
+ /* Don't send an ACK if we aren't supposed to. */
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+ goto bail;
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ switch (qp->s_ack_state) {
+ case OP(RDMA_READ_RESPONSE_LAST):
+ case OP(RDMA_READ_RESPONSE_ONLY):
+ case OP(ATOMIC_ACKNOWLEDGE):
+ /*
+ * We can increment the tail pointer now that the last
+ * response has been sent instead of only being
+ * constructed.
+ */
+ if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
+ qp->s_tail_ack_queue = 0;
+ /* FALLTHROUGH */
+ case OP(SEND_ONLY):
+ case OP(ACKNOWLEDGE):
+ /* Check for no next entry in the queue. */
+ if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+ if (qp->s_flags & IPATH_S_ACK_PENDING)
+ goto normal;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ goto bail;
+ }
+
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->opcode == OP(RDMA_READ_REQUEST)) {
+ /* Copy SGE state in case we need to resend */
+ qp->s_ack_rdma_sge = e->rdma_sge;
+ qp->s_cur_sge = &qp->s_ack_rdma_sge;
+ len = e->rdma_sge.sge.sge_length;
+ if (len > pmtu) {
+ len = pmtu;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+ } else {
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+ e->sent = 1;
+ }
+ ohdr->u.aeth = ipath_compute_aeth(qp);
+ hwords++;
+ qp->s_ack_rdma_psn = e->psn;
+ bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+ } else {
+ /* COMPARE_SWAP or FETCH_ADD */
+ qp->s_cur_sge = NULL;
+ len = 0;
+ qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+ ohdr->u.at.aeth = ipath_compute_aeth(qp);
+ ohdr->u.at.atomic_ack_eth[0] =
+ cpu_to_be32(e->atomic_data >> 32);
+ ohdr->u.at.atomic_ack_eth[1] =
+ cpu_to_be32(e->atomic_data);
+ hwords += sizeof(ohdr->u.at) / sizeof(u32);
+ bth2 = e->psn;
+ e->sent = 1;
+ }
+ bth0 = qp->s_ack_state << 24;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ len = qp->s_ack_rdma_sge.sge.sge_length;
+ if (len > pmtu)
+ len = pmtu;
+ else {
+ ohdr->u.aeth = ipath_compute_aeth(qp);
+ hwords++;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
+ }
+ bth0 = qp->s_ack_state << 24;
+ bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+ break;
+
+ default:
+ normal:
+ /*
+ * Send a regular ACK.
+ * Set the s_ack_state so we wait until after sending
+ * the ACK before setting s_ack_state to ACKNOWLEDGE
+ * (see above).
+ */
+ qp->s_ack_state = OP(SEND_ONLY);
+ qp->s_flags &= ~IPATH_S_ACK_PENDING;
+ qp->s_cur_sge = NULL;
+ if (qp->s_nak_state)
+ ohdr->u.aeth =
+ cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+ (qp->s_nak_state <<
+ IPATH_AETH_CREDIT_SHIFT));
+ else
+ ohdr->u.aeth = ipath_compute_aeth(qp);
+ hwords++;
+ len = 0;
+ bth0 = OP(ACKNOWLEDGE) << 24;
+ bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
+ }
+ qp->s_hdrwords = hwords;
+ qp->s_cur_size = len;
+ ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
+ return 1;
+
+bail:
+ return 0;
+}
+
+/**
+ * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_rc_req(struct ipath_qp *qp)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_other_headers *ohdr;
+ struct ipath_sge_state *ss;
+ struct ipath_swqe *wqe;
+ u32 hwords;
+ u32 len;
+ u32 bth0;
+ u32 bth2;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ char newreq;
+ unsigned long flags;
+ int ret = 0;
+
+ ohdr = &qp->s_hdr.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &qp->s_hdr.u.l.oth;
+
+ /*
+ * The lock is needed to synchronize between the sending tasklet,
+ * the receive interrupt handler, and timeout resends.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Sending responses has higher priority over sending requests. */
+ if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+ (qp->s_flags & IPATH_S_ACK_PENDING) ||
+ qp->s_ack_state != OP(ACKNOWLEDGE)) &&
+ ipath_make_rc_ack(dev, qp, ohdr, pmtu))
+ goto done;
+
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == qp->s_head)
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (atomic_read(&qp->s_dma_busy)) {
+ qp->s_flags |= IPATH_S_WAIT_DMA;
+ goto bail;
+ }
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ goto done;
+ }
+
+ /* Leave BUSY set until RNR timeout. */
+ if (qp->s_rnr_timeout) {
+ qp->s_flags |= IPATH_S_WAITING;
+ goto bail;
+ }
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+ bth0 = 1 << 22; /* Set M bit */
+
+ /* Send a request. */
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ switch (qp->s_state) {
+ default:
+ if (!(ib_ipath_state_ops[qp->state] &
+ IPATH_PROCESS_NEXT_SEND_OK))
+ goto bail;
+ /*
+ * Resend an old request or start a new one.
+ *
+ * We keep track of the current SWQE so that
+ * we don't reset the "furthest progress" state
+ * if we need to back up.
+ */
+ newreq = 0;
+ if (qp->s_cur == qp->s_tail) {
+ /* Check if send work queue is empty. */
+ if (qp->s_tail == qp->s_head)
+ goto bail;
+ /*
+ * If a fence is requested, wait for previous
+ * RDMA read and atomic operations to finish.
+ */
+ if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+ qp->s_num_rd_atomic) {
+ qp->s_flags |= IPATH_S_FENCE_PENDING;
+ goto bail;
+ }
+ wqe->psn = qp->s_next_psn;
+ newreq = 1;
+ }
+ /*
+ * Note that we have to be careful not to modify the
+ * original work request since we may need to resend
+ * it.
+ */
+ len = wqe->length;
+ ss = &qp->s_sge;
+ bth2 = 0;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ /* If no credit, return. */
+ if (qp->s_lsn != (u32) -1 &&
+ ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+ qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+ goto bail;
+ }
+ wqe->lpsn = wqe->psn;
+ if (len > pmtu) {
+ wqe->lpsn += (len - 1) / pmtu;
+ qp->s_state = OP(SEND_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_ONLY);
+ else {
+ qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ bth2 = 1 << 31; /* Request ACK. */
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ if (newreq && qp->s_lsn != (u32) -1)
+ qp->s_lsn++;
+ /* FALLTHROUGH */
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ /* If no credit, return. */
+ if (qp->s_lsn != (u32) -1 &&
+ ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+ qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+ goto bail;
+ }
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / sizeof(u32);
+ wqe->lpsn = wqe->psn;
+ if (len > pmtu) {
+ wqe->lpsn += (len - 1) / pmtu;
+ qp->s_state = OP(RDMA_WRITE_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_ONLY);
+ else {
+ qp->s_state =
+ OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after RETH */
+ ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ bth2 = 1 << 31; /* Request ACK. */
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_READ:
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow.
+ */
+ if (newreq) {
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= IPATH_S_RDMAR_PENDING;
+ goto bail;
+ }
+ qp->s_num_rd_atomic++;
+ if (qp->s_lsn != (u32) -1)
+ qp->s_lsn++;
+ /*
+ * Adjust s_next_psn to count the
+ * expected number of responses.
+ */
+ if (len > pmtu)
+ qp->s_next_psn += (len - 1) / pmtu;
+ wqe->lpsn = qp->s_next_psn++;
+ }
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+ ss = NULL;
+ len = 0;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow.
+ */
+ if (newreq) {
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= IPATH_S_RDMAR_PENDING;
+ goto bail;
+ }
+ qp->s_num_rd_atomic++;
+ if (qp->s_lsn != (u32) -1)
+ qp->s_lsn++;
+ wqe->lpsn = wqe->psn;
+ }
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ qp->s_state = OP(COMPARE_SWAP);
+ ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+ wqe->wr.wr.atomic.swap);
+ ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+ wqe->wr.wr.atomic.compare_add);
+ } else {
+ qp->s_state = OP(FETCH_ADD);
+ ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+ wqe->wr.wr.atomic.compare_add);
+ ohdr->u.atomic_eth.compare_data = 0;
+ }
+ ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+ wqe->wr.wr.atomic.remote_addr >> 32);
+ ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+ wqe->wr.wr.atomic.remote_addr);
+ ohdr->u.atomic_eth.rkey = cpu_to_be32(
+ wqe->wr.wr.atomic.rkey);
+ hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+ ss = NULL;
+ len = 0;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ default:
+ goto bail;
+ }
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_len = wqe->length;
+ if (newreq) {
+ qp->s_tail++;
+ if (qp->s_tail >= qp->s_size)
+ qp->s_tail = 0;
+ }
+ bth2 |= qp->s_psn & IPATH_PSN_MASK;
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ qp->s_psn = wqe->lpsn + 1;
+ else {
+ qp->s_psn++;
+ if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ }
+ /*
+ * Put the QP on the pending list so lost ACKs will cause
+ * a retry. More than one request can be pending so the
+ * QP may already be on the dev->pending list.
+ */
+ spin_lock(&dev->pending_lock);
+ if (list_empty(&qp->timerwait))
+ list_add_tail(&qp->timerwait,
+ &dev->pending[dev->pending_index]);
+ spin_unlock(&dev->pending_lock);
+ break;
+
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ /*
+ * This case can only happen if a send is restarted.
+ * See ipath_restart_rc().
+ */
+ ipath_init_restart(qp, wqe);
+ /* FALLTHROUGH */
+ case OP(SEND_FIRST):
+ qp->s_state = OP(SEND_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+ if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_LAST);
+ else {
+ qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ bth2 |= 1 << 31; /* Request ACK. */
+ qp->s_cur++;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_LAST):
+ /*
+ * This case can only happen if a RDMA write is restarted.
+ * See ipath_restart_rc().
+ */
+ ipath_init_restart(qp, wqe);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_FIRST):
+ qp->s_state = OP(RDMA_WRITE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+ if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+ qp->s_next_psn = qp->s_psn;
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_LAST);
+ else {
+ qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ bth2 |= 1 << 31; /* Request ACK. */
+ qp->s_cur++;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ /*
+ * This case can only happen if a RDMA read is restarted.
+ * See ipath_restart_rc().
+ */
+ ipath_init_restart(qp, wqe);
+ len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+ bth2 = qp->s_psn & IPATH_PSN_MASK;
+ qp->s_psn = wqe->lpsn + 1;
+ ss = NULL;
+ len = 0;
+ qp->s_cur++;
+ if (qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+ }
+ if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
+ bth2 |= 1 << 31; /* Request ACK. */
+ qp->s_len -= len;
+ qp->s_hdrwords = hwords;
+ qp->s_cur_sge = ss;
+ qp->s_cur_size = len;
+ ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+ ret = 1;
+ goto unlock;
+
+bail:
+ qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+/**
+ * send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from ipath_rc_rcv() and only uses the receive
+ * side QP state.
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+static void send_rc_ack(struct ipath_qp *qp)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_devdata *dd;
+ u16 lrh0;
+ u32 bth0;
+ u32 hwords;
+ u32 __iomem *piobuf;
+ struct ipath_ib_header hdr;
+ struct ipath_other_headers *ohdr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+ if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+ (qp->s_flags & IPATH_S_ACK_PENDING) ||
+ qp->s_ack_state != OP(ACKNOWLEDGE))
+ goto queue_ack;
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Don't try to send ACKs if the link isn't ACTIVE */
+ dd = dev->dd;
+ if (!(dd->ipath_flags & IPATH_LINKACTIVE))
+ goto done;
+
+ piobuf = ipath_getpiobuf(dd, 0, NULL);
+ if (!piobuf) {
+ /*
+ * We are out of PIO buffers at the moment.
+ * Pass responsibility for sending the ACK to the
+ * send tasklet so that when a PIO buffer becomes
+ * available, the ACK is sent ahead of other outgoing
+ * packets.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ goto queue_ack;
+ }
+
+ /* Construct the header. */
+ ohdr = &hdr.u.oth;
+ lrh0 = IPATH_LRH_BTH;
+ /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+ hwords = 6;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ hwords += ipath_make_grh(dev, &hdr.u.l.grh,
+ &qp->remote_ah_attr.grh,
+ hwords, 0);
+ ohdr = &hdr.u.l.oth;
+ lrh0 = IPATH_LRH_GRH;
+ }
+ /* read pkey_index w/o lock (its atomic) */
+ bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
+ (OP(ACKNOWLEDGE) << 24) | (1 << 22);
+ if (qp->r_nak_state)
+ ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+ (qp->r_nak_state <<
+ IPATH_AETH_CREDIT_SHIFT));
+ else
+ ohdr->u.aeth = ipath_compute_aeth(qp);
+ lrh0 |= qp->remote_ah_attr.sl << 4;
+ hdr.lrh[0] = cpu_to_be16(lrh0);
+ hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+ hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
+ qp->remote_ah_attr.src_path_bits);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+ ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
+
+ writeq(hwords + 1, piobuf);
+
+ if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+ u32 *hdrp = (u32 *) &hdr;
+
+ ipath_flush_wc();
+ __iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
+ ipath_flush_wc();
+ __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+ } else
+ __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+ ipath_flush_wc();
+
+ dev->n_unicast_xmit++;
+ goto done;
+
+queue_ack:
+ if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
+ dev->n_rc_qacks++;
+ qp->s_flags |= IPATH_S_ACK_PENDING;
+ qp->s_nak_state = qp->r_nak_state;
+ qp->s_ack_psn = qp->r_ack_psn;
+
+ /* Schedule the send tasklet. */
+ ipath_schedule_send(qp);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct ipath_qp *qp, u32 psn)
+{
+ u32 n = qp->s_last;
+ struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
+ u32 opcode;
+
+ qp->s_cur = n;
+
+ /*
+ * If we are starting the request from the beginning,
+ * let the normal send code handle initialization.
+ */
+ if (ipath_cmp24(psn, wqe->psn) <= 0) {
+ qp->s_state = OP(SEND_LAST);
+ goto done;
+ }
+
+ /* Find the work request opcode corresponding to the given PSN. */
+ opcode = wqe->wr.opcode;
+ for (;;) {
+ int diff;
+
+ if (++n == qp->s_size)
+ n = 0;
+ if (n == qp->s_tail)
+ break;
+ wqe = get_swqe_ptr(qp, n);
+ diff = ipath_cmp24(psn, wqe->psn);
+ if (diff < 0)
+ break;
+ qp->s_cur = n;
+ /*
+ * If we are starting the request from the beginning,
+ * let the normal send code handle initialization.
+ */
+ if (diff == 0) {
+ qp->s_state = OP(SEND_LAST);
+ goto done;
+ }
+ opcode = wqe->wr.opcode;
+ }
+
+ /*
+ * Set the state to restart in the middle of a request.
+ * Don't change the s_sge, s_cur_sge, or s_cur_size.
+ * See ipath_make_rc_req().
+ */
+ switch (opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+ break;
+
+ case IB_WR_RDMA_READ:
+ qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+ break;
+
+ default:
+ /*
+ * This case shouldn't happen since its only
+ * one PSN per req.
+ */
+ qp->s_state = OP(SEND_LAST);
+ }
+done:
+ qp->s_psn = psn;
+}
+
+/**
+ * ipath_restart_rc - back up requester to resend the last un-ACKed request
+ * @qp: the QP to restart
+ * @psn: packet sequence number for the request
+ * @wc: the work completion request
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
+{
+ struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+ struct ipath_ibdev *dev;
+
+ if (qp->s_retry == 0) {
+ ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+ ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ goto bail;
+ }
+ qp->s_retry--;
+
+ /*
+ * Remove the QP from the timeout queue.
+ * Note: it may already have been removed by ipath_ib_timer().
+ */
+ dev = to_idev(qp->ibqp.device);
+ spin_lock(&dev->pending_lock);
+ if (!list_empty(&qp->timerwait))
+ list_del_init(&qp->timerwait);
+ if (!list_empty(&qp->piowait))
+ list_del_init(&qp->piowait);
+ spin_unlock(&dev->pending_lock);
+
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ dev->n_rc_resends++;
+ else
+ dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
+
+ reset_psn(qp, psn);
+ ipath_schedule_send(qp);
+
+bail:
+ return;
+}
+
+static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
+{
+ qp->s_last_psn = psn;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held and interrupts disabled.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
+ u64 val)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ib_wc wc;
+ enum ib_wc_status status;
+ struct ipath_swqe *wqe;
+ int ret = 0;
+ u32 ack_psn;
+ int diff;
+
+ /*
+ * Remove the QP from the timeout queue (or RNR timeout queue).
+ * If ipath_ib_timer() has already removed it,
+ * it's OK since we hold the QP s_lock and ipath_restart_rc()
+ * just won't find anything to restart if we ACK everything.
+ */
+ spin_lock(&dev->pending_lock);
+ if (!list_empty(&qp->timerwait))
+ list_del_init(&qp->timerwait);
+ spin_unlock(&dev->pending_lock);
+
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+ * requests and implicitly NAK RDMA read and atomic requests issued
+ * before the NAK'ed request. The MSN won't include the NAK'ed
+ * request but will include an ACK'ed request(s).
+ */
+ ack_psn = psn;
+ if (aeth >> 29)
+ ack_psn--;
+ wqe = get_swqe_ptr(qp, qp->s_last);
+
+ /*
+ * The MSN might be for a later WQE than the PSN indicates so
+ * only complete WQEs that the PSN finishes.
+ */
+ while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+ /*
+ * RDMA_READ_RESPONSE_ONLY is a special case since
+ * we want to generate completion events for everything
+ * before the RDMA read, copy the data, then generate
+ * the completion for the read.
+ */
+ if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+ opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+ diff == 0) {
+ ret = 1;
+ goto bail;
+ }
+ /*
+ * If this request is a RDMA read or atomic, and the ACK is
+ * for a later operation, this ACK NAKs the RDMA read or
+ * atomic. In other words, only a RDMA_READ_LAST or ONLY
+ * can ACK a RDMA read and likewise for atomic ops. Note
+ * that the NAK case can only happen if relaxed ordering is
+ * used and requests are sent after an RDMA read or atomic
+ * is sent but before the response is received.
+ */
+ if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+ (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+ ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+ /*
+ * The last valid PSN seen is the previous
+ * request's.
+ */
+ update_last_psn(qp, wqe->psn - 1);
+ /* Retry this request. */
+ ipath_restart_rc(qp, wqe->psn);
+ /*
+ * No need to process the ACK/NAK since we are
+ * restarting an earlier request.
+ */
+ goto bail;
+ }
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+ *(u64 *) wqe->sg_list[0].vaddr = val;
+ if (qp->s_num_rd_atomic &&
+ (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+ qp->s_num_rd_atomic--;
+ /* Restart sending task if fence is complete */
+ if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
+ !qp->s_num_rd_atomic) ||
+ qp->s_flags & IPATH_S_RDMAR_PENDING)
+ ipath_schedule_send(qp);
+ }
+ /* Post a send completion queue entry if requested. */
+ if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ memset(&wc, 0, sizeof wc);
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+ }
+ qp->s_retry = qp->s_retry_cnt;
+ /*
+ * If we are completing a request which is in the process of
+ * being resent, we can stop resending it since we know the
+ * responder has already seen it.
+ */
+ if (qp->s_last == qp->s_cur) {
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ qp->s_last = qp->s_cur;
+ if (qp->s_last == qp->s_tail)
+ break;
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ qp->s_state = OP(SEND_LAST);
+ qp->s_psn = wqe->psn;
+ } else {
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
+ qp->s_draining = 0;
+ if (qp->s_last == qp->s_tail)
+ break;
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ }
+ }
+
+ switch (aeth >> 29) {
+ case 0: /* ACK */
+ dev->n_rc_acks++;
+ /* If this is a partial ACK, reset the retransmit timer. */
+ if (qp->s_last != qp->s_tail) {
+ spin_lock(&dev->pending_lock);
+ if (list_empty(&qp->timerwait))
+ list_add_tail(&qp->timerwait,
+ &dev->pending[dev->pending_index]);
+ spin_unlock(&dev->pending_lock);
+ /*
+ * If we get a partial ACK for a resent operation,
+ * we can stop resending the earlier packets and
+ * continue with the next packet the receiver wants.
+ */
+ if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+ reset_psn(qp, psn + 1);
+ ipath_schedule_send(qp);
+ }
+ } else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+ qp->s_state = OP(SEND_LAST);
+ qp->s_psn = psn + 1;
+ }
+ ipath_get_credit(qp, aeth);
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ qp->s_retry = qp->s_retry_cnt;
+ update_last_psn(qp, psn);
+ ret = 1;
+ goto bail;
+
+ case 1: /* RNR NAK */
+ dev->n_rnr_naks++;
+ if (qp->s_last == qp->s_tail)
+ goto bail;
+ if (qp->s_rnr_retry == 0) {
+ status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto class_b;
+ }
+ if (qp->s_rnr_retry_cnt < 7)
+ qp->s_rnr_retry--;
+
+ /* The last valid PSN is the previous PSN. */
+ update_last_psn(qp, psn - 1);
+
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ dev->n_rc_resends++;
+ else
+ dev->n_rc_resends +=
+ (qp->s_psn - psn) & IPATH_PSN_MASK;
+
+ reset_psn(qp, psn);
+
+ qp->s_rnr_timeout =
+ ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
+ IPATH_AETH_CREDIT_MASK];
+ ipath_insert_rnr_queue(qp);
+ ipath_schedule_send(qp);
+ goto bail;
+
+ case 3: /* NAK */
+ if (qp->s_last == qp->s_tail)
+ goto bail;
+ /* The last valid PSN is the previous PSN. */
+ update_last_psn(qp, psn - 1);
+ switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
+ IPATH_AETH_CREDIT_MASK) {
+ case 0: /* PSN sequence error */
+ dev->n_seq_naks++;
+ /*
+ * Back up to the responder's expected PSN.
+ * Note that we might get a NAK in the middle of an
+ * RDMA READ response which terminates the RDMA
+ * READ.
+ */
+ ipath_restart_rc(qp, psn);
+ break;
+
+ case 1: /* Invalid Request */
+ status = IB_WC_REM_INV_REQ_ERR;
+ dev->n_other_naks++;
+ goto class_b;
+
+ case 2: /* Remote Access Error */
+ status = IB_WC_REM_ACCESS_ERR;
+ dev->n_other_naks++;
+ goto class_b;
+
+ case 3: /* Remote Operation Error */
+ status = IB_WC_REM_OP_ERR;
+ dev->n_other_naks++;
+ class_b:
+ ipath_send_complete(qp, wqe, status);
+ ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ break;
+
+ default:
+ /* Ignore other reserved NAK error codes */
+ goto reserved;
+ }
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ goto bail;
+
+ default: /* 2: reserved */
+ reserved:
+ /* Ignore reserved NAK codes. */
+ goto bail;
+ }
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_rc_rcv_resp - process an incoming RC response packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
+ struct ipath_other_headers *ohdr,
+ void *data, u32 tlen,
+ struct ipath_qp *qp,
+ u32 opcode,
+ u32 psn, u32 hdrsize, u32 pmtu,
+ int header_in_data)
+{
+ struct ipath_swqe *wqe;
+ enum ib_wc_status status;
+ unsigned long flags;
+ int diff;
+ u32 pad;
+ u32 aeth;
+ u64 val;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Double check we can process this now that we hold the s_lock. */
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+ goto ack_done;
+
+ /* Ignore invalid responses. */
+ if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
+ goto ack_done;
+
+ /* Ignore duplicate responses. */
+ diff = ipath_cmp24(psn, qp->s_last_psn);
+ if (unlikely(diff <= 0)) {
+ /* Update credits for "ghost" ACKs */
+ if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+ if (!header_in_data)
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ else {
+ aeth = be32_to_cpu(((__be32 *) data)[0]);
+ data += sizeof(__be32);
+ }
+ if ((aeth >> 29) == 0)
+ ipath_get_credit(qp, aeth);
+ }
+ goto ack_done;
+ }
+
+ if (unlikely(qp->s_last == qp->s_tail))
+ goto ack_done;
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ status = IB_WC_SUCCESS;
+
+ switch (opcode) {
+ case OP(ACKNOWLEDGE):
+ case OP(ATOMIC_ACKNOWLEDGE):
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ if (!header_in_data)
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ else {
+ aeth = be32_to_cpu(((__be32 *) data)[0]);
+ data += sizeof(__be32);
+ }
+ if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+ if (!header_in_data) {
+ __be32 *p = ohdr->u.at.atomic_ack_eth;
+
+ val = ((u64) be32_to_cpu(p[0]) << 32) |
+ be32_to_cpu(p[1]);
+ } else
+ val = be64_to_cpu(((__be64 *) data)[0]);
+ } else
+ val = 0;
+ if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
+ opcode != OP(RDMA_READ_RESPONSE_FIRST))
+ goto ack_done;
+ hdrsize += 4;
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+ qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
+ /*
+ * If this is a response to a resent RDMA read, we
+ * have to be careful to copy the data to the right
+ * location.
+ */
+ qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+ wqe, psn, pmtu);
+ goto read_middle;
+
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ /* no AETH, no ACK */
+ if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+ dev->n_rdma_seq++;
+ if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+ goto ack_done;
+ qp->r_flags |= IPATH_R_RDMAR_SEQ;
+ ipath_restart_rc(qp, qp->s_last_psn + 1);
+ goto ack_done;
+ }
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+ read_middle:
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto ack_len_err;
+ if (unlikely(pmtu >= qp->s_rdma_read_len))
+ goto ack_len_err;
+
+ /* We got a response so update the timeout. */
+ spin_lock(&dev->pending_lock);
+ if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
+ list_move_tail(&qp->timerwait,
+ &dev->pending[dev->pending_index]);
+ spin_unlock(&dev->pending_lock);
+
+ if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+ qp->s_retry = qp->s_retry_cnt;
+
+ /*
+ * Update the RDMA receive state but do the copy w/o
+ * holding the locks and blocking interrupts.
+ */
+ qp->s_rdma_read_len -= pmtu;
+ update_last_psn(qp, psn);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
+ goto bail;
+
+ case OP(RDMA_READ_RESPONSE_ONLY):
+ if (!header_in_data)
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ else
+ aeth = be32_to_cpu(((__be32 *) data)[0]);
+ if (!do_rc_ack(qp, aeth, psn, opcode, 0))
+ goto ack_done;
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /*
+ * Check that the data size is >= 0 && <= pmtu.
+ * Remember to account for the AETH header (4) and
+ * ICRC (4).
+ */
+ if (unlikely(tlen < (hdrsize + pad + 8)))
+ goto ack_len_err;
+ /*
+ * If this is a response to a resent RDMA read, we
+ * have to be careful to copy the data to the right
+ * location.
+ */
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+ wqe, psn, pmtu);
+ goto read_last;
+
+ case OP(RDMA_READ_RESPONSE_LAST):
+ /* ACKs READ req. */
+ if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+ dev->n_rdma_seq++;
+ if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+ goto ack_done;
+ qp->r_flags |= IPATH_R_RDMAR_SEQ;
+ ipath_restart_rc(qp, qp->s_last_psn + 1);
+ goto ack_done;
+ }
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /*
+ * Check that the data size is >= 1 && <= pmtu.
+ * Remember to account for the AETH header (4) and
+ * ICRC (4).
+ */
+ if (unlikely(tlen <= (hdrsize + pad + 8)))
+ goto ack_len_err;
+ read_last:
+ tlen -= hdrsize + pad + 8;
+ if (unlikely(tlen != qp->s_rdma_read_len))
+ goto ack_len_err;
+ if (!header_in_data)
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ else {
+ aeth = be32_to_cpu(((__be32 *) data)[0]);
+ data += sizeof(__be32);
+ }
+ ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
+ (void) do_rc_ack(qp, aeth, psn,
+ OP(RDMA_READ_RESPONSE_LAST), 0);
+ goto ack_done;
+ }
+
+ack_op_err:
+ status = IB_WC_LOC_QP_OP_ERR;
+ goto ack_err;
+
+ack_len_err:
+ status = IB_WC_LOC_LEN_ERR;
+ack_err:
+ ipath_send_complete(qp, wqe, status);
+ ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+ return;
+}
+
+/**
+ * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
+ struct ipath_other_headers *ohdr,
+ void *data,
+ struct ipath_qp *qp,
+ u32 opcode,
+ u32 psn,
+ int diff,
+ int header_in_data)
+{
+ struct ipath_ack_entry *e;
+ u8 i, prev;
+ int old_req;
+ unsigned long flags;
+
+ if (diff > 0) {
+ /*
+ * Packet sequence error.
+ * A NAK will ACK earlier sends and RDMA writes.
+ * Don't queue the NAK if we already sent one.
+ */
+ if (!qp->r_nak_state) {
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ /* Use the expected PSN. */
+ qp->r_ack_psn = qp->r_psn;
+ goto send_ack;
+ }
+ goto done;
+ }
+
+ /*
+ * Handle a duplicate request. Don't re-execute SEND, RDMA
+ * write or atomic op. Don't NAK errors, just silently drop
+ * the duplicate request. Note that r_sge, r_len, and
+ * r_rcv_len may be in use so don't modify them.
+ *
+ * We are supposed to ACK the earliest duplicate PSN but we
+ * can coalesce an outstanding duplicate ACK. We have to
+ * send the earliest so that RDMA reads can be restarted at
+ * the requester's expected PSN.
+ *
+ * First, find where this duplicate PSN falls within the
+ * ACKs previously sent.
+ */
+ psn &= IPATH_PSN_MASK;
+ e = NULL;
+ old_req = 1;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ /* Double check we can process this now that we hold the s_lock. */
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+ goto unlock_done;
+
+ for (i = qp->r_head_ack_queue; ; i = prev) {
+ if (i == qp->s_tail_ack_queue)
+ old_req = 0;
+ if (i)
+ prev = i - 1;
+ else
+ prev = IPATH_MAX_RDMA_ATOMIC;
+ if (prev == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[prev];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (ipath_cmp24(psn, e->psn) >= 0) {
+ if (prev == qp->s_tail_ack_queue)
+ old_req = 0;
+ break;
+ }
+ }
+ switch (opcode) {
+ case OP(RDMA_READ_REQUEST): {
+ struct ib_reth *reth;
+ u32 offset;
+ u32 len;
+
+ /*
+ * If we didn't find the RDMA read request in the ack queue,
+ * or the send tasklet is already backed up to send an
+ * earlier entry, we can ignore this request.
+ */
+ if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
+ goto unlock_done;
+ /* RETH comes after BTH */
+ if (!header_in_data)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ /*
+ * Address range must be a subset of the original
+ * request and start on pmtu boundaries.
+ * We reuse the old ack_queue slot since the requester
+ * should not back up and request an earlier PSN for the
+ * same request.
+ */
+ offset = ((psn - e->psn) & IPATH_PSN_MASK) *
+ ib_mtu_enum_to_int(qp->path_mtu);
+ len = be32_to_cpu(reth->length);
+ if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
+ goto unlock_done;
+ if (len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ ok = ipath_rkey_ok(qp, &e->rdma_sge,
+ len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock_done;
+ } else {
+ e->rdma_sge.sg_list = NULL;
+ e->rdma_sge.num_sge = 0;
+ e->rdma_sge.sge.mr = NULL;
+ e->rdma_sge.sge.vaddr = NULL;
+ e->rdma_sge.sge.length = 0;
+ e->rdma_sge.sge.sge_length = 0;
+ }
+ e->psn = psn;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ qp->s_tail_ack_queue = prev;
+ break;
+ }
+
+ case OP(COMPARE_SWAP):
+ case OP(FETCH_ADD): {
+ /*
+ * If we didn't find the atomic request in the ack queue
+ * or the send tasklet is already backed up to send an
+ * earlier entry, we can ignore this request.
+ */
+ if (!e || e->opcode != (u8) opcode || old_req)
+ goto unlock_done;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ qp->s_tail_ack_queue = prev;
+ break;
+ }
+
+ default:
+ if (old_req)
+ goto unlock_done;
+ /*
+ * Resend the most recent ACK if this request is
+ * after all the previous RDMA reads and atomics.
+ */
+ if (i == qp->r_head_ack_queue) {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ qp->r_nak_state = 0;
+ qp->r_ack_psn = qp->r_psn - 1;
+ goto send_ack;
+ }
+ /*
+ * Try to send a simple ACK to work around a Mellanox bug
+ * which doesn't accept a RDMA read response or atomic
+ * response as an ACK for earlier SENDs or RDMA writes.
+ */
+ if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
+ !(qp->s_flags & IPATH_S_ACK_PENDING) &&
+ qp->s_ack_state == OP(ACKNOWLEDGE)) {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ qp->r_nak_state = 0;
+ qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+ goto send_ack;
+ }
+ /*
+ * Resend the RDMA read or atomic op which
+ * ACKs this duplicate request.
+ */
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ qp->s_tail_ack_queue = i;
+ break;
+ }
+ qp->r_nak_state = 0;
+ ipath_schedule_send(qp);
+
+unlock_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+
+send_ack:
+ return 0;
+}
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
+{
+ unsigned long flags;
+ int lastwqe;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ lastwqe = ipath_error_qp(qp, err);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
+
+static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
+{
+ unsigned next;
+
+ next = n + 1;
+ if (next > IPATH_MAX_RDMA_ATOMIC)
+ next = 0;
+ if (n == qp->s_tail_ack_queue) {
+ qp->s_tail_ack_queue = next;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ }
+}
+
+/**
+ * ipath_rc_rcv - process an incoming RC packet
+ * @dev: the device this packet came in on
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from ipath_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+ struct ipath_other_headers *ohdr;
+ u32 opcode;
+ u32 hdrsize;
+ u32 psn;
+ u32 pad;
+ struct ib_wc wc;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ int diff;
+ struct ib_reth *reth;
+ int header_in_data;
+ unsigned long flags;
+
+ /* Validate the SLID. See Ch. 9.6.1.5 */
+ if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+ goto done;
+
+ /* Check for GRH */
+ if (!has_grh) {
+ ohdr = &hdr->u.oth;
+ hdrsize = 8 + 12; /* LRH + BTH */
+ psn = be32_to_cpu(ohdr->bth[2]);
+ header_in_data = 0;
+ } else {
+ ohdr = &hdr->u.l.oth;
+ hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */
+ /*
+ * The header with GRH is 60 bytes and the core driver sets
+ * the eager header buffer size to 56 bytes so the last 4
+ * bytes of the BTH header (PSN) is in the data buffer.
+ */
+ header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+ if (header_in_data) {
+ psn = be32_to_cpu(((__be32 *) data)[0]);
+ data += sizeof(__be32);
+ } else
+ psn = be32_to_cpu(ohdr->bth[2]);
+ }
+
+ /*
+ * Process responses (ACKs) before anything else. Note that the
+ * packet sequence number will be for something in the send work
+ * queue rather than the expected receive packet sequence number.
+ * In other words, this QP is the requester.
+ */
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
+ hdrsize, pmtu, header_in_data);
+ goto done;
+ }
+
+ /* Compute 24 bits worth of difference. */
+ diff = ipath_cmp24(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
+ psn, diff, header_in_data))
+ goto done;
+ goto send_ack;
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case OP(SEND_FIRST):
+ case OP(SEND_MIDDLE):
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+ break;
+ goto nack_inv;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_MIDDLE):
+ if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ break;
+ goto nack_inv;
+
+ default:
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ goto nack_inv;
+ /*
+ * Note that it is up to the requester to not send a new
+ * RDMA read or atomic operation before receiving an ACK
+ * for the previous operation.
+ */
+ break;
+ }
+
+ memset(&wc, 0, sizeof wc);
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ if (!ipath_get_rwqe(qp, 0))
+ goto rnr_nak;
+ qp->r_rcv_len = 0;
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ case OP(RDMA_WRITE_MIDDLE):
+ send_middle:
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto nack_inv;
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len))
+ goto nack_inv;
+ ipath_copy_sge(&qp->r_sge, data, pmtu);
+ break;
+
+ case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ /* consume RWQE */
+ if (!ipath_get_rwqe(qp, 1))
+ goto rnr_nak;
+ goto send_last_imm;
+
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+ if (!ipath_get_rwqe(qp, 0))
+ goto rnr_nak;
+ qp->r_rcv_len = 0;
+ if (opcode == OP(SEND_ONLY))
+ goto send_last;
+ /* FALLTHROUGH */
+ case OP(SEND_LAST_WITH_IMMEDIATE):
+ send_last_imm:
+ if (header_in_data) {
+ wc.ex.imm_data = *(__be32 *) data;
+ data += sizeof(__be32);
+ } else {
+ /* Immediate data comes after BTH */
+ wc.ex.imm_data = ohdr->u.imm_data;
+ }
+ hdrsize += 4;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ /* FALLTHROUGH */
+ case OP(SEND_LAST):
+ case OP(RDMA_WRITE_LAST):
+ send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto nack_inv;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len))
+ goto nack_inv;
+ ipath_copy_sge(&qp->r_sge, data, tlen);
+ qp->r_msn++;
+ if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+ break;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ (ohdr->bth[0] &
+ cpu_to_be32(1 << 23)) != 0);
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ if (unlikely(!(qp->qp_access_flags &
+ IB_ACCESS_REMOTE_WRITE)))
+ goto nack_inv;
+ /* consume RWQE */
+ /* RETH comes after BTH */
+ if (!header_in_data)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ hdrsize += sizeof(*reth);
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey & NAK */
+ ok = ipath_rkey_ok(qp, &qp->r_sge,
+ qp->r_len, vaddr, rkey,
+ IB_ACCESS_REMOTE_WRITE);
+ if (unlikely(!ok))
+ goto nack_acc;
+ } else {
+ qp->r_sge.sg_list = NULL;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (opcode == OP(RDMA_WRITE_FIRST))
+ goto send_middle;
+ else if (opcode == OP(RDMA_WRITE_ONLY))
+ goto send_last;
+ if (!ipath_get_rwqe(qp, 1))
+ goto rnr_nak;
+ goto send_last_imm;
+
+ case OP(RDMA_READ_REQUEST): {
+ struct ipath_ack_entry *e;
+ u32 len;
+ u8 next;
+
+ if (unlikely(!(qp->qp_access_flags &
+ IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+ next = qp->r_head_ack_queue + 1;
+ if (next > IPATH_MAX_RDMA_ATOMIC)
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ /* Double check we can process this while holding the s_lock. */
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+ goto unlock;
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlck;
+ ipath_update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ /* RETH comes after BTH */
+ if (!header_in_data)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ len = be32_to_cpu(reth->length);
+ if (len) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey & NAK */
+ ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto nack_acc_unlck;
+ /*
+ * Update the next expected PSN. We add 1 later
+ * below, so only add the remainder here.
+ */
+ if (len > pmtu)
+ qp->r_psn += (len - 1) / pmtu;
+ } else {
+ e->rdma_sge.sg_list = NULL;
+ e->rdma_sge.num_sge = 0;
+ e->rdma_sge.sge.mr = NULL;
+ e->rdma_sge.sge.vaddr = NULL;
+ e->rdma_sge.sge.length = 0;
+ e->rdma_sge.sge.sge_length = 0;
+ }
+ e->opcode = opcode;
+ e->sent = 0;
+ e->psn = psn;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_nak_state = 0;
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ ipath_schedule_send(qp);
+
+ goto unlock;
+ }
+
+ case OP(COMPARE_SWAP):
+ case OP(FETCH_ADD): {
+ struct ib_atomic_eth *ateth;
+ struct ipath_ack_entry *e;
+ u64 vaddr;
+ atomic64_t *maddr;
+ u64 sdata;
+ u32 rkey;
+ u8 next;
+
+ if (unlikely(!(qp->qp_access_flags &
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto nack_inv;
+ next = qp->r_head_ack_queue + 1;
+ if (next > IPATH_MAX_RDMA_ATOMIC)
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ /* Double check we can process this while holding the s_lock. */
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+ goto unlock;
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlck;
+ ipath_update_ack_queue(qp, next);
+ }
+ if (!header_in_data)
+ ateth = &ohdr->u.atomic_eth;
+ else
+ ateth = (struct ib_atomic_eth *)data;
+ vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+ be32_to_cpu(ateth->vaddr[1]);
+ if (unlikely(vaddr & (sizeof(u64) - 1)))
+ goto nack_inv_unlck;
+ rkey = be32_to_cpu(ateth->rkey);
+ /* Check rkey & NAK */
+ if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
+ sizeof(u64), vaddr, rkey,
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto nack_acc_unlck;
+ /* Perform atomic OP and save result. */
+ maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+ sdata = be64_to_cpu(ateth->swap_data);
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+ (u64) atomic64_add_return(sdata, maddr) - sdata :
+ (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+ be64_to_cpu(ateth->compare_data),
+ sdata);
+ e->opcode = opcode;
+ e->sent = 0;
+ e->psn = psn & IPATH_PSN_MASK;
+ qp->r_msn++;
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_nak_state = 0;
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ ipath_schedule_send(qp);
+
+ goto unlock;
+ }
+
+ default:
+ /* NAK unknown opcodes. */
+ goto nack_inv;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_ack_psn = psn;
+ qp->r_nak_state = 0;
+ /* Send an ACK if requested or required. */
+ if (psn & (1 << 31))
+ goto send_ack;
+ goto done;
+
+rnr_nak:
+ qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+ qp->r_ack_psn = qp->r_psn;
+ goto send_ack;
+
+nack_inv_unlck:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->r_ack_psn = qp->r_psn;
+ goto send_ack;
+
+nack_acc_unlck:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+ ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ send_rc_ack(qp);
+ goto done;
+
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_registers.h b/drivers/staging/rdma/ipath/ipath_registers.h
new file mode 100644
index 000000000000..8f44d0cf3833
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_registers.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_REGISTERS_H
+#define _IPATH_REGISTERS_H
+
+/*
+ * This file should only be included by kernel source, and by the diags. It
+ * defines the registers, and their contents, for InfiniPath chips.
+ */
+
+/*
+ * These are the InfiniPath register and buffer bit definitions,
+ * that are visible to software, and needed only by the kernel
+ * and diag code. A few, that are visible to protocol and user
+ * code are in ipath_common.h. Some bits are specific
+ * to a given chip implementation, and have been moved to the
+ * chip-specific source file
+ */
+
+/* kr_revision bits */
+#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
+#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
+#define INFINIPATH_R_ARCH_MASK 0xFF
+#define INFINIPATH_R_ARCH_SHIFT 16
+#define INFINIPATH_R_SOFTWARE_MASK 0xFF
+#define INFINIPATH_R_SOFTWARE_SHIFT 24
+#define INFINIPATH_R_BOARDID_MASK 0xFF
+#define INFINIPATH_R_BOARDID_SHIFT 32
+
+/* kr_control bits */
+#define INFINIPATH_C_FREEZEMODE 0x00000002
+#define INFINIPATH_C_LINKENABLE 0x00000004
+
+/* kr_sendctrl bits */
+#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
+#define INFINIPATH_S_UPDTHRESH_SHIFT 24
+#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
+
+#define IPATH_S_ABORT 0
+#define IPATH_S_PIOINTBUFAVAIL 1
+#define IPATH_S_PIOBUFAVAILUPD 2
+#define IPATH_S_PIOENABLE 3
+#define IPATH_S_SDMAINTENABLE 9
+#define IPATH_S_SDMASINGLEDESCRIPTOR 10
+#define IPATH_S_SDMAENABLE 11
+#define IPATH_S_SDMAHALT 12
+#define IPATH_S_DISARM 31
+
+#define INFINIPATH_S_ABORT (1U << IPATH_S_ABORT)
+#define INFINIPATH_S_PIOINTBUFAVAIL (1U << IPATH_S_PIOINTBUFAVAIL)
+#define INFINIPATH_S_PIOBUFAVAILUPD (1U << IPATH_S_PIOBUFAVAILUPD)
+#define INFINIPATH_S_PIOENABLE (1U << IPATH_S_PIOENABLE)
+#define INFINIPATH_S_SDMAINTENABLE (1U << IPATH_S_SDMAINTENABLE)
+#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
+ (1U << IPATH_S_SDMASINGLEDESCRIPTOR)
+#define INFINIPATH_S_SDMAENABLE (1U << IPATH_S_SDMAENABLE)
+#define INFINIPATH_S_SDMAHALT (1U << IPATH_S_SDMAHALT)
+#define INFINIPATH_S_DISARM (1U << IPATH_S_DISARM)
+
+/* kr_rcvctrl bits that are the same on multiple chips */
+#define INFINIPATH_R_PORTENABLE_SHIFT 0
+#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_SDMAINT 0x8000000000000000ULL
+#define INFINIPATH_I_SDMADISABLED 0x4000000000000000ULL
+#define INFINIPATH_I_ERROR 0x0000000080000000ULL
+#define INFINIPATH_I_SPIOSENT 0x0000000040000000ULL
+#define INFINIPATH_I_SPIOBUFAVAIL 0x0000000020000000ULL
+#define INFINIPATH_I_GPIO 0x0000000010000000ULL
+#define INFINIPATH_I_JINT 0x0000000004000000ULL
+
+/* kr_errorstatus, kr_errorclear, kr_errormask bits */
+#define INFINIPATH_E_RFORMATERR 0x0000000000000001ULL
+#define INFINIPATH_E_RVCRC 0x0000000000000002ULL
+#define INFINIPATH_E_RICRC 0x0000000000000004ULL
+#define INFINIPATH_E_RMINPKTLEN 0x0000000000000008ULL
+#define INFINIPATH_E_RMAXPKTLEN 0x0000000000000010ULL
+#define INFINIPATH_E_RLONGPKTLEN 0x0000000000000020ULL
+#define INFINIPATH_E_RSHORTPKTLEN 0x0000000000000040ULL
+#define INFINIPATH_E_RUNEXPCHAR 0x0000000000000080ULL
+#define INFINIPATH_E_RUNSUPVL 0x0000000000000100ULL
+#define INFINIPATH_E_REBP 0x0000000000000200ULL
+#define INFINIPATH_E_RIBFLOW 0x0000000000000400ULL
+#define INFINIPATH_E_RBADVERSION 0x0000000000000800ULL
+#define INFINIPATH_E_RRCVEGRFULL 0x0000000000001000ULL
+#define INFINIPATH_E_RRCVHDRFULL 0x0000000000002000ULL
+#define INFINIPATH_E_RBADTID 0x0000000000004000ULL
+#define INFINIPATH_E_RHDRLEN 0x0000000000008000ULL
+#define INFINIPATH_E_RHDR 0x0000000000010000ULL
+#define INFINIPATH_E_RIBLOSTLINK 0x0000000000020000ULL
+#define INFINIPATH_E_SENDSPECIALTRIGGER 0x0000000008000000ULL
+#define INFINIPATH_E_SDMADISABLED 0x0000000010000000ULL
+#define INFINIPATH_E_SMINPKTLEN 0x0000000020000000ULL
+#define INFINIPATH_E_SMAXPKTLEN 0x0000000040000000ULL
+#define INFINIPATH_E_SUNDERRUN 0x0000000080000000ULL
+#define INFINIPATH_E_SPKTLEN 0x0000000100000000ULL
+#define INFINIPATH_E_SDROPPEDSMPPKT 0x0000000200000000ULL
+#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL
+#define INFINIPATH_E_SPIOARMLAUNCH 0x0000000800000000ULL
+#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL
+#define INFINIPATH_E_SUNSUPVL 0x0000002000000000ULL
+#define INFINIPATH_E_SENDBUFMISUSE 0x0000004000000000ULL
+#define INFINIPATH_E_SDMAGENMISMATCH 0x0000008000000000ULL
+#define INFINIPATH_E_SDMAOUTOFBOUND 0x0000010000000000ULL
+#define INFINIPATH_E_SDMATAILOUTOFBOUND 0x0000020000000000ULL
+#define INFINIPATH_E_SDMABASE 0x0000040000000000ULL
+#define INFINIPATH_E_SDMA1STDESC 0x0000080000000000ULL
+#define INFINIPATH_E_SDMARPYTAG 0x0000100000000000ULL
+#define INFINIPATH_E_SDMADWEN 0x0000200000000000ULL
+#define INFINIPATH_E_SDMAMISSINGDW 0x0000400000000000ULL
+#define INFINIPATH_E_SDMAUNEXPDATA 0x0000800000000000ULL
+#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL
+#define INFINIPATH_E_INVALIDADDR 0x0002000000000000ULL
+#define INFINIPATH_E_RESET 0x0004000000000000ULL
+#define INFINIPATH_E_HARDWARE 0x0008000000000000ULL
+#define INFINIPATH_E_SDMADESCADDRMISALIGN 0x0010000000000000ULL
+#define INFINIPATH_E_INVALIDEEPCMD 0x0020000000000000ULL
+
+/*
+ * this is used to print "common" packet errors only when the
+ * __IPATH_ERRPKTDBG bit is set in ipath_debug.
+ */
+#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
+ | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
+ | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
+ | INFINIPATH_E_REBP )
+
+/* Convenience for decoding Send DMA errors */
+#define INFINIPATH_E_SDMAERRS ( \
+ INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
+ INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
+ INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
+ INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
+ INFINIPATH_E_SDMAUNEXPDATA | \
+ INFINIPATH_E_SDMADESCADDRMISALIGN | \
+ INFINIPATH_E_SDMADISABLED | \
+ INFINIPATH_E_SENDBUFMISUSE)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
+ * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: expTID, 3: eagerTID
+ * bit 4: flag buffer, 5: datainfo, 6: header info */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF 0x1ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC 0x2ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
+/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF 0x01ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ 0x02ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID 0x04ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF 0x10ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO 0x40ULL
+/* waldo specific -- find the rest in ipath_6110.c */
+#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR 0x0000000400000000ULL
+/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
+#define INFINIPATH_HWE_MEMBISTFAILED 0x0040000000000000ULL
+
+/* kr_hwdiagctrl bits */
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR 0x0000000400000000ULL
+#define INFINIPATH_DC_COUNTERDISABLE 0x1000000000000000ULL
+#define INFINIPATH_DC_COUNTERWREN 0x2000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+
+/* kr_ibcctrl bits */
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
+#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
+#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
+#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */
+#define INFINIPATH_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */
+#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
+#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
+#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
+#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
+#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
+#define INFINIPATH_IBCC_LOOPBACK 0x8000000000000000ULL
+#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
+
+/* kr_ibcstatus bits */
+#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
+#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
+
+#define INFINIPATH_IBCS_TXREADY 0x40000000
+#define INFINIPATH_IBCS_TXCREDITOK 0x80000000
+/* link training states (shift by
+ INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
+#define INFINIPATH_IBCS_LT_STATE_DISABLED 0x00
+#define INFINIPATH_IBCS_LT_STATE_LINKUP 0x01
+#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE 0x02
+#define INFINIPATH_IBCS_LT_STATE_POLLQUIET 0x03
+#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY 0x04
+#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET 0x05
+#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE 0x08
+#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG 0x09
+#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT 0x0a
+#define INFINIPATH_IBCS_LT_STATE_CFGIDLE 0x0b
+#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN 0x0c
+#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT 0x0e
+#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE 0x0f
+/* link state machine states (shift by ibcs_ls_shift) */
+#define INFINIPATH_IBCS_L_STATE_DOWN 0x0
+#define INFINIPATH_IBCS_L_STATE_INIT 0x1
+#define INFINIPATH_IBCS_L_STATE_ARM 0x2
+#define INFINIPATH_IBCS_L_STATE_ACTIVE 0x3
+#define INFINIPATH_IBCS_L_STATE_ACT_DEFER 0x4
+
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
+#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
+#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
+
+/* kr_extctrl bits */
+#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
+#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
+#define INFINIPATH_EXTC_SERDESENABLE 0x80000000ULL
+#define INFINIPATH_EXTC_SERDESCONNECT 0x40000000ULL
+#define INFINIPATH_EXTC_SERDESENTRUNKING 0x20000000ULL
+#define INFINIPATH_EXTC_SERDESDISRXFIFO 0x10000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK1 0x08000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK2 0x04000000ULL
+#define INFINIPATH_EXTC_SERDESENENCDEC 0x02000000ULL
+#define INFINIPATH_EXTC_LED1SECPORT_ON 0x00000020ULL
+#define INFINIPATH_EXTC_LED2SECPORT_ON 0x00000010ULL
+#define INFINIPATH_EXTC_LED1PRIPORT_ON 0x00000008ULL
+#define INFINIPATH_EXTC_LED2PRIPORT_ON 0x00000004ULL
+#define INFINIPATH_EXTC_LEDGBLOK_ON 0x00000002ULL
+#define INFINIPATH_EXTC_LEDGBLERR_OFF 0x00000001ULL
+
+/* kr_partitionkey bits */
+#define INFINIPATH_PKEY_SIZE 16
+#define INFINIPATH_PKEY_MASK 0xFFFF
+#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
+
+/* kr_serdesconfig0 bits */
+#define INFINIPATH_SERDC0_RESET_MASK 0xfULL /* overal reset bits */
+#define INFINIPATH_SERDC0_RESET_PLL 0x10000000ULL /* pll reset */
+/* tx idle enables (per lane) */
+#define INFINIPATH_SERDC0_TXIDLE 0xF000ULL
+/* rx detect enables (per lane) */
+#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
+/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
+#define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL
+
+/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
+#define INFINIPATH_XGXS_RX_POL_SHIFT 19
+#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
+
+
+/*
+ * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers. This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define IPATH_PIO_MAXIBHDR 128
+
+typedef u64 ipath_err_t;
+
+/* The following change with the type of device, so
+ * need to be part of the ipath_devdata struct, or
+ * we could have problems plugging in devices of
+ * different types (e.g. one HT, one PCIE)
+ * in one system, to be managed by one driver.
+ * On the other hand, this file is may also be included
+ * by other code, so leave the declarations here
+ * temporarily. Minor footprint issue if common-model
+ * linker used, none if C89+ linker used.
+ */
+
+/* mask of defined bits for various registers */
+extern u64 infinipath_i_bitsextant;
+extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
+
+/* masks that are different in various chips, or only exist in some chips */
+extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
+
+/*
+ * These are the infinipath general register numbers (not offsets).
+ * The kernel registers are used directly, those beyond the kernel
+ * registers are calculated from one of the base registers. The use of
+ * an integer type doesn't allow type-checking as thorough as, say,
+ * an enum but allows for better hiding of chip differences.
+ */
+typedef const u16 ipath_kreg, /* infinipath general registers */
+ ipath_creg, /* infinipath counter registers */
+ ipath_sreg; /* kernel-only, infinipath send registers */
+
+/*
+ * These are the chip registers common to all infinipath chips, and
+ * used both by the kernel and the diagnostics or other user code.
+ * They are all implemented such that 64 bit accesses work.
+ * Some implement no more than 32 bits. Because 64 bit reads
+ * require 2 HT cmds on opteron, we access those with 32 bit
+ * reads for efficiency (they are written as 64 bits, since
+ * the extra 32 bits are nearly free on writes, and it slightly reduces
+ * complexity). The rest are all accessed as 64 bits.
+ */
+struct ipath_kregs {
+ /* These are the 32 bit group */
+ ipath_kreg kr_control;
+ ipath_kreg kr_counterregbase;
+ ipath_kreg kr_intmask;
+ ipath_kreg kr_intstatus;
+ ipath_kreg kr_pagealign;
+ ipath_kreg kr_portcnt;
+ ipath_kreg kr_rcvtidbase;
+ ipath_kreg kr_rcvtidcnt;
+ ipath_kreg kr_rcvegrbase;
+ ipath_kreg kr_rcvegrcnt;
+ ipath_kreg kr_scratch;
+ ipath_kreg kr_sendctrl;
+ ipath_kreg kr_sendpiobufbase;
+ ipath_kreg kr_sendpiobufcnt;
+ ipath_kreg kr_sendpiosize;
+ ipath_kreg kr_sendregbase;
+ ipath_kreg kr_userregbase;
+ /* These are the 64 bit group */
+ ipath_kreg kr_debugport;
+ ipath_kreg kr_debugportselect;
+ ipath_kreg kr_errorclear;
+ ipath_kreg kr_errormask;
+ ipath_kreg kr_errorstatus;
+ ipath_kreg kr_extctrl;
+ ipath_kreg kr_extstatus;
+ ipath_kreg kr_gpio_clear;
+ ipath_kreg kr_gpio_mask;
+ ipath_kreg kr_gpio_out;
+ ipath_kreg kr_gpio_status;
+ ipath_kreg kr_hwdiagctrl;
+ ipath_kreg kr_hwerrclear;
+ ipath_kreg kr_hwerrmask;
+ ipath_kreg kr_hwerrstatus;
+ ipath_kreg kr_ibcctrl;
+ ipath_kreg kr_ibcstatus;
+ ipath_kreg kr_intblocked;
+ ipath_kreg kr_intclear;
+ ipath_kreg kr_interruptconfig;
+ ipath_kreg kr_mdio;
+ ipath_kreg kr_partitionkey;
+ ipath_kreg kr_rcvbthqp;
+ ipath_kreg kr_rcvbufbase;
+ ipath_kreg kr_rcvbufsize;
+ ipath_kreg kr_rcvctrl;
+ ipath_kreg kr_rcvhdrcnt;
+ ipath_kreg kr_rcvhdrentsize;
+ ipath_kreg kr_rcvhdrsize;
+ ipath_kreg kr_rcvintmembase;
+ ipath_kreg kr_rcvintmemsize;
+ ipath_kreg kr_revision;
+ ipath_kreg kr_sendbuffererror;
+ ipath_kreg kr_sendpioavailaddr;
+ ipath_kreg kr_serdesconfig0;
+ ipath_kreg kr_serdesconfig1;
+ ipath_kreg kr_serdesstatus;
+ ipath_kreg kr_txintmembase;
+ ipath_kreg kr_txintmemsize;
+ ipath_kreg kr_xgxsconfig;
+ ipath_kreg kr_ibpllcfg;
+ /* use these two (and the following N ports) only with
+ * ipath_k*_kreg64_port(); not *kreg64() */
+ ipath_kreg kr_rcvhdraddr;
+ ipath_kreg kr_rcvhdrtailaddr;
+
+ /* remaining registers are not present on all types of infinipath
+ chips */
+ ipath_kreg kr_rcvpktledcnt;
+ ipath_kreg kr_pcierbuftestreg0;
+ ipath_kreg kr_pcierbuftestreg1;
+ ipath_kreg kr_pcieq0serdesconfig0;
+ ipath_kreg kr_pcieq0serdesconfig1;
+ ipath_kreg kr_pcieq0serdesstatus;
+ ipath_kreg kr_pcieq1serdesconfig0;
+ ipath_kreg kr_pcieq1serdesconfig1;
+ ipath_kreg kr_pcieq1serdesstatus;
+ ipath_kreg kr_hrtbt_guid;
+ ipath_kreg kr_ibcddrctrl;
+ ipath_kreg kr_ibcddrstatus;
+ ipath_kreg kr_jintreload;
+
+ /* send dma related regs */
+ ipath_kreg kr_senddmabase;
+ ipath_kreg kr_senddmalengen;
+ ipath_kreg kr_senddmatail;
+ ipath_kreg kr_senddmahead;
+ ipath_kreg kr_senddmaheadaddr;
+ ipath_kreg kr_senddmabufmask0;
+ ipath_kreg kr_senddmabufmask1;
+ ipath_kreg kr_senddmabufmask2;
+ ipath_kreg kr_senddmastatus;
+
+ /* SerDes related regs (IBA7220-only) */
+ ipath_kreg kr_ibserdesctrl;
+ ipath_kreg kr_ib_epbacc;
+ ipath_kreg kr_ib_epbtrans;
+ ipath_kreg kr_pcie_epbacc;
+ ipath_kreg kr_pcie_epbtrans;
+ ipath_kreg kr_ib_ddsrxeq;
+};
+
+struct ipath_cregs {
+ ipath_creg cr_badformatcnt;
+ ipath_creg cr_erricrccnt;
+ ipath_creg cr_errlinkcnt;
+ ipath_creg cr_errlpcrccnt;
+ ipath_creg cr_errpkey;
+ ipath_creg cr_errrcvflowctrlcnt;
+ ipath_creg cr_err_rlencnt;
+ ipath_creg cr_errslencnt;
+ ipath_creg cr_errtidfull;
+ ipath_creg cr_errtidvalid;
+ ipath_creg cr_errvcrccnt;
+ ipath_creg cr_ibstatuschange;
+ ipath_creg cr_intcnt;
+ ipath_creg cr_invalidrlencnt;
+ ipath_creg cr_invalidslencnt;
+ ipath_creg cr_lbflowstallcnt;
+ ipath_creg cr_iblinkdowncnt;
+ ipath_creg cr_iblinkerrrecovcnt;
+ ipath_creg cr_ibsymbolerrcnt;
+ ipath_creg cr_pktrcvcnt;
+ ipath_creg cr_pktrcvflowctrlcnt;
+ ipath_creg cr_pktsendcnt;
+ ipath_creg cr_pktsendflowcnt;
+ ipath_creg cr_portovflcnt;
+ ipath_creg cr_rcvebpcnt;
+ ipath_creg cr_rcvovflcnt;
+ ipath_creg cr_rxdroppktcnt;
+ ipath_creg cr_senddropped;
+ ipath_creg cr_sendstallcnt;
+ ipath_creg cr_sendunderruncnt;
+ ipath_creg cr_unsupvlcnt;
+ ipath_creg cr_wordrcvcnt;
+ ipath_creg cr_wordsendcnt;
+ ipath_creg cr_vl15droppedpktcnt;
+ ipath_creg cr_rxotherlocalphyerrcnt;
+ ipath_creg cr_excessbufferovflcnt;
+ ipath_creg cr_locallinkintegrityerrcnt;
+ ipath_creg cr_rxvlerrcnt;
+ ipath_creg cr_rxdlidfltrcnt;
+ ipath_creg cr_psstat;
+ ipath_creg cr_psstart;
+ ipath_creg cr_psinterval;
+ ipath_creg cr_psrcvdatacount;
+ ipath_creg cr_psrcvpktscount;
+ ipath_creg cr_psxmitdatacount;
+ ipath_creg cr_psxmitpktscount;
+ ipath_creg cr_psxmitwaitcount;
+};
+
+#endif /* _IPATH_REGISTERS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_ruc.c b/drivers/staging/rdma/ipath/ipath_ruc.c
new file mode 100644
index 000000000000..1f95bbaf7602
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_ruc.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of milliseconds.
+ */
+const u32 ib_ipath_rnr_table[32] = {
+ 656, /* 0 */
+ 1, /* 1 */
+ 1, /* 2 */
+ 1, /* 3 */
+ 1, /* 4 */
+ 1, /* 5 */
+ 1, /* 6 */
+ 1, /* 7 */
+ 1, /* 8 */
+ 1, /* 9 */
+ 1, /* A */
+ 1, /* B */
+ 1, /* C */
+ 1, /* D */
+ 2, /* E */
+ 2, /* F */
+ 3, /* 10 */
+ 4, /* 11 */
+ 6, /* 12 */
+ 8, /* 13 */
+ 11, /* 14 */
+ 16, /* 15 */
+ 21, /* 16 */
+ 31, /* 17 */
+ 41, /* 18 */
+ 62, /* 19 */
+ 82, /* 1A */
+ 123, /* 1B */
+ 164, /* 1C */
+ 246, /* 1D */
+ 328, /* 1E */
+ 492 /* 1F */
+};
+
+/**
+ * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
+ * @qp: the QP
+ *
+ * Called with the QP s_lock held and interrupts disabled.
+ * XXX Use a simple list for now. We might need a priority
+ * queue if we have lots of QPs waiting for RNR timeouts
+ * but that should be rare.
+ */
+void ipath_insert_rnr_queue(struct ipath_qp *qp)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+
+ /* We already did a spin_lock_irqsave(), so just use spin_lock */
+ spin_lock(&dev->pending_lock);
+ if (list_empty(&dev->rnrwait))
+ list_add(&qp->timerwait, &dev->rnrwait);
+ else {
+ struct list_head *l = &dev->rnrwait;
+ struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
+ timerwait);
+
+ while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
+ qp->s_rnr_timeout -= nqp->s_rnr_timeout;
+ l = l->next;
+ if (l->next == &dev->rnrwait) {
+ nqp = NULL;
+ break;
+ }
+ nqp = list_entry(l->next, struct ipath_qp,
+ timerwait);
+ }
+ if (nqp)
+ nqp->s_rnr_timeout -= qp->s_rnr_timeout;
+ list_add(&qp->timerwait, l);
+ }
+ spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_init_sge - Validate a RWQE and fill in the SGE state
+ * @qp: the QP
+ *
+ * Return 1 if OK.
+ */
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+ u32 *lengthp, struct ipath_sge_state *ss)
+{
+ int i, j, ret;
+ struct ib_wc wc;
+
+ *lengthp = 0;
+ for (i = j = 0; i < wqe->num_sge; i++) {
+ if (wqe->sg_list[i].length == 0)
+ continue;
+ /* Check LKEY */
+ if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
+ &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+ goto bad_lkey;
+ *lengthp += wqe->sg_list[i].length;
+ j++;
+ }
+ ss->num_sge = j;
+ ret = 1;
+ goto bail;
+
+bad_lkey:
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr_id;
+ wc.status = IB_WC_LOC_PROT_ERR;
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ /* Signal solicited completion event. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+ ret = 0;
+bail:
+ return ret;
+}
+
+/**
+ * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return 0 if no RWQE is available, otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
+{
+ unsigned long flags;
+ struct ipath_rq *rq;
+ struct ipath_rwq *wq;
+ struct ipath_srq *srq;
+ struct ipath_rwqe *wqe;
+ void (*handler)(struct ib_event *, void *);
+ u32 tail;
+ int ret;
+
+ if (qp->ibqp.srq) {
+ srq = to_isrq(qp->ibqp.srq);
+ handler = srq->ibsrq.event_handler;
+ rq = &srq->rq;
+ } else {
+ srq = NULL;
+ handler = NULL;
+ rq = &qp->r_rq;
+ }
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+ ret = 0;
+ goto unlock;
+ }
+
+ wq = rq->wq;
+ tail = wq->tail;
+ /* Validate tail before using it since it is user writable. */
+ if (tail >= rq->size)
+ tail = 0;
+ do {
+ if (unlikely(tail == wq->head)) {
+ ret = 0;
+ goto unlock;
+ }
+ /* Make sure entry is read after head index is read. */
+ smp_rmb();
+ wqe = get_rwqe_ptr(rq, tail);
+ if (++tail >= rq->size)
+ tail = 0;
+ if (wr_id_only)
+ break;
+ qp->r_sge.sg_list = qp->r_sg_list;
+ } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
+ qp->r_wr_id = wqe->wr_id;
+ wq->tail = tail;
+
+ ret = 1;
+ set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
+ if (handler) {
+ u32 n;
+
+ /*
+ * validate head pointer value and compute
+ * the number of remaining WQEs.
+ */
+ n = wq->head;
+ if (n >= rq->size)
+ n = 0;
+ if (n < tail)
+ n += rq->size - tail;
+ else
+ n -= tail;
+ if (n < srq->limit) {
+ struct ib_event ev;
+
+ srq->limit = 0;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ handler(&ev, srq->ibsrq.srq_context);
+ goto bail;
+ }
+ }
+unlock:
+ spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+ return ret;
+}
+
+/**
+ * ipath_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from ipath_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send(). We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ipath_ruc_loopback(struct ipath_qp *sqp)
+{
+ struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+ struct ipath_qp *qp;
+ struct ipath_swqe *wqe;
+ struct ipath_sge *sge;
+ unsigned long flags;
+ struct ib_wc wc;
+ u64 sdata;
+ atomic64_t *maddr;
+ enum ib_wc_status send_status;
+
+ /*
+ * Note that we check the responder QP state after
+ * checking the requester's state.
+ */
+ qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
+
+ spin_lock_irqsave(&sqp->s_lock, flags);
+
+ /* Return if we are already busy processing a work request. */
+ if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+ !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+ goto unlock;
+
+ sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+ if (sqp->s_last == sqp->s_head)
+ goto clr_busy;
+ wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+ /* Return if it is not OK to start a new work reqeust. */
+ if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+ if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+ goto clr_busy;
+ /* We are in the error state, flush the work request. */
+ send_status = IB_WC_WR_FLUSH_ERR;
+ goto flush_send;
+ }
+
+ /*
+ * We can rely on the entry not changing without the s_lock
+ * being held until we update s_last.
+ * We increment s_cur to indicate s_last is in progress.
+ */
+ if (sqp->s_last == sqp->s_cur) {
+ if (++sqp->s_cur >= sqp->s_size)
+ sqp->s_cur = 0;
+ }
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+ if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+ dev->n_pkt_drops++;
+ /*
+ * For RC, the requester would timeout and retry so
+ * shortcut the timeouts and just signal too many retries.
+ */
+ if (sqp->ibqp.qp_type == IB_QPT_RC)
+ send_status = IB_WC_RETRY_EXC_ERR;
+ else
+ send_status = IB_WC_SUCCESS;
+ goto serr;
+ }
+
+ memset(&wc, 0, sizeof wc);
+ send_status = IB_WC_SUCCESS;
+
+ sqp->s_sge.sge = wqe->sg_list[0];
+ sqp->s_sge.sg_list = wqe->sg_list + 1;
+ sqp->s_sge.num_sge = wqe->wr.num_sge;
+ sqp->s_len = wqe->length;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ /* FALLTHROUGH */
+ case IB_WR_SEND:
+ if (!ipath_get_rwqe(qp, 0))
+ goto rnr_nak;
+ break;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ if (!ipath_get_rwqe(qp, 1))
+ goto rnr_nak;
+ /* FALLTHROUGH */
+ case IB_WR_RDMA_WRITE:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+ if (wqe->length == 0)
+ break;
+ if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
+ wqe->wr.wr.rdma.remote_addr,
+ wqe->wr.wr.rdma.rkey,
+ IB_ACCESS_REMOTE_WRITE)))
+ goto acc_err;
+ break;
+
+ case IB_WR_RDMA_READ:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto inv_err;
+ if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
+ wqe->wr.wr.rdma.remote_addr,
+ wqe->wr.wr.rdma.rkey,
+ IB_ACCESS_REMOTE_READ)))
+ goto acc_err;
+ qp->r_sge.sge = wqe->sg_list[0];
+ qp->r_sge.sg_list = wqe->sg_list + 1;
+ qp->r_sge.num_sge = wqe->wr.num_sge;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ goto inv_err;
+ if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
+ wqe->wr.wr.atomic.remote_addr,
+ wqe->wr.wr.atomic.rkey,
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto acc_err;
+ /* Perform atomic OP and save result. */
+ maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+ sdata = wqe->wr.wr.atomic.compare_add;
+ *(u64 *) sqp->s_sge.sge.vaddr =
+ (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+ (u64) atomic64_add_return(sdata, maddr) - sdata :
+ (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+ sdata, wqe->wr.wr.atomic.swap);
+ goto send_comp;
+
+ default:
+ send_status = IB_WC_LOC_QP_OP_ERR;
+ goto serr;
+ }
+
+ sge = &sqp->s_sge.sge;
+ while (sqp->s_len) {
+ u32 len = sqp->s_len;
+
+ if (len > sge->length)
+ len = sge->length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ BUG_ON(len == 0);
+ ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--sqp->s_sge.num_sge)
+ *sge = *sqp->s_sge.sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ sqp->s_len -= len;
+ }
+
+ if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+ goto send_comp;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc.opcode = IB_WC_RECV;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.port_num = 1;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
+ sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+ ipath_send_complete(sqp, wqe, send_status);
+ goto again;
+
+rnr_nak:
+ /* Handle RNR NAK */
+ if (qp->ibqp.qp_type == IB_QPT_UC)
+ goto send_comp;
+ /*
+ * Note: we don't need the s_lock held since the BUSY flag
+ * makes this single threaded.
+ */
+ if (sqp->s_rnr_retry == 0) {
+ send_status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto serr;
+ }
+ if (sqp->s_rnr_retry_cnt < 7)
+ sqp->s_rnr_retry--;
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
+ goto clr_busy;
+ sqp->s_flags |= IPATH_S_WAITING;
+ dev->n_rnr_naks++;
+ sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
+ ipath_insert_rnr_queue(sqp);
+ goto clr_busy;
+
+inv_err:
+ send_status = IB_WC_REM_INV_REQ_ERR;
+ wc.status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+
+acc_err:
+ send_status = IB_WC_REM_ACCESS_ERR;
+ wc.status = IB_WC_LOC_PROT_ERR;
+err:
+ /* responder goes to error state */
+ ipath_rc_error(qp, wc.status);
+
+serr:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ ipath_send_complete(sqp, wqe, send_status);
+ if (sqp->ibqp.qp_type == IB_QPT_RC) {
+ int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+ sqp->s_flags &= ~IPATH_S_BUSY;
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = sqp->ibqp.device;
+ ev.element.qp = &sqp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+ }
+ goto done;
+ }
+clr_busy:
+ sqp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+ if (qp && atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
+{
+ if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+ qp->ibqp.qp_type == IB_QPT_SMI) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+ }
+}
+
+/**
+ * ipath_no_bufs_available - tell the layer driver we need buffers
+ * @qp: the QP that caused the problem
+ * @dev: the device we ran out of buffers on
+ *
+ * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int ipath_no_bufs_available(struct ipath_qp *qp,
+ struct ipath_ibdev *dev)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ /*
+ * Note that as soon as want_buffer() is called and
+ * possibly before it returns, ipath_ib_piobufavail()
+ * could be called. Therefore, put QP on the piowait list before
+ * enabling the PIO avail interrupt.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+ dev->n_piowait++;
+ qp->s_flags |= IPATH_S_WAITING;
+ qp->s_flags &= ~IPATH_S_BUSY;
+ spin_lock(&dev->pending_lock);
+ if (list_empty(&qp->piowait))
+ list_add_tail(&qp->piowait, &dev->piowait);
+ spin_unlock(&dev->pending_lock);
+ } else
+ ret = 0;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (ret)
+ want_buffer(dev->dd, qp);
+ return ret;
+}
+
+/**
+ * ipath_make_grh - construct a GRH header
+ * @dev: a pointer to the ipath device
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+ struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+ hdr->version_tclass_flow =
+ cpu_to_be32((6 << 28) |
+ (grh->traffic_class << 20) |
+ grh->flow_label);
+ hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+ /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+ hdr->next_hdr = 0x1B;
+ hdr->hop_limit = grh->hop_limit;
+ /* The SGID is 32-bit aligned. */
+ hdr->sgid.global.subnet_prefix = dev->gid_prefix;
+ hdr->sgid.global.interface_id = dev->dd->ipath_guid;
+ hdr->dgid = grh->dgid;
+
+ /* GRH header size in 32-bit words. */
+ return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+ struct ipath_other_headers *ohdr,
+ u32 bth0, u32 bth2)
+{
+ u16 lrh0;
+ u32 nwords;
+ u32 extra_bytes;
+
+ /* Construct the header. */
+ extra_bytes = -qp->s_cur_size & 3;
+ nwords = (qp->s_cur_size + extra_bytes) >> 2;
+ lrh0 = IPATH_LRH_BTH;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+ &qp->remote_ah_attr.grh,
+ qp->s_hdrwords, nwords);
+ lrh0 = IPATH_LRH_GRH;
+ }
+ lrh0 |= qp->remote_ah_attr.sl << 4;
+ qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+ qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+ qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
+ qp->remote_ah_attr.src_path_bits);
+ bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
+ bth0 |= extra_bytes << 20;
+ ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
+ ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+ ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * ipath_do_send - perform a send on a QP
+ * @data: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted. Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void ipath_do_send(unsigned long data)
+{
+ struct ipath_qp *qp = (struct ipath_qp *)data;
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ int (*make_req)(struct ipath_qp *qp);
+ unsigned long flags;
+
+ if ((qp->ibqp.qp_type == IB_QPT_RC ||
+ qp->ibqp.qp_type == IB_QPT_UC) &&
+ qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
+ ipath_ruc_loopback(qp);
+ goto bail;
+ }
+
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ make_req = ipath_make_rc_req;
+ else if (qp->ibqp.qp_type == IB_QPT_UC)
+ make_req = ipath_make_uc_req;
+ else
+ make_req = ipath_make_ud_req;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Return if we are already busy processing a work request. */
+ if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+ !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ goto bail;
+ }
+
+ qp->s_flags |= IPATH_S_BUSY;
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+again:
+ /* Check for a constructed packet to be sent. */
+ if (qp->s_hdrwords != 0) {
+ /*
+ * If no PIO bufs are available, return. An interrupt will
+ * call ipath_ib_piobufavail() when one is available.
+ */
+ if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
+ qp->s_cur_sge, qp->s_cur_size)) {
+ if (ipath_no_bufs_available(qp, dev))
+ goto bail;
+ }
+ dev->n_unicast_xmit++;
+ /* Record that we sent the packet and s_hdr is empty. */
+ qp->s_hdrwords = 0;
+ }
+
+ if (make_req(qp))
+ goto again;
+
+bail:;
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+ enum ib_wc_status status)
+{
+ u32 old_last, last;
+
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+ return;
+
+ /* See ch. 11.2.4.1 and 10.7.3.1 */
+ if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+ status != IB_WC_SUCCESS) {
+ struct ib_wc wc;
+
+ memset(&wc, 0, sizeof wc);
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = status;
+ wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+ wc.qp = &qp->ibqp;
+ if (status == IB_WC_SUCCESS)
+ wc.byte_len = wqe->length;
+ ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+ status != IB_WC_SUCCESS);
+ }
+
+ old_last = last = qp->s_last;
+ if (++last >= qp->s_size)
+ last = 0;
+ qp->s_last = last;
+ if (qp->s_cur == old_last)
+ qp->s_cur = last;
+ if (qp->s_tail == old_last)
+ qp->s_tail = last;
+ if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+ qp->s_draining = 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_sdma.c b/drivers/staging/rdma/ipath/ipath_sdma.c
new file mode 100644
index 000000000000..17a517766ad2
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_sdma.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
+
+static void vl15_watchdog_enq(struct ipath_devdata *dd)
+{
+ /* ipath_sdma_lock must already be held */
+ if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
+ unsigned long interval = (HZ + 19) / 20;
+ dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
+ add_timer(&dd->ipath_sdma_vl15_timer);
+ }
+}
+
+static void vl15_watchdog_deq(struct ipath_devdata *dd)
+{
+ /* ipath_sdma_lock must already be held */
+ if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
+ unsigned long interval = (HZ + 19) / 20;
+ mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
+ } else {
+ del_timer(&dd->ipath_sdma_vl15_timer);
+ }
+}
+
+static void vl15_watchdog_timeout(unsigned long opaque)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+ if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
+ ipath_dbg("vl15 watchdog timeout - clearing\n");
+ ipath_cancel_sends(dd, 1);
+ ipath_hol_down(dd);
+ } else {
+ ipath_dbg("vl15 watchdog timeout - "
+ "condition already cleared\n");
+ }
+}
+
+static void unmap_desc(struct ipath_devdata *dd, unsigned head)
+{
+ __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
+ u64 desc[2];
+ dma_addr_t addr;
+ size_t len;
+
+ desc[0] = le64_to_cpu(descqp[0]);
+ desc[1] = le64_to_cpu(descqp[1]);
+
+ addr = (desc[1] << 32) | (desc[0] >> 32);
+ len = (desc[0] >> 14) & (0x7ffULL << 2);
+ dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+/*
+ * ipath_sdma_lock should be locked before calling this.
+ */
+int ipath_sdma_make_progress(struct ipath_devdata *dd)
+{
+ struct list_head *lp = NULL;
+ struct ipath_sdma_txreq *txp = NULL;
+ u16 dmahead;
+ u16 start_idx = 0;
+ int progress = 0;
+
+ if (!list_empty(&dd->ipath_sdma_activelist)) {
+ lp = dd->ipath_sdma_activelist.next;
+ txp = list_entry(lp, struct ipath_sdma_txreq, list);
+ start_idx = txp->start_idx;
+ }
+
+ /*
+ * Read the SDMA head register in order to know that the
+ * interrupt clear has been written to the chip.
+ * Otherwise, we may not get an interrupt for the last
+ * descriptor in the queue.
+ */
+ dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
+ /* sanity check return value for error handling (chip reset, etc.) */
+ if (dmahead >= dd->ipath_sdma_descq_cnt)
+ goto done;
+
+ while (dd->ipath_sdma_descq_head != dmahead) {
+ if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
+ dd->ipath_sdma_descq_head == start_idx) {
+ unmap_desc(dd, dd->ipath_sdma_descq_head);
+ start_idx++;
+ if (start_idx == dd->ipath_sdma_descq_cnt)
+ start_idx = 0;
+ }
+
+ /* increment free count and head */
+ dd->ipath_sdma_descq_removed++;
+ if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
+ dd->ipath_sdma_descq_head = 0;
+
+ if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
+ /* move to notify list */
+ if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+ vl15_watchdog_deq(dd);
+ list_move_tail(lp, &dd->ipath_sdma_notifylist);
+ if (!list_empty(&dd->ipath_sdma_activelist)) {
+ lp = dd->ipath_sdma_activelist.next;
+ txp = list_entry(lp, struct ipath_sdma_txreq,
+ list);
+ start_idx = txp->start_idx;
+ } else {
+ lp = NULL;
+ txp = NULL;
+ }
+ }
+ progress = 1;
+ }
+
+ if (progress)
+ tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+done:
+ return progress;
+}
+
+static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
+{
+ struct ipath_sdma_txreq *txp, *txp_next;
+
+ list_for_each_entry_safe(txp, txp_next, list, list) {
+ list_del_init(&txp->list);
+
+ if (txp->callback)
+ (*txp->callback)(txp->callback_cookie,
+ txp->callback_status);
+ }
+}
+
+static void sdma_notify_taskbody(struct ipath_devdata *dd)
+{
+ unsigned long flags;
+ struct list_head list;
+
+ INIT_LIST_HEAD(&list);
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+ list_splice_init(&dd->ipath_sdma_notifylist, &list);
+
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+ ipath_sdma_notify(dd, &list);
+
+ /*
+ * The IB verbs layer needs to see the callback before getting
+ * the call to ipath_ib_piobufavail() because the callback
+ * handles releasing resources the next send will need.
+ * Otherwise, we could do these calls in
+ * ipath_sdma_make_progress().
+ */
+ ipath_ib_piobufavail(dd->verbs_dev);
+}
+
+static void sdma_notify_task(unsigned long opaque)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+ if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+ sdma_notify_taskbody(dd);
+}
+
+static void dump_sdma_state(struct ipath_devdata *dd)
+{
+ unsigned long reg;
+
+ reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
+ ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
+
+ reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
+ ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
+
+ reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
+ ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
+
+ reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
+ ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
+
+ reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
+ ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
+
+ reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+ ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
+
+ reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+ ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
+}
+
+static void sdma_abort_task(unsigned long opaque)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+ u64 status;
+ unsigned long flags;
+
+ if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+ return;
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+ status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
+
+ /* nothing to do */
+ if (status == IPATH_SDMA_ABORT_NONE)
+ goto unlock;
+
+ /* ipath_sdma_abort() is done, waiting for interrupt */
+ if (status == IPATH_SDMA_ABORT_DISARMED) {
+ if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
+ goto resched_noprint;
+ /* give up, intr got lost somewhere */
+ ipath_dbg("give up waiting for SDMADISABLED intr\n");
+ __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+ status = IPATH_SDMA_ABORT_ABORTED;
+ }
+
+ /* everything is stopped, time to clean up and restart */
+ if (status == IPATH_SDMA_ABORT_ABORTED) {
+ struct ipath_sdma_txreq *txp, *txpnext;
+ u64 hwstatus;
+ int notify = 0;
+
+ hwstatus = ipath_read_kreg64(dd,
+ dd->ipath_kregs->kr_senddmastatus);
+
+ if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
+ IPATH_SDMA_STATUS_ABORT_IN_PROG |
+ IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
+ !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
+ if (dd->ipath_sdma_reset_wait > 0) {
+ /* not done shutting down sdma */
+ --dd->ipath_sdma_reset_wait;
+ goto resched;
+ }
+ ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
+ "status after SDMA reset, continuing\n");
+ dump_sdma_state(dd);
+ }
+
+ /* dequeue all "sent" requests */
+ list_for_each_entry_safe(txp, txpnext,
+ &dd->ipath_sdma_activelist, list) {
+ txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
+ if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+ vl15_watchdog_deq(dd);
+ list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+ notify = 1;
+ }
+ if (notify)
+ tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+ /* reset our notion of head and tail */
+ dd->ipath_sdma_descq_tail = 0;
+ dd->ipath_sdma_descq_head = 0;
+ dd->ipath_sdma_head_dma[0] = 0;
+ dd->ipath_sdma_generation = 0;
+ dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
+
+ /* Reset SendDmaLenGen */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
+ (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
+
+ /* done with sdma state for a bit */
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+ /*
+ * Don't restart sdma here (with the exception
+ * below). Wait until link is up to ACTIVE. VL15 MADs
+ * used to bring the link up use PIO, and multiple link
+ * transitions otherwise cause the sdma engine to be
+ * stopped and started multiple times.
+ * The disable is done here, including the shadow,
+ * so the state is kept consistent.
+ * See ipath_restart_sdma() for the actual starting
+ * of sdma.
+ */
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ /* make sure I see next message */
+ dd->ipath_sdma_abort_jiffies = 0;
+
+ /*
+ * Not everything that takes SDMA offline is a link
+ * status change. If the link was up, restart SDMA.
+ */
+ if (dd->ipath_flags & IPATH_LINKACTIVE)
+ ipath_restart_sdma(dd);
+
+ goto done;
+ }
+
+resched:
+ /*
+ * for now, keep spinning
+ * JAG - this is bad to just have default be a loop without
+ * state change
+ */
+ if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
+ ipath_dbg("looping with status 0x%08lx\n",
+ dd->ipath_sdma_status);
+ dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
+ }
+resched_noprint:
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+ if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+ tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+ return;
+
+unlock:
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+done:
+ return;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void ipath_sdma_intr(struct ipath_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+ (void) ipath_sdma_make_progress(dd);
+
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+}
+
+static int alloc_sdma(struct ipath_devdata *dd)
+{
+ int ret = 0;
+
+ /* Allocate memory for SendDMA descriptor FIFO */
+ dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
+ SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
+
+ if (!dd->ipath_sdma_descq) {
+ ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
+ "FIFO memory\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ dd->ipath_sdma_descq_cnt =
+ SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
+
+ /* Allocate memory for DMA of head register to memory */
+ dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
+ PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
+ if (!dd->ipath_sdma_head_dma) {
+ ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
+ ret = -ENOMEM;
+ goto cleanup_descq;
+ }
+ dd->ipath_sdma_head_dma[0] = 0;
+
+ init_timer(&dd->ipath_sdma_vl15_timer);
+ dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout;
+ dd->ipath_sdma_vl15_timer.data = (unsigned long)dd;
+ atomic_set(&dd->ipath_sdma_vl15_count, 0);
+
+ goto done;
+
+cleanup_descq:
+ dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+ (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
+ dd->ipath_sdma_descq = NULL;
+ dd->ipath_sdma_descq_phys = 0;
+done:
+ return ret;
+}
+
+int setup_sdma(struct ipath_devdata *dd)
+{
+ int ret = 0;
+ unsigned i, n;
+ u64 tmp64;
+ u64 senddmabufmask[3] = { 0 };
+ unsigned long flags;
+
+ ret = alloc_sdma(dd);
+ if (ret)
+ goto done;
+
+ if (!dd->ipath_sdma_descq) {
+ ipath_dev_err(dd, "SendDMA memory not allocated\n");
+ goto done;
+ }
+
+ /*
+ * Set initial status as if we had been up, then gone down.
+ * This lets initial start on transition to ACTIVE be the
+ * same as restart after link flap.
+ */
+ dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
+ dd->ipath_sdma_abort_jiffies = 0;
+ dd->ipath_sdma_generation = 0;
+ dd->ipath_sdma_descq_tail = 0;
+ dd->ipath_sdma_descq_head = 0;
+ dd->ipath_sdma_descq_removed = 0;
+ dd->ipath_sdma_descq_added = 0;
+
+ /* Set SendDmaBase */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
+ dd->ipath_sdma_descq_phys);
+ /* Set SendDmaLenGen */
+ tmp64 = dd->ipath_sdma_descq_cnt;
+ tmp64 |= 1<<18; /* enable generation checking */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
+ /* Set SendDmaTail */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
+ dd->ipath_sdma_descq_tail);
+ /* Set SendDmaHeadAddr */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
+ dd->ipath_sdma_head_phys);
+
+ /*
+ * Reserve all the former "kernel" piobufs, using high number range
+ * so we get as many 4K buffers as possible
+ */
+ n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+ i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
+ ipath_chg_pioavailkernel(dd, i, n - i , 0);
+ for (; i < n; ++i) {
+ unsigned word = i / 64;
+ unsigned bit = i & 63;
+ BUG_ON(word >= 3);
+ senddmabufmask[word] |= 1ULL << bit;
+ }
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
+ senddmabufmask[0]);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
+ senddmabufmask[1]);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
+ senddmabufmask[2]);
+
+ INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
+ INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
+
+ tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
+ (unsigned long) dd);
+ tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
+ (unsigned long) dd);
+
+ /*
+ * No use to turn on SDMA here, as link is probably not ACTIVE
+ * Just mark it RUNNING and enable the interrupt, and let the
+ * ipath_restart_sdma() on link transition to ACTIVE actually
+ * enable it.
+ */
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+done:
+ return ret;
+}
+
+void teardown_sdma(struct ipath_devdata *dd)
+{
+ struct ipath_sdma_txreq *txp, *txpnext;
+ unsigned long flags;
+ dma_addr_t sdma_head_phys = 0;
+ dma_addr_t sdma_descq_phys = 0;
+ void *sdma_descq = NULL;
+ void *sdma_head_dma = NULL;
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+ __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+ __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+ tasklet_kill(&dd->ipath_sdma_abort_task);
+ tasklet_kill(&dd->ipath_sdma_notify_task);
+
+ /* turn off sdma */
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+ dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ /* dequeue all "sent" requests */
+ list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
+ list) {
+ txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
+ if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+ vl15_watchdog_deq(dd);
+ list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+ }
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+ sdma_notify_taskbody(dd);
+
+ del_timer_sync(&dd->ipath_sdma_vl15_timer);
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+ dd->ipath_sdma_abort_jiffies = 0;
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
+
+ if (dd->ipath_sdma_head_dma) {
+ sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
+ sdma_head_phys = dd->ipath_sdma_head_phys;
+ dd->ipath_sdma_head_dma = NULL;
+ dd->ipath_sdma_head_phys = 0;
+ }
+
+ if (dd->ipath_sdma_descq) {
+ sdma_descq = dd->ipath_sdma_descq;
+ sdma_descq_phys = dd->ipath_sdma_descq_phys;
+ dd->ipath_sdma_descq = NULL;
+ dd->ipath_sdma_descq_phys = 0;
+ }
+
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+ if (sdma_head_dma)
+ dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+ sdma_head_dma, sdma_head_phys);
+
+ if (sdma_descq)
+ dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+ sdma_descq, sdma_descq_phys);
+}
+
+/*
+ * [Re]start SDMA, if we use it, and it's not already OK.
+ * This is called on transition to link ACTIVE, either the first or
+ * subsequent times.
+ */
+void ipath_restart_sdma(struct ipath_devdata *dd)
+{
+ unsigned long flags;
+ int needed = 1;
+
+ if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+ goto bail;
+
+ /*
+ * First, make sure we should, which is to say,
+ * check that we are "RUNNING" (not in teardown)
+ * and not "SHUTDOWN"
+ */
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
+ || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+ needed = 0;
+ else {
+ __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+ __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+ __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+ }
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+ if (!needed) {
+ ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
+ dd->ipath_sdma_status);
+ goto bail;
+ }
+ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+ /*
+ * First clear, just to be safe. Enable is only done
+ * in chip on 0->1 transition
+ */
+ dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+ ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+ spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+ /* notify upper layers */
+ ipath_ib_piobufavail(dd->verbs_dev);
+
+bail:
+ return;
+}
+
+static inline void make_sdma_desc(struct ipath_devdata *dd,
+ u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
+{
+ WARN_ON(addr & 3);
+ /* SDmaPhyAddr[47:32] */
+ sdmadesc[1] = addr >> 32;
+ /* SDmaPhyAddr[31:0] */
+ sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+ /* SDmaGeneration[1:0] */
+ sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
+ /* SDmaDwordCount[10:0] */
+ sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
+ /* SDmaBufOffset[12:2] */
+ sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ * the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ * (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int ipath_sdma_verbs_send(struct ipath_devdata *dd,
+ struct ipath_sge_state *ss, u32 dwords,
+ struct ipath_verbs_txreq *tx)
+{
+
+ unsigned long flags;
+ struct ipath_sge *sge;
+ int ret = 0;
+ u16 tail;
+ __le64 *descqp;
+ u64 sdmadesc[2];
+ u32 dwoffset;
+ dma_addr_t addr;
+
+ if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
+ ipath_dbg("packet size %X > ibmax %X, fail\n",
+ tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
+ ret = -EMSGSIZE;
+ goto fail;
+ }
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+retry:
+ if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
+ if (ipath_sdma_make_progress(dd))
+ goto retry;
+ ret = -ENOBUFS;
+ goto unlock;
+ }
+
+ addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
+ tx->map_len, DMA_TO_DEVICE);
+ if (dma_mapping_error(&dd->pcidev->dev, addr))
+ goto ioerr;
+
+ dwoffset = tx->map_len >> 2;
+ make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
+
+ /* SDmaFirstDesc */
+ sdmadesc[0] |= 1ULL << 12;
+ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+ sdmadesc[0] |= 1ULL << 14; /* SDmaUseLargeBuf */
+
+ /* write to the descq */
+ tail = dd->ipath_sdma_descq_tail;
+ descqp = &dd->ipath_sdma_descq[tail].qw[0];
+ *descqp++ = cpu_to_le64(sdmadesc[0]);
+ *descqp++ = cpu_to_le64(sdmadesc[1]);
+
+ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
+ tx->txreq.start_idx = tail;
+
+ /* increment the tail */
+ if (++tail == dd->ipath_sdma_descq_cnt) {
+ tail = 0;
+ descqp = &dd->ipath_sdma_descq[0].qw[0];
+ ++dd->ipath_sdma_generation;
+ }
+
+ sge = &ss->sge;
+ while (dwords) {
+ u32 dw;
+ u32 len;
+
+ len = dwords << 2;
+ if (len > sge->length)
+ len = sge->length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ BUG_ON(len == 0);
+ dw = (len + 3) >> 2;
+ addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(&dd->pcidev->dev, addr))
+ goto unmap;
+ make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
+ /* SDmaUseLargeBuf has to be set in every descriptor */
+ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+ sdmadesc[0] |= 1ULL << 14;
+ /* write to the descq */
+ *descqp++ = cpu_to_le64(sdmadesc[0]);
+ *descqp++ = cpu_to_le64(sdmadesc[1]);
+
+ /* increment the tail */
+ if (++tail == dd->ipath_sdma_descq_cnt) {
+ tail = 0;
+ descqp = &dd->ipath_sdma_descq[0].qw[0];
+ ++dd->ipath_sdma_generation;
+ }
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+
+ dwoffset += dw;
+ dwords -= dw;
+ }
+
+ if (!tail)
+ descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
+ descqp -= 2;
+ /* SDmaLastDesc */
+ descqp[0] |= cpu_to_le64(1ULL << 11);
+ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
+ /* SDmaIntReq */
+ descqp[0] |= cpu_to_le64(1ULL << 15);
+ }
+
+ /* Commit writes to memory and advance the tail on the chip */
+ wmb();
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+
+ tx->txreq.next_descq_idx = tail;
+ tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
+ dd->ipath_sdma_descq_tail = tail;
+ dd->ipath_sdma_descq_added += tx->txreq.sg_count;
+ list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
+ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
+ vl15_watchdog_enq(dd);
+ goto unlock;
+
+unmap:
+ while (tail != dd->ipath_sdma_descq_tail) {
+ if (!tail)
+ tail = dd->ipath_sdma_descq_cnt - 1;
+ else
+ tail--;
+ unmap_desc(dd, tail);
+ }
+ioerr:
+ ret = -EIO;
+unlock:
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+fail:
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_srq.c b/drivers/staging/rdma/ipath/ipath_srq.c
new file mode 100644
index 000000000000..26271984b717
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_srq.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct ipath_srq *srq = to_isrq(ibsrq);
+ struct ipath_rwq *wq;
+ unsigned long flags;
+ int ret;
+
+ for (; wr; wr = wr->next) {
+ struct ipath_rwqe *wqe;
+ u32 next;
+ int i;
+
+ if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+ *bad_wr = wr;
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&srq->rq.lock, flags);
+ wq = srq->rq.wq;
+ next = wq->head + 1;
+ if (next >= srq->rq.size)
+ next = 0;
+ if (next == wq->tail) {
+ spin_unlock_irqrestore(&srq->rq.lock, flags);
+ *bad_wr = wr;
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ wqe = get_rwqe_ptr(&srq->rq, wq->head);
+ wqe->wr_id = wr->wr_id;
+ wqe->num_sge = wr->num_sge;
+ for (i = 0; i < wr->num_sge; i++)
+ wqe->sg_list[i] = wr->sg_list[i];
+ /* Make sure queue entry is written before the head index. */
+ smp_wmb();
+ wq->head = next;
+ spin_unlock_irqrestore(&srq->rq.lock, flags);
+ }
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libipathverbs when creating a user SRQ
+ */
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata)
+{
+ struct ipath_ibdev *dev = to_idev(ibpd->device);
+ struct ipath_srq *srq;
+ u32 sz;
+ struct ib_srq *ret;
+
+ if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+ ret = ERR_PTR(-ENOSYS);
+ goto done;
+ }
+
+ if (srq_init_attr->attr.max_wr == 0) {
+ ret = ERR_PTR(-EINVAL);
+ goto done;
+ }
+
+ if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
+ (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
+ ret = ERR_PTR(-EINVAL);
+ goto done;
+ }
+
+ srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+ if (!srq) {
+ ret = ERR_PTR(-ENOMEM);
+ goto done;
+ }
+
+ /*
+ * Need to use vmalloc() if we want to support large #s of entries.
+ */
+ srq->rq.size = srq_init_attr->attr.max_wr + 1;
+ srq->rq.max_sge = srq_init_attr->attr.max_sge;
+ sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+ sizeof(struct ipath_rwqe);
+ srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
+ if (!srq->rq.wq) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_srq;
+ }
+
+ /*
+ * Return the address of the RWQ as the offset to mmap.
+ * See ipath_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ int err;
+ u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
+
+ srq->ip =
+ ipath_create_mmap_info(dev, s,
+ ibpd->uobject->context,
+ srq->rq.wq);
+ if (!srq->ip) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_wq;
+ }
+
+ err = ib_copy_to_udata(udata, &srq->ip->offset,
+ sizeof(srq->ip->offset));
+ if (err) {
+ ret = ERR_PTR(err);
+ goto bail_ip;
+ }
+ } else
+ srq->ip = NULL;
+
+ /*
+ * ib_create_srq() will initialize srq->ibsrq.
+ */
+ spin_lock_init(&srq->rq.lock);
+ srq->rq.wq->head = 0;
+ srq->rq.wq->tail = 0;
+ srq->limit = srq_init_attr->attr.srq_limit;
+
+ spin_lock(&dev->n_srqs_lock);
+ if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
+ spin_unlock(&dev->n_srqs_lock);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail_ip;
+ }
+
+ dev->n_srqs_allocated++;
+ spin_unlock(&dev->n_srqs_lock);
+
+ if (srq->ip) {
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ ret = &srq->ibsrq;
+ goto done;
+
+bail_ip:
+ kfree(srq->ip);
+bail_wq:
+ vfree(srq->rq.wq);
+bail_srq:
+ kfree(srq);
+done:
+ return ret;
+}
+
+/**
+ * ipath_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for ipathverbs.so
+ */
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask,
+ struct ib_udata *udata)
+{
+ struct ipath_srq *srq = to_isrq(ibsrq);
+ struct ipath_rwq *wq;
+ int ret = 0;
+
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ struct ipath_rwq *owq;
+ struct ipath_rwqe *p;
+ u32 sz, size, n, head, tail;
+
+ /* Check that the requested sizes are below the limits. */
+ if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
+ ((attr_mask & IB_SRQ_LIMIT) ?
+ attr->srq_limit : srq->limit) > attr->max_wr) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ sz = sizeof(struct ipath_rwqe) +
+ srq->rq.max_sge * sizeof(struct ib_sge);
+ size = attr->max_wr + 1;
+ wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
+ if (!wq) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ /* Check that we can write the offset to mmap. */
+ if (udata && udata->inlen >= sizeof(__u64)) {
+ __u64 offset_addr;
+ __u64 offset = 0;
+
+ ret = ib_copy_from_udata(&offset_addr, udata,
+ sizeof(offset_addr));
+ if (ret)
+ goto bail_free;
+ udata->outbuf =
+ (void __user *) (unsigned long) offset_addr;
+ ret = ib_copy_to_udata(udata, &offset,
+ sizeof(offset));
+ if (ret)
+ goto bail_free;
+ }
+
+ spin_lock_irq(&srq->rq.lock);
+ /*
+ * validate head pointer value and compute
+ * the number of remaining WQEs.
+ */
+ owq = srq->rq.wq;
+ head = owq->head;
+ if (head >= srq->rq.size)
+ head = 0;
+ tail = owq->tail;
+ if (tail >= srq->rq.size)
+ tail = 0;
+ n = head;
+ if (n < tail)
+ n += srq->rq.size - tail;
+ else
+ n -= tail;
+ if (size <= n) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ n = 0;
+ p = wq->wq;
+ while (tail != head) {
+ struct ipath_rwqe *wqe;
+ int i;
+
+ wqe = get_rwqe_ptr(&srq->rq, tail);
+ p->wr_id = wqe->wr_id;
+ p->num_sge = wqe->num_sge;
+ for (i = 0; i < wqe->num_sge; i++)
+ p->sg_list[i] = wqe->sg_list[i];
+ n++;
+ p = (struct ipath_rwqe *)((char *) p + sz);
+ if (++tail >= srq->rq.size)
+ tail = 0;
+ }
+ srq->rq.wq = wq;
+ srq->rq.size = size;
+ wq->head = n;
+ wq->tail = 0;
+ if (attr_mask & IB_SRQ_LIMIT)
+ srq->limit = attr->srq_limit;
+ spin_unlock_irq(&srq->rq.lock);
+
+ vfree(owq);
+
+ if (srq->ip) {
+ struct ipath_mmap_info *ip = srq->ip;
+ struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
+ u32 s = sizeof(struct ipath_rwq) + size * sz;
+
+ ipath_update_mmap_info(dev, ip, s, wq);
+
+ /*
+ * Return the offset to mmap.
+ * See ipath_mmap() for details.
+ */
+ if (udata && udata->inlen >= sizeof(__u64)) {
+ ret = ib_copy_to_udata(udata, &ip->offset,
+ sizeof(ip->offset));
+ if (ret)
+ goto bail;
+ }
+
+ spin_lock_irq(&dev->pending_lock);
+ if (list_empty(&ip->pending_mmaps))
+ list_add(&ip->pending_mmaps,
+ &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+ } else if (attr_mask & IB_SRQ_LIMIT) {
+ spin_lock_irq(&srq->rq.lock);
+ if (attr->srq_limit >= srq->rq.size)
+ ret = -EINVAL;
+ else
+ srq->limit = attr->srq_limit;
+ spin_unlock_irq(&srq->rq.lock);
+ }
+ goto bail;
+
+bail_unlock:
+ spin_unlock_irq(&srq->rq.lock);
+bail_free:
+ vfree(wq);
+bail:
+ return ret;
+}
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+ struct ipath_srq *srq = to_isrq(ibsrq);
+
+ attr->max_wr = srq->rq.size - 1;
+ attr->max_sge = srq->rq.max_sge;
+ attr->srq_limit = srq->limit;
+ return 0;
+}
+
+/**
+ * ipath_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int ipath_destroy_srq(struct ib_srq *ibsrq)
+{
+ struct ipath_srq *srq = to_isrq(ibsrq);
+ struct ipath_ibdev *dev = to_idev(ibsrq->device);
+
+ spin_lock(&dev->n_srqs_lock);
+ dev->n_srqs_allocated--;
+ spin_unlock(&dev->n_srqs_lock);
+ if (srq->ip)
+ kref_put(&srq->ip->ref, ipath_release_mmap_info);
+ else
+ vfree(srq->rq.wq);
+ kfree(srq);
+
+ return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_stats.c b/drivers/staging/rdma/ipath/ipath_stats.c
new file mode 100644
index 000000000000..f63e143e3292
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_stats.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_kernel.h"
+
+struct infinipath_stats ipath_stats;
+
+/**
+ * ipath_snap_cntr - snapshot a chip counter
+ * @dd: the infinipath device
+ * @creg: the counter to snapshot
+ *
+ * called from add_timer and user counter read calls, to deal with
+ * counters that wrap in "human time". The words sent and received, and
+ * the packets sent and received are all that we worry about. For now,
+ * at least, we don't worry about error counters, because if they wrap
+ * that quickly, we probably don't care. We may eventually just make this
+ * handle all the counters. word counters can wrap in about 20 seconds
+ * of full bandwidth traffic, packet counters in a few hours.
+ */
+
+u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
+{
+ u32 val, reg64 = 0;
+ u64 val64;
+ unsigned long t0, t1;
+ u64 ret;
+
+ t0 = jiffies;
+ /* If fast increment counters are only 32 bits, snapshot them,
+ * and maintain them as 64bit values in the driver */
+ if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
+ (creg == dd->ipath_cregs->cr_wordsendcnt ||
+ creg == dd->ipath_cregs->cr_wordrcvcnt ||
+ creg == dd->ipath_cregs->cr_pktsendcnt ||
+ creg == dd->ipath_cregs->cr_pktrcvcnt)) {
+ val64 = ipath_read_creg(dd, creg);
+ val = val64 == ~0ULL ? ~0U : 0;
+ reg64 = 1;
+ } else /* val64 just to keep gcc quiet... */
+ val64 = val = ipath_read_creg32(dd, creg);
+ /*
+ * See if a second has passed. This is just a way to detect things
+ * that are quite broken. Normally this should take just a few
+ * cycles (the check is for long enough that we don't care if we get
+ * pre-empted.) An Opteron HT O read timeout is 4 seconds with
+ * normal NB values
+ */
+ t1 = jiffies;
+ if (time_before(t0 + HZ, t1) && val == -1) {
+ ipath_dev_err(dd, "Error! Read counter 0x%x timed out\n",
+ creg);
+ ret = 0ULL;
+ goto bail;
+ }
+ if (reg64) {
+ ret = val64;
+ goto bail;
+ }
+
+ if (creg == dd->ipath_cregs->cr_wordsendcnt) {
+ if (val != dd->ipath_lastsword) {
+ dd->ipath_sword += val - dd->ipath_lastsword;
+ dd->ipath_lastsword = val;
+ }
+ val64 = dd->ipath_sword;
+ } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
+ if (val != dd->ipath_lastrword) {
+ dd->ipath_rword += val - dd->ipath_lastrword;
+ dd->ipath_lastrword = val;
+ }
+ val64 = dd->ipath_rword;
+ } else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
+ if (val != dd->ipath_lastspkts) {
+ dd->ipath_spkts += val - dd->ipath_lastspkts;
+ dd->ipath_lastspkts = val;
+ }
+ val64 = dd->ipath_spkts;
+ } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
+ if (val != dd->ipath_lastrpkts) {
+ dd->ipath_rpkts += val - dd->ipath_lastrpkts;
+ dd->ipath_lastrpkts = val;
+ }
+ val64 = dd->ipath_rpkts;
+ } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
+ if (dd->ibdeltainprog)
+ val64 -= val64 - dd->ibsymsnap;
+ val64 -= dd->ibsymdelta;
+ } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
+ if (dd->ibdeltainprog)
+ val64 -= val64 - dd->iblnkerrsnap;
+ val64 -= dd->iblnkerrdelta;
+ } else
+ val64 = (u64) val;
+
+ ret = val64;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
+ * @dd: the infinipath device
+ *
+ * print the delta of egrfull/hdrqfull errors for kernel ports no more than
+ * every 5 seconds. User processes are printed at close, but kernel doesn't
+ * close, so... Separate routine so may call from other places someday, and
+ * so function name when printed by _IPATH_INFO is meaningfull
+ */
+static void ipath_qcheck(struct ipath_devdata *dd)
+{
+ static u64 last_tot_hdrqfull;
+ struct ipath_portdata *pd = dd->ipath_pd[0];
+ size_t blen = 0;
+ char buf[128];
+ u32 hdrqtail;
+
+ *buf = 0;
+ if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
+ blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
+ pd->port_hdrqfull -
+ dd->ipath_p0_hdrqfull);
+ dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
+ }
+ if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
+ blen += snprintf(buf + blen, sizeof buf - blen,
+ "%srcvegrfull %llu",
+ blen ? ", " : "",
+ (unsigned long long)
+ (ipath_stats.sps_etidfull -
+ dd->ipath_last_tidfull));
+ dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
+ }
+
+ /*
+ * this is actually the number of hdrq full interrupts, not actual
+ * events, but at the moment that's mostly what I'm interested in.
+ * Actual count, etc. is in the counters, if needed. For production
+ * users this won't ordinarily be printed.
+ */
+
+ if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
+ ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
+ blen += snprintf(buf + blen, sizeof buf - blen,
+ "%shdrqfull %llu (all ports)",
+ blen ? ", " : "",
+ (unsigned long long)
+ (ipath_stats.sps_hdrqfull -
+ last_tot_hdrqfull));
+ last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
+ }
+ if (blen)
+ ipath_dbg("%s\n", buf);
+
+ hdrqtail = ipath_get_hdrqtail(pd);
+ if (pd->port_head != hdrqtail) {
+ if (dd->ipath_lastport0rcv_cnt ==
+ ipath_stats.sps_port0pkts) {
+ ipath_cdbg(PKT, "missing rcv interrupts? "
+ "port0 hd=%x tl=%x; port0pkts %llx; write"
+ " hd (w/intr)\n",
+ pd->port_head, hdrqtail,
+ (unsigned long long)
+ ipath_stats.sps_port0pkts);
+ ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
+ dd->ipath_rhdrhead_intr_off, pd->port_port);
+ }
+ dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
+ }
+}
+
+static void ipath_chk_errormask(struct ipath_devdata *dd)
+{
+ static u32 fixed;
+ u32 ctrl;
+ unsigned long errormask;
+ unsigned long hwerrs;
+
+ if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
+ return;
+
+ errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+
+ if (errormask == dd->ipath_errormask)
+ return;
+ fixed++;
+
+ hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+ ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ dd->ipath_errormask);
+
+ if ((hwerrs & dd->ipath_hwerrmask) ||
+ (ctrl & INFINIPATH_C_FREEZEMODE)) {
+ /* force re-interrupt of pending events, just in case */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+ dev_info(&dd->pcidev->dev,
+ "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
+ fixed, errormask, (unsigned long)dd->ipath_errormask,
+ ctrl, hwerrs);
+ } else
+ ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
+ fixed, errormask,
+ (unsigned long)dd->ipath_errormask);
+}
+
+
+/**
+ * ipath_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the infinipath device ipath_devdata
+ *
+ * called from add_timer
+ */
+void ipath_get_faststats(unsigned long opaque)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+ int i;
+ static unsigned cnt;
+ unsigned long flags;
+ u64 traffic_wds;
+
+ /*
+ * don't access the chip while running diags, or memory diags can
+ * fail
+ */
+ if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
+ ipath_diag_inuse)
+ /* but re-arm the timer, for diags case; won't hurt other */
+ goto done;
+
+ /*
+ * We now try to maintain a "active timer", based on traffic
+ * exceeding a threshold, so we need to check the word-counts
+ * even if they are 64-bit.
+ */
+ traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
+ ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+ spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+ traffic_wds -= dd->ipath_traffic_wds;
+ dd->ipath_traffic_wds += traffic_wds;
+ if (traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
+ atomic_add(5, &dd->ipath_active_time); /* S/B #define */
+ spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+
+ if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
+ ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+ ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+ }
+
+ ipath_qcheck(dd);
+
+ /*
+ * deal with repeat error suppression. Doesn't really matter if
+ * last error was almost a full interval ago, or just a few usecs
+ * ago; still won't get more than 2 per interval. We may want
+ * longer intervals for this eventually, could do with mod, counter
+ * or separate timer. Also see code in ipath_handle_errors() and
+ * ipath_handle_hwerrors().
+ */
+
+ if (dd->ipath_lasterror)
+ dd->ipath_lasterror = 0;
+ if (dd->ipath_lasthwerror)
+ dd->ipath_lasthwerror = 0;
+ if (dd->ipath_maskederrs
+ && time_after(jiffies, dd->ipath_unmasktime)) {
+ char ebuf[256];
+ int iserr;
+ iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
+ dd->ipath_maskederrs);
+ if (dd->ipath_maskederrs &
+ ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+ INFINIPATH_E_PKTERRS))
+ ipath_dev_err(dd, "Re-enabling masked errors "
+ "(%s)\n", ebuf);
+ else {
+ /*
+ * rcvegrfull and rcvhdrqfull are "normal", for some
+ * types of processes (mostly benchmarks) that send
+ * huge numbers of messages, while not processing
+ * them. So only complain about these at debug
+ * level.
+ */
+ if (iserr)
+ ipath_dbg(
+ "Re-enabling queue full errors (%s)\n",
+ ebuf);
+ else
+ ipath_cdbg(ERRPKT, "Re-enabling packet"
+ " problem interrupt (%s)\n", ebuf);
+ }
+
+ /* re-enable masked errors */
+ dd->ipath_errormask |= dd->ipath_maskederrs;
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ dd->ipath_errormask);
+ dd->ipath_maskederrs = 0;
+ }
+
+ /* limit qfull messages to ~one per minute per port */
+ if ((++cnt & 0x10)) {
+ for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
+ struct ipath_portdata *pd = dd->ipath_pd[i];
+
+ if (pd && pd->port_lastrcvhdrqtail != -1)
+ pd->port_lastrcvhdrqtail = -1;
+ }
+ }
+
+ ipath_chk_errormask(dd);
+done:
+ mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_sysfs.c b/drivers/staging/rdma/ipath/ipath_sysfs.c
new file mode 100644
index 000000000000..75558f33f1cb
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_sysfs.c
@@ -0,0 +1,1238 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stat.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+/**
+ * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
+ * @str: the string containing the number
+ * @valp: where to put the result
+ *
+ * returns the number of bytes consumed, or negative value on error
+ */
+int ipath_parse_ushort(const char *str, unsigned short *valp)
+{
+ unsigned long val;
+ char *end;
+ int ret;
+
+ if (!isdigit(str[0])) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ val = simple_strtoul(str, &end, 0);
+
+ if (val > 0xffff) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ *valp = val;
+
+ ret = end + 1 - str;
+ if (ret == 0)
+ ret = -EINVAL;
+
+bail:
+ return ret;
+}
+
+static ssize_t show_version(struct device_driver *dev, char *buf)
+{
+ /* The string printed here is already newline-terminated. */
+ return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
+}
+
+static ssize_t show_num_units(struct device_driver *dev, char *buf)
+{
+ return scnprintf(buf, PAGE_SIZE, "%d\n",
+ ipath_count_units(NULL, NULL, NULL));
+}
+
+static ssize_t show_status(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ ssize_t ret;
+
+ if (!dd->ipath_statusp) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+ (unsigned long long) *(dd->ipath_statusp));
+
+bail:
+ return ret;
+}
+
+static const char *ipath_status_str[] = {
+ "Initted",
+ "Disabled",
+ "Admin_Disabled",
+ "", /* This used to be the old "OIB_SMA" status. */
+ "", /* This used to be the old "SMA" status. */
+ "Present",
+ "IB_link_up",
+ "IB_configured",
+ "NoIBcable",
+ "Fatal_Hardware_Error",
+ NULL,
+};
+
+static ssize_t show_status_str(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int i, any;
+ u64 s;
+ ssize_t ret;
+
+ if (!dd->ipath_statusp) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ s = *(dd->ipath_statusp);
+ *buf = '\0';
+ for (any = i = 0; s && ipath_status_str[i]; i++) {
+ if (s & 1) {
+ if (any && strlcat(buf, " ", PAGE_SIZE) >=
+ PAGE_SIZE)
+ /* overflow */
+ break;
+ if (strlcat(buf, ipath_status_str[i],
+ PAGE_SIZE) >= PAGE_SIZE)
+ break;
+ any = 1;
+ }
+ s >>= 1;
+ }
+ if (any)
+ strlcat(buf, "\n", PAGE_SIZE);
+
+ ret = strlen(buf);
+
+bail:
+ return ret;
+}
+
+static ssize_t show_boardversion(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ /* The string printed here is already newline-terminated. */
+ return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
+}
+
+static ssize_t show_localbus_info(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ /* The string printed here is already newline-terminated. */
+ return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
+}
+
+static ssize_t show_lmc(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
+}
+
+static ssize_t store_lmc(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ u16 lmc = 0;
+ int ret;
+
+ ret = ipath_parse_ushort(buf, &lmc);
+ if (ret < 0)
+ goto invalid;
+
+ if (lmc > 7) {
+ ret = -EINVAL;
+ goto invalid;
+ }
+
+ ipath_set_lid(dd, dd->ipath_lid, lmc);
+
+ goto bail;
+invalid:
+ ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
+bail:
+ return ret;
+}
+
+static ssize_t show_lid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
+}
+
+static ssize_t store_lid(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ u16 lid = 0;
+ int ret;
+
+ ret = ipath_parse_ushort(buf, &lid);
+ if (ret < 0)
+ goto invalid;
+
+ if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
+ ret = -EINVAL;
+ goto invalid;
+ }
+
+ ipath_set_lid(dd, lid, dd->ipath_lmc);
+
+ goto bail;
+invalid:
+ ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
+bail:
+ return ret;
+}
+
+static ssize_t show_mlid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
+}
+
+static ssize_t store_mlid(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ u16 mlid;
+ int ret;
+
+ ret = ipath_parse_ushort(buf, &mlid);
+ if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
+ goto invalid;
+
+ dd->ipath_mlid = mlid;
+
+ goto bail;
+invalid:
+ ipath_dev_err(dd, "attempt to set invalid MLID\n");
+bail:
+ return ret;
+}
+
+static ssize_t show_guid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ u8 *guid;
+
+ guid = (u8 *) & (dd->ipath_guid);
+
+ return scnprintf(buf, PAGE_SIZE,
+ "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+ guid[0], guid[1], guid[2], guid[3],
+ guid[4], guid[5], guid[6], guid[7]);
+}
+
+static ssize_t store_guid(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ ssize_t ret;
+ unsigned short guid[8];
+ __be64 new_guid;
+ u8 *ng;
+ int i;
+
+ if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
+ &guid[0], &guid[1], &guid[2], &guid[3],
+ &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
+ goto invalid;
+
+ ng = (u8 *) &new_guid;
+
+ for (i = 0; i < 8; i++) {
+ if (guid[i] > 0xff)
+ goto invalid;
+ ng[i] = guid[i];
+ }
+
+ if (new_guid == 0)
+ goto invalid;
+
+ dd->ipath_guid = new_guid;
+ dd->ipath_nguid = 1;
+ if (dd->verbs_dev)
+ dd->verbs_dev->ibdev.node_guid = new_guid;
+
+ ret = strlen(buf);
+ goto bail;
+
+invalid:
+ ipath_dev_err(dd, "attempt to set invalid GUID\n");
+ ret = -EINVAL;
+
+bail:
+ return ret;
+}
+
+static ssize_t show_nguid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
+}
+
+static ssize_t show_nports(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ /* Return the number of user ports available. */
+ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
+}
+
+static ssize_t show_serial(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ buf[sizeof dd->ipath_serial] = '\0';
+ memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
+ strcat(buf, "\n");
+ return strlen(buf);
+}
+
+static ssize_t show_unit(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
+}
+
+static ssize_t show_jint_max_packets(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
+}
+
+static ssize_t store_jint_max_packets(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ u16 v = 0;
+ int ret;
+
+ ret = ipath_parse_ushort(buf, &v);
+ if (ret < 0)
+ ipath_dev_err(dd, "invalid jint_max_packets.\n");
+ else
+ dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
+
+ return ret;
+}
+
+static ssize_t show_jint_idle_ticks(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
+}
+
+static ssize_t store_jint_idle_ticks(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ u16 v = 0;
+ int ret;
+
+ ret = ipath_parse_ushort(buf, &v);
+ if (ret < 0)
+ ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
+ else
+ dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
+
+ return ret;
+}
+
+#define DEVICE_COUNTER(name, attr) \
+ static ssize_t show_counter_##name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ struct ipath_devdata *dd = dev_get_drvdata(dev); \
+ return scnprintf(\
+ buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
+ ipath_snap_cntr( \
+ dd, offsetof(struct infinipath_counters, \
+ attr) / sizeof(u64))); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
+
+DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
+DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
+DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
+DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
+DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
+DEVICE_COUNTER(lb_ints, LBIntCnt);
+DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
+DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
+DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
+DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
+DEVICE_COUNTER(rx_dwords, RxDwordCnt);
+DEVICE_COUNTER(rx_ebps, RxEBPCnt);
+DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
+DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
+DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
+DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
+DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
+DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
+DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
+DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
+DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
+DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
+DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
+DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
+DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
+DEVICE_COUNTER(tx_dwords, TxDwordCnt);
+DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
+DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
+DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
+DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
+DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
+DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
+
+static struct attribute *dev_counter_attributes[] = {
+ &dev_attr_ib_link_downeds.attr,
+ &dev_attr_ib_link_err_recoveries.attr,
+ &dev_attr_ib_status_changes.attr,
+ &dev_attr_ib_symbol_errs.attr,
+ &dev_attr_lb_flow_stalls.attr,
+ &dev_attr_lb_ints.attr,
+ &dev_attr_rx_bad_formats.attr,
+ &dev_attr_rx_buf_ovfls.attr,
+ &dev_attr_rx_data_pkts.attr,
+ &dev_attr_rx_dropped_pkts.attr,
+ &dev_attr_rx_dwords.attr,
+ &dev_attr_rx_ebps.attr,
+ &dev_attr_rx_flow_ctrl_errs.attr,
+ &dev_attr_rx_flow_pkts.attr,
+ &dev_attr_rx_icrc_errs.attr,
+ &dev_attr_rx_len_errs.attr,
+ &dev_attr_rx_link_problems.attr,
+ &dev_attr_rx_lpcrc_errs.attr,
+ &dev_attr_rx_max_min_len_errs.attr,
+ &dev_attr_rx_p0_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p1_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p2_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p3_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p4_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p5_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p6_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p7_hdr_egr_ovfls.attr,
+ &dev_attr_rx_p8_hdr_egr_ovfls.attr,
+ &dev_attr_rx_pkey_mismatches.attr,
+ &dev_attr_rx_tid_full_errs.attr,
+ &dev_attr_rx_tid_valid_errs.attr,
+ &dev_attr_rx_vcrc_errs.attr,
+ &dev_attr_tx_data_pkts.attr,
+ &dev_attr_tx_dropped_pkts.attr,
+ &dev_attr_tx_dwords.attr,
+ &dev_attr_tx_flow_pkts.attr,
+ &dev_attr_tx_flow_stalls.attr,
+ &dev_attr_tx_len_errs.attr,
+ &dev_attr_tx_max_min_len_errs.attr,
+ &dev_attr_tx_underruns.attr,
+ &dev_attr_tx_unsup_vl_errs.attr,
+ NULL
+};
+
+static struct attribute_group dev_counter_attr_group = {
+ .name = "counters",
+ .attrs = dev_counter_attributes
+};
+
+static ssize_t store_reset(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ if (count < 5 || memcmp(buf, "reset", 5)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ if (dd->ipath_flags & IPATH_DISABLED) {
+ /*
+ * post-reset init would re-enable interrupts, etc.
+ * so don't allow reset on disabled devices. Not
+ * perfect error, but about the best choice.
+ */
+ dev_info(dev,"Unit %d is disabled, can't reset\n",
+ dd->ipath_unit);
+ ret = -EINVAL;
+ goto bail;
+ }
+ ret = ipath_reset_device(dd->ipath_unit);
+bail:
+ return ret<0 ? ret : count;
+}
+
+static ssize_t store_link_state(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, r;
+ u16 state;
+
+ ret = ipath_parse_ushort(buf, &state);
+ if (ret < 0)
+ goto invalid;
+
+ r = ipath_set_linkstate(dd, state);
+ if (r < 0) {
+ ret = r;
+ goto bail;
+ }
+
+ goto bail;
+invalid:
+ ipath_dev_err(dd, "attempt to set invalid link state\n");
+bail:
+ return ret;
+}
+
+static ssize_t show_mtu(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
+}
+
+static ssize_t store_mtu(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ ssize_t ret;
+ u16 mtu = 0;
+ int r;
+
+ ret = ipath_parse_ushort(buf, &mtu);
+ if (ret < 0)
+ goto invalid;
+
+ r = ipath_set_mtu(dd, mtu);
+ if (r < 0)
+ ret = r;
+
+ goto bail;
+invalid:
+ ipath_dev_err(dd, "attempt to set invalid MTU\n");
+bail:
+ return ret;
+}
+
+static ssize_t show_enabled(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
+}
+
+static ssize_t store_enabled(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ ssize_t ret;
+ u16 enable = 0;
+
+ ret = ipath_parse_ushort(buf, &enable);
+ if (ret < 0) {
+ ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
+ goto bail;
+ }
+
+ if (enable) {
+ if (!(dd->ipath_flags & IPATH_DISABLED))
+ goto bail;
+
+ dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
+ /* same as post-reset */
+ ret = ipath_init_chip(dd, 1);
+ if (ret)
+ ipath_dev_err(dd, "Failed to enable unit %d\n",
+ dd->ipath_unit);
+ else {
+ dd->ipath_flags &= ~IPATH_DISABLED;
+ *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
+ }
+ }
+ else if (!(dd->ipath_flags & IPATH_DISABLED)) {
+ dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
+ ipath_shutdown_device(dd);
+ dd->ipath_flags |= IPATH_DISABLED;
+ *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
+ }
+
+bail:
+ return ret;
+}
+
+static ssize_t store_rx_pol_inv(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, r;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret < 0)
+ goto invalid;
+
+ r = ipath_set_rx_pol_inv(dd, val);
+ if (r < 0) {
+ ret = r;
+ goto bail;
+ }
+
+ goto bail;
+invalid:
+ ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
+bail:
+ return ret;
+}
+
+static ssize_t store_led_override(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret > 0)
+ ipath_set_led_override(dd, val);
+ else
+ ipath_dev_err(dd, "attempt to set invalid LED override\n");
+ return ret;
+}
+
+static ssize_t show_logged_errs(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int idx, count;
+
+ /* force consistency with actual EEPROM */
+ if (ipath_update_eeprom_log(dd) != 0)
+ return -ENXIO;
+
+ count = 0;
+ for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+ count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
+ dd->ipath_eep_st_errs[idx],
+ idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
+ }
+
+ return count;
+}
+
+/*
+ * New sysfs entries to control various IB config. These all turn into
+ * accesses via ipath_f_get/set_ib_cfg.
+ *
+ * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
+ if (ret >= 0)
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, r;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret >= 0 && val > 3)
+ ret = -EINVAL;
+ if (ret < 0) {
+ ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+ goto bail;
+ }
+
+ /*
+ * Set the "intentional" heartbeat enable per either of
+ * "Enable" and "Auto", as these are normally set together.
+ * This bit is consulted when leaving loopback mode,
+ * because entering loopback mode overrides it and automatically
+ * disables heartbeat.
+ */
+ r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
+ if (r < 0)
+ ret = r;
+ else if (val == IPATH_IB_HRTBT_OFF)
+ dd->ipath_flags |= IPATH_NO_HRTBT;
+ else
+ dd->ipath_flags &= ~IPATH_NO_HRTBT;
+
+bail:
+ return ret;
+}
+
+/*
+ * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
+ * _not_ the particular encoding of any given chip)
+ */
+static ssize_t show_lwid_enb(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
+ if (ret >= 0)
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return ret;
+}
+
+static ssize_t store_lwid_enb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, r;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret >= 0 && (val == 0 || val > 3))
+ ret = -EINVAL;
+ if (ret < 0) {
+ ipath_dev_err(dd,
+ "attempt to set invalid Link Width (enable)\n");
+ goto bail;
+ }
+
+ r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
+ if (r < 0)
+ ret = r;
+
+bail:
+ return ret;
+}
+
+/* Get current link width */
+static ssize_t show_lwid(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
+ if (ret >= 0)
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return ret;
+}
+
+/*
+ * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
+ */
+static ssize_t show_spd_enb(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
+ if (ret >= 0)
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return ret;
+}
+
+static ssize_t store_spd_enb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, r;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
+ ret = -EINVAL;
+ if (ret < 0) {
+ ipath_dev_err(dd,
+ "attempt to set invalid Link Speed (enable)\n");
+ goto bail;
+ }
+
+ r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
+ if (r < 0)
+ ret = r;
+
+bail:
+ return ret;
+}
+
+/* Get current link speed */
+static ssize_t show_spd(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
+ if (ret >= 0)
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return ret;
+}
+
+/*
+ * Get/Set RX polarity-invert enable. 0=no, 1=yes.
+ */
+static ssize_t show_rx_polinv_enb(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
+ if (ret >= 0)
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return ret;
+}
+
+static ssize_t store_rx_polinv_enb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, r;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret >= 0 && val > 1) {
+ ipath_dev_err(dd,
+ "attempt to set invalid Rx Polarity (enable)\n");
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
+ if (r < 0)
+ ret = r;
+
+bail:
+ return ret;
+}
+
+/*
+ * Get/Set RX lane-reversal enable. 0=no, 1=yes.
+ */
+static ssize_t show_lanerev_enb(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+
+ ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
+ if (ret >= 0)
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+ return ret;
+}
+
+static ssize_t store_lanerev_enb(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, r;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret >= 0 && val > 1) {
+ ret = -EINVAL;
+ ipath_dev_err(dd,
+ "attempt to set invalid Lane reversal (enable)\n");
+ goto bail;
+ }
+
+ r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
+ if (r < 0)
+ ret = r;
+
+bail:
+ return ret;
+}
+
+static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
+static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
+
+static struct attribute *driver_attributes[] = {
+ &driver_attr_num_units.attr,
+ &driver_attr_version.attr,
+ NULL
+};
+
+static struct attribute_group driver_attr_group = {
+ .attrs = driver_attributes
+};
+
+static ssize_t store_tempsense(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret, stat;
+ u16 val;
+
+ ret = ipath_parse_ushort(buf, &val);
+ if (ret <= 0) {
+ ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
+ goto bail;
+ }
+ /* If anything but the highest limit, enable T_CRIT_A "interrupt" */
+ stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
+ if (stat) {
+ ipath_dev_err(dd, "Unable to set tempsense config\n");
+ ret = -1;
+ goto bail;
+ }
+ stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
+ if (stat) {
+ ipath_dev_err(dd, "Unable to set local Tcrit\n");
+ ret = -1;
+ goto bail;
+ }
+ stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
+ if (stat) {
+ ipath_dev_err(dd, "Unable to set remote Tcrit\n");
+ ret = -1;
+ goto bail;
+ }
+
+bail:
+ return ret;
+}
+
+/*
+ * dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_devdata *dd = dev_get_drvdata(dev);
+ int ret;
+ int idx;
+ u8 regvals[8];
+
+ ret = -ENXIO;
+ for (idx = 0; idx < 8; ++idx) {
+ if (idx == 6)
+ continue;
+ ret = ipath_tempsense_read(dd, idx);
+ if (ret < 0)
+ break;
+ regvals[idx] = ret;
+ }
+ if (idx == 8)
+ ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+ *(signed char *)(regvals),
+ *(signed char *)(regvals + 1),
+ regvals[2], regvals[3],
+ *(signed char *)(regvals + 5),
+ *(signed char *)(regvals + 7));
+ return ret;
+}
+
+const struct attribute_group *ipath_driver_attr_groups[] = {
+ &driver_attr_group,
+ NULL,
+};
+
+static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
+static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
+static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
+static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
+static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
+static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
+static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
+static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
+static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
+static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
+static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
+static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
+ show_jint_max_packets, store_jint_max_packets);
+static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
+ show_jint_idle_ticks, store_jint_idle_ticks);
+static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
+ show_tempsense, store_tempsense);
+
+static struct attribute *dev_attributes[] = {
+ &dev_attr_guid.attr,
+ &dev_attr_lmc.attr,
+ &dev_attr_lid.attr,
+ &dev_attr_link_state.attr,
+ &dev_attr_mlid.attr,
+ &dev_attr_mtu.attr,
+ &dev_attr_nguid.attr,
+ &dev_attr_nports.attr,
+ &dev_attr_serial.attr,
+ &dev_attr_status.attr,
+ &dev_attr_status_str.attr,
+ &dev_attr_boardversion.attr,
+ &dev_attr_unit.attr,
+ &dev_attr_enabled.attr,
+ &dev_attr_rx_pol_inv.attr,
+ &dev_attr_led_override.attr,
+ &dev_attr_logged_errors.attr,
+ &dev_attr_tempsense.attr,
+ &dev_attr_localbus_info.attr,
+ NULL
+};
+
+static struct attribute_group dev_attr_group = {
+ .attrs = dev_attributes
+};
+
+static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+ store_hrtbt_enb);
+static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
+ store_lwid_enb);
+static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
+static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
+ store_spd_enb);
+static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
+static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
+ store_rx_polinv_enb);
+static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
+ store_lanerev_enb);
+
+static struct attribute *dev_ibcfg_attributes[] = {
+ &dev_attr_hrtbt_enable.attr,
+ &dev_attr_link_width_enable.attr,
+ &dev_attr_link_width.attr,
+ &dev_attr_link_speed_enable.attr,
+ &dev_attr_link_speed.attr,
+ &dev_attr_rx_pol_inv_enable.attr,
+ &dev_attr_rx_lane_rev_enable.attr,
+ NULL
+};
+
+static struct attribute_group dev_ibcfg_attr_group = {
+ .attrs = dev_ibcfg_attributes
+};
+
+/**
+ * ipath_expose_reset - create a device reset file
+ * @dev: the device structure
+ *
+ * Only expose a file that lets us reset the device after someone
+ * enters diag mode. A device reset is quite likely to crash the
+ * machine entirely, so we don't want to normally make it
+ * available.
+ *
+ * Called with ipath_mutex held.
+ */
+int ipath_expose_reset(struct device *dev)
+{
+ static int exposed;
+ int ret;
+
+ if (!exposed) {
+ ret = device_create_file(dev, &dev_attr_reset);
+ exposed = 1;
+ }
+ else
+ ret = 0;
+
+ return ret;
+}
+
+int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
+{
+ int ret;
+
+ ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
+ if (ret)
+ goto bail;
+
+ ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
+ if (ret)
+ goto bail_attrs;
+
+ if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+ ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
+ if (ret)
+ goto bail_counter;
+ ret = device_create_file(dev, &dev_attr_jint_max_packets);
+ if (ret)
+ goto bail_idle;
+
+ ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
+ if (ret)
+ goto bail_max;
+ }
+
+ return 0;
+
+bail_max:
+ device_remove_file(dev, &dev_attr_jint_max_packets);
+bail_idle:
+ device_remove_file(dev, &dev_attr_jint_idle_ticks);
+bail_counter:
+ sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+bail_attrs:
+ sysfs_remove_group(&dev->kobj, &dev_attr_group);
+bail:
+ return ret;
+}
+
+void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
+{
+ sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+
+ if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+ sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
+ device_remove_file(dev, &dev_attr_jint_idle_ticks);
+ device_remove_file(dev, &dev_attr_jint_max_packets);
+ }
+
+ sysfs_remove_group(&dev->kobj, &dev_attr_group);
+
+ device_remove_file(dev, &dev_attr_reset);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_uc.c b/drivers/staging/rdma/ipath/ipath_uc.c
new file mode 100644
index 000000000000..22e60998f1a7
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_uc.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_uc_req(struct ipath_qp *qp)
+{
+ struct ipath_other_headers *ohdr;
+ struct ipath_swqe *wqe;
+ unsigned long flags;
+ u32 hwords;
+ u32 bth0;
+ u32 len;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ int ret = 0;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == qp->s_head)
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (atomic_read(&qp->s_dma_busy)) {
+ qp->s_flags |= IPATH_S_WAIT_DMA;
+ goto bail;
+ }
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ goto done;
+ }
+
+ ohdr = &qp->s_hdr.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &qp->s_hdr.u.l.oth;
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+ bth0 = 1 << 22; /* Set M bit */
+
+ /* Get the next send request. */
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ qp->s_wqe = NULL;
+ switch (qp->s_state) {
+ default:
+ if (!(ib_ipath_state_ops[qp->state] &
+ IPATH_PROCESS_NEXT_SEND_OK))
+ goto bail;
+ /* Check if send work queue is empty. */
+ if (qp->s_cur == qp->s_head)
+ goto bail;
+ /*
+ * Start a new request.
+ */
+ qp->s_psn = wqe->psn = qp->s_next_psn;
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_len = len = wqe->length;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ if (len > pmtu) {
+ qp->s_state = OP(SEND_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_ONLY);
+ else {
+ qp->s_state =
+ OP(SEND_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->wr.wr.rdma.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / 4;
+ if (len > pmtu) {
+ qp->s_state = OP(RDMA_WRITE_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_ONLY);
+ else {
+ qp->s_state =
+ OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the RETH */
+ ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ default:
+ goto bail;
+ }
+ break;
+
+ case OP(SEND_FIRST):
+ qp->s_state = OP(SEND_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND)
+ qp->s_state = OP(SEND_LAST);
+ else {
+ qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ qp->s_state = OP(RDMA_WRITE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+ qp->s_state = OP(RDMA_WRITE_LAST);
+ else {
+ qp->s_state =
+ OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ }
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+ }
+ qp->s_len -= len;
+ qp->s_hdrwords = hwords;
+ qp->s_cur_sge = &qp->s_sge;
+ qp->s_cur_size = len;
+ ipath_make_ruc_header(to_idev(qp->ibqp.device),
+ qp, ohdr, bth0 | (qp->s_state << 24),
+ qp->s_next_psn++ & IPATH_PSN_MASK);
+done:
+ ret = 1;
+ goto unlock;
+
+bail:
+ qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+/**
+ * ipath_uc_rcv - handle an incoming UC packet
+ * @dev: the device the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+ struct ipath_other_headers *ohdr;
+ int opcode;
+ u32 hdrsize;
+ u32 psn;
+ u32 pad;
+ struct ib_wc wc;
+ u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+ struct ib_reth *reth;
+ int header_in_data;
+
+ /* Validate the SLID. See Ch. 9.6.1.5 */
+ if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+ goto done;
+
+ /* Check for GRH */
+ if (!has_grh) {
+ ohdr = &hdr->u.oth;
+ hdrsize = 8 + 12; /* LRH + BTH */
+ psn = be32_to_cpu(ohdr->bth[2]);
+ header_in_data = 0;
+ } else {
+ ohdr = &hdr->u.l.oth;
+ hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */
+ /*
+ * The header with GRH is 60 bytes and the
+ * core driver sets the eager header buffer
+ * size to 56 bytes so the last 4 bytes of
+ * the BTH header (PSN) is in the data buffer.
+ */
+ header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+ if (header_in_data) {
+ psn = be32_to_cpu(((__be32 *) data)[0]);
+ data += sizeof(__be32);
+ } else
+ psn = be32_to_cpu(ohdr->bth[2]);
+ }
+ /*
+ * The opcode is in the low byte when its in network order
+ * (top byte when in host order).
+ */
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+ memset(&wc, 0, sizeof wc);
+
+ /* Compare the PSN verses the expected PSN. */
+ if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
+ /*
+ * Handle a sequence error.
+ * Silently drop any current message.
+ */
+ qp->r_psn = psn;
+ inv:
+ qp->r_state = OP(SEND_LAST);
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+ goto send_first;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ goto rdma_first;
+
+ default:
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case OP(SEND_FIRST):
+ case OP(SEND_MIDDLE):
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+ break;
+ goto inv;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_MIDDLE):
+ if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ break;
+ goto inv;
+
+ default:
+ if (opcode == OP(SEND_FIRST) ||
+ opcode == OP(SEND_ONLY) ||
+ opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_FIRST) ||
+ opcode == OP(RDMA_WRITE_ONLY) ||
+ opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+ break;
+ goto inv;
+ }
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+ send_first:
+ if (qp->r_flags & IPATH_R_REUSE_SGE) {
+ qp->r_flags &= ~IPATH_R_REUSE_SGE;
+ qp->r_sge = qp->s_rdma_read_sge;
+ } else if (!ipath_get_rwqe(qp, 0)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Save the WQE so we can reuse it in case of an error. */
+ qp->s_rdma_read_sge = qp->r_sge;
+ qp->r_rcv_len = 0;
+ if (opcode == OP(SEND_ONLY))
+ goto send_last;
+ else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+ goto send_last_imm;
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+ qp->r_flags |= IPATH_R_REUSE_SGE;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len)) {
+ qp->r_flags |= IPATH_R_REUSE_SGE;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ ipath_copy_sge(&qp->r_sge, data, pmtu);
+ break;
+
+ case OP(SEND_LAST_WITH_IMMEDIATE):
+ send_last_imm:
+ if (header_in_data) {
+ wc.ex.imm_data = *(__be32 *) data;
+ data += sizeof(__be32);
+ } else {
+ /* Immediate data comes after BTH */
+ wc.ex.imm_data = ohdr->u.imm_data;
+ }
+ hdrsize += 4;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ /* FALLTHROUGH */
+ case OP(SEND_LAST):
+ send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ qp->r_flags |= IPATH_R_REUSE_SGE;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len)) {
+ qp->r_flags |= IPATH_R_REUSE_SGE;
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ wc.opcode = IB_WC_RECV;
+ last_imm:
+ ipath_copy_sge(&qp->r_sge, data, tlen);
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ (ohdr->bth[0] &
+ cpu_to_be32(1 << 23)) != 0);
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+ rdma_first:
+ /* RETH comes after BTH */
+ if (!header_in_data)
+ reth = &ohdr->u.rc.reth;
+ else {
+ reth = (struct ib_reth *)data;
+ data += sizeof(*reth);
+ }
+ hdrsize += sizeof(*reth);
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey */
+ ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
+ vaddr, rkey,
+ IB_ACCESS_REMOTE_WRITE);
+ if (unlikely(!ok)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ } else {
+ qp->r_sge.sg_list = NULL;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (unlikely(!(qp->qp_access_flags &
+ IB_ACCESS_REMOTE_WRITE))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ if (opcode == OP(RDMA_WRITE_ONLY))
+ goto rdma_last;
+ else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+ goto rdma_last_imm;
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ ipath_copy_sge(&qp->r_sge, data, pmtu);
+ break;
+
+ case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ rdma_last_imm:
+ if (header_in_data) {
+ wc.ex.imm_data = *(__be32 *) data;
+ data += sizeof(__be32);
+ } else {
+ /* Immediate data comes after BTH */
+ wc.ex.imm_data = ohdr->u.imm_data;
+ }
+ hdrsize += 4;
+ wc.wc_flags = IB_WC_WITH_IMM;
+
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ if (qp->r_flags & IPATH_R_REUSE_SGE)
+ qp->r_flags &= ~IPATH_R_REUSE_SGE;
+ else if (!ipath_get_rwqe(qp, 1)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ wc.byte_len = qp->r_len;
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ goto last_imm;
+
+ case OP(RDMA_WRITE_LAST):
+ rdma_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* XXX LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ ipath_copy_sge(&qp->r_sge, data, tlen);
+ break;
+
+ default:
+ /* Drop packet for unknown opcodes. */
+ dev->n_pkt_drops++;
+ goto done;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+done:
+ return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_ud.c b/drivers/staging/rdma/ipath/ipath_ud.c
new file mode 100644
index 000000000000..e8a2a915251e
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_ud.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from ipath_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling ipath_ud_rcv()
+ * while this is being called.
+ */
+static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
+{
+ struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+ struct ipath_qp *qp;
+ struct ib_ah_attr *ah_attr;
+ unsigned long flags;
+ struct ipath_rq *rq;
+ struct ipath_srq *srq;
+ struct ipath_sge_state rsge;
+ struct ipath_sge *sge;
+ struct ipath_rwq *wq;
+ struct ipath_rwqe *wqe;
+ void (*handler)(struct ib_event *, void *);
+ struct ib_wc wc;
+ u32 tail;
+ u32 rlen;
+ u32 length;
+
+ qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
+ if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+ dev->n_pkt_drops++;
+ goto done;
+ }
+
+ /*
+ * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+ * Qkeys with the high order bit set mean use the
+ * qkey from the QP context instead of the WR (see 10.2.5).
+ */
+ if (unlikely(qp->ibqp.qp_num &&
+ ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
+ sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
+ /* XXX OK to lose a count once in a while. */
+ dev->qkey_violations++;
+ dev->n_pkt_drops++;
+ goto drop;
+ }
+
+ /*
+ * A GRH is expected to precede the data even if not
+ * present on the wire.
+ */
+ length = swqe->length;
+ memset(&wc, 0, sizeof wc);
+ wc.byte_len = length + sizeof(struct ib_grh);
+
+ if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = swqe->wr.ex.imm_data;
+ }
+
+ /*
+ * This would be a lot simpler if we could call ipath_get_rwqe()
+ * but that uses state that the receive interrupt handler uses
+ * so we would need to lock out receive interrupts while doing
+ * local loopback.
+ */
+ if (qp->ibqp.srq) {
+ srq = to_isrq(qp->ibqp.srq);
+ handler = srq->ibsrq.event_handler;
+ rq = &srq->rq;
+ } else {
+ srq = NULL;
+ handler = NULL;
+ rq = &qp->r_rq;
+ }
+
+ /*
+ * Get the next work request entry to find where to put the data.
+ * Note that it is safe to drop the lock after changing rq->tail
+ * since ipath_post_receive() won't fill the empty slot.
+ */
+ spin_lock_irqsave(&rq->lock, flags);
+ wq = rq->wq;
+ tail = wq->tail;
+ /* Validate tail before using it since it is user writable. */
+ if (tail >= rq->size)
+ tail = 0;
+ if (unlikely(tail == wq->head)) {
+ spin_unlock_irqrestore(&rq->lock, flags);
+ dev->n_pkt_drops++;
+ goto drop;
+ }
+ wqe = get_rwqe_ptr(rq, tail);
+ rsge.sg_list = qp->r_ud_sg_list;
+ if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
+ spin_unlock_irqrestore(&rq->lock, flags);
+ dev->n_pkt_drops++;
+ goto drop;
+ }
+ /* Silently drop packets which are too big. */
+ if (wc.byte_len > rlen) {
+ spin_unlock_irqrestore(&rq->lock, flags);
+ dev->n_pkt_drops++;
+ goto drop;
+ }
+ if (++tail >= rq->size)
+ tail = 0;
+ wq->tail = tail;
+ wc.wr_id = wqe->wr_id;
+ if (handler) {
+ u32 n;
+
+ /*
+ * validate head pointer value and compute
+ * the number of remaining WQEs.
+ */
+ n = wq->head;
+ if (n >= rq->size)
+ n = 0;
+ if (n < tail)
+ n += rq->size - tail;
+ else
+ n -= tail;
+ if (n < srq->limit) {
+ struct ib_event ev;
+
+ srq->limit = 0;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ handler(&ev, srq->ibsrq.srq_context);
+ } else
+ spin_unlock_irqrestore(&rq->lock, flags);
+ } else
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
+ wc.wc_flags |= IB_WC_GRH;
+ } else
+ ipath_skip_sge(&rsge, sizeof(struct ib_grh));
+ sge = swqe->sg_list;
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ BUG_ON(len == 0);
+ ipath_copy_sge(&rsge, sge->vaddr, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--swqe->wr.num_sge)
+ sge++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = sqp->ibqp.qp_num;
+ /* XXX do we know which pkey matched? Only needed for GSI. */
+ wc.pkey_index = 0;
+ wc.slid = dev->dd->ipath_lid |
+ (ah_attr->src_path_bits &
+ ((1 << dev->dd->ipath_lmc) - 1));
+ wc.sl = ah_attr->sl;
+ wc.dlid_path_bits =
+ ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
+ wc.port_num = 1;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ swqe->wr.send_flags & IB_SEND_SOLICITED);
+drop:
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+done:;
+}
+
+/**
+ * ipath_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_ud_req(struct ipath_qp *qp)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_other_headers *ohdr;
+ struct ib_ah_attr *ah_attr;
+ struct ipath_swqe *wqe;
+ unsigned long flags;
+ u32 nwords;
+ u32 extra_bytes;
+ u32 bth0;
+ u16 lrh0;
+ u16 lid;
+ int ret = 0;
+ int next_cur;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == qp->s_head)
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (atomic_read(&qp->s_dma_busy)) {
+ qp->s_flags |= IPATH_S_WAIT_DMA;
+ goto bail;
+ }
+ wqe = get_swqe_ptr(qp, qp->s_last);
+ ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ goto done;
+ }
+
+ if (qp->s_cur == qp->s_head)
+ goto bail;
+
+ wqe = get_swqe_ptr(qp, qp->s_cur);
+ next_cur = qp->s_cur + 1;
+ if (next_cur >= qp->s_size)
+ next_cur = 0;
+
+ /* Construct the header. */
+ ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+ if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
+ if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
+ dev->n_multicast_xmit++;
+ else
+ dev->n_unicast_xmit++;
+ } else {
+ dev->n_unicast_xmit++;
+ lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
+ if (unlikely(lid == dev->dd->ipath_lid)) {
+ /*
+ * If DMAs are in progress, we can't generate
+ * a completion for the loopback packet since
+ * it would be out of order.
+ * XXX Instead of waiting, we could queue a
+ * zero length descriptor so we get a callback.
+ */
+ if (atomic_read(&qp->s_dma_busy)) {
+ qp->s_flags |= IPATH_S_WAIT_DMA;
+ goto bail;
+ }
+ qp->s_cur = next_cur;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ ipath_ud_loopback(qp, wqe);
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
+ goto done;
+ }
+ }
+
+ qp->s_cur = next_cur;
+ extra_bytes = -wqe->length & 3;
+ nwords = (wqe->length + extra_bytes) >> 2;
+
+ /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+ qp->s_hdrwords = 7;
+ qp->s_cur_size = wqe->length;
+ qp->s_cur_sge = &qp->s_sge;
+ qp->s_dmult = ah_attr->static_rate;
+ qp->s_wqe = wqe;
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ /* Header size in 32-bit words. */
+ qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+ &ah_attr->grh,
+ qp->s_hdrwords, nwords);
+ lrh0 = IPATH_LRH_GRH;
+ ohdr = &qp->s_hdr.u.l.oth;
+ /*
+ * Don't worry about sending to locally attached multicast
+ * QPs. It is unspecified by the spec. what happens.
+ */
+ } else {
+ /* Header size in 32-bit words. */
+ lrh0 = IPATH_LRH_BTH;
+ ohdr = &qp->s_hdr.u.oth;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ qp->s_hdrwords++;
+ ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+ bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+ } else
+ bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+ lrh0 |= ah_attr->sl << 4;
+ if (qp->ibqp.qp_type == IB_QPT_SMI)
+ lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+ qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+ qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */
+ qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
+ SIZE_OF_CRC);
+ lid = dev->dd->ipath_lid;
+ if (lid) {
+ lid |= ah_attr->src_path_bits &
+ ((1 << dev->dd->ipath_lmc) - 1);
+ qp->s_hdr.lrh[3] = cpu_to_be16(lid);
+ } else
+ qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= 1 << 23;
+ bth0 |= extra_bytes << 20;
+ bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
+ ipath_get_pkey(dev->dd, qp->s_pkey_index);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ /*
+ * Use the multicast QP if the destination LID is a multicast LID.
+ */
+ ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+ ah_attr->dlid != IPATH_PERMISSIVE_LID ?
+ cpu_to_be32(IPATH_MULTICAST_QPN) :
+ cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+ ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
+ /*
+ * Qkeys with the high order bit set mean use the
+ * qkey from the QP context instead of the WR (see 10.2.5).
+ */
+ ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+ qp->qkey : wqe->wr.wr.ud.remote_qkey);
+ ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+ ret = 1;
+ goto unlock;
+
+bail:
+ qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+/**
+ * ipath_ud_rcv - receive an incoming UD packet
+ * @dev: the device the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+ struct ipath_other_headers *ohdr;
+ int opcode;
+ u32 hdrsize;
+ u32 pad;
+ struct ib_wc wc;
+ u32 qkey;
+ u32 src_qp;
+ u16 dlid;
+ int header_in_data;
+
+ /* Check for GRH */
+ if (!has_grh) {
+ ohdr = &hdr->u.oth;
+ hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */
+ qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+ src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+ header_in_data = 0;
+ } else {
+ ohdr = &hdr->u.l.oth;
+ hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+ /*
+ * The header with GRH is 68 bytes and the core driver sets
+ * the eager header buffer size to 56 bytes so the last 12
+ * bytes of the IB header is in the data buffer.
+ */
+ header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+ if (header_in_data) {
+ qkey = be32_to_cpu(((__be32 *) data)[1]);
+ src_qp = be32_to_cpu(((__be32 *) data)[2]);
+ data += 12;
+ } else {
+ qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+ src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+ }
+ }
+ src_qp &= IPATH_QPN_MASK;
+
+ /*
+ * Check that the permissive LID is only used on QP0
+ * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+ */
+ if (qp->ibqp.qp_num) {
+ if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+ hdr->lrh[3] == IB_LID_PERMISSIVE)) {
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+ if (unlikely(qkey != qp->qkey)) {
+ /* XXX OK to lose a count once in a while. */
+ dev->qkey_violations++;
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+ } else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
+ hdr->lrh[3] == IB_LID_PERMISSIVE) {
+ struct ib_smp *smp = (struct ib_smp *) data;
+
+ if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+ }
+
+ /*
+ * The opcode is in the low byte when its in network order
+ * (top byte when in host order).
+ */
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ if (qp->ibqp.qp_num > 1 &&
+ opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+ if (header_in_data) {
+ wc.ex.imm_data = *(__be32 *) data;
+ data += sizeof(__be32);
+ } else
+ wc.ex.imm_data = ohdr->u.ud.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ hdrsize += sizeof(u32);
+ } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+ wc.ex.imm_data = 0;
+ wc.wc_flags = 0;
+ } else {
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ if (unlikely(tlen < (hdrsize + pad + 4))) {
+ /* Drop incomplete packets. */
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+ tlen -= hdrsize + pad + 4;
+
+ /* Drop invalid MAD packets (see 13.5.3.1). */
+ if (unlikely((qp->ibqp.qp_num == 0 &&
+ (tlen != 256 ||
+ (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
+ (qp->ibqp.qp_num == 1 &&
+ (tlen != 256 ||
+ (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+
+ /*
+ * A GRH is expected to precede the data even if not
+ * present on the wire.
+ */
+ wc.byte_len = tlen + sizeof(struct ib_grh);
+
+ /*
+ * Get the next work request entry to find where to put the data.
+ */
+ if (qp->r_flags & IPATH_R_REUSE_SGE)
+ qp->r_flags &= ~IPATH_R_REUSE_SGE;
+ else if (!ipath_get_rwqe(qp, 0)) {
+ /*
+ * Count VL15 packets dropped due to no receive buffer.
+ * Otherwise, count them as buffer overruns since usually,
+ * the HW will be able to receive packets even if there are
+ * no QPs with posted receive buffers.
+ */
+ if (qp->ibqp.qp_num == 0)
+ dev->n_vl15_dropped++;
+ else
+ dev->rcv_errors++;
+ goto bail;
+ }
+ /* Silently drop packets which are too big. */
+ if (wc.byte_len > qp->r_len) {
+ qp->r_flags |= IPATH_R_REUSE_SGE;
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+ if (has_grh) {
+ ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+ sizeof(struct ib_grh));
+ wc.wc_flags |= IB_WC_GRH;
+ } else
+ ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
+ ipath_copy_sge(&qp->r_sge, data,
+ wc.byte_len - sizeof(struct ib_grh));
+ if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+ goto bail;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.vendor_err = 0;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = src_qp;
+ /* XXX do we know which pkey matched? Only needed for GSI. */
+ wc.pkey_index = 0;
+ wc.slid = be16_to_cpu(hdr->lrh[3]);
+ wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+ dlid = be16_to_cpu(hdr->lrh[1]);
+ /*
+ * Save the LMC lower bits if the destination LID is a unicast LID.
+ */
+ wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
+ dlid & ((1 << dev->dd->ipath_lmc) - 1);
+ wc.port_num = 1;
+ /* Signal completion event if the solicited bit is set. */
+ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+ (ohdr->bth[0] &
+ cpu_to_be32(1 << 23)) != 0);
+
+bail:;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_user_pages.c b/drivers/staging/rdma/ipath/ipath_user_pages.c
new file mode 100644
index 000000000000..1da1252dcdb3
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_user_pages.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+
+static void __ipath_release_user_pages(struct page **p, size_t num_pages,
+ int dirty)
+{
+ size_t i;
+
+ for (i = 0; i < num_pages; i++) {
+ ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
+ (unsigned long) num_pages, p[i]);
+ if (dirty)
+ set_page_dirty_lock(p[i]);
+ put_page(p[i]);
+ }
+}
+
+/* call with current->mm->mmap_sem held */
+static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+ struct page **p)
+{
+ unsigned long lock_limit;
+ size_t got;
+ int ret;
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (num_pages > lock_limit) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
+ (unsigned long) num_pages, start_page);
+
+ for (got = 0; got < num_pages; got += ret) {
+ ret = get_user_pages(current, current->mm,
+ start_page + got * PAGE_SIZE,
+ num_pages - got, 1, 1,
+ p + got, NULL);
+ if (ret < 0)
+ goto bail_release;
+ }
+
+ current->mm->pinned_vm += num_pages;
+
+ ret = 0;
+ goto bail;
+
+bail_release:
+ __ipath_release_user_pages(p, got, 0);
+bail:
+ return ret;
+}
+
+/**
+ * ipath_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
+ unsigned long offset, size_t size, int direction)
+{
+ dma_addr_t phys;
+
+ phys = pci_map_page(hwdev, page, offset, size, direction);
+
+ if (phys == 0) {
+ pci_unmap_page(hwdev, phys, size, direction);
+ phys = pci_map_page(hwdev, page, offset, size, direction);
+ /*
+ * FIXME: If we get 0 again, we should keep this page,
+ * map another, then free the 0 page.
+ */
+ }
+
+ return phys;
+}
+
+/**
+ * ipath_map_single - a safety wrapper around pci_map_single()
+ *
+ * Same idea as ipath_map_page().
+ */
+dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
+ int direction)
+{
+ dma_addr_t phys;
+
+ phys = pci_map_single(hwdev, ptr, size, direction);
+
+ if (phys == 0) {
+ pci_unmap_single(hwdev, phys, size, direction);
+ phys = pci_map_single(hwdev, ptr, size, direction);
+ /*
+ * FIXME: If we get 0 again, we should keep this page,
+ * map another, then free the 0 page.
+ */
+ }
+
+ return phys;
+}
+
+/**
+ * ipath_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages. For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+ struct page **p)
+{
+ int ret;
+
+ down_write(&current->mm->mmap_sem);
+
+ ret = __ipath_get_user_pages(start_page, num_pages, p);
+
+ up_write(&current->mm->mmap_sem);
+
+ return ret;
+}
+
+void ipath_release_user_pages(struct page **p, size_t num_pages)
+{
+ down_write(&current->mm->mmap_sem);
+
+ __ipath_release_user_pages(p, num_pages, 1);
+
+ current->mm->pinned_vm -= num_pages;
+
+ up_write(&current->mm->mmap_sem);
+}
+
+struct ipath_user_pages_work {
+ struct work_struct work;
+ struct mm_struct *mm;
+ unsigned long num_pages;
+};
+
+static void user_pages_account(struct work_struct *_work)
+{
+ struct ipath_user_pages_work *work =
+ container_of(_work, struct ipath_user_pages_work, work);
+
+ down_write(&work->mm->mmap_sem);
+ work->mm->pinned_vm -= work->num_pages;
+ up_write(&work->mm->mmap_sem);
+ mmput(work->mm);
+ kfree(work);
+}
+
+void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
+{
+ struct ipath_user_pages_work *work;
+ struct mm_struct *mm;
+
+ __ipath_release_user_pages(p, num_pages, 1);
+
+ mm = get_task_mm(current);
+ if (!mm)
+ return;
+
+ work = kmalloc(sizeof(*work), GFP_KERNEL);
+ if (!work)
+ goto bail_mm;
+
+ INIT_WORK(&work->work, user_pages_account);
+ work->mm = mm;
+ work->num_pages = num_pages;
+
+ queue_work(ib_wq, &work->work);
+ return;
+
+bail_mm:
+ mmput(mm);
+ return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.c b/drivers/staging/rdma/ipath/ipath_user_sdma.c
new file mode 100644
index 000000000000..cc04b7ba3488
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_user_sdma.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "ipath_kernel.h"
+#include "ipath_user_sdma.h"
+
+/* minimum size of header */
+#define IPATH_USER_SDMA_MIN_HEADER_LENGTH 64
+/* expected size of headers (for dma_pool) */
+#define IPATH_USER_SDMA_EXP_HEADER_LENGTH 64
+/* length mask in PBC (lower 11 bits) */
+#define IPATH_PBC_LENGTH_MASK ((1 << 11) - 1)
+
+struct ipath_user_sdma_pkt {
+ u8 naddr; /* dimension of addr (1..3) ... */
+ u32 counter; /* sdma pkts queued counter for this entry */
+ u64 added; /* global descq number of entries */
+
+ struct {
+ u32 offset; /* offset for kvaddr, addr */
+ u32 length; /* length in page */
+ u8 put_page; /* should we put_page? */
+ u8 dma_mapped; /* is page dma_mapped? */
+ struct page *page; /* may be NULL (coherent mem) */
+ void *kvaddr; /* FIXME: only for pio hack */
+ dma_addr_t addr;
+ } addr[4]; /* max pages, any more and we coalesce */
+ struct list_head list; /* list element */
+};
+
+struct ipath_user_sdma_queue {
+ /*
+ * pkts sent to dma engine are queued on this
+ * list head. the type of the elements of this
+ * list are struct ipath_user_sdma_pkt...
+ */
+ struct list_head sent;
+
+ /* headers with expected length are allocated from here... */
+ char header_cache_name[64];
+ struct dma_pool *header_cache;
+
+ /* packets are allocated from the slab cache... */
+ char pkt_slab_name[64];
+ struct kmem_cache *pkt_slab;
+
+ /* as packets go on the queued queue, they are counted... */
+ u32 counter;
+ u32 sent_counter;
+
+ /* dma page table */
+ struct rb_root dma_pages_root;
+
+ /* protect everything above... */
+ struct mutex lock;
+};
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
+{
+ struct ipath_user_sdma_queue *pq =
+ kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
+
+ if (!pq)
+ goto done;
+
+ pq->counter = 0;
+ pq->sent_counter = 0;
+ INIT_LIST_HEAD(&pq->sent);
+
+ mutex_init(&pq->lock);
+
+ snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+ "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
+ pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+ sizeof(struct ipath_user_sdma_pkt),
+ 0, 0, NULL);
+
+ if (!pq->pkt_slab)
+ goto err_kfree;
+
+ snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+ "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
+ pq->header_cache = dma_pool_create(pq->header_cache_name,
+ dev,
+ IPATH_USER_SDMA_EXP_HEADER_LENGTH,
+ 4, 0);
+ if (!pq->header_cache)
+ goto err_slab;
+
+ pq->dma_pages_root = RB_ROOT;
+
+ goto done;
+
+err_slab:
+ kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+ kfree(pq);
+ pq = NULL;
+
+done:
+ return pq;
+}
+
+static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
+ int i, size_t offset, size_t len,
+ int put_page, int dma_mapped,
+ struct page *page,
+ void *kvaddr, dma_addr_t dma_addr)
+{
+ pkt->addr[i].offset = offset;
+ pkt->addr[i].length = len;
+ pkt->addr[i].put_page = put_page;
+ pkt->addr[i].dma_mapped = dma_mapped;
+ pkt->addr[i].page = page;
+ pkt->addr[i].kvaddr = kvaddr;
+ pkt->addr[i].addr = dma_addr;
+}
+
+static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
+ u32 counter, size_t offset,
+ size_t len, int dma_mapped,
+ struct page *page,
+ void *kvaddr, dma_addr_t dma_addr)
+{
+ pkt->naddr = 1;
+ pkt->counter = counter;
+ ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
+ kvaddr, dma_addr);
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
+ struct ipath_user_sdma_pkt *pkt,
+ const struct iovec *iov,
+ unsigned long niov) {
+ int ret = 0;
+ struct page *page = alloc_page(GFP_KERNEL);
+ void *mpage_save;
+ char *mpage;
+ int i;
+ int len = 0;
+ dma_addr_t dma_addr;
+
+ if (!page) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ mpage = kmap(page);
+ mpage_save = mpage;
+ for (i = 0; i < niov; i++) {
+ int cfur;
+
+ cfur = copy_from_user(mpage,
+ iov[i].iov_base, iov[i].iov_len);
+ if (cfur) {
+ ret = -EFAULT;
+ goto free_unmap;
+ }
+
+ mpage += iov[i].iov_len;
+ len += iov[i].iov_len;
+ }
+
+ dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+ ret = -ENOMEM;
+ goto free_unmap;
+ }
+
+ ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
+ dma_addr);
+ pkt->naddr = 2;
+
+ goto done;
+
+free_unmap:
+ kunmap(page);
+ __free_page(page);
+done:
+ return ret;
+}
+
+/* how many pages in this iovec element? */
+static int ipath_user_sdma_num_pages(const struct iovec *iov)
+{
+ const unsigned long addr = (unsigned long) iov->iov_base;
+ const unsigned long len = iov->iov_len;
+ const unsigned long spage = addr & PAGE_MASK;
+ const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+ return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+/* truncate length to page boundary */
+static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
+{
+ const unsigned long offset = addr & ~PAGE_MASK;
+
+ return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
+}
+
+static void ipath_user_sdma_free_pkt_frag(struct device *dev,
+ struct ipath_user_sdma_queue *pq,
+ struct ipath_user_sdma_pkt *pkt,
+ int frag)
+{
+ const int i = frag;
+
+ if (pkt->addr[i].page) {
+ if (pkt->addr[i].dma_mapped)
+ dma_unmap_page(dev,
+ pkt->addr[i].addr,
+ pkt->addr[i].length,
+ DMA_TO_DEVICE);
+
+ if (pkt->addr[i].kvaddr)
+ kunmap(pkt->addr[i].page);
+
+ if (pkt->addr[i].put_page)
+ put_page(pkt->addr[i].page);
+ else
+ __free_page(pkt->addr[i].page);
+ } else if (pkt->addr[i].kvaddr)
+ /* free coherent mem from cache... */
+ dma_pool_free(pq->header_cache,
+ pkt->addr[i].kvaddr, pkt->addr[i].addr);
+}
+
+/* return number of pages pinned... */
+static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
+ struct ipath_user_sdma_pkt *pkt,
+ unsigned long addr, int tlen, int npages)
+{
+ struct page *pages[2];
+ int j;
+ int ret;
+
+ ret = get_user_pages_fast(addr, npages, 0, pages);
+ if (ret != npages) {
+ int i;
+
+ for (i = 0; i < ret; i++)
+ put_page(pages[i]);
+
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ for (j = 0; j < npages; j++) {
+ /* map the pages... */
+ const int flen =
+ ipath_user_sdma_page_length(addr, tlen);
+ dma_addr_t dma_addr =
+ dma_map_page(&dd->pcidev->dev,
+ pages[j], 0, flen, DMA_TO_DEVICE);
+ unsigned long fofs = addr & ~PAGE_MASK;
+
+ if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
+ pages[j], kmap(pages[j]),
+ dma_addr);
+
+ pkt->naddr++;
+ addr += flen;
+ tlen -= flen;
+ }
+
+done:
+ return ret;
+}
+
+static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq,
+ struct ipath_user_sdma_pkt *pkt,
+ const struct iovec *iov,
+ unsigned long niov)
+{
+ int ret = 0;
+ unsigned long idx;
+
+ for (idx = 0; idx < niov; idx++) {
+ const int npages = ipath_user_sdma_num_pages(iov + idx);
+ const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+ ret = ipath_user_sdma_pin_pages(dd, pkt,
+ addr, iov[idx].iov_len,
+ npages);
+ if (ret < 0)
+ goto free_pkt;
+ }
+
+ goto done;
+
+free_pkt:
+ for (idx = 0; idx < pkt->naddr; idx++)
+ ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+done:
+ return ret;
+}
+
+static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq,
+ struct ipath_user_sdma_pkt *pkt,
+ const struct iovec *iov,
+ unsigned long niov, int npages)
+{
+ int ret = 0;
+
+ if (npages >= ARRAY_SIZE(pkt->addr))
+ ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
+ else
+ ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+ return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void ipath_user_sdma_free_pkt_list(struct device *dev,
+ struct ipath_user_sdma_queue *pq,
+ struct list_head *list)
+{
+ struct ipath_user_sdma_pkt *pkt, *pkt_next;
+
+ list_for_each_entry_safe(pkt, pkt_next, list, list) {
+ int i;
+
+ for (i = 0; i < pkt->naddr; i++)
+ ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+ kmem_cache_free(pq->pkt_slab, pkt);
+ }
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total. list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq,
+ struct list_head *list,
+ const struct iovec *iov,
+ unsigned long niov,
+ int maxpkts)
+{
+ unsigned long idx = 0;
+ int ret = 0;
+ int npkts = 0;
+ struct page *page = NULL;
+ __le32 *pbc;
+ dma_addr_t dma_addr;
+ struct ipath_user_sdma_pkt *pkt = NULL;
+ size_t len;
+ size_t nw;
+ u32 counter = pq->counter;
+ int dma_mapped = 0;
+
+ while (idx < niov && npkts < maxpkts) {
+ const unsigned long addr = (unsigned long) iov[idx].iov_base;
+ const unsigned long idx_save = idx;
+ unsigned pktnw;
+ unsigned pktnwc;
+ int nfrags = 0;
+ int npages = 0;
+ int cfur;
+
+ dma_mapped = 0;
+ len = iov[idx].iov_len;
+ nw = len >> 2;
+ page = NULL;
+
+ pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+ if (!pkt) {
+ ret = -ENOMEM;
+ goto free_list;
+ }
+
+ if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
+ len > PAGE_SIZE || len & 3 || addr & 3) {
+ ret = -EINVAL;
+ goto free_pkt;
+ }
+
+ if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
+ pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+ &dma_addr);
+ else
+ pbc = NULL;
+
+ if (!pbc) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto free_pkt;
+ }
+ pbc = kmap(page);
+ }
+
+ cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+ if (cfur) {
+ ret = -EFAULT;
+ goto free_pbc;
+ }
+
+ /*
+ * this assignment is a bit strange. it's because the
+ * the pbc counts the number of 32 bit words in the full
+ * packet _except_ the first word of the pbc itself...
+ */
+ pktnwc = nw - 1;
+
+ /*
+ * pktnw computation yields the number of 32 bit words
+ * that the caller has indicated in the PBC. note that
+ * this is one less than the total number of words that
+ * goes to the send DMA engine as the first 32 bit word
+ * of the PBC itself is not counted. Armed with this count,
+ * we can verify that the packet is consistent with the
+ * iovec lengths.
+ */
+ pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
+ if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
+ ret = -EINVAL;
+ goto free_pbc;
+ }
+
+
+ idx++;
+ while (pktnwc < pktnw && idx < niov) {
+ const size_t slen = iov[idx].iov_len;
+ const unsigned long faddr =
+ (unsigned long) iov[idx].iov_base;
+
+ if (slen & 3 || faddr & 3 || !slen ||
+ slen > PAGE_SIZE) {
+ ret = -EINVAL;
+ goto free_pbc;
+ }
+
+ npages++;
+ if ((faddr & PAGE_MASK) !=
+ ((faddr + slen - 1) & PAGE_MASK))
+ npages++;
+
+ pktnwc += slen >> 2;
+ idx++;
+ nfrags++;
+ }
+
+ if (pktnwc != pktnw) {
+ ret = -EINVAL;
+ goto free_pbc;
+ }
+
+ if (page) {
+ dma_addr = dma_map_page(&dd->pcidev->dev,
+ page, 0, len, DMA_TO_DEVICE);
+ if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+ ret = -ENOMEM;
+ goto free_pbc;
+ }
+
+ dma_mapped = 1;
+ }
+
+ ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
+ page, pbc, dma_addr);
+
+ if (nfrags) {
+ ret = ipath_user_sdma_init_payload(dd, pq, pkt,
+ iov + idx_save + 1,
+ nfrags, npages);
+ if (ret < 0)
+ goto free_pbc_dma;
+ }
+
+ counter++;
+ npkts++;
+
+ list_add_tail(&pkt->list, list);
+ }
+
+ ret = idx;
+ goto done;
+
+free_pbc_dma:
+ if (dma_mapped)
+ dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
+free_pbc:
+ if (page) {
+ kunmap(page);
+ __free_page(page);
+ } else
+ dma_pool_free(pq->header_cache, pbc, dma_addr);
+free_pkt:
+ kmem_cache_free(pq->pkt_slab, pkt);
+free_list:
+ ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+ return ret;
+}
+
+static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
+ u32 c)
+{
+ pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq)
+{
+ struct list_head free_list;
+ struct ipath_user_sdma_pkt *pkt;
+ struct ipath_user_sdma_pkt *pkt_prev;
+ int ret = 0;
+
+ INIT_LIST_HEAD(&free_list);
+
+ list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+ s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
+
+ if (descd < 0)
+ break;
+
+ list_move_tail(&pkt->list, &free_list);
+
+ /* one more packet cleaned */
+ ret++;
+ }
+
+ if (!list_empty(&free_list)) {
+ u32 counter;
+
+ pkt = list_entry(free_list.prev,
+ struct ipath_user_sdma_pkt, list);
+ counter = pkt->counter;
+
+ ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+ ipath_user_sdma_set_complete_counter(pq, counter);
+ }
+
+ return ret;
+}
+
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
+{
+ if (!pq)
+ return;
+
+ kmem_cache_destroy(pq->pkt_slab);
+ dma_pool_destroy(pq->header_cache);
+ kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+ ret = ipath_sdma_make_progress(dd);
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+ return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq)
+{
+ int i;
+
+ if (!pq)
+ return;
+
+ for (i = 0; i < 100; i++) {
+ mutex_lock(&pq->lock);
+ if (list_empty(&pq->sent)) {
+ mutex_unlock(&pq->lock);
+ break;
+ }
+ ipath_user_sdma_hwqueue_clean(dd);
+ ipath_user_sdma_queue_clean(dd, pq);
+ mutex_unlock(&pq->lock);
+ msleep(10);
+ }
+
+ if (!list_empty(&pq->sent)) {
+ struct list_head free_list;
+
+ printk(KERN_INFO "drain: lists not empty: forcing!\n");
+ INIT_LIST_HEAD(&free_list);
+ mutex_lock(&pq->lock);
+ list_splice_init(&pq->sent, &free_list);
+ ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+ mutex_unlock(&pq->lock);
+ }
+}
+
+static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
+ u64 addr, u64 dwlen, u64 dwoffset)
+{
+ return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+ ((addr & 0xfffffffcULL) << 32) |
+ /* SDmaGeneration[1:0] */
+ ((dd->ipath_sdma_generation & 3ULL) << 30) |
+ /* SDmaDwordCount[10:0] */
+ ((dwlen & 0x7ffULL) << 16) |
+ /* SDmaBufOffset[12:2] */
+ (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
+{
+ return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
+{
+ /* last */ /* dma head */
+ return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 ipath_sdma_make_desc1(u64 addr)
+{
+ /* SDmaPhyAddr[47:32] */
+ return cpu_to_le64(addr >> 32);
+}
+
+static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
+ struct ipath_user_sdma_pkt *pkt, int idx,
+ unsigned ofs, u16 tail)
+{
+ const u64 addr = (u64) pkt->addr[idx].addr +
+ (u64) pkt->addr[idx].offset;
+ const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+ __le64 *descqp;
+ __le64 descq0;
+
+ descqp = &dd->ipath_sdma_descq[tail].qw[0];
+
+ descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
+ if (idx == 0)
+ descq0 = ipath_sdma_make_first_desc0(descq0);
+ if (idx == pkt->naddr - 1)
+ descq0 = ipath_sdma_make_last_desc0(descq0);
+
+ descqp[0] = descq0;
+ descqp[1] = ipath_sdma_make_desc1(addr);
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq,
+ struct list_head *pktlist)
+{
+ int ret = 0;
+ unsigned long flags;
+ u16 tail;
+
+ if (list_empty(pktlist))
+ return 0;
+
+ if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
+ return -ECOMM;
+
+ spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+ if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
+ ret = -ECOMM;
+ goto unlock;
+ }
+
+ tail = dd->ipath_sdma_descq_tail;
+ while (!list_empty(pktlist)) {
+ struct ipath_user_sdma_pkt *pkt =
+ list_entry(pktlist->next, struct ipath_user_sdma_pkt,
+ list);
+ int i;
+ unsigned ofs = 0;
+ u16 dtail = tail;
+
+ if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
+ goto unlock_check_tail;
+
+ for (i = 0; i < pkt->naddr; i++) {
+ ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
+ ofs += pkt->addr[i].length >> 2;
+
+ if (++tail == dd->ipath_sdma_descq_cnt) {
+ tail = 0;
+ ++dd->ipath_sdma_generation;
+ }
+ }
+
+ if ((ofs<<2) > dd->ipath_ibmaxlen) {
+ ipath_dbg("packet size %X > ibmax %X, fail\n",
+ ofs<<2, dd->ipath_ibmaxlen);
+ ret = -EMSGSIZE;
+ goto unlock;
+ }
+
+ /*
+ * if the packet is >= 2KB mtu equivalent, we have to use
+ * the large buffers, and have to mark each descriptor as
+ * part of a large buffer packet.
+ */
+ if (ofs >= IPATH_SMALLBUF_DWORDS) {
+ for (i = 0; i < pkt->naddr; i++) {
+ dd->ipath_sdma_descq[dtail].qw[0] |=
+ cpu_to_le64(1ULL << 14);
+ if (++dtail == dd->ipath_sdma_descq_cnt)
+ dtail = 0;
+ }
+ }
+
+ dd->ipath_sdma_descq_added += pkt->naddr;
+ pkt->added = dd->ipath_sdma_descq_added;
+ list_move_tail(&pkt->list, &pq->sent);
+ ret++;
+ }
+
+unlock_check_tail:
+ /* advance the tail on the chip if necessary */
+ if (dd->ipath_sdma_descq_tail != tail) {
+ wmb();
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+ dd->ipath_sdma_descq_tail = tail;
+ }
+
+unlock:
+ spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+ return ret;
+}
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq,
+ const struct iovec *iov,
+ unsigned long dim)
+{
+ int ret = 0;
+ struct list_head list;
+ int npkts = 0;
+
+ INIT_LIST_HEAD(&list);
+
+ mutex_lock(&pq->lock);
+
+ if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
+ ipath_user_sdma_hwqueue_clean(dd);
+ ipath_user_sdma_queue_clean(dd, pq);
+ }
+
+ while (dim) {
+ const int mxp = 8;
+
+ ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
+ if (ret <= 0)
+ goto done_unlock;
+ else {
+ dim -= ret;
+ iov += ret;
+ }
+
+ /* force packets onto the sdma hw queue... */
+ if (!list_empty(&list)) {
+ /*
+ * lazily clean hw queue. the 4 is a guess of about
+ * how many sdma descriptors a packet will take (it
+ * doesn't have to be perfect).
+ */
+ if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
+ ipath_user_sdma_hwqueue_clean(dd);
+ ipath_user_sdma_queue_clean(dd, pq);
+ }
+
+ ret = ipath_user_sdma_push_pkts(dd, pq, &list);
+ if (ret < 0)
+ goto done_unlock;
+ else {
+ npkts += ret;
+ pq->counter += ret;
+
+ if (!list_empty(&list))
+ goto done_unlock;
+ }
+ }
+ }
+
+done_unlock:
+ if (!list_empty(&list))
+ ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+ mutex_unlock(&pq->lock);
+
+ return (ret < 0) ? ret : npkts;
+}
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq)
+{
+ int ret = 0;
+
+ mutex_lock(&pq->lock);
+ ipath_user_sdma_hwqueue_clean(dd);
+ ret = ipath_user_sdma_queue_clean(dd, pq);
+ mutex_unlock(&pq->lock);
+
+ return ret;
+}
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
+{
+ return pq->sent_counter;
+}
+
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
+{
+ return pq->counter;
+}
+
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.h b/drivers/staging/rdma/ipath/ipath_user_sdma.h
new file mode 100644
index 000000000000..fc76316c4a58
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_user_sdma.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct ipath_user_sdma_queue;
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq,
+ const struct iovec *iov,
+ unsigned long dim);
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq);
+
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+ struct ipath_user_sdma_queue *pq);
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
new file mode 100644
index 000000000000..ed2bbc2f7eae
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_verbs.c
@@ -0,0 +1,2365 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+static unsigned int ib_ipath_qp_table_size = 251;
+module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_ipath_lkey_table_size = 12;
+module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
+ S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+ "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_ipath_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+ "Maximum number of protection domains to support");
+
+static unsigned int ib_ipath_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_ipath_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+ "Maximum number of completion queue entries to support");
+
+unsigned int ib_ipath_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
+ S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_ipath_max_qps = 16384;
+module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_ipath_max_sges = 0x60;
+module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_ipath_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
+ S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+ "Maximum number of multicast groups to support");
+
+unsigned int ib_ipath_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
+ uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+ "Maximum number of attached QPs to support");
+
+unsigned int ib_ipath_max_srqs = 1024;
+module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_ipath_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
+ uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
+ uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_ipath_disable_sma;
+module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = 0,
+ [IB_QPS_INIT] = IPATH_POST_RECV_OK,
+ [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+ [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+ IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+ IPATH_PROCESS_NEXT_SEND_OK,
+ [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+ IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+ [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+ IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+ [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+ IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+};
+
+struct ipath_ucontext {
+ struct ib_ucontext ibucontext;
+};
+
+static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
+ *ibucontext)
+{
+ return container_of(ibucontext, struct ipath_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
+ [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+ [IB_WR_SEND] = IB_WC_SEND,
+ [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+ [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+static __be64 sys_image_guid;
+
+/**
+ * ipath_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+{
+ struct ipath_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ BUG_ON(len == 0);
+ memcpy(sge->vaddr, data, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ data += len;
+ length -= len;
+ }
+}
+
+/**
+ * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
+{
+ struct ipath_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ BUG_ON(len == 0);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the ipath_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
+{
+ struct ipath_sge *sg_list = ss->sg_list;
+ struct ipath_sge sge = ss->sge;
+ u8 num_sge = ss->num_sge;
+ u32 ndesc = 1; /* count the header */
+
+ while (length) {
+ u32 len = sge.length;
+
+ if (len > length)
+ len = length;
+ if (len > sge.sge_length)
+ len = sge.sge_length;
+ BUG_ON(len == 0);
+ if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+ (len != length && (len & (sizeof(u32) - 1)))) {
+ ndesc = 0;
+ break;
+ }
+ ndesc++;
+ sge.vaddr += len;
+ sge.length -= len;
+ sge.sge_length -= len;
+ if (sge.sge_length == 0) {
+ if (--num_sge)
+ sge = *sg_list++;
+ } else if (sge.length == 0 && sge.mr != NULL) {
+ if (++sge.n >= IPATH_SEGSZ) {
+ if (++sge.m >= sge.mr->mapsz)
+ break;
+ sge.n = 0;
+ }
+ sge.vaddr =
+ sge.mr->map[sge.m]->segs[sge.n].vaddr;
+ sge.length =
+ sge.mr->map[sge.m]->segs[sge.n].length;
+ }
+ length -= len;
+ }
+ return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
+ u32 length)
+{
+ struct ipath_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ BUG_ON(len == 0);
+ memcpy(data, sge->vaddr, len);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ data += len;
+ length -= len;
+ }
+}
+
+/**
+ * ipath_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+ struct ipath_swqe *wqe;
+ u32 next;
+ int i;
+ int j;
+ int acc;
+ int ret;
+ unsigned long flags;
+ struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ if (qp->ibqp.qp_type != IB_QPT_SMI &&
+ !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+ ret = -ENETDOWN;
+ goto bail;
+ }
+
+ /* Check that state is OK to post send. */
+ if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+ goto bail_inval;
+
+ /* IB spec says that num_sge == 0 is OK. */
+ if (wr->num_sge > qp->s_max_sge)
+ goto bail_inval;
+
+ /*
+ * Don't allow RDMA reads or atomic operations on UC or
+ * undefined operations.
+ * Make sure buffer is large enough to hold the result for atomics.
+ */
+ if (qp->ibqp.qp_type == IB_QPT_UC) {
+ if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+ goto bail_inval;
+ } else if (qp->ibqp.qp_type == IB_QPT_UD) {
+ /* Check UD opcode */
+ if (wr->opcode != IB_WR_SEND &&
+ wr->opcode != IB_WR_SEND_WITH_IMM)
+ goto bail_inval;
+ /* Check UD destination address PD */
+ if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+ goto bail_inval;
+ } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+ goto bail_inval;
+ else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+ (wr->num_sge == 0 ||
+ wr->sg_list[0].length < sizeof(u64) ||
+ wr->sg_list[0].addr & (sizeof(u64) - 1)))
+ goto bail_inval;
+ else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+ goto bail_inval;
+
+ next = qp->s_head + 1;
+ if (next >= qp->s_size)
+ next = 0;
+ if (next == qp->s_last) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ wqe = get_swqe_ptr(qp, qp->s_head);
+ wqe->wr = *wr;
+ wqe->length = 0;
+ if (wr->num_sge) {
+ acc = wr->opcode >= IB_WR_RDMA_READ ?
+ IB_ACCESS_LOCAL_WRITE : 0;
+ for (i = 0, j = 0; i < wr->num_sge; i++) {
+ u32 length = wr->sg_list[i].length;
+ int ok;
+
+ if (length == 0)
+ continue;
+ ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
+ &wr->sg_list[i], acc);
+ if (!ok)
+ goto bail_inval;
+ wqe->length += length;
+ j++;
+ }
+ wqe->wr.num_sge = j;
+ }
+ if (qp->ibqp.qp_type == IB_QPT_UC ||
+ qp->ibqp.qp_type == IB_QPT_RC) {
+ if (wqe->length > 0x80000000U)
+ goto bail_inval;
+ } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
+ goto bail_inval;
+ wqe->ssn = qp->s_ssn++;
+ qp->s_head = next;
+
+ ret = 0;
+ goto bail;
+
+bail_inval:
+ ret = -EINVAL;
+bail:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+/**
+ * ipath_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ int err = 0;
+
+ for (; wr; wr = wr->next) {
+ err = ipath_post_one_send(qp, wr);
+ if (err) {
+ *bad_wr = wr;
+ goto bail;
+ }
+ }
+
+ /* Try to do the send work in the caller's context. */
+ ipath_do_send((unsigned long) qp);
+
+bail:
+ return err;
+}
+
+/**
+ * ipath_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ struct ipath_rwq *wq = qp->r_rq.wq;
+ unsigned long flags;
+ int ret;
+
+ /* Check that state is OK to post receive. */
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
+ *bad_wr = wr;
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ for (; wr; wr = wr->next) {
+ struct ipath_rwqe *wqe;
+ u32 next;
+ int i;
+
+ if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+ *bad_wr = wr;
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&qp->r_rq.lock, flags);
+ next = wq->head + 1;
+ if (next >= qp->r_rq.size)
+ next = 0;
+ if (next == wq->tail) {
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ *bad_wr = wr;
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+ wqe->wr_id = wr->wr_id;
+ wqe->num_sge = wr->num_sge;
+ for (i = 0; i < wr->num_sge; i++)
+ wqe->sg_list[i] = wr->sg_list[i];
+ /* Make sure queue entry is written before the head index. */
+ smp_wmb();
+ wq->head = next;
+ spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ }
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_qp_rcv - processing an incoming packet on a QP
+ * @dev: the device the packet came on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_qp_rcv(struct ipath_ibdev *dev,
+ struct ipath_ib_header *hdr, int has_grh,
+ void *data, u32 tlen, struct ipath_qp *qp)
+{
+ /* Check for valid receive state. */
+ if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+ dev->n_pkt_drops++;
+ return;
+ }
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ if (ib_ipath_disable_sma)
+ break;
+ /* FALLTHROUGH */
+ case IB_QPT_UD:
+ ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
+ break;
+
+ case IB_QPT_RC:
+ ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
+ break;
+
+ case IB_QPT_UC:
+ ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
+ break;
+
+ default:
+ break;
+ }
+}
+
+/**
+ * ipath_ib_rcv - process an incoming packet
+ * @arg: the device pointer
+ * @rhdr: the header of the packet
+ * @data: the packet data
+ * @tlen: the packet length
+ *
+ * This is called from ipath_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
+ u32 tlen)
+{
+ struct ipath_ib_header *hdr = rhdr;
+ struct ipath_other_headers *ohdr;
+ struct ipath_qp *qp;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+ u16 lid;
+
+ if (unlikely(dev == NULL))
+ goto bail;
+
+ if (unlikely(tlen < 24)) { /* LRH+BTH+CRC */
+ dev->rcv_errors++;
+ goto bail;
+ }
+
+ /* Check for a valid destination LID (see ch. 7.11.1). */
+ lid = be16_to_cpu(hdr->lrh[1]);
+ if (lid < IPATH_MULTICAST_LID_BASE) {
+ lid &= ~((1 << dev->dd->ipath_lmc) - 1);
+ if (unlikely(lid != dev->dd->ipath_lid)) {
+ dev->rcv_errors++;
+ goto bail;
+ }
+ }
+
+ /* Check for GRH */
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh == IPATH_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else if (lnh == IPATH_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else {
+ dev->rcv_errors++;
+ goto bail;
+ }
+
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+ dev->opstats[opcode].n_bytes += tlen;
+ dev->opstats[opcode].n_packets++;
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
+ if (qp_num == IPATH_MULTICAST_QPN) {
+ struct ipath_mcast *mcast;
+ struct ipath_mcast_qp *p;
+
+ if (lnh != IPATH_LRH_GRH) {
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+ mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
+ if (mcast == NULL) {
+ dev->n_pkt_drops++;
+ goto bail;
+ }
+ dev->n_multicast_rcv++;
+ list_for_each_entry_rcu(p, &mcast->qp_list, list)
+ ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
+ /*
+ * Notify ipath_multicast_detach() if it is waiting for us
+ * to finish.
+ */
+ if (atomic_dec_return(&mcast->refcount) <= 1)
+ wake_up(&mcast->wait);
+ } else {
+ qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
+ if (qp) {
+ dev->n_unicast_rcv++;
+ ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
+ tlen, qp);
+ /*
+ * Notify ipath_destroy_qp() if it is waiting
+ * for us to finish.
+ */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ } else
+ dev->n_pkt_drops++;
+ }
+
+bail:;
+}
+
+/**
+ * ipath_ib_timer - verbs timer
+ * @arg: the device pointer
+ *
+ * This is called from ipath_do_rcv_timer() at interrupt level to check for
+ * QPs which need retransmits and to collect performance numbers.
+ */
+static void ipath_ib_timer(struct ipath_ibdev *dev)
+{
+ struct ipath_qp *resend = NULL;
+ struct ipath_qp *rnr = NULL;
+ struct list_head *last;
+ struct ipath_qp *qp;
+ unsigned long flags;
+
+ if (dev == NULL)
+ return;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ /* Start filling the next pending queue. */
+ if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
+ dev->pending_index = 0;
+ /* Save any requests still in the new queue, they have timed out. */
+ last = &dev->pending[dev->pending_index];
+ while (!list_empty(last)) {
+ qp = list_entry(last->next, struct ipath_qp, timerwait);
+ list_del_init(&qp->timerwait);
+ qp->timer_next = resend;
+ resend = qp;
+ atomic_inc(&qp->refcount);
+ }
+ last = &dev->rnrwait;
+ if (!list_empty(last)) {
+ qp = list_entry(last->next, struct ipath_qp, timerwait);
+ if (--qp->s_rnr_timeout == 0) {
+ do {
+ list_del_init(&qp->timerwait);
+ qp->timer_next = rnr;
+ rnr = qp;
+ atomic_inc(&qp->refcount);
+ if (list_empty(last))
+ break;
+ qp = list_entry(last->next, struct ipath_qp,
+ timerwait);
+ } while (qp->s_rnr_timeout == 0);
+ }
+ }
+ /*
+ * We should only be in the started state if pma_sample_start != 0
+ */
+ if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
+ --dev->pma_sample_start == 0) {
+ dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+ ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
+ &dev->ipath_rword,
+ &dev->ipath_spkts,
+ &dev->ipath_rpkts,
+ &dev->ipath_xmit_wait);
+ }
+ if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+ if (dev->pma_sample_interval == 0) {
+ u64 ta, tb, tc, td, te;
+
+ dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+ ipath_snapshot_counters(dev->dd, &ta, &tb,
+ &tc, &td, &te);
+
+ dev->ipath_sword = ta - dev->ipath_sword;
+ dev->ipath_rword = tb - dev->ipath_rword;
+ dev->ipath_spkts = tc - dev->ipath_spkts;
+ dev->ipath_rpkts = td - dev->ipath_rpkts;
+ dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
+ }
+ else
+ dev->pma_sample_interval--;
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+ /* XXX What if timer fires again while this is running? */
+ while (resend != NULL) {
+ qp = resend;
+ resend = qp->timer_next;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (qp->s_last != qp->s_tail &&
+ ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+ dev->n_timeouts++;
+ ipath_restart_rc(qp, qp->s_last_psn + 1);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+ while (rnr != NULL) {
+ qp = rnr;
+ rnr = qp->timer_next;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+}
+
+static void update_sge(struct ipath_sge_state *ss, u32 length)
+{
+ struct ipath_sge *sge = &ss->sge;
+
+ sge->vaddr += length;
+ sge->length -= length;
+ sge->sge_length -= length;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr != NULL) {
+ if (++sge->n >= IPATH_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ return;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+ return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+ return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+ data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+ data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+ return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+ return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+ return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+ data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+ data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+ return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
+ u32 length, unsigned flush_wc)
+{
+ u32 extra = 0;
+ u32 data = 0;
+ u32 last;
+
+ while (1) {
+ u32 len = ss->sge.length;
+ u32 off;
+
+ if (len > length)
+ len = length;
+ if (len > ss->sge.sge_length)
+ len = ss->sge.sge_length;
+ BUG_ON(len == 0);
+ /* If the source address is not aligned, try to align it. */
+ off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+ if (off) {
+ u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+ ~(sizeof(u32) - 1));
+ u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+ u32 y;
+
+ y = sizeof(u32) - off;
+ if (len > y)
+ len = y;
+ if (len + extra >= sizeof(u32)) {
+ data |= set_upper_bits(v, extra *
+ BITS_PER_BYTE);
+ len = sizeof(u32) - extra;
+ if (len == length) {
+ last = data;
+ break;
+ }
+ __raw_writel(data, piobuf);
+ piobuf++;
+ extra = 0;
+ data = 0;
+ } else {
+ /* Clear unused upper bytes */
+ data |= clear_upper_bytes(v, len, extra);
+ if (len == length) {
+ last = data;
+ break;
+ }
+ extra += len;
+ }
+ } else if (extra) {
+ /* Source address is aligned. */
+ u32 *addr = (u32 *) ss->sge.vaddr;
+ int shift = extra * BITS_PER_BYTE;
+ int ushift = 32 - shift;
+ u32 l = len;
+
+ while (l >= sizeof(u32)) {
+ u32 v = *addr;
+
+ data |= set_upper_bits(v, shift);
+ __raw_writel(data, piobuf);
+ data = get_upper_bits(v, ushift);
+ piobuf++;
+ addr++;
+ l -= sizeof(u32);
+ }
+ /*
+ * We still have 'extra' number of bytes leftover.
+ */
+ if (l) {
+ u32 v = *addr;
+
+ if (l + extra >= sizeof(u32)) {
+ data |= set_upper_bits(v, shift);
+ len -= l + extra - sizeof(u32);
+ if (len == length) {
+ last = data;
+ break;
+ }
+ __raw_writel(data, piobuf);
+ piobuf++;
+ extra = 0;
+ data = 0;
+ } else {
+ /* Clear unused upper bytes */
+ data |= clear_upper_bytes(v, l,
+ extra);
+ if (len == length) {
+ last = data;
+ break;
+ }
+ extra += l;
+ }
+ } else if (len == length) {
+ last = data;
+ break;
+ }
+ } else if (len == length) {
+ u32 w;
+
+ /*
+ * Need to round up for the last dword in the
+ * packet.
+ */
+ w = (len + 3) >> 2;
+ __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
+ piobuf += w - 1;
+ last = ((u32 *) ss->sge.vaddr)[w - 1];
+ break;
+ } else {
+ u32 w = len >> 2;
+
+ __iowrite32_copy(piobuf, ss->sge.vaddr, w);
+ piobuf += w;
+
+ extra = len & (sizeof(u32) - 1);
+ if (extra) {
+ u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+ /* Clear unused upper bytes */
+ data = clear_upper_bytes(v, extra, 0);
+ }
+ }
+ update_sge(ss, len);
+ length -= len;
+ }
+ /* Update address before sending packet. */
+ update_sge(ss, length);
+ if (flush_wc) {
+ /* must flush early everything before trigger word */
+ ipath_flush_wc();
+ __raw_writel(last, piobuf);
+ /* be sure trigger word is written */
+ ipath_flush_wc();
+ } else
+ __raw_writel(last, piobuf);
+}
+
+/*
+ * Convert IB rate to delay multiplier.
+ */
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 8;
+ case IB_RATE_5_GBPS: return 4;
+ case IB_RATE_10_GBPS: return 2;
+ case IB_RATE_20_GBPS: return 1;
+ default: return 0;
+ }
+}
+
+/*
+ * Convert delay multiplier to IB rate
+ */
+static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
+{
+ switch (mult) {
+ case 8: return IB_RATE_2_5_GBPS;
+ case 4: return IB_RATE_5_GBPS;
+ case 2: return IB_RATE_10_GBPS;
+ case 1: return IB_RATE_20_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+
+static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
+{
+ struct ipath_verbs_txreq *tx = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ if (!list_empty(&dev->txreq_free)) {
+ struct list_head *l = dev->txreq_free.next;
+
+ list_del(l);
+ tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+ return tx;
+}
+
+static inline void put_txreq(struct ipath_ibdev *dev,
+ struct ipath_verbs_txreq *tx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ list_add(&tx->txreq.list, &dev->txreq_free);
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+static void sdma_complete(void *cookie, int status)
+{
+ struct ipath_verbs_txreq *tx = cookie;
+ struct ipath_qp *qp = tx->qp;
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+ enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
+
+ if (atomic_dec_and_test(&qp->s_dma_busy)) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (tx->wqe)
+ ipath_send_complete(qp, tx->wqe, ibs);
+ if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+ qp->s_last != qp->s_head) ||
+ (qp->s_flags & IPATH_S_WAIT_DMA))
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ wake_up(&qp->wait_dma);
+ } else if (tx->wqe) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipath_send_complete(qp, tx->wqe, ibs);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+
+ if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+ kfree(tx->txreq.map_addr);
+ put_txreq(dev, tx);
+
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+ unsigned long flags;
+
+ if (atomic_dec_and_test(&qp->s_dma_busy)) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+ qp->s_last != qp->s_head) ||
+ (qp->s_flags & IPATH_S_WAIT_DMA))
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ wake_up(&qp->wait_dma);
+ }
+}
+
+/*
+ * Compute the number of clock cycles of delay before sending the next packet.
+ * The multipliers reflect the number of clocks for the fastest rate so
+ * one tick at 4xDDR is 8 ticks at 1xSDR.
+ * If the destination port will take longer to receive a packet than
+ * the outgoing link can send it, we need to delay sending the next packet
+ * by the difference in time it takes the receiver to receive and the sender
+ * to send this packet.
+ * Note that this delay is always correct for UC and RC but not always
+ * optimal for UD. For UD, the destination HCA can be different for each
+ * packet, in which case, we could send packets to a different destination
+ * while "waiting" for the delay. The overhead for doing this without
+ * HW support is more than just paying the cost of delaying some packets
+ * unnecessarily.
+ */
+static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
+{
+ return (rcv_mult > snd_mult) ?
+ (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+}
+
+static int ipath_verbs_send_dma(struct ipath_qp *qp,
+ struct ipath_ib_header *hdr, u32 hdrwords,
+ struct ipath_sge_state *ss, u32 len,
+ u32 plen, u32 dwords)
+{
+ struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+ struct ipath_devdata *dd = dev->dd;
+ struct ipath_verbs_txreq *tx;
+ u32 *piobuf;
+ u32 control;
+ u32 ndesc;
+ int ret;
+
+ tx = qp->s_tx;
+ if (tx) {
+ qp->s_tx = NULL;
+ /* resend previously constructed packet */
+ atomic_inc(&qp->s_dma_busy);
+ ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
+ if (ret) {
+ qp->s_tx = tx;
+ decrement_dma_busy(qp);
+ }
+ goto bail;
+ }
+
+ tx = get_txreq(dev);
+ if (!tx) {
+ ret = -EBUSY;
+ goto bail;
+ }
+
+ /*
+ * Get the saved delay count we computed for the previous packet
+ * and save the delay count for this packet to be used next time
+ * we get here.
+ */
+ control = qp->s_pkt_delay;
+ qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+ tx->qp = qp;
+ atomic_inc(&qp->refcount);
+ tx->wqe = qp->s_wqe;
+ tx->txreq.callback = sdma_complete;
+ tx->txreq.callback_cookie = tx;
+ tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
+ IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
+ if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+ tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
+
+ /* VL15 packets bypass credit check */
+ if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
+ control |= 1ULL << 31;
+ tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
+ }
+
+ if (len) {
+ /*
+ * Don't try to DMA if it takes more descriptors than
+ * the queue holds.
+ */
+ ndesc = ipath_count_sge(ss, len);
+ if (ndesc >= dd->ipath_sdma_descq_cnt)
+ ndesc = 0;
+ } else
+ ndesc = 1;
+ if (ndesc) {
+ tx->hdr.pbc[0] = cpu_to_le32(plen);
+ tx->hdr.pbc[1] = cpu_to_le32(control);
+ memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
+ tx->txreq.sg_count = ndesc;
+ tx->map_len = (hdrwords + 2) << 2;
+ tx->txreq.map_addr = &tx->hdr;
+ atomic_inc(&qp->s_dma_busy);
+ ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
+ if (ret) {
+ /* save ss and length in dwords */
+ tx->ss = ss;
+ tx->len = dwords;
+ qp->s_tx = tx;
+ decrement_dma_busy(qp);
+ }
+ goto bail;
+ }
+
+ /* Allocate a buffer and copy the header and payload to it. */
+ tx->map_len = (plen + 1) << 2;
+ piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
+ if (unlikely(piobuf == NULL)) {
+ ret = -EBUSY;
+ goto err_tx;
+ }
+ tx->txreq.map_addr = piobuf;
+ tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
+ tx->txreq.sg_count = 1;
+
+ *piobuf++ = (__force u32) cpu_to_le32(plen);
+ *piobuf++ = (__force u32) cpu_to_le32(control);
+ memcpy(piobuf, hdr, hdrwords << 2);
+ ipath_copy_from_sge(piobuf + hdrwords, ss, len);
+
+ atomic_inc(&qp->s_dma_busy);
+ ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
+ /*
+ * If we couldn't queue the DMA request, save the info
+ * and try again later rather than destroying the
+ * buffer and undoing the side effects of the copy.
+ */
+ if (ret) {
+ tx->ss = NULL;
+ tx->len = 0;
+ qp->s_tx = tx;
+ decrement_dma_busy(qp);
+ }
+ dev->n_unaligned++;
+ goto bail;
+
+err_tx:
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ put_txreq(dev, tx);
+bail:
+ return ret;
+}
+
+static int ipath_verbs_send_pio(struct ipath_qp *qp,
+ struct ipath_ib_header *ibhdr, u32 hdrwords,
+ struct ipath_sge_state *ss, u32 len,
+ u32 plen, u32 dwords)
+{
+ struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+ u32 *hdr = (u32 *) ibhdr;
+ u32 __iomem *piobuf;
+ unsigned flush_wc;
+ u32 control;
+ int ret;
+ unsigned long flags;
+
+ piobuf = ipath_getpiobuf(dd, plen, NULL);
+ if (unlikely(piobuf == NULL)) {
+ ret = -EBUSY;
+ goto bail;
+ }
+
+ /*
+ * Get the saved delay count we computed for the previous packet
+ * and save the delay count for this packet to be used next time
+ * we get here.
+ */
+ control = qp->s_pkt_delay;
+ qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+ /* VL15 packets bypass credit check */
+ if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
+ control |= 1ULL << 31;
+
+ /*
+ * Write the length to the control qword plus any needed flags.
+ * We have to flush after the PBC for correctness on some cpus
+ * or WC buffer can be written out of order.
+ */
+ writeq(((u64) control << 32) | plen, piobuf);
+ piobuf += 2;
+
+ flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
+ if (len == 0) {
+ /*
+ * If there is just the header portion, must flush before
+ * writing last word of header for correctness, and after
+ * the last header word (trigger word).
+ */
+ if (flush_wc) {
+ ipath_flush_wc();
+ __iowrite32_copy(piobuf, hdr, hdrwords - 1);
+ ipath_flush_wc();
+ __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+ ipath_flush_wc();
+ } else
+ __iowrite32_copy(piobuf, hdr, hdrwords);
+ goto done;
+ }
+
+ if (flush_wc)
+ ipath_flush_wc();
+ __iowrite32_copy(piobuf, hdr, hdrwords);
+ piobuf += hdrwords;
+
+ /* The common case is aligned and contained in one segment. */
+ if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+ !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+ u32 *addr = (u32 *) ss->sge.vaddr;
+
+ /* Update address before sending packet. */
+ update_sge(ss, len);
+ if (flush_wc) {
+ __iowrite32_copy(piobuf, addr, dwords - 1);
+ /* must flush early everything before trigger word */
+ ipath_flush_wc();
+ __raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+ /* be sure trigger word is written */
+ ipath_flush_wc();
+ } else
+ __iowrite32_copy(piobuf, addr, dwords);
+ goto done;
+ }
+ copy_io(piobuf, ss, len, flush_wc);
+done:
+ if (qp->s_wqe) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+ ret = 0;
+bail:
+ return ret;
+}
+
+/**
+ * ipath_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ */
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+ u32 hdrwords, struct ipath_sge_state *ss, u32 len)
+{
+ struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+ u32 plen;
+ int ret;
+ u32 dwords = (len + 3) >> 2;
+
+ /*
+ * Calculate the send buffer trigger address.
+ * The +1 counts for the pbc control dword following the pbc length.
+ */
+ plen = hdrwords + dwords + 1;
+
+ /*
+ * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+ * can defer SDMA restart until link goes ACTIVE without
+ * worrying about just how we got there.
+ */
+ if (qp->ibqp.qp_type == IB_QPT_SMI ||
+ !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+ ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+ plen, dwords);
+ else
+ ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+ plen, dwords);
+
+ return ret;
+}
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+ u64 *rwords, u64 *spkts, u64 *rpkts,
+ u64 *xmit_wait)
+{
+ int ret;
+
+ if (!(dd->ipath_flags & IPATH_INITTED)) {
+ /* no hardware, freeze, etc. */
+ ret = -EINVAL;
+ goto bail;
+ }
+ *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+ *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+ *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+ *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+ *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_get_counters - get various chip counters
+ * @dd: the infinipath device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int ipath_get_counters(struct ipath_devdata *dd,
+ struct ipath_verbs_counters *cntrs)
+{
+ struct ipath_cregs const *crp = dd->ipath_cregs;
+ int ret;
+
+ if (!(dd->ipath_flags & IPATH_INITTED)) {
+ /* no hardware, freeze, etc. */
+ ret = -EINVAL;
+ goto bail;
+ }
+ cntrs->symbol_error_counter =
+ ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
+ cntrs->link_error_recovery_counter =
+ ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
+ /*
+ * The link downed counter counts when the other side downs the
+ * connection. We add in the number of times we downed the link
+ * due to local link integrity errors to compensate.
+ */
+ cntrs->link_downed_counter =
+ ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
+ cntrs->port_rcv_errors =
+ ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
+ ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
+ ipath_snap_cntr(dd, crp->cr_portovflcnt) +
+ ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
+ ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
+ ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
+ ipath_snap_cntr(dd, crp->cr_erricrccnt) +
+ ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
+ ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
+ ipath_snap_cntr(dd, crp->cr_badformatcnt) +
+ dd->ipath_rxfc_unsupvl_errs;
+ if (crp->cr_rxotherlocalphyerrcnt)
+ cntrs->port_rcv_errors +=
+ ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
+ if (crp->cr_rxvlerrcnt)
+ cntrs->port_rcv_errors +=
+ ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
+ cntrs->port_rcv_remphys_errors =
+ ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
+ cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
+ cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
+ cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
+ cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
+ cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
+ cntrs->local_link_integrity_errors =
+ crp->cr_locallinkintegrityerrcnt ?
+ ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
+ ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+ dd->ipath_lli_errs : dd->ipath_lli_errors);
+ cntrs->excessive_buffer_overrun_errors =
+ crp->cr_excessbufferovflcnt ?
+ ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
+ dd->ipath_overrun_thresh_errs;
+ cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
+ ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_ib_piobufavail - callback when a PIO buffer is available
+ * @arg: the device pointer
+ *
+ * This is called from ipath_intr() at interrupt level when a PIO buffer is
+ * available after ipath_verbs_send() returned an error that no buffers were
+ * available. Return 1 if we consumed all the PIO buffers and we still have
+ * QPs waiting for buffers (for now, just restart the send tasklet and
+ * return zero).
+ */
+int ipath_ib_piobufavail(struct ipath_ibdev *dev)
+{
+ struct list_head *list;
+ struct ipath_qp *qplist;
+ struct ipath_qp *qp;
+ unsigned long flags;
+
+ if (dev == NULL)
+ goto bail;
+
+ list = &dev->piowait;
+ qplist = NULL;
+
+ spin_lock_irqsave(&dev->pending_lock, flags);
+ while (!list_empty(list)) {
+ qp = list_entry(list->next, struct ipath_qp, piowait);
+ list_del_init(&qp->piowait);
+ qp->pio_next = qplist;
+ qplist = qp;
+ atomic_inc(&qp->refcount);
+ }
+ spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+ while (qplist != NULL) {
+ qp = qplist;
+ qplist = qp->pio_next;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+ ipath_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+
+bail:
+ return 0;
+}
+
+static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+ struct ib_udata *uhw)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+
+ if (uhw->inlen || uhw->outlen)
+ return -EINVAL;
+
+ memset(props, 0, sizeof(*props));
+
+ props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+ IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+ IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+ IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+ props->page_size_cap = PAGE_SIZE;
+ props->vendor_id =
+ IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
+ props->vendor_part_id = dev->dd->ipath_deviceid;
+ props->hw_ver = dev->dd->ipath_pcirev;
+
+ props->sys_image_guid = dev->sys_image_guid;
+
+ props->max_mr_size = ~0ull;
+ props->max_qp = ib_ipath_max_qps;
+ props->max_qp_wr = ib_ipath_max_qp_wrs;
+ props->max_sge = ib_ipath_max_sges;
+ props->max_sge_rd = ib_ipath_max_sges;
+ props->max_cq = ib_ipath_max_cqs;
+ props->max_ah = ib_ipath_max_ahs;
+ props->max_cqe = ib_ipath_max_cqes;
+ props->max_mr = dev->lk_table.max;
+ props->max_fmr = dev->lk_table.max;
+ props->max_map_per_fmr = 32767;
+ props->max_pd = ib_ipath_max_pds;
+ props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
+ props->max_qp_init_rd_atom = 255;
+ /* props->max_res_rd_atom */
+ props->max_srq = ib_ipath_max_srqs;
+ props->max_srq_wr = ib_ipath_max_srq_wrs;
+ props->max_srq_sge = ib_ipath_max_srq_sges;
+ /* props->local_ca_ack_delay */
+ props->atomic_cap = IB_ATOMIC_GLOB;
+ props->max_pkeys = ipath_get_npkeys(dev->dd);
+ props->max_mcast_grp = ib_ipath_max_mcast_grps;
+ props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
+ props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+ props->max_mcast_grp;
+
+ return 0;
+}
+
+const u8 ipath_cvt_physportstate[32] = {
+ [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+ [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+ [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+ [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+ [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+ [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+ [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
+ IB_PHYSPORTSTATE_CFG_TRAIN,
+ [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
+ IB_PHYSPORTSTATE_CFG_TRAIN,
+ [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
+ IB_PHYSPORTSTATE_CFG_TRAIN,
+ [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
+ IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+ [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
+ IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+ [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
+ IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+ [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+ [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
+{
+ return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
+}
+
+static int ipath_query_port(struct ib_device *ibdev,
+ u8 port, struct ib_port_attr *props)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_devdata *dd = dev->dd;
+ enum ib_mtu mtu;
+ u16 lid = dd->ipath_lid;
+ u64 ibcstat;
+
+ memset(props, 0, sizeof(*props));
+ props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+ props->lmc = dd->ipath_lmc;
+ props->sm_lid = dev->sm_lid;
+ props->sm_sl = dev->sm_sl;
+ ibcstat = dd->ipath_lastibcstat;
+ /* map LinkState to IB portinfo values. */
+ props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
+
+ /* See phys_state_show() */
+ props->phys_state = /* MEA: assumes shift == 0 */
+ ipath_cvt_physportstate[dd->ipath_lastibcstat &
+ dd->ibcs_lts_mask];
+ props->port_cap_flags = dev->port_cap_flags;
+ props->gid_tbl_len = 1;
+ props->max_msg_sz = 0x80000000;
+ props->pkey_tbl_len = ipath_get_npkeys(dd);
+ props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
+ dev->z_pkey_violations;
+ props->qkey_viol_cntr = dev->qkey_violations;
+ props->active_width = dd->ipath_link_width_active;
+ /* See rate_show() */
+ props->active_speed = dd->ipath_link_speed_active;
+ props->max_vl_num = 1; /* VLCap = VL0 */
+ props->init_type_reply = 0;
+
+ props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+ switch (dd->ipath_ibmtu) {
+ case 4096:
+ mtu = IB_MTU_4096;
+ break;
+ case 2048:
+ mtu = IB_MTU_2048;
+ break;
+ case 1024:
+ mtu = IB_MTU_1024;
+ break;
+ case 512:
+ mtu = IB_MTU_512;
+ break;
+ case 256:
+ mtu = IB_MTU_256;
+ break;
+ default:
+ mtu = IB_MTU_2048;
+ }
+ props->active_mtu = mtu;
+ props->subnet_timeout = dev->subnet_timeout;
+
+ return 0;
+}
+
+static int ipath_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ int ret;
+
+ if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+ IB_DEVICE_MODIFY_NODE_DESC)) {
+ ret = -EOPNOTSUPP;
+ goto bail;
+ }
+
+ if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
+ memcpy(device->node_desc, device_modify->node_desc, 64);
+
+ if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+ to_idev(device)->sys_image_guid =
+ cpu_to_be64(device_modify->sys_image_guid);
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static int ipath_modify_port(struct ib_device *ibdev,
+ u8 port, int port_modify_mask,
+ struct ib_port_modify *props)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+
+ dev->port_cap_flags |= props->set_port_cap_mask;
+ dev->port_cap_flags &= ~props->clr_port_cap_mask;
+ if (port_modify_mask & IB_PORT_SHUTDOWN)
+ ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
+ if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+ dev->qkey_violations = 0;
+ return 0;
+}
+
+static int ipath_query_gid(struct ib_device *ibdev, u8 port,
+ int index, union ib_gid *gid)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ int ret;
+
+ if (index >= 1) {
+ ret = -EINVAL;
+ goto bail;
+ }
+ gid->global.subnet_prefix = dev->gid_prefix;
+ gid->global.interface_id = dev->dd->ipath_guid;
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ struct ipath_pd *pd;
+ struct ib_pd *ret;
+
+ /*
+ * This is actually totally arbitrary. Some correctness tests
+ * assume there's a maximum number of PDs that can be allocated.
+ * We don't actually have this limit, but we fail the test if
+ * we allow allocations of more than we report for this value.
+ */
+
+ pd = kmalloc(sizeof *pd, GFP_KERNEL);
+ if (!pd) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ spin_lock(&dev->n_pds_lock);
+ if (dev->n_pds_allocated == ib_ipath_max_pds) {
+ spin_unlock(&dev->n_pds_lock);
+ kfree(pd);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ dev->n_pds_allocated++;
+ spin_unlock(&dev->n_pds_lock);
+
+ /* ib_alloc_pd() will initialize pd->ibpd. */
+ pd->user = udata != NULL;
+
+ ret = &pd->ibpd;
+
+bail:
+ return ret;
+}
+
+static int ipath_dealloc_pd(struct ib_pd *ibpd)
+{
+ struct ipath_pd *pd = to_ipd(ibpd);
+ struct ipath_ibdev *dev = to_idev(ibpd->device);
+
+ spin_lock(&dev->n_pds_lock);
+ dev->n_pds_allocated--;
+ spin_unlock(&dev->n_pds_lock);
+
+ kfree(pd);
+
+ return 0;
+}
+
+/**
+ * ipath_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr)
+{
+ struct ipath_ah *ah;
+ struct ib_ah *ret;
+ struct ipath_ibdev *dev = to_idev(pd->device);
+ unsigned long flags;
+
+ /* A multicast address requires a GRH (see ch. 8.4.1). */
+ if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+ ah_attr->dlid != IPATH_PERMISSIVE_LID &&
+ !(ah_attr->ah_flags & IB_AH_GRH)) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ if (ah_attr->dlid == 0) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ if (ah_attr->port_num < 1 ||
+ ah_attr->port_num > pd->device->phys_port_cnt) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail;
+ }
+
+ ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+ if (!ah) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ spin_lock_irqsave(&dev->n_ahs_lock, flags);
+ if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+ kfree(ah);
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ dev->n_ahs_allocated++;
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+ /* ib_create_ah() will initialize ah->ibah. */
+ ah->attr = *ah_attr;
+ ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
+
+ ret = &ah->ibah;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_destroy_ah(struct ib_ah *ibah)
+{
+ struct ipath_ibdev *dev = to_idev(ibah->device);
+ struct ipath_ah *ah = to_iah(ibah);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->n_ahs_lock, flags);
+ dev->n_ahs_allocated--;
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+ kfree(ah);
+
+ return 0;
+}
+
+static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+ struct ipath_ah *ah = to_iah(ibah);
+
+ *ah_attr = ah->attr;
+ ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
+
+ return 0;
+}
+
+/**
+ * ipath_get_npkeys - return the size of the PKEY table for port 0
+ * @dd: the infinipath device
+ */
+unsigned ipath_get_npkeys(struct ipath_devdata *dd)
+{
+ return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
+}
+
+/**
+ * ipath_get_pkey - return the indexed PKEY from the port PKEY table
+ * @dd: the infinipath device
+ * @index: the PKEY index
+ */
+unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
+{
+ unsigned ret;
+
+ /* always a kernel port, no locking needed */
+ if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
+ ret = 0;
+ else
+ ret = dd->ipath_pd[0]->port_pkeys[index];
+
+ return ret;
+}
+
+static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey)
+{
+ struct ipath_ibdev *dev = to_idev(ibdev);
+ int ret;
+
+ if (index >= ipath_get_npkeys(dev->dd)) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ *pkey = ipath_get_pkey(dev->dd, index);
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+/**
+ * ipath_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the InfiniPath driver
+ */
+
+static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ struct ipath_ucontext *context;
+ struct ib_ucontext *ret;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ ret = &context->ibucontext;
+
+bail:
+ return ret;
+}
+
+static int ipath_dealloc_ucontext(struct ib_ucontext *context)
+{
+ kfree(to_iucontext(context));
+ return 0;
+}
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev);
+
+static void __verbs_timer(unsigned long arg)
+{
+ struct ipath_devdata *dd = (struct ipath_devdata *) arg;
+
+ /* Handle verbs layer timeouts. */
+ ipath_ib_timer(dd->verbs_dev);
+
+ mod_timer(&dd->verbs_timer, jiffies + 1);
+}
+
+static int enable_timer(struct ipath_devdata *dd)
+{
+ /*
+ * Early chips had a design flaw where the chip and kernel idea
+ * of the tail register don't always agree, and therefore we won't
+ * get an interrupt on the next packet received.
+ * If the board supports per packet receive interrupts, use it.
+ * Otherwise, the timer function periodically checks for packets
+ * to cover this case.
+ * Either way, the timer is needed for verbs layer related
+ * processing.
+ */
+ if (dd->ipath_flags & IPATH_GPIO_INTR) {
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
+ 0x2074076542310ULL);
+ /* Enable GPIO bit 2 interrupt */
+ dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+ dd->ipath_gpio_mask);
+ }
+
+ init_timer(&dd->verbs_timer);
+ dd->verbs_timer.function = __verbs_timer;
+ dd->verbs_timer.data = (unsigned long)dd;
+ dd->verbs_timer.expires = jiffies + 1;
+ add_timer(&dd->verbs_timer);
+
+ return 0;
+}
+
+static int disable_timer(struct ipath_devdata *dd)
+{
+ /* Disable GPIO bit 2 interrupt */
+ if (dd->ipath_flags & IPATH_GPIO_INTR) {
+ /* Disable GPIO bit 2 interrupt */
+ dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+ dd->ipath_gpio_mask);
+ /*
+ * We might want to undo changes to debugportselect,
+ * but how?
+ */
+ }
+
+ del_timer_sync(&dd->verbs_timer);
+
+ return 0;
+}
+
+static int ipath_port_immutable(struct ib_device *ibdev, u8 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct ib_port_attr attr;
+ int err;
+
+ err = ipath_query_port(ibdev, port_num, &attr);
+ if (err)
+ return err;
+
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+ return 0;
+}
+
+/**
+ * ipath_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated ipath_ibdev pointer or NULL on error.
+ */
+int ipath_register_ib_device(struct ipath_devdata *dd)
+{
+ struct ipath_verbs_counters cntrs;
+ struct ipath_ibdev *idev;
+ struct ib_device *dev;
+ struct ipath_verbs_txreq *tx;
+ unsigned i;
+ int ret;
+
+ idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
+ if (idev == NULL) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ dev = &idev->ibdev;
+
+ if (dd->ipath_sdma_descq_cnt) {
+ tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
+ GFP_KERNEL);
+ if (tx == NULL) {
+ ret = -ENOMEM;
+ goto err_tx;
+ }
+ } else
+ tx = NULL;
+ idev->txreq_bufs = tx;
+
+ /* Only need to initialize non-zero fields. */
+ spin_lock_init(&idev->n_pds_lock);
+ spin_lock_init(&idev->n_ahs_lock);
+ spin_lock_init(&idev->n_cqs_lock);
+ spin_lock_init(&idev->n_qps_lock);
+ spin_lock_init(&idev->n_srqs_lock);
+ spin_lock_init(&idev->n_mcast_grps_lock);
+
+ spin_lock_init(&idev->qp_table.lock);
+ spin_lock_init(&idev->lk_table.lock);
+ idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
+ /* Set the prefix to the default value (see ch. 4.1.1) */
+ idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL);
+
+ ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
+ if (ret)
+ goto err_qp;
+
+ /*
+ * The top ib_ipath_lkey_table_size bits are used to index the
+ * table. The lower 8 bits can be owned by the user (copied from
+ * the LKEY). The remaining bits act as a generation number or tag.
+ */
+ idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
+ idev->lk_table.table = kzalloc(idev->lk_table.max *
+ sizeof(*idev->lk_table.table),
+ GFP_KERNEL);
+ if (idev->lk_table.table == NULL) {
+ ret = -ENOMEM;
+ goto err_lk;
+ }
+ INIT_LIST_HEAD(&idev->pending_mmaps);
+ spin_lock_init(&idev->pending_lock);
+ idev->mmap_offset = PAGE_SIZE;
+ spin_lock_init(&idev->mmap_offset_lock);
+ INIT_LIST_HEAD(&idev->pending[0]);
+ INIT_LIST_HEAD(&idev->pending[1]);
+ INIT_LIST_HEAD(&idev->pending[2]);
+ INIT_LIST_HEAD(&idev->piowait);
+ INIT_LIST_HEAD(&idev->rnrwait);
+ INIT_LIST_HEAD(&idev->txreq_free);
+ idev->pending_index = 0;
+ idev->port_cap_flags =
+ IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+ if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
+ idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+ idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+ idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+ idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+ idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+ idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+ /* Snapshot current HW counters to "clear" them. */
+ ipath_get_counters(dd, &cntrs);
+ idev->z_symbol_error_counter = cntrs.symbol_error_counter;
+ idev->z_link_error_recovery_counter =
+ cntrs.link_error_recovery_counter;
+ idev->z_link_downed_counter = cntrs.link_downed_counter;
+ idev->z_port_rcv_errors = cntrs.port_rcv_errors;
+ idev->z_port_rcv_remphys_errors =
+ cntrs.port_rcv_remphys_errors;
+ idev->z_port_xmit_discards = cntrs.port_xmit_discards;
+ idev->z_port_xmit_data = cntrs.port_xmit_data;
+ idev->z_port_rcv_data = cntrs.port_rcv_data;
+ idev->z_port_xmit_packets = cntrs.port_xmit_packets;
+ idev->z_port_rcv_packets = cntrs.port_rcv_packets;
+ idev->z_local_link_integrity_errors =
+ cntrs.local_link_integrity_errors;
+ idev->z_excessive_buffer_overrun_errors =
+ cntrs.excessive_buffer_overrun_errors;
+ idev->z_vl15_dropped = cntrs.vl15_dropped;
+
+ for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
+ list_add(&tx->txreq.list, &idev->txreq_free);
+
+ /*
+ * The system image GUID is supposed to be the same for all
+ * IB HCAs in a single system but since there can be other
+ * device types in the system, we can't be sure this is unique.
+ */
+ if (!sys_image_guid)
+ sys_image_guid = dd->ipath_guid;
+ idev->sys_image_guid = sys_image_guid;
+ idev->ib_unit = dd->ipath_unit;
+ idev->dd = dd;
+
+ strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
+ dev->owner = THIS_MODULE;
+ dev->node_guid = dd->ipath_guid;
+ dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
+ dev->uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_AH) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+ (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+ dev->node_type = RDMA_NODE_IB_CA;
+ dev->phys_port_cnt = 1;
+ dev->num_comp_vectors = 1;
+ dev->dma_device = &dd->pcidev->dev;
+ dev->query_device = ipath_query_device;
+ dev->modify_device = ipath_modify_device;
+ dev->query_port = ipath_query_port;
+ dev->modify_port = ipath_modify_port;
+ dev->query_pkey = ipath_query_pkey;
+ dev->query_gid = ipath_query_gid;
+ dev->alloc_ucontext = ipath_alloc_ucontext;
+ dev->dealloc_ucontext = ipath_dealloc_ucontext;
+ dev->alloc_pd = ipath_alloc_pd;
+ dev->dealloc_pd = ipath_dealloc_pd;
+ dev->create_ah = ipath_create_ah;
+ dev->destroy_ah = ipath_destroy_ah;
+ dev->query_ah = ipath_query_ah;
+ dev->create_srq = ipath_create_srq;
+ dev->modify_srq = ipath_modify_srq;
+ dev->query_srq = ipath_query_srq;
+ dev->destroy_srq = ipath_destroy_srq;
+ dev->create_qp = ipath_create_qp;
+ dev->modify_qp = ipath_modify_qp;
+ dev->query_qp = ipath_query_qp;
+ dev->destroy_qp = ipath_destroy_qp;
+ dev->post_send = ipath_post_send;
+ dev->post_recv = ipath_post_receive;
+ dev->post_srq_recv = ipath_post_srq_receive;
+ dev->create_cq = ipath_create_cq;
+ dev->destroy_cq = ipath_destroy_cq;
+ dev->resize_cq = ipath_resize_cq;
+ dev->poll_cq = ipath_poll_cq;
+ dev->req_notify_cq = ipath_req_notify_cq;
+ dev->get_dma_mr = ipath_get_dma_mr;
+ dev->reg_phys_mr = ipath_reg_phys_mr;
+ dev->reg_user_mr = ipath_reg_user_mr;
+ dev->dereg_mr = ipath_dereg_mr;
+ dev->alloc_fmr = ipath_alloc_fmr;
+ dev->map_phys_fmr = ipath_map_phys_fmr;
+ dev->unmap_fmr = ipath_unmap_fmr;
+ dev->dealloc_fmr = ipath_dealloc_fmr;
+ dev->attach_mcast = ipath_multicast_attach;
+ dev->detach_mcast = ipath_multicast_detach;
+ dev->process_mad = ipath_process_mad;
+ dev->mmap = ipath_mmap;
+ dev->dma_ops = &ipath_dma_mapping_ops;
+ dev->get_port_immutable = ipath_port_immutable;
+
+ snprintf(dev->node_desc, sizeof(dev->node_desc),
+ IPATH_IDSTR " %s", init_utsname()->nodename);
+
+ ret = ib_register_device(dev, NULL);
+ if (ret)
+ goto err_reg;
+
+ ret = ipath_verbs_register_sysfs(dev);
+ if (ret)
+ goto err_class;
+
+ enable_timer(dd);
+
+ goto bail;
+
+err_class:
+ ib_unregister_device(dev);
+err_reg:
+ kfree(idev->lk_table.table);
+err_lk:
+ kfree(idev->qp_table.table);
+err_qp:
+ kfree(idev->txreq_bufs);
+err_tx:
+ ib_dealloc_device(dev);
+ ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+ idev = NULL;
+
+bail:
+ dd->verbs_dev = idev;
+ return ret;
+}
+
+void ipath_unregister_ib_device(struct ipath_ibdev *dev)
+{
+ struct ib_device *ibdev = &dev->ibdev;
+ u32 qps_inuse;
+
+ ib_unregister_device(ibdev);
+
+ disable_timer(dev->dd);
+
+ if (!list_empty(&dev->pending[0]) ||
+ !list_empty(&dev->pending[1]) ||
+ !list_empty(&dev->pending[2]))
+ ipath_dev_err(dev->dd, "pending list not empty!\n");
+ if (!list_empty(&dev->piowait))
+ ipath_dev_err(dev->dd, "piowait list not empty!\n");
+ if (!list_empty(&dev->rnrwait))
+ ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
+ if (!ipath_mcast_tree_empty())
+ ipath_dev_err(dev->dd, "multicast table memory leak!\n");
+ /*
+ * Note that ipath_unregister_ib_device() can be called before all
+ * the QPs are destroyed!
+ */
+ qps_inuse = ipath_free_all_qps(&dev->qp_table);
+ if (qps_inuse)
+ ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+ qps_inuse);
+ kfree(dev->qp_table.table);
+ kfree(dev->lk_table.table);
+ kfree(dev->txreq_bufs);
+ ib_dealloc_device(ibdev);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_ibdev *dev =
+ container_of(device, struct ipath_ibdev, ibdev.dev);
+
+ return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_ibdev *dev =
+ container_of(device, struct ipath_ibdev, ibdev.dev);
+ int ret;
+
+ ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
+ if (ret < 0)
+ goto bail;
+ strcat(buf, "\n");
+ ret = strlen(buf);
+
+bail:
+ return ret;
+}
+
+static ssize_t show_stats(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ipath_ibdev *dev =
+ container_of(device, struct ipath_ibdev, ibdev.dev);
+ int i;
+ int len;
+
+ len = sprintf(buf,
+ "RC resends %d\n"
+ "RC no QACK %d\n"
+ "RC ACKs %d\n"
+ "RC SEQ NAKs %d\n"
+ "RC RDMA seq %d\n"
+ "RC RNR NAKs %d\n"
+ "RC OTH NAKs %d\n"
+ "RC timeouts %d\n"
+ "RC RDMA dup %d\n"
+ "piobuf wait %d\n"
+ "unaligned %d\n"
+ "PKT drops %d\n"
+ "WQE errs %d\n",
+ dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
+ dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
+ dev->n_other_naks, dev->n_timeouts,
+ dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
+ dev->n_pkt_drops, dev->n_wqe_errs);
+ for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
+ const struct ipath_opcode_stats *si = &dev->opstats[i];
+
+ if (!si->n_packets && !si->n_bytes)
+ continue;
+ len += sprintf(buf + len, "%02x %llu/%llu\n", i,
+ (unsigned long long) si->n_packets,
+ (unsigned long long) si->n_bytes);
+ }
+ return len;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+
+static struct device_attribute *ipath_class_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_hca_type,
+ &dev_attr_board_id,
+ &dev_attr_stats
+};
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev)
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+ ret = device_create_file(&dev->dev,
+ ipath_class_attributes[i]);
+ if (ret)
+ goto bail;
+ }
+ return 0;
+bail:
+ for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+ device_remove_file(&dev->dev, ipath_class_attributes[i]);
+ return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h
new file mode 100644
index 000000000000..ec167e545e15
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_verbs.h
@@ -0,0 +1,939 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPATH_VERBS_H
+#define IPATH_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ipath_kernel.h"
+
+#define IPATH_MAX_RDMA_ATOMIC 4
+
+#define QPN_MAX (1 << 24)
+#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IPATH_UVERBS_ABI_VERSION 2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK 0x20
+#define IB_NAK_PSN_ERROR 0x60
+#define IB_NAK_INVALID_REQUEST 0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR 0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST 0x64
+
+/* Flags for checking QP state (see ib_ipath_state_ops[]) */
+#define IPATH_POST_SEND_OK 0x01
+#define IPATH_POST_RECV_OK 0x02
+#define IPATH_PROCESS_RECV_OK 0x04
+#define IPATH_PROCESS_SEND_OK 0x08
+#define IPATH_PROCESS_NEXT_SEND_OK 0x10
+#define IPATH_FLUSH_SEND 0x20
+#define IPATH_FLUSH_RECV 0x40
+#define IPATH_PROCESS_OR_FLUSH_SEND \
+ (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE 0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED 0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005)
+
+struct ib_reth {
+ __be64 vaddr;
+ __be32 rkey;
+ __be32 length;
+} __attribute__ ((packed));
+
+struct ib_atomic_eth {
+ __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */
+ __be32 rkey;
+ __be64 swap_data;
+ __be64 compare_data;
+} __attribute__ ((packed));
+
+struct ipath_other_headers {
+ __be32 bth[3];
+ union {
+ struct {
+ __be32 deth[2];
+ __be32 imm_data;
+ } ud;
+ struct {
+ struct ib_reth reth;
+ __be32 imm_data;
+ } rc;
+ struct {
+ __be32 aeth;
+ __be32 atomic_ack_eth[2];
+ } at;
+ __be32 imm_data;
+ __be32 aeth;
+ struct ib_atomic_eth atomic_eth;
+ } u;
+} __attribute__ ((packed));
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data). Only the first 56 bytes of the IB header
+ * will be in the eager header buffer. The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct ipath_ib_header {
+ __be16 lrh[4];
+ union {
+ struct {
+ struct ib_grh grh;
+ struct ipath_other_headers oth;
+ } l;
+ struct ipath_other_headers oth;
+ } u;
+} __attribute__ ((packed));
+
+struct ipath_pio_header {
+ __le32 pbc[2];
+ struct ipath_ib_header hdr;
+} __attribute__ ((packed));
+
+/*
+ * There is one struct ipath_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct ipath_mcast_qp.
+ */
+struct ipath_mcast_qp {
+ struct list_head list;
+ struct ipath_qp *qp;
+};
+
+struct ipath_mcast {
+ struct rb_node rb_node;
+ union ib_gid mgid;
+ struct list_head qp_list;
+ wait_queue_head_t wait;
+ atomic_t refcount;
+ int n_attached;
+};
+
+/* Protection domain */
+struct ipath_pd {
+ struct ib_pd ibpd;
+ int user; /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct ipath_ah {
+ struct ib_ah ibah;
+ struct ib_ah_attr attr;
+};
+
+/*
+ * This structure is used by ipath_mmap() to validate an offset
+ * when an mmap() request is made. The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct ipath_mmap_info {
+ struct list_head pending_mmaps;
+ struct ib_ucontext *context;
+ void *obj;
+ __u64 offset;
+ struct kref ref;
+ unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct ipath_cq_wc {
+ u32 head; /* index of next entry to fill */
+ u32 tail; /* index of next ib_poll_cq() entry */
+ union {
+ /* these are actually size ibcq.cqe + 1 */
+ struct ib_uverbs_wc uqueue[0];
+ struct ib_wc kqueue[0];
+ };
+};
+
+/*
+ * The completion queue structure.
+ */
+struct ipath_cq {
+ struct ib_cq ibcq;
+ struct tasklet_struct comptask;
+ spinlock_t lock;
+ u8 notify;
+ u8 triggered;
+ struct ipath_cq_wc *queue;
+ struct ipath_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct ipath_seg {
+ void *vaddr;
+ size_t length;
+};
+
+/* The number of ipath_segs that fit in a page. */
+#define IPATH_SEGSZ (PAGE_SIZE / sizeof (struct ipath_seg))
+
+struct ipath_segarray {
+ struct ipath_seg segs[IPATH_SEGSZ];
+};
+
+struct ipath_mregion {
+ struct ib_pd *pd; /* shares refcnt of ibmr.pd */
+ u64 user_base; /* User's address for this region */
+ u64 iova; /* IB start address of this region */
+ size_t length;
+ u32 lkey;
+ u32 offset; /* offset (bytes) to start of region */
+ int access_flags;
+ u32 max_segs; /* number of ipath_segs in all the arrays */
+ u32 mapsz; /* size of the map array */
+ struct ipath_segarray *map[0]; /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct ipath_sge {
+ struct ipath_mregion *mr;
+ void *vaddr; /* kernel virtual address of segment */
+ u32 sge_length; /* length of the SGE */
+ u32 length; /* remaining length of the segment */
+ u16 m; /* current index: mr->map[m] */
+ u16 n; /* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct ipath_mr {
+ struct ib_mr ibmr;
+ struct ib_umem *umem;
+ struct ipath_mregion mr; /* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct ipath_swqe {
+ struct ib_send_wr wr; /* don't use wr.sg_list */
+ u32 psn; /* first packet sequence number */
+ u32 lpsn; /* last packet sequence number */
+ u32 ssn; /* send sequence number */
+ u32 length; /* total length of data in sg_list */
+ struct ipath_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct ipath_rwqe {
+ u64 wr_id;
+ u8 num_sge;
+ struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct ipath_rwq {
+ u32 head; /* new work requests posted to the head */
+ u32 tail; /* receives pull requests from here. */
+ struct ipath_rwqe wq[0];
+};
+
+struct ipath_rq {
+ struct ipath_rwq *wq;
+ spinlock_t lock;
+ u32 size; /* size of RWQE array */
+ u8 max_sge;
+};
+
+struct ipath_srq {
+ struct ib_srq ibsrq;
+ struct ipath_rq rq;
+ struct ipath_mmap_info *ip;
+ /* send signal when number of RWQEs < limit */
+ u32 limit;
+};
+
+struct ipath_sge_state {
+ struct ipath_sge *sg_list; /* next SGE to be used if any */
+ struct ipath_sge sge; /* progress state for the current SGE */
+ u8 num_sge;
+ u8 static_rate;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct ipath_ack_entry {
+ u8 opcode;
+ u8 sent;
+ u32 psn;
+ union {
+ struct ipath_sge_state rdma_sge;
+ u64 atomic_data;
+ };
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct ipath_qp {
+ struct ib_qp ibqp;
+ struct ipath_qp *next; /* link list for QPN hash table */
+ struct ipath_qp *timer_next; /* link list for ipath_ib_timer() */
+ struct ipath_qp *pio_next; /* link for ipath_ib_piobufavail() */
+ struct list_head piowait; /* link for wait PIO buf */
+ struct list_head timerwait; /* link for waiting for timeouts */
+ struct ib_ah_attr remote_ah_attr;
+ struct ipath_ib_header s_hdr; /* next packet header to send */
+ atomic_t refcount;
+ wait_queue_head_t wait;
+ wait_queue_head_t wait_dma;
+ struct tasklet_struct s_task;
+ struct ipath_mmap_info *ip;
+ struct ipath_sge_state *s_cur_sge;
+ struct ipath_verbs_txreq *s_tx;
+ struct ipath_sge_state s_sge; /* current send request data */
+ struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
+ struct ipath_sge_state s_ack_rdma_sge;
+ struct ipath_sge_state s_rdma_read_sge;
+ struct ipath_sge_state r_sge; /* current receive data */
+ spinlock_t s_lock;
+ atomic_t s_dma_busy;
+ u16 s_pkt_delay;
+ u16 s_hdrwords; /* size of s_hdr in 32 bit words */
+ u32 s_cur_size; /* size of send packet in bytes */
+ u32 s_len; /* total length of s_sge */
+ u32 s_rdma_read_len; /* total length of s_rdma_read_sge */
+ u32 s_next_psn; /* PSN for next request */
+ u32 s_last_psn; /* last response PSN processed */
+ u32 s_psn; /* current packet sequence number */
+ u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */
+ u32 s_ack_psn; /* PSN for acking sends and RDMA writes */
+ u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */
+ u32 r_ack_psn; /* PSN for next ACK or atomic ACK */
+ u64 r_wr_id; /* ID for current receive WQE */
+ unsigned long r_aflags;
+ u32 r_len; /* total length of r_sge */
+ u32 r_rcv_len; /* receive data len processed */
+ u32 r_psn; /* expected rcv packet sequence number */
+ u32 r_msn; /* message sequence number */
+ u8 state; /* QP state */
+ u8 s_state; /* opcode of last packet sent */
+ u8 s_ack_state; /* opcode of packet to ACK */
+ u8 s_nak_state; /* non-zero if NAK is pending */
+ u8 r_state; /* opcode of last packet received */
+ u8 r_nak_state; /* non-zero if NAK is pending */
+ u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */
+ u8 r_flags;
+ u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */
+ u8 r_head_ack_queue; /* index into s_ack_queue[] */
+ u8 qp_access_flags;
+ u8 s_max_sge; /* size of s_wq->sg_list */
+ u8 s_retry_cnt; /* number of times to retry */
+ u8 s_rnr_retry_cnt;
+ u8 s_retry; /* requester retry counter */
+ u8 s_rnr_retry; /* requester RNR retry counter */
+ u8 s_pkey_index; /* PKEY index to use */
+ u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */
+ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */
+ u8 s_tail_ack_queue; /* index into s_ack_queue[] */
+ u8 s_flags;
+ u8 s_dmult;
+ u8 s_draining;
+ u8 timeout; /* Timeout for this QP */
+ enum ib_mtu path_mtu;
+ u32 remote_qpn;
+ u32 qkey; /* QKEY for this QP (for UD or RD) */
+ u32 s_size; /* send work queue size */
+ u32 s_head; /* new entries added here */
+ u32 s_tail; /* next entry to process */
+ u32 s_cur; /* current work queue entry */
+ u32 s_last; /* last un-ACK'ed entry */
+ u32 s_ssn; /* SSN of tail entry */
+ u32 s_lsn; /* limit sequence number (credit) */
+ struct ipath_swqe *s_wq; /* send work queue */
+ struct ipath_swqe *s_wqe;
+ struct ipath_sge *r_ud_sg_list;
+ struct ipath_rq r_rq; /* receive work queue */
+ struct ipath_sge r_sg_list[0]; /* verified SGEs */
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define IPATH_R_WRID_VALID 0
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define IPATH_R_REUSE_SGE 0x01
+#define IPATH_R_RDMAR_SEQ 0x02
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
+ * before processing the next SWQE
+ * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
+ * before processing the next SWQE
+ * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
+ * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ * next send completion entry not via send DMA.
+ */
+#define IPATH_S_SIGNAL_REQ_WR 0x01
+#define IPATH_S_FENCE_PENDING 0x02
+#define IPATH_S_RDMAR_PENDING 0x04
+#define IPATH_S_ACK_PENDING 0x08
+#define IPATH_S_BUSY 0x10
+#define IPATH_S_WAITING 0x20
+#define IPATH_S_WAIT_SSN_CREDIT 0x40
+#define IPATH_S_WAIT_DMA 0x80
+
+#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
+ IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
+
+#define IPATH_PSN_CREDIT 512
+
+/*
+ * Since struct ipath_swqe is not a fixed size, we can't simply index into
+ * struct ipath_qp.s_wq. This function does the array index computation.
+ */
+static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
+ unsigned n)
+{
+ return (struct ipath_swqe *)((char *)qp->s_wq +
+ (sizeof(struct ipath_swqe) +
+ qp->s_max_sge *
+ sizeof(struct ipath_sge)) * n);
+}
+
+/*
+ * Since struct ipath_rwqe is not a fixed size, we can't simply index into
+ * struct ipath_rwq.wq. This function does the array index computation.
+ */
+static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
+ unsigned n)
+{
+ return (struct ipath_rwqe *)
+ ((char *) rq->wq->wq +
+ (sizeof(struct ipath_rwqe) +
+ rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+ atomic_t n_free;
+ void *page;
+};
+
+struct ipath_qp_table {
+ spinlock_t lock;
+ u32 last; /* last QP number allocated */
+ u32 max; /* size of the hash table */
+ u32 nmaps; /* size of the map table */
+ struct ipath_qp **table;
+ /* bit map of free numbers */
+ struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct ipath_lkey_table {
+ spinlock_t lock;
+ u32 next; /* next unused index (speeds search) */
+ u32 gen; /* generation count */
+ u32 max; /* size of the table */
+ struct ipath_mregion **table;
+};
+
+struct ipath_opcode_stats {
+ u64 n_packets; /* number of packets */
+ u64 n_bytes; /* total number of bytes */
+};
+
+struct ipath_ibdev {
+ struct ib_device ibdev;
+ struct ipath_devdata *dd;
+ struct list_head pending_mmaps;
+ spinlock_t mmap_offset_lock;
+ u32 mmap_offset;
+ int ib_unit; /* This is the device number */
+ u16 sm_lid; /* in host order */
+ u8 sm_sl;
+ u8 mkeyprot;
+ /* non-zero when timer is set */
+ unsigned long mkey_lease_timeout;
+
+ /* The following fields are really per port. */
+ struct ipath_qp_table qp_table;
+ struct ipath_lkey_table lk_table;
+ struct list_head pending[3]; /* FIFO of QPs waiting for ACKs */
+ struct list_head piowait; /* list for wait PIO buf */
+ struct list_head txreq_free;
+ void *txreq_bufs;
+ /* list of QPs waiting for RNR timer */
+ struct list_head rnrwait;
+ spinlock_t pending_lock;
+ __be64 sys_image_guid; /* in network order */
+ __be64 gid_prefix; /* in network order */
+ __be64 mkey;
+
+ u32 n_pds_allocated; /* number of PDs allocated for device */
+ spinlock_t n_pds_lock;
+ u32 n_ahs_allocated; /* number of AHs allocated for device */
+ spinlock_t n_ahs_lock;
+ u32 n_cqs_allocated; /* number of CQs allocated for device */
+ spinlock_t n_cqs_lock;
+ u32 n_qps_allocated; /* number of QPs allocated for device */
+ spinlock_t n_qps_lock;
+ u32 n_srqs_allocated; /* number of SRQs allocated for device */
+ spinlock_t n_srqs_lock;
+ u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+ spinlock_t n_mcast_grps_lock;
+
+ u64 ipath_sword; /* total dwords sent (sample result) */
+ u64 ipath_rword; /* total dwords received (sample result) */
+ u64 ipath_spkts; /* total packets sent (sample result) */
+ u64 ipath_rpkts; /* total packets received (sample result) */
+ /* # of ticks no data sent (sample result) */
+ u64 ipath_xmit_wait;
+ u64 rcv_errors; /* # of packets with SW detected rcv errs */
+ u64 n_unicast_xmit; /* total unicast packets sent */
+ u64 n_unicast_rcv; /* total unicast packets received */
+ u64 n_multicast_xmit; /* total multicast packets sent */
+ u64 n_multicast_rcv; /* total multicast packets received */
+ u64 z_symbol_error_counter; /* starting count for PMA */
+ u64 z_link_error_recovery_counter; /* starting count for PMA */
+ u64 z_link_downed_counter; /* starting count for PMA */
+ u64 z_port_rcv_errors; /* starting count for PMA */
+ u64 z_port_rcv_remphys_errors; /* starting count for PMA */
+ u64 z_port_xmit_discards; /* starting count for PMA */
+ u64 z_port_xmit_data; /* starting count for PMA */
+ u64 z_port_rcv_data; /* starting count for PMA */
+ u64 z_port_xmit_packets; /* starting count for PMA */
+ u64 z_port_rcv_packets; /* starting count for PMA */
+ u32 z_pkey_violations; /* starting count for PMA */
+ u32 z_local_link_integrity_errors; /* starting count for PMA */
+ u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */
+ u32 z_vl15_dropped; /* starting count for PMA */
+ u32 n_rc_resends;
+ u32 n_rc_acks;
+ u32 n_rc_qacks;
+ u32 n_seq_naks;
+ u32 n_rdma_seq;
+ u32 n_rnr_naks;
+ u32 n_other_naks;
+ u32 n_timeouts;
+ u32 n_pkt_drops;
+ u32 n_vl15_dropped;
+ u32 n_wqe_errs;
+ u32 n_rdma_dup_busy;
+ u32 n_piowait;
+ u32 n_unaligned;
+ u32 port_cap_flags;
+ u32 pma_sample_start;
+ u32 pma_sample_interval;
+ __be16 pma_counter_select[5];
+ u16 pma_tag;
+ u16 qkey_violations;
+ u16 mkey_violations;
+ u16 mkey_lease_period;
+ u16 pending_index; /* which pending queue is active */
+ u8 pma_sample_status;
+ u8 subnet_timeout;
+ u8 vl_high_limit;
+ struct ipath_opcode_stats opstats[128];
+};
+
+struct ipath_verbs_counters {
+ u64 symbol_error_counter;
+ u64 link_error_recovery_counter;
+ u64 link_downed_counter;
+ u64 port_rcv_errors;
+ u64 port_rcv_remphys_errors;
+ u64 port_xmit_discards;
+ u64 port_xmit_data;
+ u64 port_rcv_data;
+ u64 port_xmit_packets;
+ u64 port_rcv_packets;
+ u32 local_link_integrity_errors;
+ u32 excessive_buffer_overrun_errors;
+ u32 vl15_dropped;
+};
+
+struct ipath_verbs_txreq {
+ struct ipath_qp *qp;
+ struct ipath_swqe *wqe;
+ u32 map_len;
+ u32 len;
+ struct ipath_sge_state *ss;
+ struct ipath_pio_header hdr;
+ struct ipath_sdma_txreq txreq;
+};
+
+static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct ipath_mr, ibmr);
+}
+
+static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct ipath_pd, ibpd);
+}
+
+static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct ipath_ah, ibah);
+}
+
+static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct ipath_cq, ibcq);
+}
+
+static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct ipath_srq, ibsrq);
+}
+
+static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct ipath_qp, ibqp);
+}
+
+static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct ipath_ibdev, ibdev);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+static inline void ipath_schedule_send(struct ipath_qp *qp)
+{
+ if (qp->s_flags & IPATH_S_ANY_WAIT)
+ qp->s_flags &= ~IPATH_S_ANY_WAIT;
+ if (!(qp->s_flags & IPATH_S_BUSY))
+ tasklet_hi_schedule(&qp->s_task);
+}
+
+int ipath_process_mad(struct ib_device *ibdev,
+ int mad_flags,
+ u8 port_num,
+ const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in, size_t in_mad_size,
+ struct ib_mad_hdr *out, size_t *out_mad_size,
+ u16 *out_mad_pkey_index);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int ipath_cmp24(u32 a, u32 b)
+{
+ return (((int) a) - ((int) b)) << 8;
+}
+
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+ u64 *rwords, u64 *spkts, u64 *rpkts,
+ u64 *xmit_wait);
+
+int ipath_get_counters(struct ipath_devdata *dd,
+ struct ipath_verbs_counters *cntrs);
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_mcast_tree_empty(void);
+
+__be32 ipath_compute_aeth(struct ipath_qp *qp);
+
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
+
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+
+int ipath_destroy_qp(struct ib_qp *ibqp);
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_qp_init_attr *init_attr);
+
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
+
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
+
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
+
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
+
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+ u32 hdrwords, struct ipath_sge_state *ss, u32 len);
+
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
+
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
+
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
+
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+ int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
+ struct ipath_mregion *mr);
+
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
+
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+ struct ib_sge *sge, int acc);
+
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+ u32 len, u64 vaddr, u32 rkey, int acc);
+
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask,
+ struct ib_udata *udata);
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int ipath_destroy_srq(struct ib_srq *ibsrq);
+
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+
+int ipath_destroy_cq(struct ib_cq *ibcq);
+
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata);
+
+int ipath_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+ int list_len, u64 iova);
+
+int ipath_unmap_fmr(struct list_head *fmr_list);
+
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
+
+void ipath_release_mmap_info(struct kref *ref);
+
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+ u32 size,
+ struct ib_ucontext *context,
+ void *obj);
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+ struct ipath_mmap_info *ip,
+ u32 size, void *obj);
+
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+void ipath_insert_rnr_queue(struct ipath_qp *qp);
+
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+ u32 *lengthp, struct ipath_sge_state *ss);
+
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
+
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+ struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+ struct ipath_other_headers *ohdr,
+ u32 bth0, u32 bth2);
+
+void ipath_do_send(unsigned long data);
+
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+ enum ib_wc_status status);
+
+int ipath_make_rc_req(struct ipath_qp *qp);
+
+int ipath_make_uc_req(struct ipath_qp *qp);
+
+int ipath_make_ud_req(struct ipath_qp *qp);
+
+int ipath_register_ib_device(struct ipath_devdata *);
+
+void ipath_unregister_ib_device(struct ipath_ibdev *);
+
+void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
+
+int ipath_ib_piobufavail(struct ipath_ibdev *);
+
+unsigned ipath_get_npkeys(struct ipath_devdata *);
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *);
+
+unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
+
+extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
+
+/*
+ * Below converts HCA-specific LinkTrainingState to IB PhysPortState
+ * values.
+ */
+extern const u8 ipath_cvt_physportstate[];
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+
+extern const int ib_ipath_state_ops[];
+
+extern unsigned int ib_ipath_lkey_table_size;
+
+extern unsigned int ib_ipath_max_cqes;
+
+extern unsigned int ib_ipath_max_cqs;
+
+extern unsigned int ib_ipath_max_qp_wrs;
+
+extern unsigned int ib_ipath_max_qps;
+
+extern unsigned int ib_ipath_max_sges;
+
+extern unsigned int ib_ipath_max_mcast_grps;
+
+extern unsigned int ib_ipath_max_mcast_qp_attached;
+
+extern unsigned int ib_ipath_max_srqs;
+
+extern unsigned int ib_ipath_max_srq_sges;
+
+extern unsigned int ib_ipath_max_srq_wrs;
+
+extern const u32 ib_ipath_rnr_table[];
+
+extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
+
+#endif /* IPATH_VERBS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_verbs_mcast.c b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
new file mode 100644
index 000000000000..6216ea923853
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ipath_verbs.h"
+
+/*
+ * Global table of GID to attached QPs.
+ * The table is global to all ipath devices since a send from one QP/device
+ * needs to be locally routed to any locally attached QPs on the same
+ * or different device.
+ */
+static struct rb_root mcast_tree;
+static DEFINE_SPINLOCK(mcast_lock);
+
+/**
+ * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
+{
+ struct ipath_mcast_qp *mqp;
+
+ mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
+ if (!mqp)
+ goto bail;
+
+ mqp->qp = qp;
+ atomic_inc(&qp->refcount);
+
+bail:
+ return mqp;
+}
+
+static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
+{
+ struct ipath_qp *qp = mqp->qp;
+
+ /* Notify ipath_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+
+ kfree(mqp);
+}
+
+/**
+ * ipath_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
+{
+ struct ipath_mcast *mcast;
+
+ mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+ if (!mcast)
+ goto bail;
+
+ mcast->mgid = *mgid;
+ INIT_LIST_HEAD(&mcast->qp_list);
+ init_waitqueue_head(&mcast->wait);
+ atomic_set(&mcast->refcount, 0);
+ mcast->n_attached = 0;
+
+bail:
+ return mcast;
+}
+
+static void ipath_mcast_free(struct ipath_mcast *mcast)
+{
+ struct ipath_mcast_qp *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+ ipath_mcast_qp_free(p);
+
+ kfree(mcast);
+}
+
+/**
+ * ipath_mcast_find - search the global table for the given multicast GID
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
+{
+ struct rb_node *n;
+ unsigned long flags;
+ struct ipath_mcast *mcast;
+
+ spin_lock_irqsave(&mcast_lock, flags);
+ n = mcast_tree.rb_node;
+ while (n) {
+ int ret;
+
+ mcast = rb_entry(n, struct ipath_mcast, rb_node);
+
+ ret = memcmp(mgid->raw, mcast->mgid.raw,
+ sizeof(union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else {
+ atomic_inc(&mcast->refcount);
+ spin_unlock_irqrestore(&mcast_lock, flags);
+ goto bail;
+ }
+ }
+ spin_unlock_irqrestore(&mcast_lock, flags);
+
+ mcast = NULL;
+
+bail:
+ return mcast;
+}
+
+/**
+ * ipath_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added. Return EEXIST if the GID was already in
+ * the table but the QP was added. Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int ipath_mcast_add(struct ipath_ibdev *dev,
+ struct ipath_mcast *mcast,
+ struct ipath_mcast_qp *mqp)
+{
+ struct rb_node **n = &mcast_tree.rb_node;
+ struct rb_node *pn = NULL;
+ int ret;
+
+ spin_lock_irq(&mcast_lock);
+
+ while (*n) {
+ struct ipath_mcast *tmcast;
+ struct ipath_mcast_qp *p;
+
+ pn = *n;
+ tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
+
+ ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+ sizeof(union ib_gid));
+ if (ret < 0) {
+ n = &pn->rb_left;
+ continue;
+ }
+ if (ret > 0) {
+ n = &pn->rb_right;
+ continue;
+ }
+
+ /* Search the QP list to see if this is already there. */
+ list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+ if (p->qp == mqp->qp) {
+ ret = ESRCH;
+ goto bail;
+ }
+ }
+ if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
+ ret = ENOMEM;
+ goto bail;
+ }
+
+ tmcast->n_attached++;
+
+ list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+ ret = EEXIST;
+ goto bail;
+ }
+
+ spin_lock(&dev->n_mcast_grps_lock);
+ if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
+ spin_unlock(&dev->n_mcast_grps_lock);
+ ret = ENOMEM;
+ goto bail;
+ }
+
+ dev->n_mcast_grps_allocated++;
+ spin_unlock(&dev->n_mcast_grps_lock);
+
+ mcast->n_attached++;
+
+ list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+ atomic_inc(&mcast->refcount);
+ rb_link_node(&mcast->rb_node, pn, n);
+ rb_insert_color(&mcast->rb_node, &mcast_tree);
+
+ ret = 0;
+
+bail:
+ spin_unlock_irq(&mcast_lock);
+
+ return ret;
+}
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ struct ipath_ibdev *dev = to_idev(ibqp->device);
+ struct ipath_mcast *mcast;
+ struct ipath_mcast_qp *mqp;
+ int ret;
+
+ /*
+ * Allocate data structures since its better to do this outside of
+ * spin locks and it will most likely be needed.
+ */
+ mcast = ipath_mcast_alloc(gid);
+ if (mcast == NULL) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ mqp = ipath_mcast_qp_alloc(qp);
+ if (mqp == NULL) {
+ ipath_mcast_free(mcast);
+ ret = -ENOMEM;
+ goto bail;
+ }
+ switch (ipath_mcast_add(dev, mcast, mqp)) {
+ case ESRCH:
+ /* Neither was used: can't attach the same QP twice. */
+ ipath_mcast_qp_free(mqp);
+ ipath_mcast_free(mcast);
+ ret = -EINVAL;
+ goto bail;
+ case EEXIST: /* The mcast wasn't used */
+ ipath_mcast_free(mcast);
+ break;
+ case ENOMEM:
+ /* Exceeded the maximum number of mcast groups. */
+ ipath_mcast_qp_free(mqp);
+ ipath_mcast_free(mcast);
+ ret = -ENOMEM;
+ goto bail;
+ default:
+ break;
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct ipath_qp *qp = to_iqp(ibqp);
+ struct ipath_ibdev *dev = to_idev(ibqp->device);
+ struct ipath_mcast *mcast = NULL;
+ struct ipath_mcast_qp *p, *tmp;
+ struct rb_node *n;
+ int last = 0;
+ int ret;
+
+ spin_lock_irq(&mcast_lock);
+
+ /* Find the GID in the mcast table. */
+ n = mcast_tree.rb_node;
+ while (1) {
+ if (n == NULL) {
+ spin_unlock_irq(&mcast_lock);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ mcast = rb_entry(n, struct ipath_mcast, rb_node);
+ ret = memcmp(gid->raw, mcast->mgid.raw,
+ sizeof(union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else
+ break;
+ }
+
+ /* Search the QP list. */
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+ if (p->qp != qp)
+ continue;
+ /*
+ * We found it, so remove it, but don't poison the forward
+ * link until we are sure there are no list walkers.
+ */
+ list_del_rcu(&p->list);
+ mcast->n_attached--;
+
+ /* If this was the last attached QP, remove the GID too. */
+ if (list_empty(&mcast->qp_list)) {
+ rb_erase(&mcast->rb_node, &mcast_tree);
+ last = 1;
+ }
+ break;
+ }
+
+ spin_unlock_irq(&mcast_lock);
+
+ if (p) {
+ /*
+ * Wait for any list walkers to finish before freeing the
+ * list element.
+ */
+ wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+ ipath_mcast_qp_free(p);
+ }
+ if (last) {
+ atomic_dec(&mcast->refcount);
+ wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+ ipath_mcast_free(mcast);
+ spin_lock_irq(&dev->n_mcast_grps_lock);
+ dev->n_mcast_grps_allocated--;
+ spin_unlock_irq(&dev->n_mcast_grps_lock);
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+int ipath_mcast_tree_empty(void)
+{
+ return mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_ppc64.c b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
new file mode 100644
index 000000000000..1a7e20a75149
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only. Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+ return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_x86_64.c b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
new file mode 100644
index 000000000000..7b6e4c843e19
--- /dev/null
+++ b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only. Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/processor.h>
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+ int ret = 0;
+ u64 pioaddr, piolen;
+ unsigned bits;
+ const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+ const size_t len = pci_resource_len(dd->pcidev, 0);
+
+ /*
+ * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+ * chip. Linux (possibly the hardware) requires it to be on a power
+ * of 2 address matching the length (which has to be a power of 2).
+ * For rev1, that means the base address, for rev2, it will be just
+ * the PIO buffers themselves.
+ * For chips with two sets of buffers, the calculations are
+ * somewhat more complicated; we need to sum, and the piobufbase
+ * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+ * The buffers are still packed, so a single range covers both.
+ */
+ if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
+ unsigned long pio2kbase, pio4kbase;
+ pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
+ pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
+ if (pio2kbase < pio4kbase) { /* all, for now */
+ pioaddr = addr + pio2kbase;
+ piolen = pio4kbase - pio2kbase +
+ dd->ipath_piobcnt4k * dd->ipath_4kalign;
+ } else {
+ pioaddr = addr + pio4kbase;
+ piolen = pio2kbase - pio4kbase +
+ dd->ipath_piobcnt2k * dd->ipath_palign;
+ }
+ } else { /* single buffer size (2K, currently) */
+ pioaddr = addr + dd->ipath_piobufbase;
+ piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
+ dd->ipath_piobcnt4k * dd->ipath_4kalign;
+ }
+
+ for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+ /* do nothing */ ;
+
+ if (piolen != (1ULL << bits)) {
+ piolen >>= bits;
+ while (piolen >>= 1)
+ bits++;
+ piolen = 1ULL << (bits + 1);
+ }
+ if (pioaddr & (piolen - 1)) {
+ u64 atmp;
+ ipath_dbg("pioaddr %llx not on right boundary for size "
+ "%llx, fixing\n",
+ (unsigned long long) pioaddr,
+ (unsigned long long) piolen);
+ atmp = pioaddr & ~(piolen - 1);
+ if (atmp < addr || (atmp + piolen) > (addr + len)) {
+ ipath_dev_err(dd, "No way to align address/size "
+ "(%llx/%llx), no WC mtrr\n",
+ (unsigned long long) atmp,
+ (unsigned long long) piolen << 1);
+ ret = -ENODEV;
+ } else {
+ ipath_dbg("changing WC base from %llx to %llx, "
+ "len from %llx to %llx\n",
+ (unsigned long long) pioaddr,
+ (unsigned long long) atmp,
+ (unsigned long long) piolen,
+ (unsigned long long) piolen << 1);
+ pioaddr = atmp;
+ piolen <<= 1;
+ }
+ }
+
+ if (!ret) {
+ dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
+ if (dd->wc_cookie < 0) {
+ ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n");
+ ret = -ENODEV;
+ } else if (dd->wc_cookie == 0)
+ ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n");
+ else
+ ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n");
+ }
+
+ return ret;
+}
+
+/**
+ * ipath_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ */
+void ipath_disable_wc(struct ipath_devdata *dd)
+{
+ arch_phys_wc_del(dd->wc_cookie);
+}