aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lnet
diff options
context:
space:
mode:
authorPeng Tao <bergwolf@gmail.com>2013-05-02 16:46:55 +0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2013-05-14 13:54:50 -0400
commitd7e09d0397e84eefbabfd9cb353221f3c6448d83 (patch)
treecaeeaffa2967acfe0988dc65cf04089c0a27c7aa /drivers/staging/lustre/lnet
parentstaging: vt6656: remove unused definitions (diff)
downloadlinux-dev-d7e09d0397e84eefbabfd9cb353221f3c6448d83.tar.xz
linux-dev-d7e09d0397e84eefbabfd9cb353221f3c6448d83.zip
staging: add Lustre file system client support
Lustre is the most deployed distributed file system in the HPC (High Performance Computing) world. The patch adds its client side support. The code is not very clean and needs to live in drivers/staging for some time for continuing cleanup work. See drivers/staging/lustre/TODO for details. The code is based on Lustre master commit faefbfc04 commit faefbfc0460bc00f2ee4c1c1c86aa1e39b9eea49 Author: Alex Zhuravlev <alexey.zhuravlev@intel.com> Date: Tue Apr 30 23:05:21 2013 +0400 LU-3244 utils: tunefs.lustre should preserve virgin label Plus a few under-review patches on Whamcloud gerrit: 3.8 kernel support: http://review.whamcloud.com/#change,5973 http://review.whamcloud.com/#change,5974 http://review.whamcloud.com/#change,5768 http://review.whamcloud.com/#change,5781 http://review.whamcloud.com/#change,5763 http://review.whamcloud.com/#change,5613 http://review.whamcloud.com/#change,5655 3.9 kernel support: http://review.whamcloud.com/#change,5898 http://review.whamcloud.com/#change,5899 Kconfig/Kbuild: http://review.whamcloud.com/#change,4646 http://review.whamcloud.com/#change,4644 libcfs cleanup: http://review.whamcloud.com/#change,2831 http://review.whamcloud.com/#change,4775 http://review.whamcloud.com/#change,4776 http://review.whamcloud.com/#change,4777 http://review.whamcloud.com/#change,4778 http://review.whamcloud.com/#change,4779 http://review.whamcloud.com/#change,4780 All starting/trailing whitespaces are removed, to match kernel coding style. Also ran scripts/cleanfile on all lustre source files. [maked the Kconfig depend on BROKEN as the recent procfs changes causes this to fail - gregkh] Signed-off-by: Peng Tao <tao.peng@emc.com> Signed-off-by: Andreas Dilger <andreas.dilger@intel.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/staging/lustre/lnet')
-rw-r--r--drivers/staging/lustre/lnet/Kconfig40
-rw-r--r--drivers/staging/lustre/lnet/Makefile1
-rw-r--r--drivers/staging/lustre/lnet/klnds/Makefile1
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile5
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c3256
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h1057
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c3529
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c493
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/Makefile7
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c2902
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h602
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c2664
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c1088
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h91
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c198
-rw-r--r--drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c797
-rw-r--r--drivers/staging/lustre/lnet/lnet/Makefile8
-rw-r--r--drivers/staging/lustre/lnet/lnet/acceptor.c527
-rw-r--r--drivers/staging/lustre/lnet/lnet/api-errno.c39
-rw-r--r--drivers/staging/lustre/lnet/lnet/api-ni.c1941
-rw-r--r--drivers/staging/lustre/lnet/lnet/config.c1264
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-eq.c447
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-md.c451
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-me.c297
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-move.c2441
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-msg.c650
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-ptl.c938
-rw-r--r--drivers/staging/lustre/lnet/lnet/lo.c120
-rw-r--r--drivers/staging/lustre/lnet/lnet/module.c154
-rw-r--r--drivers/staging/lustre/lnet/lnet/peer.c337
-rw-r--r--drivers/staging/lustre/lnet/lnet/router.c1693
-rw-r--r--drivers/staging/lustre/lnet/lnet/router_proc.c950
-rw-r--r--drivers/staging/lustre/lnet/selftest/Makefile6
-rw-r--r--drivers/staging/lustre/lnet/selftest/brw_test.c499
-rw-r--r--drivers/staging/lustre/lnet/selftest/conctl.c931
-rw-r--r--drivers/staging/lustre/lnet/selftest/conrpc.c1397
-rw-r--r--drivers/staging/lustre/lnet/selftest/conrpc.h146
-rw-r--r--drivers/staging/lustre/lnet/selftest/console.c2071
-rw-r--r--drivers/staging/lustre/lnet/selftest/console.h232
-rw-r--r--drivers/staging/lustre/lnet/selftest/framework.c1814
-rw-r--r--drivers/staging/lustre/lnet/selftest/module.c169
-rw-r--r--drivers/staging/lustre/lnet/selftest/ping_test.c229
-rw-r--r--drivers/staging/lustre/lnet/selftest/rpc.c1665
-rw-r--r--drivers/staging/lustre/lnet/selftest/rpc.h302
-rw-r--r--drivers/staging/lustre/lnet/selftest/selftest.h611
-rw-r--r--drivers/staging/lustre/lnet/selftest/timer.c253
-rw-r--r--drivers/staging/lustre/lnet/selftest/timer.h53
47 files changed, 39366 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig
new file mode 100644
index 000000000000..00850eeb6a8c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Kconfig
@@ -0,0 +1,40 @@
+config LNET
+ tristate "Lustre networking subsystem"
+ depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+ int "Lustre lnet max transfer payload (default 2MB)"
+ depends on LUSTRE_FS
+ default "1048576"
+ help
+ This option defines the maximum size of payload in bytes that lnet
+ can put into its transport.
+
+ If unsure, use default.
+
+config LNET_SELFTEST
+ tristate "Lustre networking self testing"
+ depends on LNET
+ help
+ Choose Y here if you want to do lnet self testing. To compile this
+ as a module, choose M here: the module will be called lnet_selftest.
+
+ To compile this as a kernel modules, choose M here and it will be
+ called lnet_selftest.
+
+ If unsure, say N.
+
+ See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+ tristate "LNET infiniband support"
+ depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+ default LNET && INFINIBAND
+ help
+ This option allows the LNET users to use infiniband as an
+ RDMA-enabled transport.
+
+ To compile this as a kernel module, choose M here and it will be
+ called ko2iblnd.
+
+ If unsure, say N.
diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile
new file mode 100644
index 000000000000..374212b1555a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) := klnds/ lnet/ selftest/
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644
index 000000000000..c23e4f67f837
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/ socklnd/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 000000000000..71b7d8418357
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 000000000000..f4b958bbe5a3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -0,0 +1,3256 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+lnd_t the_o2iblnd = {
+ .lnd_type = O2IBLND,
+ .lnd_startup = kiblnd_startup,
+ .lnd_shutdown = kiblnd_shutdown,
+ .lnd_ctl = kiblnd_ctl,
+ .lnd_query = kiblnd_query,
+ .lnd_send = kiblnd_send,
+ .lnd_recv = kiblnd_recv,
+};
+
+kib_data_t kiblnd_data;
+
+__u32
+kiblnd_cksum (void *ptr, int nob)
+{
+ char *c = ptr;
+ __u32 sum = 0;
+
+ while (nob-- > 0)
+ sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+ /* ensure I don't return 0 (== no checksum) */
+ return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+ switch (type) {
+ case IBLND_MSG_CONNREQ:
+ return "CONNREQ";
+
+ case IBLND_MSG_CONNACK:
+ return "CONNACK";
+
+ case IBLND_MSG_NOOP:
+ return "NOOP";
+
+ case IBLND_MSG_IMMEDIATE:
+ return "IMMEDIATE";
+
+ case IBLND_MSG_PUT_REQ:
+ return "PUT_REQ";
+
+ case IBLND_MSG_PUT_NAK:
+ return "PUT_NAK";
+
+ case IBLND_MSG_PUT_ACK:
+ return "PUT_ACK";
+
+ case IBLND_MSG_PUT_DONE:
+ return "PUT_DONE";
+
+ case IBLND_MSG_GET_REQ:
+ return "GET_REQ";
+
+ case IBLND_MSG_GET_DONE:
+ return "GET_DONE";
+
+ default:
+ return "???";
+ }
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+ const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+ switch (type) {
+ case IBLND_MSG_CONNREQ:
+ case IBLND_MSG_CONNACK:
+ return hdr_size + sizeof(kib_connparams_t);
+
+ case IBLND_MSG_NOOP:
+ return hdr_size;
+
+ case IBLND_MSG_IMMEDIATE:
+ return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+ case IBLND_MSG_PUT_REQ:
+ return hdr_size + sizeof(kib_putreq_msg_t);
+
+ case IBLND_MSG_PUT_ACK:
+ return hdr_size + sizeof(kib_putack_msg_t);
+
+ case IBLND_MSG_GET_REQ:
+ return hdr_size + sizeof(kib_get_msg_t);
+
+ case IBLND_MSG_PUT_NAK:
+ case IBLND_MSG_PUT_DONE:
+ case IBLND_MSG_GET_DONE:
+ return hdr_size + sizeof(kib_completion_msg_t);
+ default:
+ return -1;
+ }
+}
+
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+ kib_rdma_desc_t *rd;
+ int nob;
+ int n;
+ int i;
+
+ LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
+ msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+ rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+ &msg->ibm_u.get.ibgm_rd :
+ &msg->ibm_u.putack.ibpam_rd;
+
+ if (flip) {
+ __swab32s(&rd->rd_key);
+ __swab32s(&rd->rd_nfrags);
+ }
+
+ n = rd->rd_nfrags;
+
+ if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+ CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+ n, IBLND_MAX_RDMA_FRAGS);
+ return 1;
+ }
+
+ nob = offsetof (kib_msg_t, ibm_u) +
+ kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+ if (msg->ibm_nob < nob) {
+ CERROR("Short %s: %d(%d)\n",
+ kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+ return 1;
+ }
+
+ if (!flip)
+ return 0;
+
+ for (i = 0; i < n; i++) {
+ __swab32s(&rd->rd_frags[i].rf_nob);
+ __swab64s(&rd->rd_frags[i].rf_addr);
+ }
+
+ return 0;
+}
+
+void
+kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+ int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+ kib_net_t *net = ni->ni_data;
+
+ /* CAVEAT EMPTOR! all message fields not set here should have been
+ * initialised previously. */
+ msg->ibm_magic = IBLND_MSG_MAGIC;
+ msg->ibm_version = version;
+ /* ibm_type */
+ msg->ibm_credits = credits;
+ /* ibm_nob */
+ msg->ibm_cksum = 0;
+ msg->ibm_srcnid = ni->ni_nid;
+ msg->ibm_srcstamp = net->ibn_incarnation;
+ msg->ibm_dstnid = dstnid;
+ msg->ibm_dststamp = dststamp;
+
+ if (*kiblnd_tunables.kib_cksum) {
+ /* NB ibm_cksum zero while computing cksum */
+ msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+ }
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+ const int hdr_size = offsetof(kib_msg_t, ibm_u);
+ __u32 msg_cksum;
+ __u16 version;
+ int msg_nob;
+ int flip;
+
+ /* 6 bytes are enough to have received magic + version */
+ if (nob < 6) {
+ CERROR("Short message: %d\n", nob);
+ return -EPROTO;
+ }
+
+ if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+ flip = 0;
+ } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+ flip = 1;
+ } else {
+ CERROR("Bad magic: %08x\n", msg->ibm_magic);
+ return -EPROTO;
+ }
+
+ version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+ if (version != IBLND_MSG_VERSION &&
+ version != IBLND_MSG_VERSION_1) {
+ CERROR("Bad version: %x\n", version);
+ return -EPROTO;
+ }
+
+ if (nob < hdr_size) {
+ CERROR("Short message: %d\n", nob);
+ return -EPROTO;
+ }
+
+ msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+ if (msg_nob > nob) {
+ CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+ return -EPROTO;
+ }
+
+ /* checksum must be computed with ibm_cksum zero and BEFORE anything
+ * gets flipped */
+ msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+ msg->ibm_cksum = 0;
+ if (msg_cksum != 0 &&
+ msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+ CERROR("Bad checksum\n");
+ return -EPROTO;
+ }
+
+ msg->ibm_cksum = msg_cksum;
+
+ if (flip) {
+ /* leave magic unflipped as a clue to peer endianness */
+ msg->ibm_version = version;
+ CLASSERT (sizeof(msg->ibm_type) == 1);
+ CLASSERT (sizeof(msg->ibm_credits) == 1);
+ msg->ibm_nob = msg_nob;
+ __swab64s(&msg->ibm_srcnid);
+ __swab64s(&msg->ibm_srcstamp);
+ __swab64s(&msg->ibm_dstnid);
+ __swab64s(&msg->ibm_dststamp);
+ }
+
+ if (msg->ibm_srcnid == LNET_NID_ANY) {
+ CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+ return -EPROTO;
+ }
+
+ if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+ CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+ msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+ return -EPROTO;
+ }
+
+ switch (msg->ibm_type) {
+ default:
+ CERROR("Unknown message type %x\n", msg->ibm_type);
+ return -EPROTO;
+
+ case IBLND_MSG_NOOP:
+ case IBLND_MSG_IMMEDIATE:
+ case IBLND_MSG_PUT_REQ:
+ break;
+
+ case IBLND_MSG_PUT_ACK:
+ case IBLND_MSG_GET_REQ:
+ if (kiblnd_unpack_rd(msg, flip))
+ return -EPROTO;
+ break;
+
+ case IBLND_MSG_PUT_NAK:
+ case IBLND_MSG_PUT_DONE:
+ case IBLND_MSG_GET_DONE:
+ if (flip)
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
+ break;
+
+ case IBLND_MSG_CONNREQ:
+ case IBLND_MSG_CONNACK:
+ if (flip) {
+ __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+ __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+ __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+ }
+ break;
+ }
+ return 0;
+}
+
+int
+kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+ kib_peer_t *peer;
+ kib_net_t *net = ni->ni_data;
+ int cpt = lnet_cpt_of_nid(nid);
+ unsigned long flags;
+
+ LASSERT(net != NULL);
+ LASSERT(nid != LNET_NID_ANY);
+
+ LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+ if (peer == NULL) {
+ CERROR("Cannot allocate peer\n");
+ return -ENOMEM;
+ }
+
+ memset(peer, 0, sizeof(*peer)); /* zero flags etc */
+
+ peer->ibp_ni = ni;
+ peer->ibp_nid = nid;
+ peer->ibp_error = 0;
+ peer->ibp_last_alive = 0;
+ atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
+
+ INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
+ INIT_LIST_HEAD(&peer->ibp_conns);
+ INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ /* always called with a ref on ni, which prevents ni being shutdown */
+ LASSERT (net->ibn_shutdown == 0);
+
+ /* npeers only grows with the global lock held */
+ atomic_inc(&net->ibn_npeers);
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ *peerp = peer;
+ return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_t *peer)
+{
+ kib_net_t *net = peer->ibp_ni->ni_data;
+
+ LASSERT (net != NULL);
+ LASSERT (atomic_read(&peer->ibp_refcount) == 0);
+ LASSERT (!kiblnd_peer_active(peer));
+ LASSERT (peer->ibp_connecting == 0);
+ LASSERT (peer->ibp_accepting == 0);
+ LASSERT (list_empty(&peer->ibp_conns));
+ LASSERT (list_empty(&peer->ibp_tx_queue));
+
+ LIBCFS_FREE(peer, sizeof(*peer));
+
+ /* NB a peer's connections keep a reference on their peer until
+ * they are destroyed, so we can be assured that _all_ state to do
+ * with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *
+kiblnd_find_peer_locked (lnet_nid_t nid)
+{
+ /* the caller is responsible for accounting the additional reference
+ * that this creates */
+ struct list_head *peer_list = kiblnd_nid2peerlist(nid);
+ struct list_head *tmp;
+ kib_peer_t *peer;
+
+ list_for_each (tmp, peer_list) {
+
+ peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+ LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns)); /* active conn */
+
+ if (peer->ibp_nid != nid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+ peer, libcfs_nid2str(nid),
+ atomic_read(&peer->ibp_refcount),
+ peer->ibp_version);
+ return peer;
+ }
+ return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_t *peer)
+{
+ LASSERT (list_empty(&peer->ibp_conns));
+
+ LASSERT (kiblnd_peer_active(peer));
+ list_del_init(&peer->ibp_list);
+ /* lose peerlist's ref */
+ kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_get_peer_info (lnet_ni_t *ni, int index,
+ lnet_nid_t *nidp, int *count)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ int i;
+ unsigned long flags;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+ list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ if (index-- > 0)
+ continue;
+
+ *nidp = peer->ibp_nid;
+ *count = atomic_read(&peer->ibp_refcount);
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+ flags);
+ return 0;
+ }
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ return -ENOENT;
+}
+
+void
+kiblnd_del_peer_locked (kib_peer_t *peer)
+{
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ kib_conn_t *conn;
+
+ if (list_empty(&peer->ibp_conns)) {
+ kiblnd_unlink_peer_locked(peer);
+ } else {
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ kiblnd_close_conn_locked(conn, 0);
+ }
+ /* NB closing peer's last conn unlinked it. */
+ }
+ /* NB peer now unlinked; might even be freed if the peer table had the
+ * last ref on it. */
+}
+
+int
+kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
+{
+ LIST_HEAD (zombies);
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ kib_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ unsigned long flags;
+ int rc = -ENOENT;
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (nid != LNET_NID_ANY) {
+ lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+ } else {
+ lo = 0;
+ hi = kiblnd_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+ continue;
+
+ if (!list_empty(&peer->ibp_tx_queue)) {
+ LASSERT (list_empty(&peer->ibp_conns));
+
+ list_splice_init(&peer->ibp_tx_queue,
+ &zombies);
+ }
+
+ kiblnd_del_peer_locked(peer);
+ rc = 0; /* matched something */
+ }
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_txlist_done(ni, &zombies, -EIO);
+
+ return rc;
+}
+
+kib_conn_t *
+kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+ unsigned long flags;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+ list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ if (index-- > 0)
+ continue;
+
+ conn = list_entry(ctmp, kib_conn_t,
+ ibc_list);
+ kiblnd_conn_addref(conn);
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+ flags);
+ return conn;
+ }
+ }
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ return NULL;
+}
+
+void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+ CDEBUG(D_CONSOLE, " %p status %d msg_type %x cred %d\n",
+ rx, rx->rx_status, rx->rx_msg->ibm_type,
+ rx->rx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+ CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx "
+ "cookie "LPX64" msg %s%s type %x cred %d\n",
+ tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+ tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+ tx->tx_lntmsg[0] == NULL ? "-" : "!",
+ tx->tx_lntmsg[1] == NULL ? "-" : "!",
+ tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+ struct list_head *tmp;
+ int i;
+
+ spin_lock(&conn->ibc_lock);
+
+ CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
+ atomic_read(&conn->ibc_refcount), conn,
+ conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ CDEBUG(D_CONSOLE, " state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
+ conn->ibc_state, conn->ibc_noops_posted,
+ conn->ibc_nsends_posted, conn->ibc_credits,
+ conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+ CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error);
+
+ CDEBUG(D_CONSOLE, " early_rxs:\n");
+ list_for_each(tmp, &conn->ibc_early_rxs)
+ kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+ CDEBUG(D_CONSOLE, " tx_noops:\n");
+ list_for_each(tmp, &conn->ibc_tx_noops)
+ kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue_nocred:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+ kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+ kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " tx_queue:\n");
+ list_for_each(tmp, &conn->ibc_tx_queue)
+ kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " active_txs:\n");
+ list_for_each(tmp, &conn->ibc_active_txs)
+ kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+ CDEBUG(D_CONSOLE, " rxs:\n");
+ for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
+ kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+ spin_unlock(&conn->ibc_lock);
+}
+
+int
+kiblnd_translate_mtu(int value)
+{
+ switch (value) {
+ default:
+ return -1;
+ case 0:
+ return 0;
+ case 256:
+ return IB_MTU_256;
+ case 512:
+ return IB_MTU_512;
+ case 1024:
+ return IB_MTU_1024;
+ case 2048:
+ return IB_MTU_2048;
+ case 4096:
+ return IB_MTU_4096;
+ }
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+ int mtu;
+
+ /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+ if (cmid->route.path_rec == NULL)
+ return;
+
+ mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+ LASSERT (mtu >= 0);
+ if (mtu != 0)
+ cmid->route.path_rec->mtu = mtu;
+}
+
+static int
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+ cpumask_t *mask;
+ int vectors;
+ int off;
+ int i;
+
+ vectors = conn->ibc_cmid->device->num_comp_vectors;
+ if (vectors <= 1)
+ return 0;
+
+ mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+ /* hash NID to CPU id in this partition... */
+ off = conn->ibc_peer->ibp_nid % cpus_weight(*mask);
+ for_each_cpu_mask(i, *mask) {
+ if (off-- == 0)
+ return i % vectors;
+ }
+
+ LBUG();
+ return 1;
+}
+
+kib_conn_t *
+kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+ int state, int version)
+{
+ /* CAVEAT EMPTOR:
+ * If the new conn is created successfully it takes over the caller's
+ * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
+ * is destroyed. On failure, the caller's ref on 'peer' remains and
+ * she must dispose of 'cmid'. (Actually I'd block forever if I tried
+ * to destroy 'cmid' here since I'm called from the CM which still has
+ * its ref on 'cmid'). */
+ rwlock_t *glock = &kiblnd_data.kib_global_lock;
+ kib_net_t *net = peer->ibp_ni->ni_data;
+ kib_dev_t *dev;
+ struct ib_qp_init_attr *init_qp_attr;
+ struct kib_sched_info *sched;
+ kib_conn_t *conn;
+ struct ib_cq *cq;
+ unsigned long flags;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT(net != NULL);
+ LASSERT(!in_interrupt());
+
+ dev = net->ibn_dev;
+
+ cpt = lnet_cpt_of_nid(peer->ibp_nid);
+ sched = kiblnd_data.kib_scheds[cpt];
+
+ LASSERT(sched->ibs_nthreads > 0);
+
+ LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+ sizeof(*init_qp_attr));
+ if (init_qp_attr == NULL) {
+ CERROR("Can't allocate qp_attr for %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ goto failed_0;
+ }
+
+ LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+ if (conn == NULL) {
+ CERROR("Can't allocate connection for %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ goto failed_1;
+ }
+
+ conn->ibc_state = IBLND_CONN_INIT;
+ conn->ibc_version = version;
+ conn->ibc_peer = peer; /* I take the caller's ref */
+ cmid->context = conn; /* for future CM callbacks */
+ conn->ibc_cmid = cmid;
+
+ INIT_LIST_HEAD(&conn->ibc_early_rxs);
+ INIT_LIST_HEAD(&conn->ibc_tx_noops);
+ INIT_LIST_HEAD(&conn->ibc_tx_queue);
+ INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+ INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+ INIT_LIST_HEAD(&conn->ibc_active_txs);
+ spin_lock_init(&conn->ibc_lock);
+
+ LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+ sizeof(*conn->ibc_connvars));
+ if (conn->ibc_connvars == NULL) {
+ CERROR("Can't allocate in-progress connection state\n");
+ goto failed_2;
+ }
+
+ write_lock_irqsave(glock, flags);
+ if (dev->ibd_failover) {
+ write_unlock_irqrestore(glock, flags);
+ CERROR("%s: failover in progress\n", dev->ibd_ifname);
+ goto failed_2;
+ }
+
+ if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+ /* wakeup failover thread and teardown connection */
+ if (kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ wake_up(&kiblnd_data.kib_failover_waitq);
+ }
+
+ write_unlock_irqrestore(glock, flags);
+ CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+ cmid->device->name, dev->ibd_ifname);
+ goto failed_2;
+ }
+
+ kiblnd_hdev_addref_locked(dev->ibd_hdev);
+ conn->ibc_hdev = dev->ibd_hdev;
+
+ kiblnd_setup_mtu_locked(cmid);
+
+ write_unlock_irqrestore(glock, flags);
+
+ LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+ IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+ if (conn->ibc_rxs == NULL) {
+ CERROR("Cannot allocate RX buffers\n");
+ goto failed_2;
+ }
+
+ rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+ IBLND_RX_MSG_PAGES(version));
+ if (rc != 0)
+ goto failed_2;
+
+ kiblnd_map_rx_descs(conn);
+
+ cq = ib_create_cq(cmid->device,
+ kiblnd_cq_completion, kiblnd_cq_event, conn,
+ IBLND_CQ_ENTRIES(version),
+ kiblnd_get_completion_vector(conn, cpt));
+ if (IS_ERR(cq)) {
+ CERROR("Can't create CQ: %ld, cqe: %d\n",
+ PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+ goto failed_2;
+ }
+
+ conn->ibc_cq = cq;
+
+ rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (rc != 0) {
+ CERROR("Can't request completion notificiation: %d\n", rc);
+ goto failed_2;
+ }
+
+ init_qp_attr->event_handler = kiblnd_qp_event;
+ init_qp_attr->qp_context = conn;
+ init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+ init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+ init_qp_attr->cap.max_send_sge = 1;
+ init_qp_attr->cap.max_recv_sge = 1;
+ init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+ init_qp_attr->qp_type = IB_QPT_RC;
+ init_qp_attr->send_cq = cq;
+ init_qp_attr->recv_cq = cq;
+
+ conn->ibc_sched = sched;
+
+ rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+ if (rc != 0) {
+ CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+ rc, init_qp_attr->cap.max_send_wr,
+ init_qp_attr->cap.max_recv_wr);
+ goto failed_2;
+ }
+
+ LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+ /* 1 ref for caller and each rxmsg */
+ atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+ conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+ /* post receives */
+ for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+ rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+ IBLND_POSTRX_NO_CREDIT);
+ if (rc != 0) {
+ CERROR("Can't post rxmsg: %d\n", rc);
+
+ /* Make posted receives complete */
+ kiblnd_abort_receives(conn);
+
+ /* correct # of posted buffers
+ * NB locking needed now I'm racing with completion */
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ /* cmid will be destroyed by CM(ofed) after cm_callback
+ * returned, so we can't refer it anymore
+ * (by kiblnd_connd()->kiblnd_destroy_conn) */
+ rdma_destroy_qp(conn->ibc_cmid);
+ conn->ibc_cmid = NULL;
+
+ /* Drop my own and unused rxbuffer refcounts */
+ while (i++ <= IBLND_RX_MSGS(version))
+ kiblnd_conn_decref(conn);
+
+ return NULL;
+ }
+ }
+
+ /* Init successful! */
+ LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+ state == IBLND_CONN_PASSIVE_WAIT);
+ conn->ibc_state = state;
+
+ /* 1 more conn */
+ atomic_inc(&net->ibn_nconns);
+ return conn;
+
+ failed_2:
+ kiblnd_destroy_conn(conn);
+ failed_1:
+ LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+ return NULL;
+}
+
+void
+kiblnd_destroy_conn (kib_conn_t *conn)
+{
+ struct rdma_cm_id *cmid = conn->ibc_cmid;
+ kib_peer_t *peer = conn->ibc_peer;
+ int rc;
+
+ LASSERT (!in_interrupt());
+ LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+ LASSERT (list_empty(&conn->ibc_early_rxs));
+ LASSERT (list_empty(&conn->ibc_tx_noops));
+ LASSERT (list_empty(&conn->ibc_tx_queue));
+ LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+ LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
+ LASSERT (list_empty(&conn->ibc_active_txs));
+ LASSERT (conn->ibc_noops_posted == 0);
+ LASSERT (conn->ibc_nsends_posted == 0);
+
+ switch (conn->ibc_state) {
+ default:
+ /* conn must be completely disengaged from the network */
+ LBUG();
+
+ case IBLND_CONN_DISCONNECTED:
+ /* connvars should have been freed already */
+ LASSERT (conn->ibc_connvars == NULL);
+ break;
+
+ case IBLND_CONN_INIT:
+ break;
+ }
+
+ /* conn->ibc_cmid might be destroyed by CM already */
+ if (cmid != NULL && cmid->qp != NULL)
+ rdma_destroy_qp(cmid);
+
+ if (conn->ibc_cq != NULL) {
+ rc = ib_destroy_cq(conn->ibc_cq);
+ if (rc != 0)
+ CWARN("Error destroying CQ: %d\n", rc);
+ }
+
+ if (conn->ibc_rx_pages != NULL)
+ kiblnd_unmap_rx_descs(conn);
+
+ if (conn->ibc_rxs != NULL) {
+ LIBCFS_FREE(conn->ibc_rxs,
+ IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
+ }
+
+ if (conn->ibc_connvars != NULL)
+ LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+ if (conn->ibc_hdev != NULL)
+ kiblnd_hdev_decref(conn->ibc_hdev);
+
+ /* See CAVEAT EMPTOR above in kiblnd_create_conn */
+ if (conn->ibc_state != IBLND_CONN_INIT) {
+ kib_net_t *net = peer->ibp_ni->ni_data;
+
+ kiblnd_peer_decref(peer);
+ rdma_destroy_id(cmid);
+ atomic_dec(&net->ibn_nconns);
+ }
+
+ LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ CDEBUG(D_NET, "Closing conn -> %s, "
+ "version: %x, reason: %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_version, why);
+
+ kiblnd_close_conn_locked(conn, why);
+ count++;
+ }
+
+ return count;
+}
+
+int
+kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+ int version, __u64 incarnation)
+{
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ if (conn->ibc_version == version &&
+ conn->ibc_incarnation == incarnation)
+ continue;
+
+ CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+ "incarnation:"LPX64"(%x, "LPX64")\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_version, conn->ibc_incarnation,
+ version, incarnation);
+
+ kiblnd_close_conn_locked(conn, -ESTALE);
+ count++;
+ }
+
+ return count;
+}
+
+int
+kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
+{
+ kib_peer_t *peer;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ int lo;
+ int hi;
+ int i;
+ unsigned long flags;
+ int count = 0;
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (nid != LNET_NID_ANY)
+ lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+ else {
+ lo = 0;
+ hi = kiblnd_data.kib_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+ peer = list_entry(ptmp, kib_peer_t, ibp_list);
+ LASSERT (peer->ibp_connecting > 0 ||
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns));
+
+ if (peer->ibp_ni != ni)
+ continue;
+
+ if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+ continue;
+
+ count += kiblnd_close_peer_conns_locked(peer, 0);
+ }
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ /* wildcards always succeed */
+ if (nid == LNET_NID_ANY)
+ return 0;
+
+ return (count == 0) ? -ENOENT : 0;
+}
+
+int
+kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+ struct libcfs_ioctl_data *data = arg;
+ int rc = -EINVAL;
+
+ switch(cmd) {
+ case IOC_LIBCFS_GET_PEER: {
+ lnet_nid_t nid = 0;
+ int count = 0;
+
+ rc = kiblnd_get_peer_info(ni, data->ioc_count,
+ &nid, &count);
+ data->ioc_nid = nid;
+ data->ioc_count = count;
+ break;
+ }
+
+ case IOC_LIBCFS_DEL_PEER: {
+ rc = kiblnd_del_peer(ni, data->ioc_nid);
+ break;
+ }
+ case IOC_LIBCFS_GET_CONN: {
+ kib_conn_t *conn;
+
+ rc = 0;
+ conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+ if (conn == NULL) {
+ rc = -ENOENT;
+ break;
+ }
+
+ LASSERT (conn->ibc_cmid != NULL);
+ data->ioc_nid = conn->ibc_peer->ibp_nid;
+ if (conn->ibc_cmid->route.path_rec == NULL)
+ data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+ else
+ data->ioc_u32[0] =
+ ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+ kiblnd_conn_decref(conn);
+ break;
+ }
+ case IOC_LIBCFS_CLOSE_CONNECTION: {
+ rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ return rc;
+}
+
+void
+kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+ cfs_time_t last_alive = 0;
+ cfs_time_t now = cfs_time_current();
+ rwlock_t *glock = &kiblnd_data.kib_global_lock;
+ kib_peer_t *peer;
+ unsigned long flags;
+
+ read_lock_irqsave(glock, flags);
+
+ peer = kiblnd_find_peer_locked(nid);
+ if (peer != NULL) {
+ LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+ peer->ibp_accepting > 0 ||
+ !list_empty(&peer->ibp_conns)); /* active conn */
+ last_alive = peer->ibp_last_alive;
+ }
+
+ read_unlock_irqrestore(glock, flags);
+
+ if (last_alive != 0)
+ *when = last_alive;
+
+ /* peer is not persistent in hash, trigger peer creation
+ * and connection establishment with a NULL tx */
+ if (peer == NULL)
+ kiblnd_launch_tx(ni, NULL, nid);
+
+ CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+ libcfs_nid2str(nid), peer,
+ last_alive ? cfs_duration_sec(now - last_alive) : -1);
+ return;
+}
+
+void
+kiblnd_free_pages(kib_pages_t *p)
+{
+ int npages = p->ibp_npages;
+ int i;
+
+ for (i = 0; i < npages; i++) {
+ if (p->ibp_pages[i] != NULL)
+ __free_page(p->ibp_pages[i]);
+ }
+
+ LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+ kib_pages_t *p;
+ int i;
+
+ LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+ offsetof(kib_pages_t, ibp_pages[npages]));
+ if (p == NULL) {
+ CERROR("Can't allocate descriptor for %d pages\n", npages);
+ return -ENOMEM;
+ }
+
+ memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+ p->ibp_npages = npages;
+
+ for (i = 0; i < npages; i++) {
+ p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+ __GFP_IO);
+ if (p->ibp_pages[i] == NULL) {
+ CERROR("Can't allocate page %d of %d\n", i, npages);
+ kiblnd_free_pages(p);
+ return -ENOMEM;
+ }
+ }
+
+ *pp = p;
+ return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+ kib_rx_t *rx;
+ int i;
+
+ LASSERT (conn->ibc_rxs != NULL);
+ LASSERT (conn->ibc_hdev != NULL);
+
+ for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+ rx = &conn->ibc_rxs[i];
+
+ LASSERT (rx->rx_nob >= 0); /* not posted */
+
+ kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+ KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+ rx->rx_msgaddr),
+ IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+ }
+
+ kiblnd_free_pages(conn->ibc_rx_pages);
+
+ conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+ kib_rx_t *rx;
+ struct page *pg;
+ int pg_off;
+ int ipg;
+ int i;
+
+ for (pg_off = ipg = i = 0;
+ i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+ pg = conn->ibc_rx_pages->ibp_pages[ipg];
+ rx = &conn->ibc_rxs[i];
+
+ rx->rx_conn = conn;
+ rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+ rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+ rx->rx_msg, IBLND_MSG_SIZE,
+ DMA_FROM_DEVICE);
+ LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+ rx->rx_msgaddr));
+ KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+ CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
+ i, rx->rx_msg, rx->rx_msgaddr,
+ lnet_page2phys(pg) + pg_off);
+
+ pg_off += IBLND_MSG_SIZE;
+ LASSERT (pg_off <= PAGE_SIZE);
+
+ if (pg_off == PAGE_SIZE) {
+ pg_off = 0;
+ ipg++;
+ LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+ }
+ }
+}
+
+static void
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+ kib_hca_dev_t *hdev = tpo->tpo_hdev;
+ kib_tx_t *tx;
+ int i;
+
+ LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+ if (hdev == NULL)
+ return;
+
+ for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+ tx = &tpo->tpo_tx_descs[i];
+ kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+ KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+ tx->tx_msgaddr),
+ IBLND_MSG_SIZE, DMA_TO_DEVICE);
+ }
+
+ kiblnd_hdev_decref(hdev);
+ tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
+{
+ kib_hca_dev_t *hdev;
+ unsigned long flags;
+ int i = 0;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ while (dev->ibd_failover) {
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ if (i++ % 50 == 0)
+ CDEBUG(D_NET, "%s: Wait for failover\n",
+ dev->ibd_ifname);
+ schedule_timeout(cfs_time_seconds(1) / 100);
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ }
+
+ kiblnd_hdev_addref_locked(dev->ibd_hdev);
+ hdev = dev->ibd_hdev;
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+ kib_pages_t *txpgs = tpo->tpo_tx_pages;
+ kib_pool_t *pool = &tpo->tpo_pool;
+ kib_net_t *net = pool->po_owner->ps_net;
+ kib_dev_t *dev;
+ struct page *page;
+ kib_tx_t *tx;
+ int page_offset;
+ int ipage;
+ int i;
+
+ LASSERT (net != NULL);
+
+ dev = net->ibn_dev;
+
+ /* pre-mapped messages are not bigger than 1 page */
+ CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+ /* No fancy arithmetic when we do the buffer calculations */
+ CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+ tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+ for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+ page = txpgs->ibp_pages[ipage];
+ tx = &tpo->tpo_tx_descs[i];
+
+ tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+ page_offset);
+
+ tx->tx_msgaddr = kiblnd_dma_map_single(
+ tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+ IBLND_MSG_SIZE, DMA_TO_DEVICE);
+ LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+ tx->tx_msgaddr));
+ KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+ list_add(&tx->tx_list, &pool->po_free_list);
+
+ page_offset += IBLND_MSG_SIZE;
+ LASSERT (page_offset <= PAGE_SIZE);
+
+ if (page_offset == PAGE_SIZE) {
+ page_offset = 0;
+ ipage++;
+ LASSERT (ipage <= txpgs->ibp_npages);
+ }
+ }
+}
+
+struct ib_mr *
+kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+ __u64 index;
+
+ LASSERT (hdev->ibh_mrs[0] != NULL);
+
+ if (hdev->ibh_nmrs == 1)
+ return hdev->ibh_mrs[0];
+
+ index = addr >> hdev->ibh_mr_shift;
+
+ if (index < hdev->ibh_nmrs &&
+ index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+ return hdev->ibh_mrs[index];
+
+ return NULL;
+}
+
+struct ib_mr *
+kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+ struct ib_mr *prev_mr;
+ struct ib_mr *mr;
+ int i;
+
+ LASSERT (hdev->ibh_mrs[0] != NULL);
+
+ if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+ *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+ return NULL;
+
+ if (hdev->ibh_nmrs == 1)
+ return hdev->ibh_mrs[0];
+
+ for (i = 0, mr = prev_mr = NULL;
+ i < rd->rd_nfrags; i++) {
+ mr = kiblnd_find_dma_mr(hdev,
+ rd->rd_frags[i].rf_addr,
+ rd->rd_frags[i].rf_nob);
+ if (prev_mr == NULL)
+ prev_mr = mr;
+
+ if (mr == NULL || prev_mr != mr) {
+ /* Can't covered by one single MR */
+ mr = NULL;
+ break;
+ }
+ }
+
+ return mr;
+}
+
+void
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+ LASSERT (pool->fpo_map_count == 0);
+
+ if (pool->fpo_fmr_pool != NULL)
+ ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+ if (pool->fpo_hdev != NULL)
+ kiblnd_hdev_decref(pool->fpo_hdev);
+
+ LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+ kib_fmr_pool_t *pool;
+
+ while (!list_empty(head)) {
+ pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+ list_del(&pool->fpo_list);
+ kiblnd_destroy_fmr_pool(pool);
+ }
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+ int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+ return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+ int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+ return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+ /* FMR pool for RDMA */
+ kib_dev_t *dev = fps->fps_net->ibn_dev;
+ kib_fmr_pool_t *fpo;
+ struct ib_fmr_pool_param param = {
+ .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+ .page_shift = PAGE_SHIFT,
+ .access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE),
+ .pool_size = fps->fps_pool_size,
+ .dirty_watermark = fps->fps_flush_trigger,
+ .flush_function = NULL,
+ .flush_arg = NULL,
+ .cache = !!*kiblnd_tunables.kib_fmr_cache};
+ int rc;
+
+ LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+ if (fpo == NULL)
+ return -ENOMEM;
+
+ fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+ fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+ if (IS_ERR(fpo->fpo_fmr_pool)) {
+ rc = PTR_ERR(fpo->fpo_fmr_pool);
+ CERROR("Failed to create FMR pool: %d\n", rc);
+
+ kiblnd_hdev_decref(fpo->fpo_hdev);
+ LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+ return rc;
+ }
+
+ fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ fpo->fpo_owner = fps;
+ *pp_fpo = fpo;
+
+ return 0;
+}
+
+static void
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+{
+ if (fps->fps_net == NULL) /* intialized? */
+ return;
+
+ spin_lock(&fps->fps_lock);
+
+ while (!list_empty(&fps->fps_pool_list)) {
+ kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+ kib_fmr_pool_t, fpo_list);
+ fpo->fpo_failed = 1;
+ list_del(&fpo->fpo_list);
+ if (fpo->fpo_map_count == 0)
+ list_add(&fpo->fpo_list, zombies);
+ else
+ list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+ }
+
+ spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+ if (fps->fps_net != NULL) { /* initialized? */
+ kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+ kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+ }
+}
+
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net,
+ int pool_size, int flush_trigger)
+{
+ kib_fmr_pool_t *fpo;
+ int rc;
+
+ memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+ fps->fps_net = net;
+ fps->fps_cpt = cpt;
+ fps->fps_pool_size = pool_size;
+ fps->fps_flush_trigger = flush_trigger;
+ spin_lock_init(&fps->fps_lock);
+ INIT_LIST_HEAD(&fps->fps_pool_list);
+ INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+ rc = kiblnd_create_fmr_pool(fps, &fpo);
+ if (rc == 0)
+ list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+ return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+{
+ if (fpo->fpo_map_count != 0) /* still in use */
+ return 0;
+ if (fpo->fpo_failed)
+ return 1;
+ return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+ LIST_HEAD (zombies);
+ kib_fmr_pool_t *fpo = fmr->fmr_pool;
+ kib_fmr_poolset_t *fps = fpo->fpo_owner;
+ cfs_time_t now = cfs_time_current();
+ kib_fmr_pool_t *tmp;
+ int rc;
+
+ rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+ LASSERT (rc == 0);
+
+ if (status != 0) {
+ rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+ LASSERT (rc == 0);
+ }
+
+ fmr->fmr_pool = NULL;
+ fmr->fmr_pfmr = NULL;
+
+ spin_lock(&fps->fps_lock);
+ fpo->fpo_map_count --; /* decref the pool */
+
+ list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+ /* the first pool is persistent */
+ if (fps->fps_pool_list.next == &fpo->fpo_list)
+ continue;
+
+ if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+ list_move(&fpo->fpo_list, &zombies);
+ fps->fps_version ++;
+ }
+ }
+ spin_unlock(&fps->fps_lock);
+
+ if (!list_empty(&zombies))
+ kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+ __u64 iov, kib_fmr_t *fmr)
+{
+ struct ib_pool_fmr *pfmr;
+ kib_fmr_pool_t *fpo;
+ __u64 version;
+ int rc;
+
+ again:
+ spin_lock(&fps->fps_lock);
+ version = fps->fps_version;
+ list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+ fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ fpo->fpo_map_count++;
+ spin_unlock(&fps->fps_lock);
+
+ pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+ pages, npages, iov);
+ if (likely(!IS_ERR(pfmr))) {
+ fmr->fmr_pool = fpo;
+ fmr->fmr_pfmr = pfmr;
+ return 0;
+ }
+
+ spin_lock(&fps->fps_lock);
+ fpo->fpo_map_count--;
+ if (PTR_ERR(pfmr) != -EAGAIN) {
+ spin_unlock(&fps->fps_lock);
+ return PTR_ERR(pfmr);
+ }
+
+ /* EAGAIN and ... */
+ if (version != fps->fps_version) {
+ spin_unlock(&fps->fps_lock);
+ goto again;
+ }
+ }
+
+ if (fps->fps_increasing) {
+ spin_unlock(&fps->fps_lock);
+ CDEBUG(D_NET, "Another thread is allocating new "
+ "FMR pool, waiting for her to complete\n");
+ schedule();
+ goto again;
+
+ }
+
+ if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+ /* someone failed recently */
+ spin_unlock(&fps->fps_lock);
+ return -EAGAIN;
+ }
+
+ fps->fps_increasing = 1;
+ spin_unlock(&fps->fps_lock);
+
+ CDEBUG(D_NET, "Allocate new FMR pool\n");
+ rc = kiblnd_create_fmr_pool(fps, &fpo);
+ spin_lock(&fps->fps_lock);
+ fps->fps_increasing = 0;
+ if (rc == 0) {
+ fps->fps_version++;
+ list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+ } else {
+ fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+ }
+ spin_unlock(&fps->fps_lock);
+
+ goto again;
+}
+
+static void
+kiblnd_fini_pool(kib_pool_t *pool)
+{
+ LASSERT (list_empty(&pool->po_free_list));
+ LASSERT (pool->po_allocated == 0);
+
+ CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+ CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+ memset(pool, 0, sizeof(kib_pool_t));
+ INIT_LIST_HEAD(&pool->po_free_list);
+ pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ pool->po_owner = ps;
+ pool->po_size = size;
+}
+
+void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+ kib_pool_t *pool;
+
+ while (!list_empty(head)) {
+ pool = list_entry(head->next, kib_pool_t, po_list);
+ list_del(&pool->po_list);
+
+ LASSERT (pool->po_owner != NULL);
+ pool->po_owner->ps_pool_destroy(pool);
+ }
+}
+
+static void
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+ if (ps->ps_net == NULL) /* intialized? */
+ return;
+
+ spin_lock(&ps->ps_lock);
+ while (!list_empty(&ps->ps_pool_list)) {
+ kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+ kib_pool_t, po_list);
+ po->po_failed = 1;
+ list_del(&po->po_list);
+ if (po->po_allocated == 0)
+ list_add(&po->po_list, zombies);
+ else
+ list_add(&po->po_list, &ps->ps_failed_pool_list);
+ }
+ spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+ if (ps->ps_net != NULL) { /* initialized? */
+ kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+ kiblnd_destroy_pool_list(&ps->ps_pool_list);
+ }
+}
+
+static int
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+ kib_net_t *net, char *name, int size,
+ kib_ps_pool_create_t po_create,
+ kib_ps_pool_destroy_t po_destroy,
+ kib_ps_node_init_t nd_init,
+ kib_ps_node_fini_t nd_fini)
+{
+ kib_pool_t *pool;
+ int rc;
+
+ memset(ps, 0, sizeof(kib_poolset_t));
+
+ ps->ps_cpt = cpt;
+ ps->ps_net = net;
+ ps->ps_pool_create = po_create;
+ ps->ps_pool_destroy = po_destroy;
+ ps->ps_node_init = nd_init;
+ ps->ps_node_fini = nd_fini;
+ ps->ps_pool_size = size;
+ if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+ >= sizeof(ps->ps_name))
+ return -E2BIG;
+ spin_lock_init(&ps->ps_lock);
+ INIT_LIST_HEAD(&ps->ps_pool_list);
+ INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+ rc = ps->ps_pool_create(ps, size, &pool);
+ if (rc == 0)
+ list_add(&pool->po_list, &ps->ps_pool_list);
+ else
+ CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+ return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+ if (pool->po_allocated != 0) /* still in use */
+ return 0;
+ if (pool->po_failed)
+ return 1;
+ return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+ LIST_HEAD (zombies);
+ kib_poolset_t *ps = pool->po_owner;
+ kib_pool_t *tmp;
+ cfs_time_t now = cfs_time_current();
+
+ spin_lock(&ps->ps_lock);
+
+ if (ps->ps_node_fini != NULL)
+ ps->ps_node_fini(pool, node);
+
+ LASSERT (pool->po_allocated > 0);
+ list_add(node, &pool->po_free_list);
+ pool->po_allocated --;
+
+ list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+ /* the first pool is persistent */
+ if (ps->ps_pool_list.next == &pool->po_list)
+ continue;
+
+ if (kiblnd_pool_is_idle(pool, now))
+ list_move(&pool->po_list, &zombies);
+ }
+ spin_unlock(&ps->ps_lock);
+
+ if (!list_empty(&zombies))
+ kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+ struct list_head *node;
+ kib_pool_t *pool;
+ int rc;
+
+ again:
+ spin_lock(&ps->ps_lock);
+ list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+ if (list_empty(&pool->po_free_list))
+ continue;
+
+ pool->po_allocated ++;
+ pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+ node = pool->po_free_list.next;
+ list_del(node);
+
+ if (ps->ps_node_init != NULL) {
+ /* still hold the lock */
+ ps->ps_node_init(pool, node);
+ }
+ spin_unlock(&ps->ps_lock);
+ return node;
+ }
+
+ /* no available tx pool and ... */
+ if (ps->ps_increasing) {
+ /* another thread is allocating a new pool */
+ spin_unlock(&ps->ps_lock);
+ CDEBUG(D_NET, "Another thread is allocating new "
+ "%s pool, waiting for her to complete\n",
+ ps->ps_name);
+ schedule();
+ goto again;
+ }
+
+ if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+ /* someone failed recently */
+ spin_unlock(&ps->ps_lock);
+ return NULL;
+ }
+
+ ps->ps_increasing = 1;
+ spin_unlock(&ps->ps_lock);
+
+ CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+ rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+ spin_lock(&ps->ps_lock);
+ ps->ps_increasing = 0;
+ if (rc == 0) {
+ list_add_tail(&pool->po_list, &ps->ps_pool_list);
+ } else {
+ ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+ CERROR("Can't allocate new %s pool because out of memory\n",
+ ps->ps_name);
+ }
+ spin_unlock(&ps->ps_lock);
+
+ goto again;
+}
+
+void
+kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+ kib_pmr_pool_t *ppo = pmr->pmr_pool;
+ struct ib_mr *mr = pmr->pmr_mr;
+
+ pmr->pmr_mr = NULL;
+ kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+ if (mr != NULL)
+ ib_dereg_mr(mr);
+}
+
+int
+kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+ kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+ kib_phys_mr_t *pmr;
+ struct list_head *node;
+ int rc;
+ int i;
+
+ node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+ if (node == NULL) {
+ CERROR("Failed to allocate PMR descriptor\n");
+ return -ENOMEM;
+ }
+
+ pmr = container_of(node, kib_phys_mr_t, pmr_list);
+ if (pmr->pmr_pool->ppo_hdev != hdev) {
+ kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+ return -EAGAIN;
+ }
+
+ for (i = 0; i < rd->rd_nfrags; i ++) {
+ pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+ pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+ }
+
+ pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+ pmr->pmr_ipb, rd->rd_nfrags,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE,
+ iova);
+ if (!IS_ERR(pmr->pmr_mr)) {
+ pmr->pmr_iova = *iova;
+ *pp_pmr = pmr;
+ return 0;
+ }
+
+ rc = PTR_ERR(pmr->pmr_mr);
+ CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+ pmr->pmr_mr = NULL;
+ kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+ return rc;
+}
+
+static void
+kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+ kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+ kib_phys_mr_t *pmr;
+
+ LASSERT (pool->po_allocated == 0);
+
+ while (!list_empty(&pool->po_free_list)) {
+ pmr = list_entry(pool->po_free_list.next,
+ kib_phys_mr_t, pmr_list);
+
+ LASSERT (pmr->pmr_mr == NULL);
+ list_del(&pmr->pmr_list);
+
+ if (pmr->pmr_ipb != NULL) {
+ LIBCFS_FREE(pmr->pmr_ipb,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(struct ib_phys_buf));
+ }
+
+ LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+ }
+
+ kiblnd_fini_pool(pool);
+ if (ppo->ppo_hdev != NULL)
+ kiblnd_hdev_decref(ppo->ppo_hdev);
+
+ LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+ int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+ return max(IBLND_PMR_POOL, size);
+}
+
+static int
+kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+ struct kib_pmr_pool *ppo;
+ struct kib_pool *pool;
+ kib_phys_mr_t *pmr;
+ int i;
+
+ LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+ ps->ps_cpt, sizeof(kib_pmr_pool_t));
+ if (ppo == NULL) {
+ CERROR("Failed to allocate PMR pool\n");
+ return -ENOMEM;
+ }
+
+ pool = &ppo->ppo_pool;
+ kiblnd_init_pool(ps, pool, size);
+
+ for (i = 0; i < size; i++) {
+ LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+ ps->ps_cpt, sizeof(kib_phys_mr_t));
+ if (pmr == NULL)
+ break;
+
+ pmr->pmr_pool = ppo;
+ LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+ IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+ if (pmr->pmr_ipb == NULL)
+ break;
+
+ list_add(&pmr->pmr_list, &pool->po_free_list);
+ }
+
+ if (i < size) {
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+ }
+
+ ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+ *pp_po = pool;
+ return 0;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+ kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+ int i;
+
+ LASSERT (pool->po_allocated == 0);
+
+ if (tpo->tpo_tx_pages != NULL) {
+ kiblnd_unmap_tx_pool(tpo);
+ kiblnd_free_pages(tpo->tpo_tx_pages);
+ }
+
+ if (tpo->tpo_tx_descs == NULL)
+ goto out;
+
+ for (i = 0; i < pool->po_size; i++) {
+ kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+ list_del(&tx->tx_list);
+ if (tx->tx_pages != NULL)
+ LIBCFS_FREE(tx->tx_pages,
+ LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+ if (tx->tx_frags != NULL)
+ LIBCFS_FREE(tx->tx_frags,
+ IBLND_MAX_RDMA_FRAGS *
+ sizeof(*tx->tx_frags));
+ if (tx->tx_wrq != NULL)
+ LIBCFS_FREE(tx->tx_wrq,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+ if (tx->tx_sge != NULL)
+ LIBCFS_FREE(tx->tx_sge,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_sge));
+ if (tx->tx_rd != NULL)
+ LIBCFS_FREE(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBLND_MAX_RDMA_FRAGS]));
+ }
+
+ LIBCFS_FREE(tpo->tpo_tx_descs,
+ pool->po_size * sizeof(kib_tx_t));
+out:
+ kiblnd_fini_pool(pool);
+ LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+ int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+ return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+ int i;
+ int npg;
+ kib_pool_t *pool;
+ kib_tx_pool_t *tpo;
+
+ LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+ if (tpo == NULL) {
+ CERROR("Failed to allocate TX pool\n");
+ return -ENOMEM;
+ }
+
+ pool = &tpo->tpo_pool;
+ kiblnd_init_pool(ps, pool, size);
+ tpo->tpo_tx_descs = NULL;
+ tpo->tpo_tx_pages = NULL;
+
+ npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+ if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+ CERROR("Can't allocate tx pages: %d\n", npg);
+ LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+ return -ENOMEM;
+ }
+
+ LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+ size * sizeof(kib_tx_t));
+ if (tpo->tpo_tx_descs == NULL) {
+ CERROR("Can't allocate %d tx descriptors\n", size);
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+ }
+
+ memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+ for (i = 0; i < size; i++) {
+ kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+ tx->tx_pool = tpo;
+ if (ps->ps_net->ibn_fmr_ps != NULL) {
+ LIBCFS_CPT_ALLOC(tx->tx_pages,
+ lnet_cpt_table(), ps->ps_cpt,
+ LNET_MAX_IOV * sizeof(*tx->tx_pages));
+ if (tx->tx_pages == NULL)
+ break;
+ }
+
+ LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+ IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+ if (tx->tx_frags == NULL)
+ break;
+
+ sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+ LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+ if (tx->tx_wrq == NULL)
+ break;
+
+ LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+ (1 + IBLND_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_sge));
+ if (tx->tx_sge == NULL)
+ break;
+
+ LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBLND_MAX_RDMA_FRAGS]));
+ if (tx->tx_rd == NULL)
+ break;
+ }
+
+ if (i == size) {
+ kiblnd_map_tx_pool(tpo);
+ *pp_po = pool;
+ return 0;
+ }
+
+ ps->ps_pool_destroy(pool);
+ return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+ kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+ tps_poolset);
+ kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list);
+
+ tx->tx_cookie = tps->tps_next_tx_cookie ++;
+}
+
+void
+kiblnd_net_fini_pools(kib_net_t *net)
+{
+ int i;
+
+ cfs_cpt_for_each(i, lnet_cpt_table()) {
+ kib_tx_poolset_t *tps;
+ kib_fmr_poolset_t *fps;
+ kib_pmr_poolset_t *pps;
+
+ if (net->ibn_tx_ps != NULL) {
+ tps = net->ibn_tx_ps[i];
+ kiblnd_fini_poolset(&tps->tps_poolset);
+ }
+
+ if (net->ibn_fmr_ps != NULL) {
+ fps = net->ibn_fmr_ps[i];
+ kiblnd_fini_fmr_poolset(fps);
+ }
+
+ if (net->ibn_pmr_ps != NULL) {
+ pps = net->ibn_pmr_ps[i];
+ kiblnd_fini_poolset(&pps->pps_poolset);
+ }
+ }
+
+ if (net->ibn_tx_ps != NULL) {
+ cfs_percpt_free(net->ibn_tx_ps);
+ net->ibn_tx_ps = NULL;
+ }
+
+ if (net->ibn_fmr_ps != NULL) {
+ cfs_percpt_free(net->ibn_fmr_ps);
+ net->ibn_fmr_ps = NULL;
+ }
+
+ if (net->ibn_pmr_ps != NULL) {
+ cfs_percpt_free(net->ibn_pmr_ps);
+ net->ibn_pmr_ps = NULL;
+ }
+}
+
+int
+kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+ unsigned long flags;
+ int cpt;
+ int rc;
+ int i;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+ net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+ flags);
+ goto create_tx_pool;
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (*kiblnd_tunables.kib_fmr_pool_size <
+ *kiblnd_tunables.kib_ntx / 4) {
+ CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+ *kiblnd_tunables.kib_fmr_pool_size,
+ *kiblnd_tunables.kib_ntx / 4);
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ /* TX pool must be created later than FMR/PMR, see LU-2268
+ * for details */
+ LASSERT(net->ibn_tx_ps == NULL);
+
+ /* premapping can fail if ibd_nmr > 1, so we always create
+ * FMR/PMR pool and map-on-demand if premapping failed */
+
+ net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(kib_fmr_poolset_t));
+ if (net->ibn_fmr_ps == NULL) {
+ CERROR("Failed to allocate FMR pool array\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ for (i = 0; i < ncpts; i++) {
+ cpt = (cpts == NULL) ? i : cpts[i];
+ rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+ kiblnd_fmr_pool_size(ncpts),
+ kiblnd_fmr_flush_trigger(ncpts));
+ if (rc == -ENOSYS && i == 0) /* no FMR */
+ break; /* create PMR pool */
+
+ if (rc != 0) { /* a real error */
+ CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+ cpt, rc);
+ goto failed;
+ }
+ }
+
+ if (i > 0) {
+ LASSERT(i == ncpts);
+ goto create_tx_pool;
+ }
+
+ cfs_percpt_free(net->ibn_fmr_ps);
+ net->ibn_fmr_ps = NULL;
+
+ CWARN("Device does not support FMR, failing back to PMR\n");
+
+ if (*kiblnd_tunables.kib_pmr_pool_size <
+ *kiblnd_tunables.kib_ntx / 4) {
+ CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+ *kiblnd_tunables.kib_pmr_pool_size,
+ *kiblnd_tunables.kib_ntx / 4);
+ rc = -EINVAL;
+ goto failed;
+ }
+
+ net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(kib_pmr_poolset_t));
+ if (net->ibn_pmr_ps == NULL) {
+ CERROR("Failed to allocate PMR pool array\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ for (i = 0; i < ncpts; i++) {
+ cpt = (cpts == NULL) ? i : cpts[i];
+ rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+ cpt, net, "PMR",
+ kiblnd_pmr_pool_size(ncpts),
+ kiblnd_create_pmr_pool,
+ kiblnd_destroy_pmr_pool, NULL, NULL);
+ if (rc != 0) {
+ CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+ cpt, rc);
+ goto failed;
+ }
+ }
+
+ create_tx_pool:
+ net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(kib_tx_poolset_t));
+ if (net->ibn_tx_ps == NULL) {
+ CERROR("Failed to allocate tx pool array\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ for (i = 0; i < ncpts; i++) {
+ cpt = (cpts == NULL) ? i : cpts[i];
+ rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+ cpt, net, "TX",
+ kiblnd_tx_pool_size(ncpts),
+ kiblnd_create_tx_pool,
+ kiblnd_destroy_tx_pool,
+ kiblnd_tx_init, NULL);
+ if (rc != 0) {
+ CERROR("Can't initialize TX pool for CPT %d: %d\n",
+ cpt, rc);
+ goto failed;
+ }
+ }
+
+ return 0;
+ failed:
+ kiblnd_net_fini_pools(net);
+ LASSERT(rc != 0);
+ return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+ struct ib_device_attr *attr;
+ int rc;
+
+ /* It's safe to assume a HCA can handle a page size
+ * matching that of the native system */
+ hdev->ibh_page_shift = PAGE_SHIFT;
+ hdev->ibh_page_size = 1 << PAGE_SHIFT;
+ hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
+
+ LIBCFS_ALLOC(attr, sizeof(*attr));
+ if (attr == NULL) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ rc = ib_query_device(hdev->ibh_ibdev, attr);
+ if (rc == 0)
+ hdev->ibh_mr_size = attr->max_mr_size;
+
+ LIBCFS_FREE(attr, sizeof(*attr));
+
+ if (rc != 0) {
+ CERROR("Failed to query IB device: %d\n", rc);
+ return rc;
+ }
+
+ if (hdev->ibh_mr_size == ~0ULL) {
+ hdev->ibh_mr_shift = 64;
+ return 0;
+ }
+
+ for (hdev->ibh_mr_shift = 0;
+ hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
+ if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+ hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+ return 0;
+ }
+
+ CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
+ return -EINVAL;
+}
+
+void
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+ int i;
+
+ if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+ return;
+
+ for (i = 0; i < hdev->ibh_nmrs; i++) {
+ if (hdev->ibh_mrs[i] == NULL)
+ break;
+
+ ib_dereg_mr(hdev->ibh_mrs[i]);
+ }
+
+ LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+ hdev->ibh_mrs = NULL;
+ hdev->ibh_nmrs = 0;
+}
+
+void
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+ kiblnd_hdev_cleanup_mrs(hdev);
+
+ if (hdev->ibh_pd != NULL)
+ ib_dealloc_pd(hdev->ibh_pd);
+
+ if (hdev->ibh_cmid != NULL)
+ rdma_destroy_id(hdev->ibh_cmid);
+
+ LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+int
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+ struct ib_mr *mr;
+ int i;
+ int rc;
+ __u64 mm_size;
+ __u64 mr_size;
+ int acflags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE;
+
+ rc = kiblnd_hdev_get_attr(hdev);
+ if (rc != 0)
+ return rc;
+
+ if (hdev->ibh_mr_shift == 64) {
+ LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+ if (hdev->ibh_mrs == NULL) {
+ CERROR("Failed to allocate MRs table\n");
+ return -ENOMEM;
+ }
+
+ hdev->ibh_mrs[0] = NULL;
+ hdev->ibh_nmrs = 1;
+
+ mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+ if (IS_ERR(mr)) {
+ CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+ kiblnd_hdev_cleanup_mrs(hdev);
+ return PTR_ERR(mr);
+ }
+
+ hdev->ibh_mrs[0] = mr;
+
+ goto out;
+ }
+
+ mr_size = (1ULL << hdev->ibh_mr_shift);
+ mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+ hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+ if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+ /* it's 4T..., assume we will re-code at that time */
+ CERROR("Can't support memory size: x"LPX64
+ " with MR size: x"LPX64"\n", mm_size, mr_size);
+ return -EINVAL;
+ }
+
+ /* create an array of MRs to cover all memory */
+ LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+ if (hdev->ibh_mrs == NULL) {
+ CERROR("Failed to allocate MRs' table\n");
+ return -ENOMEM;
+ }
+
+ memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+
+ for (i = 0; i < hdev->ibh_nmrs; i++) {
+ struct ib_phys_buf ipb;
+ __u64 iova;
+
+ ipb.size = hdev->ibh_mr_size;
+ ipb.addr = i * mr_size;
+ iova = ipb.addr;
+
+ mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+ if (IS_ERR(mr)) {
+ CERROR("Failed ib_reg_phys_mr addr "LPX64
+ " size "LPX64" : %ld\n",
+ ipb.addr, ipb.size, PTR_ERR(mr));
+ kiblnd_hdev_cleanup_mrs(hdev);
+ return PTR_ERR(mr);
+ }
+
+ LASSERT (iova == ipb.addr);
+
+ hdev->ibh_mrs[i] = mr;
+ }
+
+out:
+ if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+ LCONSOLE_INFO("Register global MR array, MR size: "
+ LPX64", array size: %d\n",
+ hdev->ibh_mr_size, hdev->ibh_nmrs);
+ return 0;
+}
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{ /* DUMMY */
+ return 0;
+}
+
+static int
+kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+ struct rdma_cm_id *cmid;
+ struct sockaddr_in srcaddr;
+ struct sockaddr_in dstaddr;
+ int rc;
+
+ if (dev->ibd_hdev == NULL || /* initializing */
+ dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+ *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+ return 1;
+
+ /* XXX: it's UGLY, but I don't have better way to find
+ * ib-bonding HCA failover because:
+ *
+ * a. no reliable CM event for HCA failover...
+ * b. no OFED API to get ib_device for current net_device...
+ *
+ * We have only two choices at this point:
+ *
+ * a. rdma_bind_addr(), it will conflict with listener cmid
+ * b. rdma_resolve_addr() to zero addr */
+ cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(cmid)) {
+ rc = PTR_ERR(cmid);
+ CERROR("Failed to create cmid for failover: %d\n", rc);
+ return rc;
+ }
+
+ memset(&srcaddr, 0, sizeof(srcaddr));
+ srcaddr.sin_family = AF_INET;
+ srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+ memset(&dstaddr, 0, sizeof(dstaddr));
+ dstaddr.sin_family = AF_INET;
+ rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+ (struct sockaddr *)&dstaddr, 1);
+ if (rc != 0 || cmid->device == NULL) {
+ CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+ dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+ cmid->device, rc);
+ rdma_destroy_id(cmid);
+ return rc;
+ }
+
+ if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+ /* don't need device failover */
+ rdma_destroy_id(cmid);
+ return 0;
+ }
+
+ return 1;
+}
+
+int
+kiblnd_dev_failover(kib_dev_t *dev)
+{
+ LIST_HEAD (zombie_tpo);
+ LIST_HEAD (zombie_ppo);
+ LIST_HEAD (zombie_fpo);
+ struct rdma_cm_id *cmid = NULL;
+ kib_hca_dev_t *hdev = NULL;
+ kib_hca_dev_t *old;
+ struct ib_pd *pd;
+ kib_net_t *net;
+ struct sockaddr_in addr;
+ unsigned long flags;
+ int rc = 0;
+ int i;
+
+ LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
+ dev->ibd_can_failover ||
+ dev->ibd_hdev == NULL);
+
+ rc = kiblnd_dev_need_failover(dev);
+ if (rc <= 0)
+ goto out;
+
+ if (dev->ibd_hdev != NULL &&
+ dev->ibd_hdev->ibh_cmid != NULL) {
+ /* XXX it's not good to close old listener at here,
+ * because we can fail to create new listener.
+ * But we have to close it now, otherwise rdma_bind_addr
+ * will return EADDRINUSE... How crap! */
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ cmid = dev->ibd_hdev->ibh_cmid;
+ /* make next schedule of kiblnd_dev_need_failover()
+ * return 1 for me */
+ dev->ibd_hdev->ibh_cmid = NULL;
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ rdma_destroy_id(cmid);
+ }
+
+ cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+ IB_QPT_RC);
+ if (IS_ERR(cmid)) {
+ rc = PTR_ERR(cmid);
+ CERROR("Failed to create cmid for failover: %d\n", rc);
+ goto out;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+ addr.sin_port = htons(*kiblnd_tunables.kib_service);
+
+ /* Bind to failover device or port */
+ rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+ if (rc != 0 || cmid->device == NULL) {
+ CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+ dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+ cmid->device, rc);
+ rdma_destroy_id(cmid);
+ goto out;
+ }
+
+ LIBCFS_ALLOC(hdev, sizeof(*hdev));
+ if (hdev == NULL) {
+ CERROR("Failed to allocate kib_hca_dev\n");
+ rdma_destroy_id(cmid);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ atomic_set(&hdev->ibh_ref, 1);
+ hdev->ibh_dev = dev;
+ hdev->ibh_cmid = cmid;
+ hdev->ibh_ibdev = cmid->device;
+
+ pd = ib_alloc_pd(cmid->device);
+ if (IS_ERR(pd)) {
+ rc = PTR_ERR(pd);
+ CERROR("Can't allocate PD: %d\n", rc);
+ goto out;
+ }
+
+ hdev->ibh_pd = pd;
+
+ rc = rdma_listen(cmid, 0);
+ if (rc != 0) {
+ CERROR("Can't start new listener: %d\n", rc);
+ goto out;
+ }
+
+ rc = kiblnd_hdev_setup_mrs(hdev);
+ if (rc != 0) {
+ CERROR("Can't setup device: %d\n", rc);
+ goto out;
+ }
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ old = dev->ibd_hdev;
+ dev->ibd_hdev = hdev; /* take over the refcount */
+ hdev = old;
+
+ list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+ cfs_cpt_for_each(i, lnet_cpt_table()) {
+ kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+ &zombie_tpo);
+
+ if (net->ibn_fmr_ps != NULL) {
+ kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+ &zombie_fpo);
+
+ } else if (net->ibn_pmr_ps != NULL) {
+ kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+ pps_poolset, &zombie_ppo);
+ }
+ }
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+ if (!list_empty(&zombie_tpo))
+ kiblnd_destroy_pool_list(&zombie_tpo);
+ if (!list_empty(&zombie_ppo))
+ kiblnd_destroy_pool_list(&zombie_ppo);
+ if (!list_empty(&zombie_fpo))
+ kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+ if (hdev != NULL)
+ kiblnd_hdev_decref(hdev);
+
+ if (rc != 0)
+ dev->ibd_failed_failover++;
+ else
+ dev->ibd_failed_failover = 0;
+
+ return rc;
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+ LASSERT (dev->ibd_nnets == 0);
+ LASSERT (list_empty(&dev->ibd_nets));
+
+ list_del(&dev->ibd_fail_list);
+ list_del(&dev->ibd_list);
+
+ if (dev->ibd_hdev != NULL)
+ kiblnd_hdev_decref(dev->ibd_hdev);
+
+ LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+kib_dev_t *
+kiblnd_create_dev(char *ifname)
+{
+ struct net_device *netdev;
+ kib_dev_t *dev;
+ __u32 netmask;
+ __u32 ip;
+ int up;
+ int rc;
+
+ rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+ if (rc != 0) {
+ CERROR("Can't query IPoIB interface %s: %d\n",
+ ifname, rc);
+ return NULL;
+ }
+
+ if (!up) {
+ CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(dev, sizeof(*dev));
+ if (dev == NULL)
+ return NULL;
+
+ memset(dev, 0, sizeof(*dev));
+ netdev = dev_get_by_name(&init_net, ifname);
+ if (netdev == NULL) {
+ dev->ibd_can_failover = 0;
+ } else {
+ dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+ dev_put(netdev);
+ }
+
+ INIT_LIST_HEAD(&dev->ibd_nets);
+ INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+ INIT_LIST_HEAD(&dev->ibd_fail_list);
+ dev->ibd_ifip = ip;
+ strcpy(&dev->ibd_ifname[0], ifname);
+
+ /* initialize the device */
+ rc = kiblnd_dev_failover(dev);
+ if (rc != 0) {
+ CERROR("Can't initialize device: %d\n", rc);
+ LIBCFS_FREE(dev, sizeof(*dev));
+ return NULL;
+ }
+
+ list_add_tail(&dev->ibd_list,
+ &kiblnd_data.kib_devs);
+ return dev;
+}
+
+void
+kiblnd_base_shutdown(void)
+{
+ struct kib_sched_info *sched;
+ int i;
+
+ LASSERT (list_empty(&kiblnd_data.kib_devs));
+
+ CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ switch (kiblnd_data.kib_init) {
+ default:
+ LBUG();
+
+ case IBLND_INIT_ALL:
+ case IBLND_INIT_DATA:
+ LASSERT (kiblnd_data.kib_peers != NULL);
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+ LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
+ }
+ LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
+ LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
+
+ /* flag threads to terminate; wake and wait for them to die */
+ kiblnd_data.kib_shutdown = 1;
+
+ /* NB: we really want to stop scheduler threads net by net
+ * instead of the whole module, this should be improved
+ * with dynamic configuration LNet */
+ cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+ wake_up_all(&sched->ibs_waitq);
+
+ wake_up_all(&kiblnd_data.kib_connd_waitq);
+ wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+ i = 2;
+ while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "Waiting for %d threads to terminate\n",
+ atomic_read(&kiblnd_data.kib_nthreads));
+ cfs_pause(cfs_time_seconds(1));
+ }
+
+ /* fall through */
+
+ case IBLND_INIT_NOTHING:
+ break;
+ }
+
+ if (kiblnd_data.kib_peers != NULL) {
+ LIBCFS_FREE(kiblnd_data.kib_peers,
+ sizeof(struct list_head) *
+ kiblnd_data.kib_peer_hash_size);
+ }
+
+ if (kiblnd_data.kib_scheds != NULL)
+ cfs_percpt_free(kiblnd_data.kib_scheds);
+
+ CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+ module_put(THIS_MODULE);
+}
+
+void
+kiblnd_shutdown (lnet_ni_t *ni)
+{
+ kib_net_t *net = ni->ni_data;
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ int i;
+ unsigned long flags;
+
+ LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+ if (net == NULL)
+ goto out;
+
+ CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ write_lock_irqsave(g_lock, flags);
+ net->ibn_shutdown = 1;
+ write_unlock_irqrestore(g_lock, flags);
+
+ switch (net->ibn_init) {
+ default:
+ LBUG();
+
+ case IBLND_INIT_ALL:
+ /* nuke all existing peers within this net */
+ kiblnd_del_peer(ni, LNET_NID_ANY);
+
+ /* Wait for all peer state to clean up */
+ i = 2;
+ while (atomic_read(&net->ibn_npeers) != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+ "%s: waiting for %d peers to disconnect\n",
+ libcfs_nid2str(ni->ni_nid),
+ atomic_read(&net->ibn_npeers));
+ cfs_pause(cfs_time_seconds(1));
+ }
+
+ kiblnd_net_fini_pools(net);
+
+ write_lock_irqsave(g_lock, flags);
+ LASSERT(net->ibn_dev->ibd_nnets > 0);
+ net->ibn_dev->ibd_nnets--;
+ list_del(&net->ibn_list);
+ write_unlock_irqrestore(g_lock, flags);
+
+ /* fall through */
+
+ case IBLND_INIT_NOTHING:
+ LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+ if (net->ibn_dev != NULL &&
+ net->ibn_dev->ibd_nnets == 0)
+ kiblnd_destroy_dev(net->ibn_dev);
+
+ break;
+ }
+
+ CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+ atomic_read(&libcfs_kmemory));
+
+ net->ibn_init = IBLND_INIT_NOTHING;
+ ni->ni_data = NULL;
+
+ LIBCFS_FREE(net, sizeof(*net));
+
+out:
+ if (list_empty(&kiblnd_data.kib_devs))
+ kiblnd_base_shutdown();
+ return;
+}
+
+int
+kiblnd_base_startup(void)
+{
+ struct kib_sched_info *sched;
+ int rc;
+ int i;
+
+ LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+ try_module_get(THIS_MODULE);
+ memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+ rwlock_init(&kiblnd_data.kib_global_lock);
+
+ INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+ INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+ kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+ LIBCFS_ALLOC(kiblnd_data.kib_peers,
+ sizeof(struct list_head) *
+ kiblnd_data.kib_peer_hash_size);
+ if (kiblnd_data.kib_peers == NULL) {
+ goto failed;
+ }
+ for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+ INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+ spin_lock_init(&kiblnd_data.kib_connd_lock);
+ INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+ INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+ init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+ init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+ kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*sched));
+ if (kiblnd_data.kib_scheds == NULL)
+ goto failed;
+
+ cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+ int nthrs;
+
+ spin_lock_init(&sched->ibs_lock);
+ INIT_LIST_HEAD(&sched->ibs_conns);
+ init_waitqueue_head(&sched->ibs_waitq);
+
+ nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+ if (*kiblnd_tunables.kib_nscheds > 0) {
+ nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+ } else {
+ /* max to half of CPUs, another half is reserved for
+ * upper layer modules */
+ nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+ }
+
+ sched->ibs_nthreads_max = nthrs;
+ sched->ibs_cpt = i;
+ }
+
+ kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+ /* lists/ptrs/locks initialised */
+ kiblnd_data.kib_init = IBLND_INIT_DATA;
+ /*****************************************************/
+
+ rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+ if (rc != 0) {
+ CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+ goto failed;
+ }
+
+ if (*kiblnd_tunables.kib_dev_failover != 0)
+ rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+ "kiblnd_failover");
+
+ if (rc != 0) {
+ CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ kiblnd_data.kib_init = IBLND_INIT_ALL;
+ /*****************************************************/
+
+ return 0;
+
+ failed:
+ kiblnd_base_shutdown();
+ return -ENETDOWN;
+}
+
+int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+ int rc = 0;
+ int nthrs;
+ int i;
+
+ if (sched->ibs_nthreads == 0) {
+ if (*kiblnd_tunables.kib_nscheds > 0) {
+ nthrs = sched->ibs_nthreads_max;
+ } else {
+ nthrs = cfs_cpt_weight(lnet_cpt_table(),
+ sched->ibs_cpt);
+ nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+ nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+ }
+ } else {
+ LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+ /* increase one thread if there is new interface */
+ nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+ }
+
+ for (i = 0; i < nthrs; i++) {
+ long id;
+ char name[20];
+ id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+ snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+ KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+ rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+ if (rc == 0)
+ continue;
+
+ CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+ sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+ break;
+ }
+
+ sched->ibs_nthreads += i;
+ return rc;
+}
+
+int
+kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+{
+ int cpt;
+ int rc;
+ int i;
+
+ for (i = 0; i < ncpts; i++) {
+ struct kib_sched_info *sched;
+
+ cpt = (cpts == NULL) ? i : cpts[i];
+ sched = kiblnd_data.kib_scheds[cpt];
+
+ if (!newdev && sched->ibs_nthreads > 0)
+ continue;
+
+ rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+ if (rc != 0) {
+ CERROR("Failed to start scheduler threads for %s\n",
+ dev->ibd_ifname);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+kib_dev_t *
+kiblnd_dev_search(char *ifname)
+{
+ kib_dev_t *alias = NULL;
+ kib_dev_t *dev;
+ char *colon;
+ char *colon2;
+
+ colon = strchr(ifname, ':');
+ list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+ if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+ return dev;
+
+ if (alias != NULL)
+ continue;
+
+ colon2 = strchr(dev->ibd_ifname, ':');
+ if (colon != NULL)
+ *colon = 0;
+ if (colon2 != NULL)
+ *colon2 = 0;
+
+ if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+ alias = dev;
+
+ if (colon != NULL)
+ *colon = ':';
+ if (colon2 != NULL)
+ *colon2 = ':';
+ }
+ return alias;
+}
+
+int
+kiblnd_startup (lnet_ni_t *ni)
+{
+ char *ifname;
+ kib_dev_t *ibdev = NULL;
+ kib_net_t *net;
+ struct timeval tv;
+ unsigned long flags;
+ int rc;
+ int newdev;
+
+ LASSERT (ni->ni_lnd == &the_o2iblnd);
+
+ if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+ rc = kiblnd_base_startup();
+ if (rc != 0)
+ return rc;
+ }
+
+ LIBCFS_ALLOC(net, sizeof(*net));
+ ni->ni_data = net;
+ if (net == NULL)
+ goto failed;
+
+ memset(net, 0, sizeof(*net));
+
+ do_gettimeofday(&tv);
+ net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+ ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout;
+ ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits;
+ ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits;
+ ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+ if (ni->ni_interfaces[0] != NULL) {
+ /* Use the IPoIB interface specified in 'networks=' */
+
+ CLASSERT (LNET_MAX_INTERFACES > 1);
+ if (ni->ni_interfaces[1] != NULL) {
+ CERROR("Multiple interfaces not supported\n");
+ goto failed;
+ }
+
+ ifname = ni->ni_interfaces[0];
+ } else {
+ ifname = *kiblnd_tunables.kib_default_ipif;
+ }
+
+ if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+ CERROR("IPoIB interface name too long: %s\n", ifname);
+ goto failed;
+ }
+
+ ibdev = kiblnd_dev_search(ifname);
+
+ newdev = ibdev == NULL;
+ /* hmm...create kib_dev even for alias */
+ if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+ ibdev = kiblnd_create_dev(ifname);
+
+ if (ibdev == NULL)
+ goto failed;
+
+ net->ibn_dev = ibdev;
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+ rc = kiblnd_dev_start_threads(ibdev, newdev,
+ ni->ni_cpts, ni->ni_ncpts);
+ if (rc != 0)
+ goto failed;
+
+ rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+ if (rc != 0) {
+ CERROR("Failed to initialize NI pools: %d\n", rc);
+ goto failed;
+ }
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ ibdev->ibd_nnets++;
+ list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ net->ibn_init = IBLND_INIT_ALL;
+
+ return 0;
+
+failed:
+ if (net->ibn_dev == NULL && ibdev != NULL)
+ kiblnd_destroy_dev(ibdev);
+
+ kiblnd_shutdown(ni);
+
+ CDEBUG(D_NET, "kiblnd_startup failed\n");
+ return -ENETDOWN;
+}
+
+void __exit
+kiblnd_module_fini (void)
+{
+ lnet_unregister_lnd(&the_o2iblnd);
+ kiblnd_tunables_fini();
+}
+
+int __init
+kiblnd_module_init (void)
+{
+ int rc;
+
+ CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+ CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+ <= IBLND_MSG_SIZE);
+ CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+ <= IBLND_MSG_SIZE);
+
+ rc = kiblnd_tunables_init();
+ if (rc != 0)
+ return rc;
+
+ lnet_register_lnd(&the_o2iblnd);
+
+ return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 000000000000..e4626bf82fc7
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
@@ -0,0 +1,1057 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED 100
+
+#define IBLND_N_SCHED 2
+#define IBLND_N_SCHED_HIGH 4
+
+typedef struct
+{
+ int *kib_dev_failover; /* HCA failover */
+ unsigned int *kib_service; /* IB service number */
+ int *kib_min_reconnect_interval; /* first failed connection retry... */
+ int *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+ int *kib_cksum; /* checksum kib_msg_t? */
+ int *kib_timeout; /* comms timeout (seconds) */
+ int *kib_keepalive; /* keepalive timeout (seconds) */
+ int *kib_ntx; /* # tx descs */
+ int *kib_credits; /* # concurrent sends */
+ int *kib_peertxcredits; /* # concurrent sends to 1 peer */
+ int *kib_peerrtrcredits; /* # per-peer router buffer credits */
+ int *kib_peercredits_hiw; /* # when eagerly to return credits */
+ int *kib_peertimeout; /* seconds to consider peer dead */
+ char **kib_default_ipif; /* default IPoIB interface */
+ int *kib_retry_count;
+ int *kib_rnr_retry_count;
+ int *kib_concurrent_sends; /* send work queue sizing */
+ int *kib_ib_mtu; /* IB MTU */
+ int *kib_map_on_demand; /* map-on-demand if RD has more fragments
+ * than this value, 0 disable map-on-demand */
+ int *kib_pmr_pool_size; /* # physical MR in pool */
+ int *kib_fmr_pool_size; /* # FMRs in pool */
+ int *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+ int *kib_fmr_cache; /* enable FMR pool cache? */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+ ctl_table_header_t *kib_sysctl; /* sysctl interface */
+#endif
+ int *kib_require_priv_port;/* accept only privileged ports */
+ int *kib_use_priv_port; /* use privileged port for active connect */
+ /* # threads on each CPT */
+ int *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */
+#define IBLND_CREDITS_MAX ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1) /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ IBLND_MSG_QUEUE_SIZE_V1 : \
+ *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ IBLND_CREDIT_HIGHWATER_V1 : \
+ *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+ if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+ return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+ if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+ return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+ return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ kiblnd_concurrent_sends_v1() : \
+ *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+ *kiblnd_tunables.kib_map_on_demand : \
+ IBLND_MAX_RDMA_FRAGS) /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v) ((v) == IBLND_MSG_VERSION_1 ? \
+ IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL 256
+#define IBLND_PMR_POOL 256
+#define IBLND_FMR_POOL 256
+#define IBLND_FMR_POOL_FLUSH 192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v) (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v) (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v) ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v) IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v) ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v) (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE IFALIASZ
+#else
+#define KIB_IFNAME_SIZE 256
+#endif
+
+typedef struct
+{
+ struct list_head ibd_list; /* chain on kib_devs */
+ struct list_head ibd_fail_list; /* chain on kib_failed_devs */
+ __u32 ibd_ifip; /* IPoIB interface IP */
+ /** IPoIB interface name */
+ char ibd_ifname[KIB_IFNAME_SIZE];
+ int ibd_nnets; /* # nets extant */
+
+ cfs_time_t ibd_next_failover;
+ int ibd_failed_failover; /* # failover failures */
+ unsigned int ibd_failover; /* failover in progress */
+ unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */
+ struct list_head ibd_nets;
+ struct kib_hca_dev *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+ struct rdma_cm_id *ibh_cmid; /* listener cmid */
+ struct ib_device *ibh_ibdev; /* IB device */
+ int ibh_page_shift; /* page shift of current HCA */
+ int ibh_page_size; /* page size of current HCA */
+ __u64 ibh_page_mask; /* page mask of current HCA */
+ int ibh_mr_shift; /* bits shift of max MR size */
+ __u64 ibh_mr_size; /* size of MR */
+ int ibh_nmrs; /* # of global MRs */
+ struct ib_mr **ibh_mrs; /* global MR */
+ struct ib_pd *ibh_pd; /* PD */
+ kib_dev_t *ibh_dev; /* owner */
+ atomic_t ibh_ref; /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE 300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY 1
+
+typedef struct
+{
+ int ibp_npages; /* # pages */
+ struct page *ibp_pages[0]; /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+ struct list_head pmr_list; /* chain node */
+ struct ib_phys_buf *pmr_ipb; /* physical buffer */
+ struct ib_mr *pmr_mr; /* IB MR */
+ struct kib_pmr_pool *pmr_pool; /* owner of this MR */
+ __u64 pmr_iova; /* Virtual I/O address */
+ int pmr_refcount; /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+ int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN 32
+
+typedef struct kib_poolset
+{
+ spinlock_t ps_lock; /* serialize */
+ struct kib_net *ps_net; /* network it belongs to */
+ char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+ struct list_head ps_pool_list; /* list of pools */
+ struct list_head ps_failed_pool_list; /* failed pool list */
+ cfs_time_t ps_next_retry; /* time stamp for retry if failed to allocate */
+ int ps_increasing; /* is allocating new pool */
+ int ps_pool_size; /* new pool size */
+ int ps_cpt; /* CPT id */
+
+ kib_ps_pool_create_t ps_pool_create; /* create a new pool */
+ kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */
+ kib_ps_node_init_t ps_node_init; /* initialize new allocated node */
+ kib_ps_node_fini_t ps_node_fini; /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+ struct list_head po_list; /* chain on pool list */
+ struct list_head po_free_list; /* pre-allocated node */
+ kib_poolset_t *po_owner; /* pool_set of this pool */
+ cfs_time_t po_deadline; /* deadline of this pool */
+ int po_allocated; /* # of elements in use */
+ int po_failed; /* pool is created on failed HCA */
+ int po_size; /* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+ kib_poolset_t tps_poolset; /* pool-set */
+ __u64 tps_next_tx_cookie; /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+ kib_pool_t tpo_pool; /* pool */
+ struct kib_hca_dev *tpo_hdev; /* device for this pool */
+ struct kib_tx *tpo_tx_descs; /* all the tx descriptors */
+ kib_pages_t *tpo_tx_pages; /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+ kib_poolset_t pps_poolset; /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+ struct kib_hca_dev *ppo_hdev; /* device for this pool */
+ kib_pool_t ppo_pool; /* pool */
+} kib_pmr_pool_t;
+
+typedef struct
+{
+ spinlock_t fps_lock; /* serialize */
+ struct kib_net *fps_net; /* IB network */
+ struct list_head fps_pool_list; /* FMR pool list */
+ struct list_head fps_failed_pool_list; /* FMR pool list */
+ __u64 fps_version; /* validity stamp */
+ int fps_cpt; /* CPT id */
+ int fps_pool_size;
+ int fps_flush_trigger;
+ /* is allocating new pool */
+ int fps_increasing;
+ /* time stamp for retry if failed to allocate */
+ cfs_time_t fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct
+{
+ struct list_head fpo_list; /* chain on pool list */
+ struct kib_hca_dev *fpo_hdev; /* device for this pool */
+ kib_fmr_poolset_t *fpo_owner; /* owner of this pool */
+ struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+ cfs_time_t fpo_deadline; /* deadline of this pool */
+ int fpo_failed; /* fmr pool is failed */
+ int fpo_map_count; /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+ struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
+ kib_fmr_pool_t *fmr_pool; /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+ struct list_head ibn_list; /* chain on kib_dev_t::ibd_nets */
+ __u64 ibn_incarnation; /* my epoch */
+ int ibn_init; /* initialisation state */
+ int ibn_shutdown; /* shutting down? */
+
+ atomic_t ibn_npeers; /* # peers extant */
+ atomic_t ibn_nconns; /* # connections extant */
+
+ kib_tx_poolset_t **ibn_tx_ps; /* tx pool-set */
+ kib_fmr_poolset_t **ibn_fmr_ps; /* fmr pool-set */
+ kib_pmr_poolset_t **ibn_pmr_ps; /* pmr pool-set */
+
+ kib_dev_t *ibn_dev; /* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT 16
+#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+ /* serialise */
+ spinlock_t ibs_lock;
+ /* schedulers sleep here */
+ wait_queue_head_t ibs_waitq;
+ /* conns to check for rx completions */
+ struct list_head ibs_conns;
+ /* number of scheduler threads */
+ int ibs_nthreads;
+ /* max allowed scheduler threads */
+ int ibs_nthreads_max;
+ int ibs_cpt; /* CPT id */
+};
+
+typedef struct
+{
+ int kib_init; /* initialisation state */
+ int kib_shutdown; /* shut down? */
+ struct list_head kib_devs; /* IB devices extant */
+ /* list head of failed devices */
+ struct list_head kib_failed_devs;
+ /* schedulers sleep here */
+ wait_queue_head_t kib_failover_waitq;
+ atomic_t kib_nthreads; /* # live threads */
+ /* stabilize net/dev/peer/conn ops */
+ rwlock_t kib_global_lock;
+ /* hash table of all my known peers */
+ struct list_head *kib_peers;
+ /* size of kib_peers */
+ int kib_peer_hash_size;
+ /* the connd task (serialisation assertions) */
+ void *kib_connd;
+ /* connections to setup/teardown */
+ struct list_head kib_connd_conns;
+ /* connections with zero refcount */
+ struct list_head kib_connd_zombies;
+ /* connection daemon sleeps here */
+ wait_queue_head_t kib_connd_waitq;
+ spinlock_t kib_connd_lock; /* serialise */
+ struct ib_qp_attr kib_error_qpa; /* QP->ERROR */
+ /* percpt data for schedulers */
+ struct kib_sched_info **kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING 0
+#define IBLND_INIT_DATA 1
+#define IBLND_INIT_ALL 2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+ __u16 ibcp_queue_depth;
+ __u16 ibcp_max_frags;
+ __u32 ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+ lnet_hdr_t ibim_hdr; /* portals header */
+ char ibim_payload[0]; /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct
+{
+ __u32 rf_nob; /* # bytes this frag */
+ __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+ __u32 rd_key; /* local/remote key */
+ __u32 rd_nfrags; /* # fragments */
+ kib_rdma_frag_t rd_frags[0]; /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct
+{
+ lnet_hdr_t ibprm_hdr; /* portals header */
+ __u64 ibprm_cookie; /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+ __u64 ibpam_src_cookie; /* reflected completion cookie */
+ __u64 ibpam_dst_cookie; /* opaque completion cookie */
+ kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+ lnet_hdr_t ibgm_hdr; /* portals header */
+ __u64 ibgm_cookie; /* opaque completion cookie */
+ kib_rdma_desc_t ibgm_rd; /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+ __u64 ibcm_cookie; /* opaque completion cookie */
+ __s32 ibcm_status; /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+ /* First 2 fields fixed FOR ALL TIME */
+ __u32 ibm_magic; /* I'm an ibnal message */
+ __u16 ibm_version; /* this is my version number */
+
+ __u8 ibm_type; /* msg type */
+ __u8 ibm_credits; /* returned credits */
+ __u32 ibm_nob; /* # bytes in whole message */
+ __u32 ibm_cksum; /* checksum (0 == no checksum) */
+ __u64 ibm_srcnid; /* sender's NID */
+ __u64 ibm_srcstamp; /* sender's incarnation */
+ __u64 ibm_dstnid; /* destination's NID */
+ __u64 ibm_dststamp; /* destination's incarnation */
+
+ union {
+ kib_connparams_t connparams;
+ kib_immediate_msg_t immediate;
+ kib_putreq_msg_t putreq;
+ kib_putack_msg_t putack;
+ kib_get_msg_t get;
+ kib_completion_msg_t completion;
+ } WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */
+
+#define IBLND_MSG_VERSION_1 0x11
+#define IBLND_MSG_VERSION_2 0x12
+#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ 0xc0 /* connection request */
+#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */
+#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */
+#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */
+#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */
+
+typedef struct {
+ __u32 ibr_magic; /* sender's magic */
+ __u16 ibr_version; /* sender's version */
+ __u8 ibr_why; /* reject reason */
+ __u8 ibr_padding; /* padding */
+ __u64 ibr_incarnation; /* incarnation of peer */
+ kib_connparams_t ibr_cp; /* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL 3 /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE 5 /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS 6 /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE 7 /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx /* receive message */
+{
+ struct list_head rx_list; /* queue for attention */
+ struct kib_conn *rx_conn; /* owning conn */
+ int rx_nob; /* # bytes received (-1 while posted) */
+ enum ib_wc_status rx_status; /* completion status */
+ kib_msg_t *rx_msg; /* message buffer (host vaddr) */
+ __u64 rx_msgaddr; /* message buffer (I/O addr) */
+ DECLARE_PCI_UNMAP_ADDR (rx_msgunmap); /* for dma_unmap_single() */
+ struct ib_recv_wr rx_wrq; /* receive work item... */
+ struct ib_sge rx_sge; /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST 0 /* don't post */
+#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx /* transmit message */
+{
+ struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */
+ kib_tx_pool_t *tx_pool; /* pool I'm from */
+ struct kib_conn *tx_conn; /* owning conn */
+ short tx_sending; /* # tx callbacks outstanding */
+ short tx_queued; /* queued for sending */
+ short tx_waiting; /* waiting for peer */
+ int tx_status; /* LNET completion status */
+ unsigned long tx_deadline; /* completion deadline */
+ __u64 tx_cookie; /* completion cookie */
+ lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+ kib_msg_t *tx_msg; /* message buffer (host vaddr) */
+ __u64 tx_msgaddr; /* message buffer (I/O addr) */
+ DECLARE_PCI_UNMAP_ADDR (tx_msgunmap); /* for dma_unmap_single() */
+ int tx_nwrq; /* # send work items */
+ struct ib_send_wr *tx_wrq; /* send work items... */
+ struct ib_sge *tx_sge; /* ...and their memory */
+ kib_rdma_desc_t *tx_rd; /* rdma descriptor */
+ int tx_nfrags; /* # entries in... */
+ struct scatterlist *tx_frags; /* dma_map_sg descriptor */
+ __u64 *tx_pages; /* rdma phys page addrs */
+ union {
+ kib_phys_mr_t *pmr; /* MR for physical buffer */
+ kib_fmr_t fmr; /* FMR */
+ } tx_u;
+ int tx_dmadir; /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+ /* connection-in-progress variables */
+ kib_msg_t cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+ struct kib_sched_info *ibc_sched; /* scheduler information */
+ struct kib_peer *ibc_peer; /* owning peer */
+ kib_hca_dev_t *ibc_hdev; /* HCA bound on */
+ struct list_head ibc_list; /* stash on peer's conn list */
+ struct list_head ibc_sched_list; /* schedule for attention */
+ __u16 ibc_version; /* version of connection */
+ __u64 ibc_incarnation; /* which instance of the peer */
+ atomic_t ibc_refcount; /* # users */
+ int ibc_state; /* what's happening */
+ int ibc_nsends_posted; /* # uncompleted sends */
+ int ibc_noops_posted; /* # uncompleted NOOPs */
+ int ibc_credits; /* # credits I have */
+ int ibc_outstanding_credits; /* # credits to return */
+ int ibc_reserved_credits;/* # ACK/DONE msg credits */
+ int ibc_comms_error; /* set on comms error */
+ unsigned int ibc_nrx:16; /* receive buffers owned */
+ unsigned int ibc_scheduled:1; /* scheduled for attention */
+ unsigned int ibc_ready:1; /* CQ callback fired */
+ /* time of last send */
+ unsigned long ibc_last_send;
+ /** link chain for kiblnd_check_conns only */
+ struct list_head ibc_connd_list;
+ /** rxs completed before ESTABLISHED */
+ struct list_head ibc_early_rxs;
+ /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+ struct list_head ibc_tx_noops;
+ struct list_head ibc_tx_queue; /* sends that need a credit */
+ struct list_head ibc_tx_queue_nocred;/* sends that don't need a credit */
+ struct list_head ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+ struct list_head ibc_active_txs; /* active tx awaiting completion */
+ spinlock_t ibc_lock; /* serialise */
+ kib_rx_t *ibc_rxs; /* the rx descs */
+ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */
+
+ struct rdma_cm_id *ibc_cmid; /* CM id */
+ struct ib_cq *ibc_cq; /* completion queue */
+
+ kib_connvars_t *ibc_connvars; /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT 0 /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED 3 /* connection established */
+#define IBLND_CONN_CLOSING 4 /* being closed */
+#define IBLND_CONN_DISCONNECTED 5 /* disconnected */
+
+typedef struct kib_peer
+{
+ struct list_head ibp_list; /* stash on global peer list */
+ lnet_nid_t ibp_nid; /* who's on the other end(s) */
+ lnet_ni_t *ibp_ni; /* LNet interface */
+ atomic_t ibp_refcount; /* # users */
+ struct list_head ibp_conns; /* all active connections */
+ struct list_head ibp_tx_queue; /* msgs waiting for a conn */
+ __u16 ibp_version; /* version of peer */
+ __u64 ibp_incarnation; /* incarnation of peer */
+ int ibp_connecting; /* current active connection attempts */
+ int ibp_accepting; /* current passive connection attempts */
+ int ibp_error; /* errno on closing this peer */
+ cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+ LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+ atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+ LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+ if (atomic_dec_and_test(&hdev->ibh_ref))
+ kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+ if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+ return 0;
+
+ if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+ return 0;
+
+ if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+ return 1;
+
+ return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn) \
+do { \
+ CDEBUG(D_NET, "conn[%p] (%d)++\n", \
+ (conn), atomic_read(&(conn)->ibc_refcount)); \
+ atomic_inc(&(conn)->ibc_refcount); \
+} while (0)
+
+#define kiblnd_conn_decref(conn) \
+do { \
+ unsigned long flags; \
+ \
+ CDEBUG(D_NET, "conn[%p] (%d)--\n", \
+ (conn), atomic_read(&(conn)->ibc_refcount)); \
+ LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \
+ if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \
+ list_add_tail(&(conn)->ibc_list, \
+ &kiblnd_data.kib_connd_zombies); \
+ wake_up(&kiblnd_data.kib_connd_waitq); \
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+ } \
+} while (0)
+
+#define kiblnd_peer_addref(peer) \
+do { \
+ CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \
+ (peer), libcfs_nid2str((peer)->ibp_nid), \
+ atomic_read (&(peer)->ibp_refcount)); \
+ atomic_inc(&(peer)->ibp_refcount); \
+} while (0)
+
+#define kiblnd_peer_decref(peer) \
+do { \
+ CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \
+ (peer), libcfs_nid2str((peer)->ibp_nid), \
+ atomic_read (&(peer)->ibp_refcount)); \
+ LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \
+ if (atomic_dec_and_test(&(peer)->ibp_refcount)) \
+ kiblnd_destroy_peer(peer); \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+ unsigned int hash =
+ ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+ return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+ /* Am I in the peer hash table? */
+ return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+ LASSERT (!list_empty(&peer->ibp_conns));
+
+ /* just return the first connection */
+ return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+ return (*kiblnd_tunables.kib_keepalive > 0) &&
+ cfs_time_after(jiffies, conn->ibc_last_send +
+ *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+ LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ if (conn->ibc_outstanding_credits <
+ IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+ !kiblnd_send_keepalive(conn))
+ return 0; /* No need to send NOOP */
+
+ if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+ if (!list_empty(&conn->ibc_tx_queue_nocred))
+ return 0; /* NOOP can be piggybacked */
+
+ /* No tx to piggyback NOOP onto or no credit to send a tx */
+ return (list_empty(&conn->ibc_tx_queue) ||
+ conn->ibc_credits == 0);
+ }
+
+ if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+ !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+ conn->ibc_credits == 0) /* no credit */
+ return 0;
+
+ if (conn->ibc_credits == 1 && /* last credit reserved for */
+ conn->ibc_outstanding_credits == 0) /* giving back credits */
+ return 0;
+
+ /* No tx to piggyback NOOP onto or no credit to send a tx */
+ return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+ ib_modify_qp(conn->ibc_cmid->qp,
+ &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+ if (q == &conn->ibc_tx_queue)
+ return "tx_queue";
+
+ if (q == &conn->ibc_tx_queue_rsrvd)
+ return "tx_queue_rsrvd";
+
+ if (q == &conn->ibc_tx_queue_nocred)
+ return "tx_queue_nocred";
+
+ if (q == &conn->ibc_active_txs)
+ return "active_txs";
+
+ LBUG();
+ return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX 0
+#define IBLND_WID_RDMA 1
+#define IBLND_WID_RX 2
+#define IBLND_WID_MASK 3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+ unsigned long lptr = (unsigned long)ptr;
+
+ LASSERT ((lptr & IBLND_WID_MASK) == 0);
+ LASSERT ((type & ~IBLND_WID_MASK) == 0);
+ return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+ return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+ return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+ conn->ibc_state = state;
+ mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+ msg->ibm_type = type;
+ msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+ int i;
+ int size;
+
+ for (i = size = 0; i < rd->rd_nfrags; i++)
+ size += rd->rd_frags[i].rf_nob;
+
+ return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+ return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+ return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+ return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+ if (nob < rd->rd_frags[index].rf_nob) {
+ rd->rd_frags[index].rf_addr += nob;
+ rd->rd_frags[index].rf_nob -= nob;
+ } else {
+ index ++;
+ }
+
+ return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+ LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+ msgtype == IBLND_MSG_PUT_ACK);
+
+ return msgtype == IBLND_MSG_GET_REQ ?
+ offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+ offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+ void *msg, size_t size,
+ enum dma_data_direction direction)
+{
+ return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+ __u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a) (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+ struct scatterlist *sg)
+{
+ return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+ struct scatterlist *sg)
+{
+ return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+ kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+ __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+ kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+ int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+ kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+
+int kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int kiblnd_connd (void *arg);
+int kiblnd_scheduler(void *arg);
+int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int kiblnd_failover_thread (void *arg);
+
+int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int kiblnd_cm_callback(struct rdma_cm_id *cmid,
+ struct rdma_cm_event *event);
+int kiblnd_translate_mtu(int value);
+
+int kiblnd_dev_failover(kib_dev_t *dev);
+int kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+ int version, __u64 incarnation);
+int kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+ int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+ int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+ int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+ int credits, lnet_nid_t dstnid, __u64 dststamp);
+int kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+ unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 000000000000..cc6232126dd0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -0,0 +1,3529 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+void
+kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
+{
+ lnet_msg_t *lntmsg[2];
+ kib_net_t *net = ni->ni_data;
+ int rc;
+ int i;
+
+ LASSERT (net != NULL);
+ LASSERT (!in_interrupt());
+ LASSERT (!tx->tx_queued); /* mustn't be queued for sending */
+ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */
+ LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */
+ LASSERT (tx->tx_pool != NULL);
+
+ kiblnd_unmap_tx(ni, tx);
+
+ /* tx may have up to 2 lnet msgs to finalise */
+ lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+ lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+ rc = tx->tx_status;
+
+ if (tx->tx_conn != NULL) {
+ LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+ kiblnd_conn_decref(tx->tx_conn);
+ tx->tx_conn = NULL;
+ }
+
+ tx->tx_nwrq = 0;
+ tx->tx_status = 0;
+
+ kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+ /* delay finalize until my descs have been freed */
+ for (i = 0; i < 2; i++) {
+ if (lntmsg[i] == NULL)
+ continue;
+
+ lnet_finalize(ni, lntmsg[i], rc);
+ }
+}
+
+void
+kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+ kib_tx_t *tx;
+
+ while (!list_empty (txlist)) {
+ tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ /* complete now */
+ tx->tx_waiting = 0;
+ tx->tx_status = status;
+ kiblnd_tx_done(ni, tx);
+ }
+}
+
+kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+ kib_net_t *net = (kib_net_t *)ni->ni_data;
+ struct list_head *node;
+ kib_tx_t *tx;
+ kib_tx_poolset_t *tps;
+
+ tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+ node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+ if (node == NULL)
+ return NULL;
+ tx = container_of(node, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_nwrq == 0);
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_sending == 0);
+ LASSERT (!tx->tx_waiting);
+ LASSERT (tx->tx_status == 0);
+ LASSERT (tx->tx_conn == NULL);
+ LASSERT (tx->tx_lntmsg[0] == NULL);
+ LASSERT (tx->tx_lntmsg[1] == NULL);
+ LASSERT (tx->tx_u.pmr == NULL);
+ LASSERT (tx->tx_nfrags == 0);
+
+ return tx;
+}
+
+void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+ kib_conn_t *conn = rx->rx_conn;
+ struct kib_sched_info *sched = conn->ibc_sched;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ LASSERT(conn->ibc_nrx > 0);
+ conn->ibc_nrx--;
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+ kib_conn_t *conn = rx->rx_conn;
+ kib_net_t *net = conn->ibc_peer->ibp_ni->ni_data;
+ struct ib_recv_wr *bad_wrq = NULL;
+ struct ib_mr *mr;
+ int rc;
+
+ LASSERT (net != NULL);
+ LASSERT (!in_interrupt());
+ LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+ credit == IBLND_POSTRX_PEER_CREDIT ||
+ credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+ mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+ LASSERT (mr != NULL);
+
+ rx->rx_sge.lkey = mr->lkey;
+ rx->rx_sge.addr = rx->rx_msgaddr;
+ rx->rx_sge.length = IBLND_MSG_SIZE;
+
+ rx->rx_wrq.next = NULL;
+ rx->rx_wrq.sg_list = &rx->rx_sge;
+ rx->rx_wrq.num_sge = 1;
+ rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+ LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+ LASSERT (rx->rx_nob >= 0); /* not posted */
+
+ if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+ kiblnd_drop_rx(rx); /* No more posts for this rx */
+ return 0;
+ }
+
+ rx->rx_nob = -1; /* flag posted */
+
+ rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+ if (rc != 0) {
+ CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+ rx->rx_nob = 0;
+ }
+
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+ return rc;
+
+ if (rc != 0) {
+ kiblnd_close_conn(conn, rc);
+ kiblnd_drop_rx(rx); /* No more posts for this rx */
+ return rc;
+ }
+
+ if (credit == IBLND_POSTRX_NO_CREDIT)
+ return 0;
+
+ spin_lock(&conn->ibc_lock);
+ if (credit == IBLND_POSTRX_PEER_CREDIT)
+ conn->ibc_outstanding_credits++;
+ else
+ conn->ibc_reserved_credits++;
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_check_sends(conn);
+ return 0;
+}
+
+kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+ struct list_head *tmp;
+
+ list_for_each(tmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+ if (tx->tx_cookie != cookie)
+ continue;
+
+ if (tx->tx_waiting &&
+ tx->tx_msg->ibm_type == txtype)
+ return tx;
+
+ CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+ tx->tx_waiting ? "" : "NOT ",
+ tx->tx_msg->ibm_type, txtype);
+ }
+ return NULL;
+}
+
+void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+ kib_tx_t *tx;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ int idle;
+
+ spin_lock(&conn->ibc_lock);
+
+ tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+ if (tx == NULL) {
+ spin_unlock(&conn->ibc_lock);
+
+ CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+ txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_close_conn(conn, -EPROTO);
+ return;
+ }
+
+ if (tx->tx_status == 0) { /* success so far */
+ if (status < 0) { /* failed? */
+ tx->tx_status = status;
+ } else if (txtype == IBLND_MSG_GET_REQ) {
+ lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+ }
+ }
+
+ tx->tx_waiting = 0;
+
+ idle = !tx->tx_queued && (tx->tx_sending == 0);
+ if (idle)
+ list_del(&tx->tx_list);
+
+ spin_unlock(&conn->ibc_lock);
+
+ if (idle)
+ kiblnd_tx_done(ni, tx);
+}
+
+void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ kib_tx_t *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+ if (tx == NULL) {
+ CERROR("Can't get tx for completion %x for %s\n",
+ type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+ }
+
+ tx->tx_msg->ibm_u.completion.ibcm_status = status;
+ tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+ kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+ kiblnd_queue_tx(tx, conn);
+}
+
+void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ int credits = msg->ibm_credits;
+ kib_tx_t *tx;
+ int rc = 0;
+ int rc2;
+ int post_credit;
+
+ LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ CDEBUG (D_NET, "Received %x[%d] from %s\n",
+ msg->ibm_type, credits,
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+ if (credits != 0) {
+ /* Have I received credits that will let me send? */
+ spin_lock(&conn->ibc_lock);
+
+ if (conn->ibc_credits + credits >
+ IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+ rc2 = conn->ibc_credits;
+ spin_unlock(&conn->ibc_lock);
+
+ CERROR("Bad credits from %s: %d + %d > %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ rc2, credits,
+ IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+ kiblnd_close_conn(conn, -EPROTO);
+ kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+ return;
+ }
+
+ conn->ibc_credits += credits;
+
+ /* This ensures the credit taken by NOOP can be returned */
+ if (msg->ibm_type == IBLND_MSG_NOOP &&
+ !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+ conn->ibc_outstanding_credits++;
+
+ spin_unlock(&conn->ibc_lock);
+ kiblnd_check_sends(conn);
+ }
+
+ switch (msg->ibm_type) {
+ default:
+ CERROR("Bad IBLND message type %x from %s\n",
+ msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ rc = -EPROTO;
+ break;
+
+ case IBLND_MSG_NOOP:
+ if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ break;
+ }
+
+ if (credits != 0) /* credit already posted */
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ else /* a keepalive NOOP */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_IMMEDIATE:
+ post_credit = IBLND_POSTRX_DONT_POST;
+ rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+ msg->ibm_srcnid, rx, 0);
+ if (rc < 0) /* repost on error */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_PUT_REQ:
+ post_credit = IBLND_POSTRX_DONT_POST;
+ rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+ msg->ibm_srcnid, rx, 1);
+ if (rc < 0) /* repost on error */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_PUT_NAK:
+ CWARN ("PUT_NACK from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+ kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+
+ case IBLND_MSG_PUT_ACK:
+ post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+ spin_lock(&conn->ibc_lock);
+ tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+ msg->ibm_u.putack.ibpam_src_cookie);
+ if (tx != NULL)
+ list_del(&tx->tx_list);
+ spin_unlock(&conn->ibc_lock);
+
+ if (tx == NULL) {
+ CERROR("Unmatched PUT_ACK from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ rc = -EPROTO;
+ break;
+ }
+
+ LASSERT (tx->tx_waiting);
+ /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+ * (a) I can overwrite tx_msg since my peer has received it!
+ * (b) tx_waiting set tells tx_complete() it's not done. */
+
+ tx->tx_nwrq = 0; /* overwrite PUT_REQ */
+
+ rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+ kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+ &msg->ibm_u.putack.ibpam_rd,
+ msg->ibm_u.putack.ibpam_dst_cookie);
+ if (rc2 < 0)
+ CERROR("Can't setup rdma for PUT to %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+ spin_lock(&conn->ibc_lock);
+ tx->tx_waiting = 0; /* clear waiting and queue atomically */
+ kiblnd_queue_tx_locked(tx, conn);
+ spin_unlock(&conn->ibc_lock);
+ break;
+
+ case IBLND_MSG_PUT_DONE:
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+
+ case IBLND_MSG_GET_REQ:
+ post_credit = IBLND_POSTRX_DONT_POST;
+ rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+ msg->ibm_srcnid, rx, 1);
+ if (rc < 0) /* repost on error */
+ post_credit = IBLND_POSTRX_PEER_CREDIT;
+ break;
+
+ case IBLND_MSG_GET_DONE:
+ post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+ kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+ }
+
+ if (rc < 0) /* protocol error */
+ kiblnd_close_conn(conn, rc);
+
+ if (post_credit != IBLND_POSTRX_DONT_POST)
+ kiblnd_post_rx(rx, post_credit);
+}
+
+void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ kib_net_t *net = ni->ni_data;
+ int rc;
+ int err = -EIO;
+
+ LASSERT (net != NULL);
+ LASSERT (rx->rx_nob < 0); /* was posted */
+ rx->rx_nob = 0; /* isn't now */
+
+ if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+ goto ignore;
+
+ if (status != IB_WC_SUCCESS) {
+ CNETERR("Rx from %s failed: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+ goto failed;
+ }
+
+ LASSERT (nob >= 0);
+ rx->rx_nob = nob;
+
+ rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+ if (rc != 0) {
+ CERROR ("Error %d unpacking rx from %s\n",
+ rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ goto failed;
+ }
+
+ if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+ msg->ibm_dstnid != ni->ni_nid ||
+ msg->ibm_srcstamp != conn->ibc_incarnation ||
+ msg->ibm_dststamp != net->ibn_incarnation) {
+ CERROR ("Stale rx from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ err = -ESTALE;
+ goto failed;
+ }
+
+ /* set time last known alive */
+ kiblnd_peer_alive(conn->ibc_peer);
+
+ /* racing with connection establishment/teardown! */
+
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ unsigned long flags;
+
+ write_lock_irqsave(g_lock, flags);
+ /* must check holding global lock to eliminate race */
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+ write_unlock_irqrestore(g_lock, flags);
+ return;
+ }
+ write_unlock_irqrestore(g_lock, flags);
+ }
+ kiblnd_handle_rx(rx);
+ return;
+
+ failed:
+ CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+ kiblnd_close_conn(conn, err);
+ ignore:
+ kiblnd_drop_rx(rx); /* Don't re-post rx. */
+}
+
+struct page *
+kiblnd_kvaddr_to_page (unsigned long vaddr)
+{
+ struct page *page;
+
+ if (vaddr >= VMALLOC_START &&
+ vaddr < VMALLOC_END) {
+ page = vmalloc_to_page ((void *)vaddr);
+ LASSERT (page != NULL);
+ return page;
+ }
+#ifdef CONFIG_HIGHMEM
+ if (vaddr >= PKMAP_BASE &&
+ vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+ /* No highmem pages only used for bulk (kiov) I/O */
+ CERROR("find page for address in highmem\n");
+ LBUG();
+ }
+#endif
+ page = virt_to_page (vaddr);
+ LASSERT (page != NULL);
+ return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+ kib_hca_dev_t *hdev;
+ __u64 *pages = tx->tx_pages;
+ kib_fmr_poolset_t *fps;
+ int npages;
+ int size;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT(tx->tx_pool != NULL);
+ LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+ hdev = tx->tx_pool->tpo_hdev;
+
+ for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+ for (size = 0; size < rd->rd_frags[i].rf_nob;
+ size += hdev->ibh_page_size) {
+ pages[npages ++] = (rd->rd_frags[i].rf_addr &
+ hdev->ibh_page_mask) + size;
+ }
+ }
+
+ cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+ fps = net->ibn_fmr_ps[cpt];
+ rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+ if (rc != 0) {
+ CERROR ("Can't map %d pages: %d\n", npages, rc);
+ return rc;
+ }
+
+ /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+ * the rkey */
+ rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+ tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+ rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+ rd->rd_frags[0].rf_nob = nob;
+ rd->rd_nfrags = 1;
+
+ return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+ kib_hca_dev_t *hdev;
+ kib_pmr_poolset_t *pps;
+ __u64 iova;
+ int cpt;
+ int rc;
+
+ LASSERT(tx->tx_pool != NULL);
+ LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+ hdev = tx->tx_pool->tpo_hdev;
+
+ iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+ cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+ pps = net->ibn_pmr_ps[cpt];
+ rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+ if (rc != 0) {
+ CERROR("Failed to create MR by phybuf: %d\n", rc);
+ return rc;
+ }
+
+ /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+ * the rkey */
+ rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+ tx->tx_u.pmr->pmr_mr->lkey;
+ rd->rd_nfrags = 1;
+ rd->rd_frags[0].rf_addr = iova;
+ rd->rd_frags[0].rf_nob = nob;
+
+ return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+ kib_net_t *net = ni->ni_data;
+
+ LASSERT(net != NULL);
+
+ if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+ kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+ tx->tx_u.fmr.fmr_pfmr = NULL;
+
+ } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+ kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+ tx->tx_u.pmr = NULL;
+ }
+
+ if (tx->tx_nfrags != 0) {
+ kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+ tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+ tx->tx_nfrags = 0;
+ }
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+ kib_rdma_desc_t *rd, int nfrags)
+{
+ kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+ kib_net_t *net = ni->ni_data;
+ struct ib_mr *mr = NULL;
+ __u32 nob;
+ int i;
+
+ /* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+ * RDMA sink */
+ tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ tx->tx_nfrags = nfrags;
+
+ rd->rd_nfrags =
+ kiblnd_dma_map_sg(hdev->ibh_ibdev,
+ tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+ for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+ rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len(
+ hdev->ibh_ibdev, &tx->tx_frags[i]);
+ rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+ hdev->ibh_ibdev, &tx->tx_frags[i]);
+ nob += rd->rd_frags[i].rf_nob;
+ }
+
+ /* looking for pre-mapping MR */
+ mr = kiblnd_find_rd_dma_mr(hdev, rd);
+ if (mr != NULL) {
+ /* found pre-mapping MR */
+ rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+ return 0;
+ }
+
+ if (net->ibn_fmr_ps != NULL)
+ return kiblnd_fmr_map_tx(net, tx, rd, nob);
+ else if (net->ibn_pmr_ps != NULL)
+ return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+ return -EINVAL;
+}
+
+
+int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+ unsigned int niov, struct iovec *iov, int offset, int nob)
+{
+ kib_net_t *net = ni->ni_data;
+ struct page *page;
+ struct scatterlist *sg;
+ unsigned long vaddr;
+ int fragnob;
+ int page_offset;
+
+ LASSERT (nob > 0);
+ LASSERT (niov > 0);
+ LASSERT (net != NULL);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT (niov > 0);
+ }
+
+ sg = tx->tx_frags;
+ do {
+ LASSERT (niov > 0);
+
+ vaddr = ((unsigned long)iov->iov_base) + offset;
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ page = kiblnd_kvaddr_to_page(vaddr);
+ if (page == NULL) {
+ CERROR ("Can't find page\n");
+ return -EFAULT;
+ }
+
+ fragnob = min((int)(iov->iov_len - offset), nob);
+ fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+ sg_set_page(sg, page, fragnob, page_offset);
+ sg++;
+
+ if (offset + fragnob < iov->iov_len) {
+ offset += fragnob;
+ } else {
+ offset = 0;
+ iov++;
+ niov--;
+ }
+ nob -= fragnob;
+ } while (nob > 0);
+
+ return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+ int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+ kib_net_t *net = ni->ni_data;
+ struct scatterlist *sg;
+ int fragnob;
+
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+ LASSERT (nob > 0);
+ LASSERT (nkiov > 0);
+ LASSERT (net != NULL);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT (nkiov > 0);
+ }
+
+ sg = tx->tx_frags;
+ do {
+ LASSERT (nkiov > 0);
+
+ fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+ sg_set_page(sg, kiov->kiov_page, fragnob,
+ kiov->kiov_offset + offset);
+ sg++;
+
+ offset = 0;
+ kiov++;
+ nkiov--;
+ nob -= fragnob;
+ } while (nob > 0);
+
+ return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+{
+ kib_msg_t *msg = tx->tx_msg;
+ kib_peer_t *peer = conn->ibc_peer;
+ int ver = conn->ibc_version;
+ int rc;
+ int done;
+ struct ib_send_wr *bad_wrq;
+
+ LASSERT (tx->tx_queued);
+ /* We rely on this for QP sizing */
+ LASSERT (tx->tx_nwrq > 0);
+ LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+ LASSERT (credit == 0 || credit == 1);
+ LASSERT (conn->ibc_outstanding_credits >= 0);
+ LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+ LASSERT (conn->ibc_credits >= 0);
+ LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+ if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+ /* tx completions outstanding... */
+ CDEBUG(D_NET, "%s: posted enough\n",
+ libcfs_nid2str(peer->ibp_nid));
+ return -EAGAIN;
+ }
+
+ if (credit != 0 && conn->ibc_credits == 0) { /* no credits */
+ CDEBUG(D_NET, "%s: no credits\n",
+ libcfs_nid2str(peer->ibp_nid));
+ return -EAGAIN;
+ }
+
+ if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+ conn->ibc_credits == 1 && /* last credit reserved */
+ msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */
+ CDEBUG(D_NET, "%s: not using last credit\n",
+ libcfs_nid2str(peer->ibp_nid));
+ return -EAGAIN;
+ }
+
+ /* NB don't drop ibc_lock before bumping tx_sending */
+ list_del(&tx->tx_list);
+ tx->tx_queued = 0;
+
+ if (msg->ibm_type == IBLND_MSG_NOOP &&
+ (!kiblnd_need_noop(conn) || /* redundant NOOP */
+ (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+ conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+ /* OK to drop when posted enough NOOPs, since
+ * kiblnd_check_sends will queue NOOP again when
+ * posted NOOPs complete */
+ spin_unlock(&conn->ibc_lock);
+ kiblnd_tx_done(peer->ibp_ni, tx);
+ spin_lock(&conn->ibc_lock);
+ CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_noops_posted);
+ return 0;
+ }
+
+ kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+ peer->ibp_nid, conn->ibc_incarnation);
+
+ conn->ibc_credits -= credit;
+ conn->ibc_outstanding_credits = 0;
+ conn->ibc_nsends_posted++;
+ if (msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_noops_posted++;
+
+ /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
+ * PUT. If so, it was first queued here as a PUT_REQ, sent and
+ * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+ * and then re-queued here. It's (just) possible that
+ * tx_sending is non-zero if we've not done the tx_complete()
+ * from the first send; hence the ++ rather than = below. */
+ tx->tx_sending++;
+ list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+ /* I'm still holding ibc_lock! */
+ if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+ rc = -ECONNABORTED;
+ } else if (tx->tx_pool->tpo_pool.po_failed ||
+ conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+ /* close_conn will launch failover */
+ rc = -ENETDOWN;
+ } else {
+ rc = ib_post_send(conn->ibc_cmid->qp,
+ tx->tx_wrq, &bad_wrq);
+ }
+
+ conn->ibc_last_send = jiffies;
+
+ if (rc == 0)
+ return 0;
+
+ /* NB credits are transferred in the actual
+ * message, which can only be the last work item */
+ conn->ibc_credits += credit;
+ conn->ibc_outstanding_credits += msg->ibm_credits;
+ conn->ibc_nsends_posted--;
+ if (msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_noops_posted--;
+
+ tx->tx_status = rc;
+ tx->tx_waiting = 0;
+ tx->tx_sending--;
+
+ done = (tx->tx_sending == 0);
+ if (done)
+ list_del(&tx->tx_list);
+
+ spin_unlock(&conn->ibc_lock);
+
+ if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+ CERROR("Error %d posting transmit to %s\n",
+ rc, libcfs_nid2str(peer->ibp_nid));
+ else
+ CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+ rc, libcfs_nid2str(peer->ibp_nid));
+
+ kiblnd_close_conn(conn, rc);
+
+ if (done)
+ kiblnd_tx_done(peer->ibp_ni, tx);
+
+ spin_lock(&conn->ibc_lock);
+
+ return -EIO;
+}
+
+void
+kiblnd_check_sends (kib_conn_t *conn)
+{
+ int ver = conn->ibc_version;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ kib_tx_t *tx;
+
+ /* Don't send anything until after the connection is established */
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ CDEBUG(D_NET, "%s too soon\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+ }
+
+ spin_lock(&conn->ibc_lock);
+
+ LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+ LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+ conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+ LASSERT (conn->ibc_reserved_credits >= 0);
+
+ while (conn->ibc_reserved_credits > 0 &&
+ !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+ tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+ kib_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+ conn->ibc_reserved_credits--;
+ }
+
+ if (kiblnd_need_noop(conn)) {
+ spin_unlock(&conn->ibc_lock);
+
+ tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+ if (tx != NULL)
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+ spin_lock(&conn->ibc_lock);
+ if (tx != NULL)
+ kiblnd_queue_tx_locked(tx, conn);
+ }
+
+ kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+ for (;;) {
+ int credit;
+
+ if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+ credit = 0;
+ tx = list_entry(conn->ibc_tx_queue_nocred.next,
+ kib_tx_t, tx_list);
+ } else if (!list_empty(&conn->ibc_tx_noops)) {
+ LASSERT (!IBLND_OOB_CAPABLE(ver));
+ credit = 1;
+ tx = list_entry(conn->ibc_tx_noops.next,
+ kib_tx_t, tx_list);
+ } else if (!list_empty(&conn->ibc_tx_queue)) {
+ credit = 1;
+ tx = list_entry(conn->ibc_tx_queue.next,
+ kib_tx_t, tx_list);
+ } else
+ break;
+
+ if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+ break;
+ }
+
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+ int failed = (status != IB_WC_SUCCESS);
+ kib_conn_t *conn = tx->tx_conn;
+ int idle;
+
+ LASSERT (tx->tx_sending > 0);
+
+ if (failed) {
+ if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+ CNETERR("Tx -> %s cookie "LPX64
+ " sending %d waiting %d: failed %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+ status);
+
+ kiblnd_close_conn(conn, -EIO);
+ } else {
+ kiblnd_peer_alive(conn->ibc_peer);
+ }
+
+ spin_lock(&conn->ibc_lock);
+
+ /* I could be racing with rdma completion. Whoever makes 'tx' idle
+ * gets to free it, which also drops its ref on 'conn'. */
+
+ tx->tx_sending--;
+ conn->ibc_nsends_posted--;
+ if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+ conn->ibc_noops_posted--;
+
+ if (failed) {
+ tx->tx_waiting = 0; /* don't wait for peer */
+ tx->tx_status = -EIO;
+ }
+
+ idle = (tx->tx_sending == 0) && /* This is the final callback */
+ !tx->tx_waiting && /* Not waiting for peer */
+ !tx->tx_queued; /* Not re-queued (PUT_DONE) */
+ if (idle)
+ list_del(&tx->tx_list);
+
+ kiblnd_conn_addref(conn); /* 1 ref for me.... */
+
+ spin_unlock(&conn->ibc_lock);
+
+ if (idle)
+ kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+ kiblnd_check_sends(conn);
+
+ kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+ kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+ struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq];
+ struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+ int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+ struct ib_mr *mr;
+
+ LASSERT (tx->tx_nwrq >= 0);
+ LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+ LASSERT (nob <= IBLND_MSG_SIZE);
+
+ kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+ mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+ LASSERT (mr != NULL);
+
+ sge->lkey = mr->lkey;
+ sge->addr = tx->tx_msgaddr;
+ sge->length = nob;
+
+ memset(wrq, 0, sizeof(*wrq));
+
+ wrq->next = NULL;
+ wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+ wrq->sg_list = sge;
+ wrq->num_sge = 1;
+ wrq->opcode = IB_WR_SEND;
+ wrq->send_flags = IB_SEND_SIGNALED;
+
+ tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+ int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+ kib_msg_t *ibmsg = tx->tx_msg;
+ kib_rdma_desc_t *srcrd = tx->tx_rd;
+ struct ib_sge *sge = &tx->tx_sge[0];
+ struct ib_send_wr *wrq = &tx->tx_wrq[0];
+ int rc = resid;
+ int srcidx;
+ int dstidx;
+ int wrknob;
+
+ LASSERT (!in_interrupt());
+ LASSERT (tx->tx_nwrq == 0);
+ LASSERT (type == IBLND_MSG_GET_DONE ||
+ type == IBLND_MSG_PUT_DONE);
+
+ srcidx = dstidx = 0;
+
+ while (resid > 0) {
+ if (srcidx >= srcrd->rd_nfrags) {
+ CERROR("Src buffer exhausted: %d frags\n", srcidx);
+ rc = -EPROTO;
+ break;
+ }
+
+ if (dstidx == dstrd->rd_nfrags) {
+ CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+ rc = -EPROTO;
+ break;
+ }
+
+ if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+ CERROR("RDMA too fragmented for %s (%d): "
+ "%d/%d src %d/%d dst frags\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ IBLND_RDMA_FRAGS(conn->ibc_version),
+ srcidx, srcrd->rd_nfrags,
+ dstidx, dstrd->rd_nfrags);
+ rc = -EMSGSIZE;
+ break;
+ }
+
+ wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+ kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+
+ sge = &tx->tx_sge[tx->tx_nwrq];
+ sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx);
+ sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx);
+ sge->length = wrknob;
+
+ wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+ wrq->next = wrq + 1;
+ wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+ wrq->sg_list = sge;
+ wrq->num_sge = 1;
+ wrq->opcode = IB_WR_RDMA_WRITE;
+ wrq->send_flags = 0;
+
+ wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+ wrq->wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, dstidx);
+
+ srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+ dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+ resid -= wrknob;
+
+ tx->tx_nwrq++;
+ wrq++;
+ sge++;
+ }
+
+ if (rc < 0) /* no RDMA if completing with failure */
+ tx->tx_nwrq = 0;
+
+ ibmsg->ibm_u.completion.ibcm_status = rc;
+ ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+ kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+ type, sizeof (kib_completion_msg_t));
+
+ return rc;
+}
+
+void
+kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+ struct list_head *q;
+
+ LASSERT (tx->tx_nwrq > 0); /* work items set up */
+ LASSERT (!tx->tx_queued); /* not queued for sending already */
+ LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ tx->tx_queued = 1;
+ tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+ if (tx->tx_conn == NULL) {
+ kiblnd_conn_addref(conn);
+ tx->tx_conn = conn;
+ LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+ } else {
+ /* PUT_DONE first attached to conn as a PUT_REQ */
+ LASSERT (tx->tx_conn == conn);
+ LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+ }
+
+ switch (tx->tx_msg->ibm_type) {
+ default:
+ LBUG();
+
+ case IBLND_MSG_PUT_REQ:
+ case IBLND_MSG_GET_REQ:
+ q = &conn->ibc_tx_queue_rsrvd;
+ break;
+
+ case IBLND_MSG_PUT_NAK:
+ case IBLND_MSG_PUT_ACK:
+ case IBLND_MSG_PUT_DONE:
+ case IBLND_MSG_GET_DONE:
+ q = &conn->ibc_tx_queue_nocred;
+ break;
+
+ case IBLND_MSG_NOOP:
+ if (IBLND_OOB_CAPABLE(conn->ibc_version))
+ q = &conn->ibc_tx_queue_nocred;
+ else
+ q = &conn->ibc_tx_noops;
+ break;
+
+ case IBLND_MSG_IMMEDIATE:
+ q = &conn->ibc_tx_queue;
+ break;
+ }
+
+ list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+ spin_lock(&conn->ibc_lock);
+ kiblnd_queue_tx_locked(tx, conn);
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+ struct sockaddr_in *srcaddr,
+ struct sockaddr_in *dstaddr,
+ int timeout_ms)
+{
+ unsigned short port;
+ int rc;
+
+ /* allow the port to be reused */
+ rc = rdma_set_reuseaddr(cmid, 1);
+ if (rc != 0) {
+ CERROR("Unable to set reuse on cmid: %d\n", rc);
+ return rc;
+ }
+
+ /* look for a free privileged port */
+ for (port = PROT_SOCK-1; port > 0; port--) {
+ srcaddr->sin_port = htons(port);
+ rc = rdma_resolve_addr(cmid,
+ (struct sockaddr *)srcaddr,
+ (struct sockaddr *)dstaddr,
+ timeout_ms);
+ if (rc == 0) {
+ CDEBUG(D_NET, "bound to port %hu\n", port);
+ return 0;
+ } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+ CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+ port, rc);
+ } else {
+ return rc;
+ }
+ }
+
+ CERROR("Failed to bind to a free privileged port\n");
+ return rc;
+}
+
+void
+kiblnd_connect_peer (kib_peer_t *peer)
+{
+ struct rdma_cm_id *cmid;
+ kib_dev_t *dev;
+ kib_net_t *net = peer->ibp_ni->ni_data;
+ struct sockaddr_in srcaddr;
+ struct sockaddr_in dstaddr;
+ int rc;
+
+ LASSERT (net != NULL);
+ LASSERT (peer->ibp_connecting > 0);
+
+ cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+ IB_QPT_RC);
+
+ if (IS_ERR(cmid)) {
+ CERROR("Can't create CMID for %s: %ld\n",
+ libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+ rc = PTR_ERR(cmid);
+ goto failed;
+ }
+
+ dev = net->ibn_dev;
+ memset(&srcaddr, 0, sizeof(srcaddr));
+ srcaddr.sin_family = AF_INET;
+ srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+ memset(&dstaddr, 0, sizeof(dstaddr));
+ dstaddr.sin_family = AF_INET;
+ dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+ dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+ kiblnd_peer_addref(peer); /* cmid's ref */
+
+ if (*kiblnd_tunables.kib_use_priv_port) {
+ rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+ *kiblnd_tunables.kib_timeout * 1000);
+ } else {
+ rc = rdma_resolve_addr(cmid,
+ (struct sockaddr *)&srcaddr,
+ (struct sockaddr *)&dstaddr,
+ *kiblnd_tunables.kib_timeout * 1000);
+ }
+ if (rc != 0) {
+ /* Can't initiate address resolution: */
+ CERROR("Can't resolve addr for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ goto failed2;
+ }
+
+ LASSERT (cmid->device != NULL);
+ CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+ libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+ HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+ return;
+
+ failed2:
+ kiblnd_peer_decref(peer); /* cmid's ref */
+ rdma_destroy_id(cmid);
+ failed:
+ kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ kib_conn_t *conn;
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ unsigned long flags;
+ int rc;
+
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+ LASSERT (tx == NULL || tx->tx_nwrq > 0); /* work items have been set up */
+
+ /* First time, just use a read lock since I expect to find my peer
+ * connected */
+ read_lock_irqsave(g_lock, flags);
+
+ peer = kiblnd_find_peer_locked(nid);
+ if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+ /* Found a peer with an established connection */
+ conn = kiblnd_get_conn_locked(peer);
+ kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+ read_unlock_irqrestore(g_lock, flags);
+
+ if (tx != NULL)
+ kiblnd_queue_tx(tx, conn);
+ kiblnd_conn_decref(conn); /* ...to here */
+ return;
+ }
+
+ read_unlock(g_lock);
+ /* Re-try with a write lock */
+ write_lock(g_lock);
+
+ peer = kiblnd_find_peer_locked(nid);
+ if (peer != NULL) {
+ if (list_empty(&peer->ibp_conns)) {
+ /* found a peer, but it's still connecting... */
+ LASSERT (peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0);
+ if (tx != NULL)
+ list_add_tail(&tx->tx_list,
+ &peer->ibp_tx_queue);
+ write_unlock_irqrestore(g_lock, flags);
+ } else {
+ conn = kiblnd_get_conn_locked(peer);
+ kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ if (tx != NULL)
+ kiblnd_queue_tx(tx, conn);
+ kiblnd_conn_decref(conn); /* ...to here */
+ }
+ return;
+ }
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ /* Allocate a peer ready to add to the peer table and retry */
+ rc = kiblnd_create_peer(ni, &peer, nid);
+ if (rc != 0) {
+ CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+ if (tx != NULL) {
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kiblnd_tx_done(ni, tx);
+ }
+ return;
+ }
+
+ write_lock_irqsave(g_lock, flags);
+
+ peer2 = kiblnd_find_peer_locked(nid);
+ if (peer2 != NULL) {
+ if (list_empty(&peer2->ibp_conns)) {
+ /* found a peer, but it's still connecting... */
+ LASSERT (peer2->ibp_connecting != 0 ||
+ peer2->ibp_accepting != 0);
+ if (tx != NULL)
+ list_add_tail(&tx->tx_list,
+ &peer2->ibp_tx_queue);
+ write_unlock_irqrestore(g_lock, flags);
+ } else {
+ conn = kiblnd_get_conn_locked(peer2);
+ kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ if (tx != NULL)
+ kiblnd_queue_tx(tx, conn);
+ kiblnd_conn_decref(conn); /* ...to here */
+ }
+
+ kiblnd_peer_decref(peer);
+ return;
+ }
+
+ /* Brand new peer */
+ LASSERT (peer->ibp_connecting == 0);
+ peer->ibp_connecting = 1;
+
+ /* always called with a ref on ni, which prevents ni being shutdown */
+ LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+ if (tx != NULL)
+ list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+ kiblnd_peer_addref(peer);
+ list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+ write_unlock_irqrestore(g_lock, flags);
+
+ kiblnd_connect_peer(peer);
+ kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ lnet_hdr_t *hdr = &lntmsg->msg_hdr;
+ int type = lntmsg->msg_type;
+ lnet_process_id_t target = lntmsg->msg_target;
+ int target_is_router = lntmsg->msg_target_is_router;
+ int routing = lntmsg->msg_routing;
+ unsigned int payload_niov = lntmsg->msg_niov;
+ struct iovec *payload_iov = lntmsg->msg_iov;
+ lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
+ unsigned int payload_offset = lntmsg->msg_offset;
+ unsigned int payload_nob = lntmsg->msg_len;
+ kib_msg_t *ibmsg;
+ kib_tx_t *tx;
+ int nob;
+ int rc;
+
+ /* NB 'private' is different depending on what we're sending.... */
+
+ CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+ payload_nob, payload_niov, libcfs_id2str(target));
+
+ LASSERT (payload_nob == 0 || payload_niov > 0);
+ LASSERT (payload_niov <= LNET_MAX_IOV);
+
+ /* Thread context */
+ LASSERT (!in_interrupt());
+ /* payload is either all vaddrs or all pages */
+ LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+ switch (type) {
+ default:
+ LBUG();
+ return (-EIO);
+
+ case LNET_MSG_ACK:
+ LASSERT (payload_nob == 0);
+ break;
+
+ case LNET_MSG_GET:
+ if (routing || target_is_router)
+ break; /* send IMMEDIATE */
+
+ /* is the REPLY message too small for RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+ if (nob <= IBLND_MSG_SIZE)
+ break; /* send IMMEDIATE */
+
+ tx = kiblnd_get_idle_tx(ni, target.nid);
+ if (tx == NULL) {
+ CERROR("Can't allocate txd for GET to %s\n",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ ibmsg = tx->tx_msg;
+
+ if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+ rc = kiblnd_setup_rd_iov(ni, tx,
+ &ibmsg->ibm_u.get.ibgm_rd,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.iov,
+ 0, lntmsg->msg_md->md_length);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx,
+ &ibmsg->ibm_u.get.ibgm_rd,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.kiov,
+ 0, lntmsg->msg_md->md_length);
+ if (rc != 0) {
+ CERROR("Can't setup GET sink for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kiblnd_tx_done(ni, tx);
+ return -EIO;
+ }
+
+ nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+ ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+ ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+ tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+ if (tx->tx_lntmsg[1] == NULL) {
+ CERROR("Can't create reply for GET -> %s\n",
+ libcfs_nid2str(target.nid));
+ kiblnd_tx_done(ni, tx);
+ return -EIO;
+ }
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
+ tx->tx_waiting = 1; /* waiting for GET_DONE */
+ kiblnd_launch_tx(ni, tx, target.nid);
+ return 0;
+
+ case LNET_MSG_REPLY:
+ case LNET_MSG_PUT:
+ /* Is the payload small enough not to need RDMA? */
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+ if (nob <= IBLND_MSG_SIZE)
+ break; /* send IMMEDIATE */
+
+ tx = kiblnd_get_idle_tx(ni, target.nid);
+ if (tx == NULL) {
+ CERROR("Can't allocate %s txd for %s\n",
+ type == LNET_MSG_PUT ? "PUT" : "REPLY",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ if (payload_kiov == NULL)
+ rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ if (rc != 0) {
+ CERROR("Can't setup PUT src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kiblnd_tx_done(ni, tx);
+ return -EIO;
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+ ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
+ kiblnd_launch_tx(ni, tx, target.nid);
+ return 0;
+ }
+
+ /* send IMMEDIATE */
+
+ LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+ <= IBLND_MSG_SIZE);
+
+ tx = kiblnd_get_idle_tx(ni, target.nid);
+ if (tx == NULL) {
+ CERROR ("Can't send %d to %s: tx descs exhausted\n",
+ type, libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+ if (payload_kiov != NULL)
+ lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ else
+ lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+
+ nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ kiblnd_launch_tx(ni, tx, target.nid);
+ return 0;
+}
+
+void
+kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+ lnet_process_id_t target = lntmsg->msg_target;
+ unsigned int niov = lntmsg->msg_niov;
+ struct iovec *iov = lntmsg->msg_iov;
+ lnet_kiov_t *kiov = lntmsg->msg_kiov;
+ unsigned int offset = lntmsg->msg_offset;
+ unsigned int nob = lntmsg->msg_len;
+ kib_tx_t *tx;
+ int rc;
+
+ tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+ if (tx == NULL) {
+ CERROR("Can't get tx for REPLY to %s\n",
+ libcfs_nid2str(target.nid));
+ goto failed_0;
+ }
+
+ if (nob == 0)
+ rc = 0;
+ else if (kiov == NULL)
+ rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+ niov, iov, offset, nob);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+ niov, kiov, offset, nob);
+
+ if (rc != 0) {
+ CERROR("Can't setup GET src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ goto failed_1;
+ }
+
+ rc = kiblnd_init_rdma(rx->rx_conn, tx,
+ IBLND_MSG_GET_DONE, nob,
+ &rx->rx_msg->ibm_u.get.ibgm_rd,
+ rx->rx_msg->ibm_u.get.ibgm_cookie);
+ if (rc < 0) {
+ CERROR("Can't setup rdma for GET from %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ goto failed_1;
+ }
+
+ if (nob == 0) {
+ /* No RDMA: local completion may happen now! */
+ lnet_finalize(ni, lntmsg, 0);
+ } else {
+ /* RDMA: lnet_finalize(lntmsg) when it
+ * completes */
+ tx->tx_lntmsg[0] = lntmsg;
+ }
+
+ kiblnd_queue_tx(tx, rx->rx_conn);
+ return;
+
+ failed_1:
+ kiblnd_tx_done(ni, tx);
+ failed_0:
+ lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+ unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ kib_rx_t *rx = private;
+ kib_msg_t *rxmsg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ kib_tx_t *tx;
+ kib_msg_t *txmsg;
+ int nob;
+ int post_credit = IBLND_POSTRX_PEER_CREDIT;
+ int rc = 0;
+
+ LASSERT (mlen <= rlen);
+ LASSERT (!in_interrupt());
+ /* Either all pages or all vaddrs */
+ LASSERT (!(kiov != NULL && iov != NULL));
+
+ switch (rxmsg->ibm_type) {
+ default:
+ LBUG();
+
+ case IBLND_MSG_IMMEDIATE:
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (nob > rx->rx_nob) {
+ CERROR ("Immediate message from %s too big: %d(%d)\n",
+ libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+ nob, rx->rx_nob);
+ rc = -EPROTO;
+ break;
+ }
+
+ if (kiov != NULL)
+ lnet_copy_flat2kiov(niov, kiov, offset,
+ IBLND_MSG_SIZE, rxmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ mlen);
+ else
+ lnet_copy_flat2iov(niov, iov, offset,
+ IBLND_MSG_SIZE, rxmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ mlen);
+ lnet_finalize (ni, lntmsg, 0);
+ break;
+
+ case IBLND_MSG_PUT_REQ:
+ if (mlen == 0) {
+ lnet_finalize(ni, lntmsg, 0);
+ kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
+
+ tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+ if (tx == NULL) {
+ CERROR("Can't allocate tx for %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ /* Not replying will break the connection */
+ rc = -ENOMEM;
+ break;
+ }
+
+ txmsg = tx->tx_msg;
+ if (kiov == NULL)
+ rc = kiblnd_setup_rd_iov(ni, tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ niov, iov, offset, mlen);
+ else
+ rc = kiblnd_setup_rd_kiov(ni, tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ niov, kiov, offset, mlen);
+ if (rc != 0) {
+ CERROR("Can't setup PUT sink for %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ kiblnd_tx_done(ni, tx);
+ /* tell peer it's over */
+ kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
+
+ nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+ txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+ txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+ kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_DONE */
+ kiblnd_queue_tx(tx, conn);
+
+ /* reposted buffer reserved for PUT_DONE */
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ break;
+
+ case IBLND_MSG_GET_REQ:
+ if (lntmsg != NULL) {
+ /* Optimized GET; RDMA lntmsg's payload */
+ kiblnd_reply(ni, rx, lntmsg);
+ } else {
+ /* GET didn't match anything */
+ kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+ -ENODATA,
+ rxmsg->ibm_u.get.ibgm_cookie);
+ }
+ break;
+ }
+
+ kiblnd_post_rx(rx, post_credit);
+ return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+ task_t *task = kthread_run(fn, arg, name);
+
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ atomic_inc(&kiblnd_data.kib_nthreads);
+ return 0;
+}
+
+void
+kiblnd_thread_fini (void)
+{
+ atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive (kib_peer_t *peer)
+{
+ /* This is racy, but everyone's only writing cfs_time_current() */
+ peer->ibp_last_alive = cfs_time_current();
+ mb();
+}
+
+void
+kiblnd_peer_notify (kib_peer_t *peer)
+{
+ int error = 0;
+ cfs_time_t last_alive = 0;
+ unsigned long flags;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (list_empty(&peer->ibp_conns) &&
+ peer->ibp_accepting == 0 &&
+ peer->ibp_connecting == 0 &&
+ peer->ibp_error != 0) {
+ error = peer->ibp_error;
+ peer->ibp_error = 0;
+
+ last_alive = peer->ibp_last_alive;
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (error != 0)
+ lnet_notify(peer->ibp_ni,
+ peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+ /* This just does the immediate housekeeping. 'error' is zero for a
+ * normal shutdown which can happen only after the connection has been
+ * established. If the connection is established, schedule the
+ * connection to be finished off by the connd. Otherwise the connd is
+ * already dealing with it (either to set it up or tear it down).
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_dev_t *dev;
+ unsigned long flags;
+
+ LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ if (error != 0 && conn->ibc_comms_error == 0)
+ conn->ibc_comms_error = error;
+
+ if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+ return; /* already being handled */
+
+ if (error == 0 &&
+ list_empty(&conn->ibc_tx_noops) &&
+ list_empty(&conn->ibc_tx_queue) &&
+ list_empty(&conn->ibc_tx_queue_rsrvd) &&
+ list_empty(&conn->ibc_tx_queue_nocred) &&
+ list_empty(&conn->ibc_active_txs)) {
+ CDEBUG(D_NET, "closing conn to %s\n",
+ libcfs_nid2str(peer->ibp_nid));
+ } else {
+ CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+ libcfs_nid2str(peer->ibp_nid), error,
+ list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+ list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+ list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+ list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+ list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+ }
+
+ dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+ list_del(&conn->ibc_list);
+ /* connd (see below) takes over ibc_list's ref */
+
+ if (list_empty (&peer->ibp_conns) && /* no more conns */
+ kiblnd_peer_active(peer)) { /* still in peer table */
+ kiblnd_unlink_peer_locked(peer);
+
+ /* set/clear error on last conn */
+ peer->ibp_error = conn->ibc_comms_error;
+ }
+
+ kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+ if (error != 0 &&
+ kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ wake_up(&kiblnd_data.kib_failover_waitq);
+ }
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+ list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+ wake_up(&kiblnd_data.kib_connd_waitq);
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_close_conn_locked(conn, error);
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+ unsigned long flags;
+ kib_rx_t *rx;
+
+ LASSERT(!in_interrupt());
+ LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ while (!list_empty(&conn->ibc_early_rxs)) {
+ rx = list_entry(conn->ibc_early_rxs.next,
+ kib_rx_t, rx_list);
+ list_del(&rx->rx_list);
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_handle_rx(rx);
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ }
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+
+ spin_lock(&conn->ibc_lock);
+
+ list_for_each_safe (tmp, nxt, txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ if (txs == &conn->ibc_active_txs) {
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_waiting ||
+ tx->tx_sending != 0);
+ } else {
+ LASSERT (tx->tx_queued);
+ }
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_waiting = 0;
+
+ if (tx->tx_sending == 0) {
+ tx->tx_queued = 0;
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+ }
+
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+ LASSERT (!in_interrupt());
+ LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+ kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+ /* abort_receives moves QP state to IB_QPS_ERR. This is only required
+ * for connections that didn't get as far as being connected, because
+ * rdma_disconnect() does this for free. */
+ kiblnd_abort_receives(conn);
+
+ /* Complete all tx descs not waiting for sends to complete.
+ * NB we should be safe from RDMA now that the QP has changed state */
+
+ kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+ kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+ kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+ kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+ kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+ kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error)
+{
+ LIST_HEAD (zombies);
+ unsigned long flags;
+
+ LASSERT (error != 0);
+ LASSERT (!in_interrupt());
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ if (active) {
+ LASSERT (peer->ibp_connecting > 0);
+ peer->ibp_connecting--;
+ } else {
+ LASSERT (peer->ibp_accepting > 0);
+ peer->ibp_accepting--;
+ }
+
+ if (peer->ibp_connecting != 0 ||
+ peer->ibp_accepting != 0) {
+ /* another connection attempt under way... */
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+ flags);
+ return;
+ }
+
+ if (list_empty(&peer->ibp_conns)) {
+ /* Take peer's blocked transmits to complete with error */
+ list_add(&zombies, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
+
+ if (kiblnd_peer_active(peer))
+ kiblnd_unlink_peer_locked(peer);
+
+ peer->ibp_error = error;
+ } else {
+ /* Can't have blocked transmits if there are connections */
+ LASSERT (list_empty(&peer->ibp_tx_queue));
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_peer_notify(peer);
+
+ if (list_empty (&zombies))
+ return;
+
+ CNETERR("Deleting messages for %s: connection failed\n",
+ libcfs_nid2str(peer->ibp_nid));
+
+ kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+ kib_tx_t *tx;
+ struct list_head txs;
+ unsigned long flags;
+ int active;
+
+ active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+ CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+ libcfs_nid2str(peer->ibp_nid), active,
+ conn->ibc_version, status);
+
+ LASSERT (!in_interrupt());
+ LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+ peer->ibp_connecting > 0) ||
+ (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+ peer->ibp_accepting > 0));
+
+ LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+ conn->ibc_connvars = NULL;
+
+ if (status != 0) {
+ /* failed to establish connection */
+ kiblnd_peer_connect_failed(peer, active, status);
+ kiblnd_finalise_conn(conn);
+ return;
+ }
+
+ /* connection established */
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ conn->ibc_last_send = jiffies;
+ kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+ kiblnd_peer_alive(peer);
+
+ /* Add conn to peer's list and nuke any dangling conns from a different
+ * peer instance... */
+ kiblnd_conn_addref(conn); /* +1 ref for ibc_list */
+ list_add(&conn->ibc_list, &peer->ibp_conns);
+ if (active)
+ peer->ibp_connecting--;
+ else
+ peer->ibp_accepting--;
+
+ if (peer->ibp_version == 0) {
+ peer->ibp_version = conn->ibc_version;
+ peer->ibp_incarnation = conn->ibc_incarnation;
+ }
+
+ if (peer->ibp_version != conn->ibc_version ||
+ peer->ibp_incarnation != conn->ibc_incarnation) {
+ kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+ conn->ibc_incarnation);
+ peer->ibp_version = conn->ibc_version;
+ peer->ibp_incarnation = conn->ibc_incarnation;
+ }
+
+ /* grab pending txs while I have the lock */
+ list_add(&txs, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
+
+ if (!kiblnd_peer_active(peer) || /* peer has been deleted */
+ conn->ibc_comms_error != 0) { /* error has happened already */
+ lnet_ni_t *ni = peer->ibp_ni;
+
+ /* start to shut down connection */
+ kiblnd_close_conn_locked(conn, -ECONNABORTED);
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+ return;
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ /* Schedule blocked txs */
+ spin_lock(&conn->ibc_lock);
+ while (!list_empty(&txs)) {
+ tx = list_entry(txs.next, kib_tx_t, tx_list);
+ list_del(&tx->tx_list);
+
+ kiblnd_queue_tx_locked(tx, conn);
+ }
+ spin_unlock(&conn->ibc_lock);
+
+ kiblnd_check_sends(conn);
+
+ /* schedule blocked rxs */
+ kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+ int rc;
+
+ rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+ if (rc != 0)
+ CWARN("Error %d sending reject\n", rc);
+}
+
+int
+kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+ rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
+ kib_msg_t *reqmsg = priv;
+ kib_msg_t *ackmsg;
+ kib_dev_t *ibdev;
+ kib_peer_t *peer;
+ kib_peer_t *peer2;
+ kib_conn_t *conn;
+ lnet_ni_t *ni = NULL;
+ kib_net_t *net = NULL;
+ lnet_nid_t nid;
+ struct rdma_conn_param cp;
+ kib_rej_t rej;
+ int version = IBLND_MSG_VERSION;
+ unsigned long flags;
+ int rc;
+ struct sockaddr_in *peer_addr;
+ LASSERT (!in_interrupt());
+
+ /* cmid inherits 'context' from the corresponding listener id */
+ ibdev = (kib_dev_t *)cmid->context;
+ LASSERT (ibdev != NULL);
+
+ memset(&rej, 0, sizeof(rej));
+ rej.ibr_magic = IBLND_MSG_MAGIC;
+ rej.ibr_why = IBLND_REJECT_FATAL;
+ rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+ peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+ if (*kiblnd_tunables.kib_require_priv_port &&
+ ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+ __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+ CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+ HIPQUAD(ip), ntohs(peer_addr->sin_port));
+ goto failed;
+ }
+
+ if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+ CERROR("Short connection request\n");
+ goto failed;
+ }
+
+ /* Future protocol version compatibility support! If the
+ * o2iblnd-specific protocol changes, or when LNET unifies
+ * protocols over all LNDs, the initial connection will
+ * negotiate a protocol version. I trap this here to avoid
+ * console errors; the reject tells the peer which protocol I
+ * speak. */
+ if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+ reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+ goto failed;
+ if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+ reqmsg->ibm_version != IBLND_MSG_VERSION &&
+ reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+ goto failed;
+ if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+ reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+ reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+ goto failed;
+
+ rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+ if (rc != 0) {
+ CERROR("Can't parse connection request: %d\n", rc);
+ goto failed;
+ }
+
+ nid = reqmsg->ibm_srcnid;
+ ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+ if (ni != NULL) {
+ net = (kib_net_t *)ni->ni_data;
+ rej.ibr_incarnation = net->ibn_incarnation;
+ }
+
+ if (ni == NULL || /* no matching net */
+ ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */
+ net->ibn_dev != ibdev) { /* wrong device */
+ CERROR("Can't accept %s on %s (%s:%d:%u.%u.%u.%u): "
+ "bad dst nid %s\n", libcfs_nid2str(nid),
+ ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+ ibdev->ibd_ifname, ibdev->ibd_nnets,
+ HIPQUAD(ibdev->ibd_ifip),
+ libcfs_nid2str(reqmsg->ibm_dstnid));
+
+ goto failed;
+ }
+
+ /* check time stamp as soon as possible */
+ if (reqmsg->ibm_dststamp != 0 &&
+ reqmsg->ibm_dststamp != net->ibn_incarnation) {
+ CWARN("Stale connection request\n");
+ rej.ibr_why = IBLND_REJECT_CONN_STALE;
+ goto failed;
+ }
+
+ /* I can accept peer's version */
+ version = reqmsg->ibm_version;
+
+ if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+ CERROR("Unexpected connreq msg type: %x from %s\n",
+ reqmsg->ibm_type, libcfs_nid2str(nid));
+ goto failed;
+ }
+
+ if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+ IBLND_MSG_QUEUE_SIZE(version)) {
+ CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+ libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+ IBLND_MSG_QUEUE_SIZE(version));
+
+ if (version == IBLND_MSG_VERSION)
+ rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+ goto failed;
+ }
+
+ if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+ IBLND_RDMA_FRAGS(version)) {
+ CERROR("Can't accept %s(version %x): "
+ "incompatible max_frags %d (%d wanted)\n",
+ libcfs_nid2str(nid), version,
+ reqmsg->ibm_u.connparams.ibcp_max_frags,
+ IBLND_RDMA_FRAGS(version));
+
+ if (version == IBLND_MSG_VERSION)
+ rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+ goto failed;
+
+ }
+
+ if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+ CERROR("Can't accept %s: message size %d too big (%d max)\n",
+ libcfs_nid2str(nid),
+ reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+ IBLND_MSG_SIZE);
+ goto failed;
+ }
+
+ /* assume 'nid' is a new peer; create */
+ rc = kiblnd_create_peer(ni, &peer, nid);
+ if (rc != 0) {
+ CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+ rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+ goto failed;
+ }
+
+ write_lock_irqsave(g_lock, flags);
+
+ peer2 = kiblnd_find_peer_locked(nid);
+ if (peer2 != NULL) {
+ if (peer2->ibp_version == 0) {
+ peer2->ibp_version = version;
+ peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+ }
+
+ /* not the guy I've talked with */
+ if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+ peer2->ibp_version != version) {
+ kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+ write_unlock_irqrestore(g_lock, flags);
+
+ CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+ libcfs_nid2str(nid), peer2->ibp_version, version);
+
+ kiblnd_peer_decref(peer);
+ rej.ibr_why = IBLND_REJECT_CONN_STALE;
+ goto failed;
+ }
+
+ /* tie-break connection race in favour of the higher NID */
+ if (peer2->ibp_connecting != 0 &&
+ nid < ni->ni_nid) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+ kiblnd_peer_decref(peer);
+ rej.ibr_why = IBLND_REJECT_CONN_RACE;
+ goto failed;
+ }
+
+ peer2->ibp_accepting++;
+ kiblnd_peer_addref(peer2);
+
+ write_unlock_irqrestore(g_lock, flags);
+ kiblnd_peer_decref(peer);
+ peer = peer2;
+ } else {
+ /* Brand new peer */
+ LASSERT (peer->ibp_accepting == 0);
+ LASSERT (peer->ibp_version == 0 &&
+ peer->ibp_incarnation == 0);
+
+ peer->ibp_accepting = 1;
+ peer->ibp_version = version;
+ peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+ /* I have a ref on ni that prevents it being shutdown */
+ LASSERT (net->ibn_shutdown == 0);
+
+ kiblnd_peer_addref(peer);
+ list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+ write_unlock_irqrestore(g_lock, flags);
+ }
+
+ conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+ if (conn == NULL) {
+ kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+ kiblnd_peer_decref(peer);
+ rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+ goto failed;
+ }
+
+ /* conn now "owns" cmid, so I return success from here on to ensure the
+ * CM callback doesn't destroy cmid. */
+
+ conn->ibc_incarnation = reqmsg->ibm_srcstamp;
+ conn->ibc_credits = IBLND_MSG_QUEUE_SIZE(version);
+ conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+ LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+ <= IBLND_RX_MSGS(version));
+
+ ackmsg = &conn->ibc_connvars->cv_msg;
+ memset(ackmsg, 0, sizeof(*ackmsg));
+
+ kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+ sizeof(ackmsg->ibm_u.connparams));
+ ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+ ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+ ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
+
+ kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.private_data = ackmsg;
+ cp.private_data_len = ackmsg->ibm_nob;
+ cp.responder_resources = 0; /* No atomic ops or RDMA reads */
+ cp.initiator_depth = 0;
+ cp.flow_control = 1;
+ cp.retry_count = *kiblnd_tunables.kib_retry_count;
+ cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
+
+ CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+ rc = rdma_accept(cmid, &cp);
+ if (rc != 0) {
+ CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+ rej.ibr_version = version;
+ rej.ibr_why = IBLND_REJECT_FATAL;
+
+ kiblnd_reject(cmid, &rej);
+ kiblnd_connreq_done(conn, rc);
+ kiblnd_conn_decref(conn);
+ }
+
+ lnet_ni_decref(ni);
+ return 0;
+
+ failed:
+ if (ni != NULL)
+ lnet_ni_decref(ni);
+
+ rej.ibr_version = version;
+ rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+ rej.ibr_cp.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
+ kiblnd_reject(cmid, &rej);
+
+ return -ECONNREFUSED;
+}
+
+void
+kiblnd_reconnect (kib_conn_t *conn, int version,
+ __u64 incarnation, int why, kib_connparams_t *cp)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+ char *reason;
+ int retry = 0;
+ unsigned long flags;
+
+ LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+ LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */
+
+ write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ /* retry connection if it's still needed and no other connection
+ * attempts (active or passive) are in progress
+ * NB: reconnect is still needed even when ibp_tx_queue is
+ * empty if ibp_version != version because reconnect may be
+ * initiated by kiblnd_query() */
+ if ((!list_empty(&peer->ibp_tx_queue) ||
+ peer->ibp_version != version) &&
+ peer->ibp_connecting == 1 &&
+ peer->ibp_accepting == 0) {
+ retry = 1;
+ peer->ibp_connecting++;
+
+ peer->ibp_version = version;
+ peer->ibp_incarnation = incarnation;
+ }
+
+ write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (!retry)
+ return;
+
+ switch (why) {
+ default:
+ reason = "Unknown";
+ break;
+
+ case IBLND_REJECT_CONN_STALE:
+ reason = "stale";
+ break;
+
+ case IBLND_REJECT_CONN_RACE:
+ reason = "conn race";
+ break;
+
+ case IBLND_REJECT_CONN_UNCOMPAT:
+ reason = "version negotiation";
+ break;
+ }
+
+ CNETERR("%s: retrying (%s), %x, %x, "
+ "queue_dep: %d, max_frag: %d, msg_size: %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ reason, IBLND_MSG_VERSION, version,
+ cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version),
+ cp != NULL? cp->ibcp_max_frags : IBLND_RDMA_FRAGS(version),
+ cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE);
+
+ kiblnd_connect_peer(peer);
+}
+
+void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+
+ LASSERT (!in_interrupt());
+ LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+ switch (reason) {
+ case IB_CM_REJ_STALE_CONN:
+ kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+ IBLND_REJECT_CONN_STALE, NULL);
+ break;
+
+ case IB_CM_REJ_INVALID_SERVICE_ID:
+ CNETERR("%s rejected: no listener at %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ *kiblnd_tunables.kib_service);
+ break;
+
+ case IB_CM_REJ_CONSUMER_DEFINED:
+ if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+ kib_rej_t *rej = priv;
+ kib_connparams_t *cp = NULL;
+ int flip = 0;
+ __u64 incarnation = -1;
+
+ /* NB. default incarnation is -1 because:
+ * a) V1 will ignore dst incarnation in connreq.
+ * b) V2 will provide incarnation while rejecting me,
+ * -1 will be overwrote.
+ *
+ * if I try to connect to a V1 peer with V2 protocol,
+ * it rejected me then upgrade to V2, I have no idea
+ * about the upgrading and try to reconnect with V1,
+ * in this case upgraded V2 can find out I'm trying to
+ * talk to the old guy and reject me(incarnation is -1).
+ */
+
+ if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+ rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+ __swab32s(&rej->ibr_magic);
+ __swab16s(&rej->ibr_version);
+ flip = 1;
+ }
+
+ if (priv_nob >= sizeof(kib_rej_t) &&
+ rej->ibr_version > IBLND_MSG_VERSION_1) {
+ /* priv_nob is always 148 in current version
+ * of OFED, so we still need to check version.
+ * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+ cp = &rej->ibr_cp;
+
+ if (flip) {
+ __swab64s(&rej->ibr_incarnation);
+ __swab16s(&cp->ibcp_queue_depth);
+ __swab16s(&cp->ibcp_max_frags);
+ __swab32s(&cp->ibcp_max_msg_size);
+ }
+
+ incarnation = rej->ibr_incarnation;
+ }
+
+ if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+ rej->ibr_magic != LNET_PROTO_MAGIC) {
+ CERROR("%s rejected: consumer defined fatal error\n",
+ libcfs_nid2str(peer->ibp_nid));
+ break;
+ }
+
+ if (rej->ibr_version != IBLND_MSG_VERSION &&
+ rej->ibr_version != IBLND_MSG_VERSION_1) {
+ CERROR("%s rejected: o2iblnd version %x error\n",
+ libcfs_nid2str(peer->ibp_nid),
+ rej->ibr_version);
+ break;
+ }
+
+ if (rej->ibr_why == IBLND_REJECT_FATAL &&
+ rej->ibr_version == IBLND_MSG_VERSION_1) {
+ CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+ libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+ if (conn->ibc_version != IBLND_MSG_VERSION_1)
+ rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+ }
+
+ switch (rej->ibr_why) {
+ case IBLND_REJECT_CONN_RACE:
+ case IBLND_REJECT_CONN_STALE:
+ case IBLND_REJECT_CONN_UNCOMPAT:
+ kiblnd_reconnect(conn, rej->ibr_version,
+ incarnation, rej->ibr_why, cp);
+ break;
+
+ case IBLND_REJECT_MSG_QUEUE_SIZE:
+ CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+ libcfs_nid2str(peer->ibp_nid), cp->ibcp_queue_depth,
+ IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+ break;
+
+ case IBLND_REJECT_RDMA_FRAGS:
+ CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+ libcfs_nid2str(peer->ibp_nid), cp->ibcp_max_frags,
+ IBLND_RDMA_FRAGS(conn->ibc_version));
+ break;
+
+ case IBLND_REJECT_NO_RESOURCES:
+ CERROR("%s rejected: o2iblnd no resources\n",
+ libcfs_nid2str(peer->ibp_nid));
+ break;
+
+ case IBLND_REJECT_FATAL:
+ CERROR("%s rejected: o2iblnd fatal error\n",
+ libcfs_nid2str(peer->ibp_nid));
+ break;
+
+ default:
+ CERROR("%s rejected: o2iblnd reason %d\n",
+ libcfs_nid2str(peer->ibp_nid),
+ rej->ibr_why);
+ break;
+ }
+ break;
+ }
+ /* fall through */
+ default:
+ CNETERR("%s rejected: reason %d, size %d\n",
+ libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+ break;
+ }
+
+ kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+ lnet_ni_t *ni = peer->ibp_ni;
+ kib_net_t *net = ni->ni_data;
+ kib_msg_t *msg = priv;
+ int ver = conn->ibc_version;
+ int rc = kiblnd_unpack_msg(msg, priv_nob);
+ unsigned long flags;
+
+ LASSERT (net != NULL);
+
+ if (rc != 0) {
+ CERROR("Can't unpack connack from %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ goto failed;
+ }
+
+ if (msg->ibm_type != IBLND_MSG_CONNACK) {
+ CERROR("Unexpected message %d from %s\n",
+ msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (ver != msg->ibm_version) {
+ CERROR("%s replied version %x is different with "
+ "requested version %x\n",
+ libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (msg->ibm_u.connparams.ibcp_queue_depth !=
+ IBLND_MSG_QUEUE_SIZE(ver)) {
+ CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg->ibm_u.connparams.ibcp_queue_depth,
+ IBLND_MSG_QUEUE_SIZE(ver));
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (msg->ibm_u.connparams.ibcp_max_frags !=
+ IBLND_RDMA_FRAGS(ver)) {
+ CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg->ibm_u.connparams.ibcp_max_frags,
+ IBLND_RDMA_FRAGS(ver));
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+ CERROR("%s max message size %d too big (%d max)\n",
+ libcfs_nid2str(peer->ibp_nid),
+ msg->ibm_u.connparams.ibcp_max_msg_size,
+ IBLND_MSG_SIZE);
+ rc = -EPROTO;
+ goto failed;
+ }
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ if (msg->ibm_dstnid == ni->ni_nid &&
+ msg->ibm_dststamp == net->ibn_incarnation)
+ rc = 0;
+ else
+ rc = -ESTALE;
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ if (rc != 0) {
+ CERROR("Bad connection reply from %s, rc = %d, "
+ "version: %x max_frags: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc,
+ msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+ goto failed;
+ }
+
+ conn->ibc_incarnation = msg->ibm_srcstamp;
+ conn->ibc_credits =
+ conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+ LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+ <= IBLND_RX_MSGS(ver));
+
+ kiblnd_connreq_done(conn, 0);
+ return;
+
+ failed:
+ /* NB My QP has already established itself, so I handle anything going
+ * wrong here by setting ibc_comms_error.
+ * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+ * immediately tears it down. */
+
+ LASSERT (rc != 0);
+ conn->ibc_comms_error = rc;
+ kiblnd_connreq_done(conn, 0);
+}
+
+int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+ kib_peer_t *peer = (kib_peer_t *)cmid->context;
+ kib_conn_t *conn;
+ kib_msg_t *msg;
+ struct rdma_conn_param cp;
+ int version;
+ __u64 incarnation;
+ unsigned long flags;
+ int rc;
+
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ incarnation = peer->ibp_incarnation;
+ version = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+ peer->ibp_version;
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+ if (conn == NULL) {
+ kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+ kiblnd_peer_decref(peer); /* lose cmid's ref */
+ return -ENOMEM;
+ }
+
+ /* conn "owns" cmid now, so I return success from here on to ensure the
+ * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+ * on peer */
+
+ msg = &conn->ibc_connvars->cv_msg;
+
+ memset(msg, 0, sizeof(*msg));
+ kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+ msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+ msg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
+ msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+ kiblnd_pack_msg(peer->ibp_ni, msg, version,
+ 0, peer->ibp_nid, incarnation);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.private_data = msg;
+ cp.private_data_len = msg->ibm_nob;
+ cp.responder_resources = 0; /* No atomic ops or RDMA reads */
+ cp.initiator_depth = 0;
+ cp.flow_control = 1;
+ cp.retry_count = *kiblnd_tunables.kib_retry_count;
+ cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count;
+
+ LASSERT(cmid->context == (void *)conn);
+ LASSERT(conn->ibc_cmid == cmid);
+
+ rc = rdma_connect(cmid, &cp);
+ if (rc != 0) {
+ CERROR("Can't connect to %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ kiblnd_connreq_done(conn, rc);
+ kiblnd_conn_decref(conn);
+ }
+
+ return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ int rc;
+
+ switch (event->event) {
+ default:
+ CERROR("Unexpected event: %d, status: %d\n",
+ event->event, event->status);
+ LBUG();
+
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ /* destroy cmid on failure */
+ rc = kiblnd_passive_connect(cmid,
+ (void *)KIBLND_CONN_PARAM(event),
+ KIBLND_CONN_PARAM_LEN(event));
+ CDEBUG(D_NET, "connreq: %d\n", rc);
+ return rc;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ peer = (kib_peer_t *)cmid->context;
+ CNETERR("%s: ADDR ERROR %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+ kiblnd_peer_decref(peer);
+ return -EHOSTUNREACH; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ peer = (kib_peer_t *)cmid->context;
+
+ CDEBUG(D_NET,"%s Addr resolved: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+
+ if (event->status != 0) {
+ CNETERR("Can't resolve address for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ rc = event->status;
+ } else {
+ rc = rdma_resolve_route(
+ cmid, *kiblnd_tunables.kib_timeout * 1000);
+ if (rc == 0)
+ return 0;
+ /* Can't initiate route resolution */
+ CERROR("Can't resolve route for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ }
+ kiblnd_peer_connect_failed(peer, 1, rc);
+ kiblnd_peer_decref(peer);
+ return rc; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ peer = (kib_peer_t *)cmid->context;
+ CNETERR("%s: ROUTE ERROR %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+ kiblnd_peer_decref(peer);
+ return -EHOSTUNREACH; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ peer = (kib_peer_t *)cmid->context;
+ CDEBUG(D_NET,"%s Route resolved: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+
+ if (event->status == 0)
+ return kiblnd_active_connect(cmid);
+
+ CNETERR("Can't resolve route for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), event->status);
+ kiblnd_peer_connect_failed(peer, 1, event->status);
+ kiblnd_peer_decref(peer);
+ return event->status; /* rc != 0 destroys cmid */
+
+ case RDMA_CM_EVENT_UNREACHABLE:
+ conn = (kib_conn_t *)cmid->context;
+ LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+ conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+ CNETERR("%s: UNREACHABLE %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+ kiblnd_connreq_done(conn, -ENETDOWN);
+ kiblnd_conn_decref(conn);
+ return 0;
+
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ conn = (kib_conn_t *)cmid->context;
+ LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+ conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+ CNETERR("%s: CONNECT ERROR %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+ kiblnd_connreq_done(conn, -ENOTCONN);
+ kiblnd_conn_decref(conn);
+ return 0;
+
+ case RDMA_CM_EVENT_REJECTED:
+ conn = (kib_conn_t *)cmid->context;
+ switch (conn->ibc_state) {
+ default:
+ LBUG();
+
+ case IBLND_CONN_PASSIVE_WAIT:
+ CERROR ("%s: REJECTED %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ event->status);
+ kiblnd_connreq_done(conn, -ECONNRESET);
+ break;
+
+ case IBLND_CONN_ACTIVE_CONNECT:
+ kiblnd_rejected(conn, event->status,
+ (void *)KIBLND_CONN_PARAM(event),
+ KIBLND_CONN_PARAM_LEN(event));
+ break;
+ }
+ kiblnd_conn_decref(conn);
+ return 0;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ conn = (kib_conn_t *)cmid->context;
+ switch (conn->ibc_state) {
+ default:
+ LBUG();
+
+ case IBLND_CONN_PASSIVE_WAIT:
+ CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_connreq_done(conn, 0);
+ break;
+
+ case IBLND_CONN_ACTIVE_CONNECT:
+ CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_check_connreply(conn,
+ (void *)KIBLND_CONN_PARAM(event),
+ KIBLND_CONN_PARAM_LEN(event));
+ break;
+ }
+ /* net keeps its ref on conn! */
+ return 0;
+
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+ return 0;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ conn = (kib_conn_t *)cmid->context;
+ if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+ CERROR("%s DISCONNECTED\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kiblnd_connreq_done(conn, -ECONNRESET);
+ } else {
+ kiblnd_close_conn(conn, 0);
+ }
+ kiblnd_conn_decref(conn);
+ cmid->context = NULL;
+ return 0;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ LCONSOLE_ERROR_MSG(0x131,
+ "Received notification of device removal\n"
+ "Please shutdown LNET to allow this to proceed\n");
+ /* Can't remove network from underneath LNET for now, so I have
+ * to ignore this */
+ return 0;
+
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+ return 0;
+ }
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+ kib_tx_t *tx;
+ struct list_head *ttmp;
+
+ list_for_each (ttmp, txs) {
+ tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+ if (txs != &conn->ibc_active_txs) {
+ LASSERT (tx->tx_queued);
+ } else {
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+ }
+
+ if (cfs_time_aftereq (jiffies, tx->tx_deadline)) {
+ CERROR("Timed out tx: %s, %lu seconds\n",
+ kiblnd_queue2str(conn, txs),
+ cfs_duration_sec(jiffies - tx->tx_deadline));
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+ return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+void
+kiblnd_check_conns (int idx)
+{
+ LIST_HEAD (closes);
+ LIST_HEAD (checksends);
+ struct list_head *peers = &kiblnd_data.kib_peers[idx];
+ struct list_head *ptmp;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ struct list_head *ctmp;
+ unsigned long flags;
+
+ /* NB. We expect to have a look at all the peers and not find any
+ * RDMAs to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+ list_for_each (ptmp, peers) {
+ peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+ list_for_each (ctmp, &peer->ibp_conns) {
+ int timedout;
+ int sendnoop;
+
+ conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+ LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+ spin_lock(&conn->ibc_lock);
+
+ sendnoop = kiblnd_need_noop(conn);
+ timedout = kiblnd_conn_timed_out_locked(conn);
+ if (!sendnoop && !timedout) {
+ spin_unlock(&conn->ibc_lock);
+ continue;
+ }
+
+ if (timedout) {
+ CERROR("Timed out RDMA with %s (%lu): "
+ "c: %u, oc: %u, rc: %u\n",
+ libcfs_nid2str(peer->ibp_nid),
+ cfs_duration_sec(cfs_time_current() -
+ peer->ibp_last_alive),
+ conn->ibc_credits,
+ conn->ibc_outstanding_credits,
+ conn->ibc_reserved_credits);
+ list_add(&conn->ibc_connd_list, &closes);
+ } else {
+ list_add(&conn->ibc_connd_list,
+ &checksends);
+ }
+ /* +ref for 'closes' or 'checksends' */
+ kiblnd_conn_addref(conn);
+
+ spin_unlock(&conn->ibc_lock);
+ }
+ }
+
+ read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ /* Handle timeout by closing the whole
+ * connection. We can only be sure RDMA activity
+ * has ceased once the QP has been modified. */
+ while (!list_empty(&closes)) {
+ conn = list_entry(closes.next,
+ kib_conn_t, ibc_connd_list);
+ list_del(&conn->ibc_connd_list);
+ kiblnd_close_conn(conn, -ETIMEDOUT);
+ kiblnd_conn_decref(conn);
+ }
+
+ /* In case we have enough credits to return via a
+ * NOOP, but there were no non-blocking tx descs
+ * free to do it last time... */
+ while (!list_empty(&checksends)) {
+ conn = list_entry(checksends.next,
+ kib_conn_t, ibc_connd_list);
+ list_del(&conn->ibc_connd_list);
+ kiblnd_check_sends(conn);
+ kiblnd_conn_decref(conn);
+ }
+}
+
+void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+ LASSERT (!in_interrupt());
+ LASSERT (current == kiblnd_data.kib_connd);
+ LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+ rdma_disconnect(conn->ibc_cmid);
+ kiblnd_finalise_conn(conn);
+
+ kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd (void *arg)
+{
+ wait_queue_t wait;
+ unsigned long flags;
+ kib_conn_t *conn;
+ int timeout;
+ int i;
+ int dropped_lock;
+ int peer_index = 0;
+ unsigned long deadline = jiffies;
+
+ cfs_block_allsigs ();
+
+ init_waitqueue_entry_current (&wait);
+ kiblnd_data.kib_connd = current;
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+ while (!kiblnd_data.kib_shutdown) {
+
+ dropped_lock = 0;
+
+ if (!list_empty (&kiblnd_data.kib_connd_zombies)) {
+ conn = list_entry(kiblnd_data. \
+ kib_connd_zombies.next,
+ kib_conn_t, ibc_list);
+ list_del(&conn->ibc_list);
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+ flags);
+ dropped_lock = 1;
+
+ kiblnd_destroy_conn(conn);
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+ conn = list_entry(kiblnd_data.kib_connd_conns.next,
+ kib_conn_t, ibc_list);
+ list_del(&conn->ibc_list);
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+ flags);
+ dropped_lock = 1;
+
+ kiblnd_disconnect_conn(conn);
+ kiblnd_conn_decref(conn);
+
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ /* careful with the jiffy wrap... */
+ timeout = (int)(deadline - jiffies);
+ if (timeout <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = kiblnd_data.kib_peer_hash_size;
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+ dropped_lock = 1;
+
+ /* Time to check for RDMA timeouts on a few more
+ * peers: I do checks every 'p' seconds on a
+ * proportion of the peer table and I need to check
+ * every connection 'n' times within a timeout
+ * interval, to ensure I detect a timeout on any
+ * connection within (n+1)/n times the timeout
+ * interval. */
+
+ if (*kiblnd_tunables.kib_timeout > n * p)
+ chunk = (chunk * n * p) /
+ *kiblnd_tunables.kib_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ kiblnd_check_conns(peer_index);
+ peer_index = (peer_index + 1) %
+ kiblnd_data.kib_peer_hash_size;
+ }
+
+ deadline += p * HZ;
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ if (dropped_lock)
+ continue;
+
+ /* Nothing to do for 'timeout' */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+ waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+ spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+ kiblnd_thread_fini();
+ return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+ kib_conn_t *conn = arg;
+
+ switch (event->event) {
+ case IB_EVENT_COMM_EST:
+ CDEBUG(D_NET, "%s established\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+
+ default:
+ CERROR("%s: Async QP event type %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+ return;
+ }
+}
+
+void
+kiblnd_complete (struct ib_wc *wc)
+{
+ switch (kiblnd_wreqid2type(wc->wr_id)) {
+ default:
+ LBUG();
+
+ case IBLND_WID_RDMA:
+ /* We only get RDMA completion notification if it fails. All
+ * subsequent work items, including the final SEND will fail
+ * too. However we can't print out any more info about the
+ * failing RDMA because 'tx' might be back on the idle list or
+ * even reused already if we didn't manage to post all our work
+ * items */
+ CNETERR("RDMA (tx: %p) failed: %d\n",
+ kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+ return;
+
+ case IBLND_WID_TX:
+ kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+ return;
+
+ case IBLND_WID_RX:
+ kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+ wc->byte_len);
+ return;
+ }
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+ /* NB I'm not allowed to schedule this conn once its refcount has
+ * reached 0. Since fundamentally I'm racing with scheduler threads
+ * consuming my CQ I could be called after all completions have
+ * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+ * and this CQ is about to be destroyed so I NOOP. */
+ kib_conn_t *conn = (kib_conn_t *)arg;
+ struct kib_sched_info *sched = conn->ibc_sched;
+ unsigned long flags;
+
+ LASSERT(cq == conn->ibc_cq);
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+
+ conn->ibc_ready = 1;
+
+ if (!conn->ibc_scheduled &&
+ (conn->ibc_nrx > 0 ||
+ conn->ibc_nsends_posted > 0)) {
+ kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+ conn->ibc_scheduled = 1;
+ list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+ if (waitqueue_active(&sched->ibs_waitq))
+ wake_up(&sched->ibs_waitq);
+ }
+
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+ kib_conn_t *conn = arg;
+
+ CERROR("%s: async CQ event type %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+ long id = (long)arg;
+ struct kib_sched_info *sched;
+ kib_conn_t *conn;
+ wait_queue_t wait;
+ unsigned long flags;
+ struct ib_wc wc;
+ int did_something;
+ int busy_loops = 0;
+ int rc;
+
+ cfs_block_allsigs();
+
+ init_waitqueue_entry_current(&wait);
+
+ sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+ rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+ if (rc != 0) {
+ CWARN("Failed to bind on CPT %d, please verify whether "
+ "all CPUs are healthy and reload modules if necessary, "
+ "otherwise your system might under risk of low "
+ "performance\n", sched->ibs_cpt);
+ }
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+
+ while (!kiblnd_data.kib_shutdown) {
+ if (busy_loops++ >= IBLND_RESCHED) {
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ cond_resched();
+ busy_loops = 0;
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
+
+ did_something = 0;
+
+ if (!list_empty(&sched->ibs_conns)) {
+ conn = list_entry(sched->ibs_conns.next,
+ kib_conn_t, ibc_sched_list);
+ /* take over kib_sched_conns' ref on conn... */
+ LASSERT(conn->ibc_scheduled);
+ list_del(&conn->ibc_sched_list);
+ conn->ibc_ready = 0;
+
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+ if (rc == 0) {
+ rc = ib_req_notify_cq(conn->ibc_cq,
+ IB_CQ_NEXT_COMP);
+ if (rc < 0) {
+ CWARN("%s: ib_req_notify_cq failed: %d, "
+ "closing connection\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ kiblnd_close_conn(conn, -EIO);
+ kiblnd_conn_decref(conn);
+ spin_lock_irqsave(&sched->ibs_lock,
+ flags);
+ continue;
+ }
+
+ rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+ }
+
+ if (rc < 0) {
+ CWARN("%s: ib_poll_cq failed: %d, "
+ "closing connection\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ rc);
+ kiblnd_close_conn(conn, -EIO);
+ kiblnd_conn_decref(conn);
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ continue;
+ }
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+
+ if (rc != 0 || conn->ibc_ready) {
+ /* There may be another completion waiting; get
+ * another scheduler to check while I handle
+ * this one... */
+ /* +1 ref for sched_conns */
+ kiblnd_conn_addref(conn);
+ list_add_tail(&conn->ibc_sched_list,
+ &sched->ibs_conns);
+ if (waitqueue_active(&sched->ibs_waitq))
+ wake_up(&sched->ibs_waitq);
+ } else {
+ conn->ibc_scheduled = 0;
+ }
+
+ if (rc != 0) {
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+ kiblnd_complete(&wc);
+
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
+
+ kiblnd_conn_decref(conn); /* ...drop my ref from above */
+ did_something = 1;
+ }
+
+ if (did_something)
+ continue;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ waitq_wait(&wait, TASK_INTERRUPTIBLE);
+ busy_loops = 0;
+
+ remove_wait_queue(&sched->ibs_waitq, &wait);
+ set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ kiblnd_thread_fini();
+ return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+ rwlock_t *glock = &kiblnd_data.kib_global_lock;
+ kib_dev_t *dev;
+ wait_queue_t wait;
+ unsigned long flags;
+ int rc;
+
+ LASSERT (*kiblnd_tunables.kib_dev_failover != 0);
+
+ cfs_block_allsigs ();
+
+ init_waitqueue_entry_current(&wait);
+ write_lock_irqsave(glock, flags);
+
+ while (!kiblnd_data.kib_shutdown) {
+ int do_failover = 0;
+ int long_sleep;
+
+ list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+ ibd_fail_list) {
+ if (cfs_time_before(cfs_time_current(),
+ dev->ibd_next_failover))
+ continue;
+ do_failover = 1;
+ break;
+ }
+
+ if (do_failover) {
+ list_del_init(&dev->ibd_fail_list);
+ dev->ibd_failover = 1;
+ write_unlock_irqrestore(glock, flags);
+
+ rc = kiblnd_dev_failover(dev);
+
+ write_lock_irqsave(glock, flags);
+
+ LASSERT (dev->ibd_failover);
+ dev->ibd_failover = 0;
+ if (rc >= 0) { /* Device is OK or failover succeed */
+ dev->ibd_next_failover = cfs_time_shift(3);
+ continue;
+ }
+
+ /* failed to failover, retry later */
+ dev->ibd_next_failover =
+ cfs_time_shift(min(dev->ibd_failed_failover, 10));
+ if (kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ }
+
+ continue;
+ }
+
+ /* long sleep if no more pending failover */
+ long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+ write_unlock_irqrestore(glock, flags);
+
+ rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+ cfs_time_seconds(1));
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+ write_lock_irqsave(glock, flags);
+
+ if (!long_sleep || rc != 0)
+ continue;
+
+ /* have a long sleep, routine check all active devices,
+ * we need checking like this because if there is not active
+ * connection on the dev and no SEND from local, we may listen
+ * on wrong HCA for ever while there is a bonding failover */
+ list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+ if (kiblnd_dev_can_failover(dev)) {
+ list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ }
+ }
+ }
+
+ write_unlock_irqrestore(glock, flags);
+
+ kiblnd_thread_fini();
+ return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 000000000000..e21028b72302
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -0,0 +1,493 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+CFS_MODULE_PARM(service, "i", int, 0444,
+ "service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+ "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+ "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+ "number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+ "# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+ "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+ "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444,
+ "when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+ "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+ "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
+ "IPoIB interface name");
+
+static int retry_count = 5;
+CFS_MODULE_PARM(retry_count, "i", int, 0644,
+ "Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
+ "RNR retransmissions");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+ "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu = 0;
+CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
+ "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+ "send work-queue sizing");
+
+static int map_on_demand = 0;
+CFS_MODULE_PARM(map_on_demand, "i", int, 0444,
+ "map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
+ "size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
+ "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
+ "non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444,
+ "size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+CFS_MODULE_PARM(dev_failover, "i", int, 0444,
+ "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port = 0;
+CFS_MODULE_PARM(require_privileged_port, "i", int, 0644,
+ "require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+CFS_MODULE_PARM(use_privileged_port, "i", int, 0644,
+ "use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+ .kib_dev_failover = &dev_failover,
+ .kib_service = &service,
+ .kib_cksum = &cksum,
+ .kib_timeout = &timeout,
+ .kib_keepalive = &keepalive,
+ .kib_ntx = &ntx,
+ .kib_credits = &credits,
+ .kib_peertxcredits = &peer_credits,
+ .kib_peercredits_hiw = &peer_credits_hiw,
+ .kib_peerrtrcredits = &peer_buffer_credits,
+ .kib_peertimeout = &peer_timeout,
+ .kib_default_ipif = &ipif_name,
+ .kib_retry_count = &retry_count,
+ .kib_rnr_retry_count = &rnr_retry_count,
+ .kib_concurrent_sends = &concurrent_sends,
+ .kib_ib_mtu = &ib_mtu,
+ .kib_map_on_demand = &map_on_demand,
+ .kib_fmr_pool_size = &fmr_pool_size,
+ .kib_fmr_flush_trigger = &fmr_flush_trigger,
+ .kib_fmr_cache = &fmr_cache,
+ .kib_pmr_pool_size = &pmr_pool_size,
+ .kib_require_priv_port = &require_privileged_port,
+ .kib_use_priv_port = &use_privileged_port,
+ .kib_nscheds = &nscheds
+};
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+static char ipif_basename_space[32];
+
+
+enum {
+ O2IBLND_SERVICE = 1,
+ O2IBLND_CKSUM,
+ O2IBLND_TIMEOUT,
+ O2IBLND_NTX,
+ O2IBLND_CREDITS,
+ O2IBLND_PEER_TXCREDITS,
+ O2IBLND_PEER_CREDITS_HIW,
+ O2IBLND_PEER_RTRCREDITS,
+ O2IBLND_PEER_TIMEOUT,
+ O2IBLND_IPIF_BASENAME,
+ O2IBLND_RETRY_COUNT,
+ O2IBLND_RNR_RETRY_COUNT,
+ O2IBLND_KEEPALIVE,
+ O2IBLND_CONCURRENT_SENDS,
+ O2IBLND_IB_MTU,
+ O2IBLND_MAP_ON_DEMAND,
+ O2IBLND_FMR_POOL_SIZE,
+ O2IBLND_FMR_FLUSH_TRIGGER,
+ O2IBLND_FMR_CACHE,
+ O2IBLND_PMR_POOL_SIZE,
+ O2IBLND_DEV_FAILOVER
+};
+
+static ctl_table_t kiblnd_ctl_table[] = {
+ {
+ .ctl_name = O2IBLND_SERVICE,
+ .procname = "service",
+ .data = &service,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_CKSUM,
+ .procname = "cksum",
+ .data = &cksum,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_TIMEOUT,
+ .procname = "timeout",
+ .data = &timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_NTX,
+ .procname = "ntx",
+ .data = &ntx,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_CREDITS,
+ .procname = "credits",
+ .data = &credits,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_PEER_TXCREDITS,
+ .procname = "peer_credits",
+ .data = &peer_credits,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_PEER_CREDITS_HIW,
+ .procname = "peer_credits_hiw",
+ .data = &peer_credits_hiw,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_PEER_RTRCREDITS,
+ .procname = "peer_buffer_credits",
+ .data = &peer_buffer_credits,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_PEER_TIMEOUT,
+ .procname = "peer_timeout",
+ .data = &peer_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_IPIF_BASENAME,
+ .procname = "ipif_name",
+ .data = ipif_basename_space,
+ .maxlen = sizeof(ipif_basename_space),
+ .mode = 0444,
+ .proc_handler = &proc_dostring
+ },
+ {
+ .ctl_name = O2IBLND_RETRY_COUNT,
+ .procname = "retry_count",
+ .data = &retry_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_RNR_RETRY_COUNT,
+ .procname = "rnr_retry_count",
+ .data = &rnr_retry_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_KEEPALIVE,
+ .procname = "keepalive",
+ .data = &keepalive,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_CONCURRENT_SENDS,
+ .procname = "concurrent_sends",
+ .data = &concurrent_sends,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_IB_MTU,
+ .procname = "ib_mtu",
+ .data = &ib_mtu,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_MAP_ON_DEMAND,
+ .procname = "map_on_demand",
+ .data = &map_on_demand,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+
+ {
+ .ctl_name = O2IBLND_FMR_POOL_SIZE,
+ .procname = "fmr_pool_size",
+ .data = &fmr_pool_size,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_FMR_FLUSH_TRIGGER,
+ .procname = "fmr_flush_trigger",
+ .data = &fmr_flush_trigger,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_FMR_CACHE,
+ .procname = "fmr_cache",
+ .data = &fmr_cache,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_PMR_POOL_SIZE,
+ .procname = "pmr_pool_size",
+ .data = &pmr_pool_size,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ .ctl_name = O2IBLND_DEV_FAILOVER,
+ .procname = "dev_failover",
+ .data = &dev_failover,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ },
+ {0}
+};
+
+static ctl_table_t kiblnd_top_ctl_table[] = {
+ {
+ .ctl_name = CTL_O2IBLND,
+ .procname = "o2iblnd",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = kiblnd_ctl_table
+ },
+ {0}
+};
+
+void
+kiblnd_initstrtunable(char *space, char *str, int size)
+{
+ strncpy(space, str, size);
+ space[size-1] = 0;
+}
+
+void
+kiblnd_sysctl_init (void)
+{
+ kiblnd_initstrtunable(ipif_basename_space, ipif_name,
+ sizeof(ipif_basename_space));
+
+ kiblnd_tunables.kib_sysctl =
+ cfs_register_sysctl_table(kiblnd_top_ctl_table, 0);
+
+ if (kiblnd_tunables.kib_sysctl == NULL)
+ CWARN("Can't setup /proc tunables\n");
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+ if (kiblnd_tunables.kib_sysctl != NULL)
+ unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
+}
+
+#else
+
+void
+kiblnd_sysctl_init (void)
+{
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+}
+
+#endif
+
+int
+kiblnd_tunables_init (void)
+{
+ if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+ CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+ *kiblnd_tunables.kib_ib_mtu);
+ return -EINVAL;
+ }
+
+ if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+ *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+ if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+ *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+ if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+ *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+ if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+ *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+ if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+ *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+ if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+ *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+ *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+ if (*kiblnd_tunables.kib_map_on_demand == 1)
+ *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+ if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+ if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+ *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+ *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+ else
+ *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+ }
+
+ if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+ *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+ if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+ *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+ if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+ CWARN("Concurrent sends %d is lower than message queue size: %d, "
+ "performance may drop slightly.\n",
+ *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+ }
+
+ kiblnd_sysctl_init();
+ return 0;
+}
+
+void
+kiblnd_tunables_fini (void)
+{
+ kiblnd_sysctl_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644
index 000000000000..6494b2bada05
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
+
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 000000000000..c826bf9d49ac
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -0,0 +1,2902 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+lnd_t the_ksocklnd;
+ksock_nal_data_t ksocknal_data;
+
+ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+ ksock_net_t *net = ni->ni_data;
+ int i;
+ ksock_interface_t *iface;
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ LASSERT(i < LNET_MAX_INTERFACES);
+ iface = &net->ksnn_interfaces[i];
+
+ if (iface->ksni_ipaddr == ip)
+ return (iface);
+ }
+
+ return (NULL);
+}
+
+ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+ ksock_route_t *route;
+
+ LIBCFS_ALLOC (route, sizeof (*route));
+ if (route == NULL)
+ return (NULL);
+
+ atomic_set (&route->ksnr_refcount, 1);
+ route->ksnr_peer = NULL;
+ route->ksnr_retry_interval = 0; /* OK to connect at any time */
+ route->ksnr_ipaddr = ipaddr;
+ route->ksnr_port = port;
+ route->ksnr_scheduled = 0;
+ route->ksnr_connecting = 0;
+ route->ksnr_connected = 0;
+ route->ksnr_deleted = 0;
+ route->ksnr_conn_count = 0;
+ route->ksnr_share_count = 0;
+
+ return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+ LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+ if (route->ksnr_peer != NULL)
+ ksocknal_peer_decref(route->ksnr_peer);
+
+ LIBCFS_FREE (route, sizeof (*route));
+}
+
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+ ksock_net_t *net = ni->ni_data;
+ ksock_peer_t *peer;
+
+ LASSERT (id.nid != LNET_NID_ANY);
+ LASSERT (id.pid != LNET_PID_ANY);
+ LASSERT (!in_interrupt());
+
+ LIBCFS_ALLOC (peer, sizeof (*peer));
+ if (peer == NULL)
+ return -ENOMEM;
+
+ memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */
+
+ peer->ksnp_ni = ni;
+ peer->ksnp_id = id;
+ atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */
+ peer->ksnp_closing = 0;
+ peer->ksnp_accepting = 0;
+ peer->ksnp_proto = NULL;
+ peer->ksnp_last_alive = 0;
+ peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+ INIT_LIST_HEAD (&peer->ksnp_conns);
+ INIT_LIST_HEAD (&peer->ksnp_routes);
+ INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+ INIT_LIST_HEAD (&peer->ksnp_zc_req_list);
+ spin_lock_init(&peer->ksnp_lock);
+
+ spin_lock_bh(&net->ksnn_lock);
+
+ if (net->ksnn_shutdown) {
+ spin_unlock_bh(&net->ksnn_lock);
+
+ LIBCFS_FREE(peer, sizeof(*peer));
+ CERROR("Can't create peer: network shutdown\n");
+ return -ESHUTDOWN;
+ }
+
+ net->ksnn_npeers++;
+
+ spin_unlock_bh(&net->ksnn_lock);
+
+ *peerp = peer;
+ return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_t *peer)
+{
+ ksock_net_t *net = peer->ksnp_ni->ni_data;
+
+ CDEBUG (D_NET, "peer %s %p deleted\n",
+ libcfs_id2str(peer->ksnp_id), peer);
+
+ LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+ LASSERT (peer->ksnp_accepting == 0);
+ LASSERT (list_empty (&peer->ksnp_conns));
+ LASSERT (list_empty (&peer->ksnp_routes));
+ LASSERT (list_empty (&peer->ksnp_tx_queue));
+ LASSERT (list_empty (&peer->ksnp_zc_req_list));
+
+ LIBCFS_FREE (peer, sizeof (*peer));
+
+ /* NB a peer's connections and routes keep a reference on their peer
+ * until they are destroyed, so we can be assured that _all_ state to
+ * do with this peer has been cleaned up when its refcount drops to
+ * zero. */
+ spin_lock_bh(&net->ksnn_lock);
+ net->ksnn_npeers--;
+ spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
+{
+ struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
+ struct list_head *tmp;
+ ksock_peer_t *peer;
+
+ list_for_each (tmp, peer_list) {
+
+ peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+ LASSERT (!peer->ksnp_closing);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ if (peer->ksnp_id.nid != id.nid ||
+ peer->ksnp_id.pid != id.pid)
+ continue;
+
+ CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+ peer, libcfs_id2str(id),
+ atomic_read(&peer->ksnp_refcount));
+ return (peer);
+ }
+ return (NULL);
+}
+
+ksock_peer_t *
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
+{
+ ksock_peer_t *peer;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL) /* +1 ref for caller? */
+ ksocknal_peer_addref(peer);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ return (peer);
+}
+
+void
+ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+{
+ int i;
+ __u32 ip;
+ ksock_interface_t *iface;
+
+ for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+ LASSERT (i < LNET_MAX_INTERFACES);
+ ip = peer->ksnp_passive_ips[i];
+
+ iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+ /* All IPs in peer->ksnp_passive_ips[] come from the
+ * interface list, therefore the call must succeed. */
+ LASSERT (iface != NULL);
+
+ CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+ peer, iface, iface->ksni_nroutes);
+ iface->ksni_npeers--;
+ }
+
+ LASSERT (list_empty(&peer->ksnp_conns));
+ LASSERT (list_empty(&peer->ksnp_routes));
+ LASSERT (!peer->ksnp_closing);
+ peer->ksnp_closing = 1;
+ list_del (&peer->ksnp_list);
+ /* lose peerlist's ref */
+ ksocknal_peer_decref(peer);
+}
+
+int
+ksocknal_get_peer_info (lnet_ni_t *ni, int index,
+ lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+ int *port, int *conn_count, int *share_count)
+{
+ ksock_peer_t *peer;
+ struct list_head *ptmp;
+ ksock_route_t *route;
+ struct list_head *rtmp;
+ int i;
+ int j;
+ int rc = -ENOENT;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+ list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ if (peer->ksnp_n_passive_ips == 0 &&
+ list_empty(&peer->ksnp_routes)) {
+ if (index-- > 0)
+ continue;
+
+ *id = peer->ksnp_id;
+ *myip = 0;
+ *peer_ip = 0;
+ *port = 0;
+ *conn_count = 0;
+ *share_count = 0;
+ rc = 0;
+ goto out;
+ }
+
+ for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+ if (index-- > 0)
+ continue;
+
+ *id = peer->ksnp_id;
+ *myip = peer->ksnp_passive_ips[j];
+ *peer_ip = 0;
+ *port = 0;
+ *conn_count = 0;
+ *share_count = 0;
+ rc = 0;
+ goto out;
+ }
+
+ list_for_each (rtmp, &peer->ksnp_routes) {
+ if (index-- > 0)
+ continue;
+
+ route = list_entry(rtmp, ksock_route_t,
+ ksnr_list);
+
+ *id = peer->ksnp_id;
+ *myip = route->ksnr_myipaddr;
+ *peer_ip = route->ksnr_ipaddr;
+ *port = route->ksnr_port;
+ *conn_count = route->ksnr_conn_count;
+ *share_count = route->ksnr_share_count;
+ rc = 0;
+ goto out;
+ }
+ }
+ }
+ out:
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+ ksock_peer_t *peer = route->ksnr_peer;
+ int type = conn->ksnc_type;
+ ksock_interface_t *iface;
+
+ conn->ksnc_route = route;
+ ksocknal_route_addref(route);
+
+ if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+ if (route->ksnr_myipaddr == 0) {
+ /* route wasn't bound locally yet (the initial route) */
+ CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(route->ksnr_ipaddr),
+ HIPQUAD(conn->ksnc_myipaddr));
+ } else {
+ CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+ "%u.%u.%u.%u to %u.%u.%u.%u\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(route->ksnr_ipaddr),
+ HIPQUAD(route->ksnr_myipaddr),
+ HIPQUAD(conn->ksnc_myipaddr));
+
+ iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+ route->ksnr_myipaddr);
+ if (iface != NULL)
+ iface->ksni_nroutes--;
+ }
+ route->ksnr_myipaddr = conn->ksnc_myipaddr;
+ iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+ route->ksnr_myipaddr);
+ if (iface != NULL)
+ iface->ksni_nroutes++;
+ }
+
+ route->ksnr_connected |= (1<<type);
+ route->ksnr_conn_count++;
+
+ /* Successful connection => further attempts can
+ * proceed immediately */
+ route->ksnr_retry_interval = 0;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+ ksock_route_t *route2;
+
+ LASSERT (!peer->ksnp_closing);
+ LASSERT (route->ksnr_peer == NULL);
+ LASSERT (!route->ksnr_scheduled);
+ LASSERT (!route->ksnr_connecting);
+ LASSERT (route->ksnr_connected == 0);
+
+ /* LASSERT(unique) */
+ list_for_each(tmp, &peer->ksnp_routes) {
+ route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+ CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(route->ksnr_ipaddr));
+ LBUG();
+ }
+ }
+
+ route->ksnr_peer = peer;
+ ksocknal_peer_addref(peer);
+ /* peer's routelist takes over my ref on 'route' */
+ list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+ continue;
+
+ ksocknal_associate_route_conn_locked(route, conn);
+ /* keep going (typed routes) */
+ }
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+ ksock_peer_t *peer = route->ksnr_peer;
+ ksock_interface_t *iface;
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+
+ LASSERT (!route->ksnr_deleted);
+
+ /* Close associated conns */
+ list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+ conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+ if (conn->ksnc_route != route)
+ continue;
+
+ ksocknal_close_conn_locked (conn, 0);
+ }
+
+ if (route->ksnr_myipaddr != 0) {
+ iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+ route->ksnr_myipaddr);
+ if (iface != NULL)
+ iface->ksni_nroutes--;
+ }
+
+ route->ksnr_deleted = 1;
+ list_del (&route->ksnr_list);
+ ksocknal_route_decref(route); /* drop peer's ref */
+
+ if (list_empty (&peer->ksnp_routes) &&
+ list_empty (&peer->ksnp_conns)) {
+ /* I've just removed the last route to a peer with no active
+ * connections */
+ ksocknal_unlink_peer_locked (peer);
+ }
+}
+
+int
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+ struct list_head *tmp;
+ ksock_peer_t *peer;
+ ksock_peer_t *peer2;
+ ksock_route_t *route;
+ ksock_route_t *route2;
+ int rc;
+
+ if (id.nid == LNET_NID_ANY ||
+ id.pid == LNET_PID_ANY)
+ return (-EINVAL);
+
+ /* Have a brand new peer ready... */
+ rc = ksocknal_create_peer(&peer, ni, id);
+ if (rc != 0)
+ return rc;
+
+ route = ksocknal_create_route (ipaddr, port);
+ if (route == NULL) {
+ ksocknal_peer_decref(peer);
+ return (-ENOMEM);
+ }
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ /* always called with a ref on ni, so shutdown can't have started */
+ LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+ peer2 = ksocknal_find_peer_locked (ni, id);
+ if (peer2 != NULL) {
+ ksocknal_peer_decref(peer);
+ peer = peer2;
+ } else {
+ /* peer table takes my ref on peer */
+ list_add_tail (&peer->ksnp_list,
+ ksocknal_nid2peerlist (id.nid));
+ }
+
+ route2 = NULL;
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ if (route2->ksnr_ipaddr == ipaddr)
+ break;
+
+ route2 = NULL;
+ }
+ if (route2 == NULL) {
+ ksocknal_add_route_locked(peer, route);
+ route->ksnr_share_count++;
+ } else {
+ ksocknal_route_decref(route);
+ route2->ksnr_share_count++;
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return (0);
+}
+
+void
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+{
+ ksock_conn_t *conn;
+ ksock_route_t *route;
+ struct list_head *tmp;
+ struct list_head *nxt;
+ int nshared;
+
+ LASSERT (!peer->ksnp_closing);
+
+ /* Extra ref prevents peer disappearing until I'm done with it */
+ ksocknal_peer_addref(peer);
+
+ list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ /* no match */
+ if (!(ip == 0 || route->ksnr_ipaddr == ip))
+ continue;
+
+ route->ksnr_share_count = 0;
+ /* This deletes associated conns too */
+ ksocknal_del_route_locked (route);
+ }
+
+ nshared = 0;
+ list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+ nshared += route->ksnr_share_count;
+ }
+
+ if (nshared == 0) {
+ /* remove everything else if there are no explicit entries
+ * left */
+
+ list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ /* we should only be removing auto-entries */
+ LASSERT(route->ksnr_share_count == 0);
+ ksocknal_del_route_locked (route);
+ }
+
+ list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ ksocknal_close_conn_locked(conn, 0);
+ }
+ }
+
+ ksocknal_peer_decref(peer);
+ /* NB peer unlinks itself when last conn/route is removed */
+}
+
+int
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+ LIST_HEAD (zombies);
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ ksock_peer_t *peer;
+ int lo;
+ int hi;
+ int i;
+ int rc = -ENOENT;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (id.nid != LNET_NID_ANY)
+ lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+ else {
+ lo = 0;
+ hi = ksocknal_data.ksnd_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt,
+ &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+ (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+ continue;
+
+ ksocknal_peer_addref(peer); /* a ref for me... */
+
+ ksocknal_del_peer_locked (peer, ip);
+
+ if (peer->ksnp_closing &&
+ !list_empty(&peer->ksnp_tx_queue)) {
+ LASSERT (list_empty(&peer->ksnp_conns));
+ LASSERT (list_empty(&peer->ksnp_routes));
+
+ list_splice_init(&peer->ksnp_tx_queue,
+ &zombies);
+ }
+
+ ksocknal_peer_decref(peer); /* ...till here */
+
+ rc = 0; /* matched! */
+ }
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_txlist_done(ni, &zombies, 1);
+
+ return (rc);
+}
+
+ksock_conn_t *
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+ ksock_peer_t *peer;
+ struct list_head *ptmp;
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+ int i;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+ LASSERT (!peer->ksnp_closing);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ list_for_each (ctmp, &peer->ksnp_conns) {
+ if (index-- > 0)
+ continue;
+
+ conn = list_entry (ctmp, ksock_conn_t,
+ ksnc_list);
+ ksocknal_conn_addref(conn);
+ read_unlock(&ksocknal_data. \
+ ksnd_global_lock);
+ return (conn);
+ }
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return (NULL);
+}
+
+ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+ struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt];
+ ksock_sched_t *sched;
+ int i;
+
+ LASSERT(info->ksi_nthreads > 0);
+
+ sched = &info->ksi_scheds[0];
+ /*
+ * NB: it's safe so far, but info->ksi_nthreads could be changed
+ * at runtime when we have dynamic LNet configuration, then we
+ * need to take care of this.
+ */
+ for (i = 1; i < info->ksi_nthreads; i++) {
+ if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+ sched = &info->ksi_scheds[i];
+ }
+
+ return sched;
+}
+
+int
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
+{
+ ksock_net_t *net = ni->ni_data;
+ int i;
+ int nip;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ nip = net->ksnn_ninterfaces;
+ LASSERT (nip <= LNET_MAX_INTERFACES);
+
+ /* Only offer interfaces for additional connections if I have
+ * more than one. */
+ if (nip < 2) {
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return 0;
+ }
+
+ for (i = 0; i < nip; i++) {
+ ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+ LASSERT (ipaddrs[i] != 0);
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+ int best_netmatch = 0;
+ int best_xor = 0;
+ int best = -1;
+ int this_xor;
+ int this_netmatch;
+ int i;
+
+ for (i = 0; i < nips; i++) {
+ if (ips[i] == 0)
+ continue;
+
+ this_xor = (ips[i] ^ iface->ksni_ipaddr);
+ this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+ if (!(best < 0 ||
+ best_netmatch < this_netmatch ||
+ (best_netmatch == this_netmatch &&
+ best_xor > this_xor)))
+ continue;
+
+ best = i;
+ best_netmatch = this_netmatch;
+ best_xor = this_xor;
+ }
+
+ LASSERT (best >= 0);
+ return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+ rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+ ksock_net_t *net = peer->ksnp_ni->ni_data;
+ ksock_interface_t *iface;
+ ksock_interface_t *best_iface;
+ int n_ips;
+ int i;
+ int j;
+ int k;
+ __u32 ip;
+ __u32 xor;
+ int this_netmatch;
+ int best_netmatch;
+ int best_npeers;
+
+ /* CAVEAT EMPTOR: We do all our interface matching with an
+ * exclusive hold of global lock at IRQ priority. We're only
+ * expecting to be dealing with small numbers of interfaces, so the
+ * O(n**3)-ness shouldn't matter */
+
+ /* Also note that I'm not going to return more than n_peerips
+ * interfaces, even if I have more myself */
+
+ write_lock_bh(global_lock);
+
+ LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+ LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+ /* Only match interfaces for additional connections
+ * if I have > 1 interface */
+ n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+ MIN(n_peerips, net->ksnn_ninterfaces);
+
+ for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+ /* ^ yes really... */
+
+ /* If we have any new interfaces, first tick off all the
+ * peer IPs that match old interfaces, then choose new
+ * interfaces to match the remaining peer IPS.
+ * We don't forget interfaces we've stopped using; we might
+ * start using them again... */
+
+ if (i < peer->ksnp_n_passive_ips) {
+ /* Old interface. */
+ ip = peer->ksnp_passive_ips[i];
+ best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+ /* peer passive ips are kept up to date */
+ LASSERT(best_iface != NULL);
+ } else {
+ /* choose a new interface */
+ LASSERT (i == peer->ksnp_n_passive_ips);
+
+ best_iface = NULL;
+ best_netmatch = 0;
+ best_npeers = 0;
+
+ for (j = 0; j < net->ksnn_ninterfaces; j++) {
+ iface = &net->ksnn_interfaces[j];
+ ip = iface->ksni_ipaddr;
+
+ for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+ if (peer->ksnp_passive_ips[k] == ip)
+ break;
+
+ if (k < peer->ksnp_n_passive_ips) /* using it already */
+ continue;
+
+ k = ksocknal_match_peerip(iface, peerips, n_peerips);
+ xor = (ip ^ peerips[k]);
+ this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+ if (!(best_iface == NULL ||
+ best_netmatch < this_netmatch ||
+ (best_netmatch == this_netmatch &&
+ best_npeers > iface->ksni_npeers)))
+ continue;
+
+ best_iface = iface;
+ best_netmatch = this_netmatch;
+ best_npeers = iface->ksni_npeers;
+ }
+
+ best_iface->ksni_npeers++;
+ ip = best_iface->ksni_ipaddr;
+ peer->ksnp_passive_ips[i] = ip;
+ peer->ksnp_n_passive_ips = i+1;
+ }
+
+ LASSERT (best_iface != NULL);
+
+ /* mark the best matching peer IP used */
+ j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+ peerips[j] = 0;
+ }
+
+ /* Overwrite input peer IP addresses */
+ memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+ write_unlock_bh(global_lock);
+
+ return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+ __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+ ksock_route_t *newroute = NULL;
+ rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+ lnet_ni_t *ni = peer->ksnp_ni;
+ ksock_net_t *net = ni->ni_data;
+ struct list_head *rtmp;
+ ksock_route_t *route;
+ ksock_interface_t *iface;
+ ksock_interface_t *best_iface;
+ int best_netmatch;
+ int this_netmatch;
+ int best_nroutes;
+ int i;
+ int j;
+
+ /* CAVEAT EMPTOR: We do all our interface matching with an
+ * exclusive hold of global lock at IRQ priority. We're only
+ * expecting to be dealing with small numbers of interfaces, so the
+ * O(n**3)-ness here shouldn't matter */
+
+ write_lock_bh(global_lock);
+
+ if (net->ksnn_ninterfaces < 2) {
+ /* Only create additional connections
+ * if I have > 1 interface */
+ write_unlock_bh(global_lock);
+ return;
+ }
+
+ LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+ for (i = 0; i < npeer_ipaddrs; i++) {
+ if (newroute != NULL) {
+ newroute->ksnr_ipaddr = peer_ipaddrs[i];
+ } else {
+ write_unlock_bh(global_lock);
+
+ newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+ if (newroute == NULL)
+ return;
+
+ write_lock_bh(global_lock);
+ }
+
+ if (peer->ksnp_closing) {
+ /* peer got closed under me */
+ break;
+ }
+
+ /* Already got a route? */
+ route = NULL;
+ list_for_each(rtmp, &peer->ksnp_routes) {
+ route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+ if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+ break;
+
+ route = NULL;
+ }
+ if (route != NULL)
+ continue;
+
+ best_iface = NULL;
+ best_nroutes = 0;
+ best_netmatch = 0;
+
+ LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+ /* Select interface to connect from */
+ for (j = 0; j < net->ksnn_ninterfaces; j++) {
+ iface = &net->ksnn_interfaces[j];
+
+ /* Using this interface already? */
+ list_for_each(rtmp, &peer->ksnp_routes) {
+ route = list_entry(rtmp, ksock_route_t,
+ ksnr_list);
+
+ if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+ break;
+
+ route = NULL;
+ }
+ if (route != NULL)
+ continue;
+
+ this_netmatch = (((iface->ksni_ipaddr ^
+ newroute->ksnr_ipaddr) &
+ iface->ksni_netmask) == 0) ? 1 : 0;
+
+ if (!(best_iface == NULL ||
+ best_netmatch < this_netmatch ||
+ (best_netmatch == this_netmatch &&
+ best_nroutes > iface->ksni_nroutes)))
+ continue;
+
+ best_iface = iface;
+ best_netmatch = this_netmatch;
+ best_nroutes = iface->ksni_nroutes;
+ }
+
+ if (best_iface == NULL)
+ continue;
+
+ newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+ best_iface->ksni_nroutes++;
+
+ ksocknal_add_route_locked(peer, newroute);
+ newroute = NULL;
+ }
+
+ write_unlock_bh(global_lock);
+ if (newroute != NULL)
+ ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept (lnet_ni_t *ni, socket_t *sock)
+{
+ ksock_connreq_t *cr;
+ int rc;
+ __u32 peer_ip;
+ int peer_port;
+
+ rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+ LASSERT (rc == 0); /* we succeeded before */
+
+ LIBCFS_ALLOC(cr, sizeof(*cr));
+ if (cr == NULL) {
+ LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from "
+ "%u.%u.%u.%u: memory exhausted\n",
+ HIPQUAD(peer_ip));
+ return -ENOMEM;
+ }
+
+ lnet_ni_addref(ni);
+ cr->ksncr_ni = ni;
+ cr->ksncr_sock = sock;
+
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+ list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+ wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+ return 0;
+}
+
+int
+ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr)
+{
+ ksock_route_t *route;
+
+ list_for_each_entry (route, &peer->ksnp_routes, ksnr_list) {
+
+ if (route->ksnr_ipaddr == ipaddr)
+ return route->ksnr_connecting;
+ }
+ return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+ socket_t *sock, int type)
+{
+ rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock;
+ LIST_HEAD (zombies);
+ lnet_process_id_t peerid;
+ struct list_head *tmp;
+ __u64 incarnation;
+ ksock_conn_t *conn;
+ ksock_conn_t *conn2;
+ ksock_peer_t *peer = NULL;
+ ksock_peer_t *peer2;
+ ksock_sched_t *sched;
+ ksock_hello_msg_t *hello;
+ int cpt;
+ ksock_tx_t *tx;
+ ksock_tx_t *txtmp;
+ int rc;
+ int active;
+ char *warn = NULL;
+
+ active = (route != NULL);
+
+ LASSERT (active == (type != SOCKLND_CONN_NONE));
+
+ LIBCFS_ALLOC(conn, sizeof(*conn));
+ if (conn == NULL) {
+ rc = -ENOMEM;
+ goto failed_0;
+ }
+
+ memset (conn, 0, sizeof (*conn));
+
+ conn->ksnc_peer = NULL;
+ conn->ksnc_route = NULL;
+ conn->ksnc_sock = sock;
+ /* 2 ref, 1 for conn, another extra ref prevents socket
+ * being closed before establishment of connection */
+ atomic_set (&conn->ksnc_sock_refcount, 2);
+ conn->ksnc_type = type;
+ ksocknal_lib_save_callback(sock, conn);
+ atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+ conn->ksnc_rx_ready = 0;
+ conn->ksnc_rx_scheduled = 0;
+
+ INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+ conn->ksnc_tx_ready = 0;
+ conn->ksnc_tx_scheduled = 0;
+ conn->ksnc_tx_carrier = NULL;
+ atomic_set (&conn->ksnc_tx_nob, 0);
+
+ LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+ kshm_ips[LNET_MAX_INTERFACES]));
+ if (hello == NULL) {
+ rc = -ENOMEM;
+ goto failed_1;
+ }
+
+ /* stash conn's local and remote addrs */
+ rc = ksocknal_lib_get_conn_addrs (conn);
+ if (rc != 0)
+ goto failed_1;
+
+ /* Find out/confirm peer's NID and connection type and get the
+ * vector of interfaces she's willing to let me connect to.
+ * Passive connections use the listener timeout since the peer sends
+ * eagerly */
+
+ if (active) {
+ peer = route->ksnr_peer;
+ LASSERT(ni == peer->ksnp_ni);
+
+ /* Active connection sends HELLO eagerly */
+ hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+ peerid = peer->ksnp_id;
+
+ write_lock_bh(global_lock);
+ conn->ksnc_proto = peer->ksnp_proto;
+ write_unlock_bh(global_lock);
+
+ if (conn->ksnc_proto == NULL) {
+ conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+ if (*ksocknal_tunables.ksnd_protocol == 2)
+ conn->ksnc_proto = &ksocknal_protocol_v2x;
+ else if (*ksocknal_tunables.ksnd_protocol == 1)
+ conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+ }
+
+ rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
+ if (rc != 0)
+ goto failed_1;
+ } else {
+ peerid.nid = LNET_NID_ANY;
+ peerid.pid = LNET_PID_ANY;
+
+ /* Passive, get protocol from peer */
+ conn->ksnc_proto = NULL;
+ }
+
+ rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+ if (rc < 0)
+ goto failed_1;
+
+ LASSERT (rc == 0 || active);
+ LASSERT (conn->ksnc_proto != NULL);
+ LASSERT (peerid.nid != LNET_NID_ANY);
+
+ cpt = lnet_cpt_of_nid(peerid.nid);
+
+ if (active) {
+ ksocknal_peer_addref(peer);
+ write_lock_bh(global_lock);
+ } else {
+ rc = ksocknal_create_peer(&peer, ni, peerid);
+ if (rc != 0)
+ goto failed_1;
+
+ write_lock_bh(global_lock);
+
+ /* called with a ref on ni, so shutdown can't have started */
+ LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+ peer2 = ksocknal_find_peer_locked(ni, peerid);
+ if (peer2 == NULL) {
+ /* NB this puts an "empty" peer in the peer
+ * table (which takes my ref) */
+ list_add_tail(&peer->ksnp_list,
+ ksocknal_nid2peerlist(peerid.nid));
+ } else {
+ ksocknal_peer_decref(peer);
+ peer = peer2;
+ }
+
+ /* +1 ref for me */
+ ksocknal_peer_addref(peer);
+ peer->ksnp_accepting++;
+
+ /* Am I already connecting to this guy? Resolve in
+ * favour of higher NID... */
+ if (peerid.nid < ni->ni_nid &&
+ ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+ rc = EALREADY;
+ warn = "connection race resolution";
+ goto failed_2;
+ }
+ }
+
+ if (peer->ksnp_closing ||
+ (active && route->ksnr_deleted)) {
+ /* peer/route got closed under me */
+ rc = -ESTALE;
+ warn = "peer/route removed";
+ goto failed_2;
+ }
+
+ if (peer->ksnp_proto == NULL) {
+ /* Never connected before.
+ * NB recv_hello may have returned EPROTO to signal my peer
+ * wants a different protocol than the one I asked for.
+ */
+ LASSERT (list_empty(&peer->ksnp_conns));
+
+ peer->ksnp_proto = conn->ksnc_proto;
+ peer->ksnp_incarnation = incarnation;
+ }
+
+ if (peer->ksnp_proto != conn->ksnc_proto ||
+ peer->ksnp_incarnation != incarnation) {
+ /* Peer rebooted or I've got the wrong protocol version */
+ ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+ peer->ksnp_proto = NULL;
+ rc = ESTALE;
+ warn = peer->ksnp_incarnation != incarnation ?
+ "peer rebooted" :
+ "wrong proto version";
+ goto failed_2;
+ }
+
+ switch (rc) {
+ default:
+ LBUG();
+ case 0:
+ break;
+ case EALREADY:
+ warn = "lost conn race";
+ goto failed_2;
+ case EPROTO:
+ warn = "retry with different protocol version";
+ goto failed_2;
+ }
+
+ /* Refuse to duplicate an existing connection, unless this is a
+ * loopback connection */
+ if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+ conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+ conn2->ksnc_type != conn->ksnc_type)
+ continue;
+
+ /* Reply on a passive connection attempt so the peer
+ * realises we're connected. */
+ LASSERT (rc == 0);
+ if (!active)
+ rc = EALREADY;
+
+ warn = "duplicate";
+ goto failed_2;
+ }
+ }
+
+ /* If the connection created by this route didn't bind to the IP
+ * address the route connected to, the connection/route matching
+ * code below probably isn't going to work. */
+ if (active &&
+ route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+ CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(route->ksnr_ipaddr),
+ HIPQUAD(conn->ksnc_ipaddr));
+ }
+
+ /* Search for a route corresponding to the new connection and
+ * create an association. This allows incoming connections created
+ * by routes in my peer to match my own route entries so I don't
+ * continually create duplicate routes. */
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+ if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+ continue;
+
+ ksocknal_associate_route_conn_locked(route, conn);
+ break;
+ }
+
+ conn->ksnc_peer = peer; /* conn takes my ref on peer */
+ peer->ksnp_last_alive = cfs_time_current();
+ peer->ksnp_send_keepalive = 0;
+ peer->ksnp_error = 0;
+
+ sched = ksocknal_choose_scheduler_locked(cpt);
+ sched->kss_nconns++;
+ conn->ksnc_scheduler = sched;
+
+ conn->ksnc_tx_last_post = cfs_time_current();
+ /* Set the deadline for the outgoing HELLO to drain */
+ conn->ksnc_tx_bufnob = cfs_sock_wmem_queued(sock);
+ conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ mb(); /* order with adding to peer's conn list */
+
+ list_add (&conn->ksnc_list, &peer->ksnp_conns);
+ ksocknal_conn_addref(conn);
+
+ ksocknal_new_packet(conn, 0);
+
+ conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+ /* Take packets blocking for this connection. */
+ list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+ if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+ continue;
+
+ list_del (&tx->tx_list);
+ ksocknal_queue_tx_locked (tx, conn);
+ }
+
+ write_unlock_bh(global_lock);
+
+ /* We've now got a new connection. Any errors from here on are just
+ * like "normal" comms errors and we close the connection normally.
+ * NB (a) we still have to send the reply HELLO for passive
+ * connections,
+ * (b) normal I/O on the conn is blocked until I setup and call the
+ * socket callbacks.
+ */
+
+ CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+ " incarnation:"LPD64" sched[%d:%d]\n",
+ libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+ HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port, incarnation, cpt,
+ (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+ if (active) {
+ /* additional routes after interface exchange? */
+ ksocknal_create_routes(peer, conn->ksnc_port,
+ hello->kshm_ips, hello->kshm_nips);
+ } else {
+ hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+ hello->kshm_nips);
+ rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+ }
+
+ LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+ kshm_ips[LNET_MAX_INTERFACES]));
+
+ /* setup the socket AFTER I've received hello (it disables
+ * SO_LINGER). I might call back to the acceptor who may want
+ * to send a protocol version response and then close the
+ * socket; this ensures the socket only tears down after the
+ * response has been sent. */
+ if (rc == 0)
+ rc = ksocknal_lib_setup_sock(sock);
+
+ write_lock_bh(global_lock);
+
+ /* NB my callbacks block while I hold ksnd_global_lock */
+ ksocknal_lib_set_callback(sock, conn);
+
+ if (!active)
+ peer->ksnp_accepting--;
+
+ write_unlock_bh(global_lock);
+
+ if (rc != 0) {
+ write_lock_bh(global_lock);
+ if (!conn->ksnc_closing) {
+ /* could be closed by another thread */
+ ksocknal_close_conn_locked(conn, rc);
+ }
+ write_unlock_bh(global_lock);
+ } else if (ksocknal_connsock_addref(conn) == 0) {
+ /* Allow I/O to proceed. */
+ ksocknal_read_callback(conn);
+ ksocknal_write_callback(conn);
+ ksocknal_connsock_decref(conn);
+ }
+
+ ksocknal_connsock_decref(conn);
+ ksocknal_conn_decref(conn);
+ return rc;
+
+ failed_2:
+ if (!peer->ksnp_closing &&
+ list_empty (&peer->ksnp_conns) &&
+ list_empty (&peer->ksnp_routes)) {
+ list_add(&zombies, &peer->ksnp_tx_queue);
+ list_del_init(&peer->ksnp_tx_queue);
+ ksocknal_unlink_peer_locked(peer);
+ }
+
+ write_unlock_bh(global_lock);
+
+ if (warn != NULL) {
+ if (rc < 0)
+ CERROR("Not creating conn %s type %d: %s\n",
+ libcfs_id2str(peerid), conn->ksnc_type, warn);
+ else
+ CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+ libcfs_id2str(peerid), conn->ksnc_type, warn);
+ }
+
+ if (!active) {
+ if (rc > 0) {
+ /* Request retry by replying with CONN_NONE
+ * ksnc_proto has been set already */
+ conn->ksnc_type = SOCKLND_CONN_NONE;
+ hello->kshm_nips = 0;
+ ksocknal_send_hello(ni, conn, peerid.nid, hello);
+ }
+
+ write_lock_bh(global_lock);
+ peer->ksnp_accepting--;
+ write_unlock_bh(global_lock);
+ }
+
+ ksocknal_txlist_done(ni, &zombies, 1);
+ ksocknal_peer_decref(peer);
+
+ failed_1:
+ if (hello != NULL)
+ LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+ kshm_ips[LNET_MAX_INTERFACES]));
+
+ LIBCFS_FREE (conn, sizeof(*conn));
+
+ failed_0:
+ libcfs_sock_release(sock);
+ return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+ /* This just does the immmediate housekeeping, and queues the
+ * connection for the reaper to terminate.
+ * Caller holds ksnd_global_lock exclusively in irq context */
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_route_t *route;
+ ksock_conn_t *conn2;
+ struct list_head *tmp;
+
+ LASSERT (peer->ksnp_error == 0);
+ LASSERT (!conn->ksnc_closing);
+ conn->ksnc_closing = 1;
+
+ /* ksnd_deathrow_conns takes over peer's ref */
+ list_del (&conn->ksnc_list);
+
+ route = conn->ksnc_route;
+ if (route != NULL) {
+ /* dissociate conn from route... */
+ LASSERT (!route->ksnr_deleted);
+ LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+ conn2 = NULL;
+ list_for_each(tmp, &peer->ksnp_conns) {
+ conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn2->ksnc_route == route &&
+ conn2->ksnc_type == conn->ksnc_type)
+ break;
+
+ conn2 = NULL;
+ }
+ if (conn2 == NULL)
+ route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+ conn->ksnc_route = NULL;
+
+#if 0 /* irrelevent with only eager routes */
+ /* make route least favourite */
+ list_del (&route->ksnr_list);
+ list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+#endif
+ ksocknal_route_decref(route); /* drop conn's ref on route */
+ }
+
+ if (list_empty (&peer->ksnp_conns)) {
+ /* No more connections to this peer */
+
+ if (!list_empty(&peer->ksnp_tx_queue)) {
+ ksock_tx_t *tx;
+
+ LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+ /* throw them to the last connection...,
+ * these TXs will be send to /dev/null by scheduler */
+ list_for_each_entry(tx, &peer->ksnp_tx_queue,
+ tx_list)
+ ksocknal_tx_prep(conn, tx);
+
+ spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+ list_splice_init(&peer->ksnp_tx_queue,
+ &conn->ksnc_tx_queue);
+ spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+ }
+
+ peer->ksnp_proto = NULL; /* renegotiate protocol version */
+ peer->ksnp_error = error; /* stash last conn close reason */
+
+ if (list_empty (&peer->ksnp_routes)) {
+ /* I've just closed last conn belonging to a
+ * peer with no routes to it */
+ ksocknal_unlink_peer_locked (peer);
+ }
+ }
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ list_add_tail(&conn->ksnc_list,
+ &ksocknal_data.ksnd_deathrow_conns);
+ wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+ int notify = 0;
+ cfs_time_t last_alive = 0;
+
+ /* There has been a connection failure or comms error; but I'll only
+ * tell LNET I think the peer is dead if it's to another kernel and
+ * there are no connections or connection attempts in existance. */
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+ list_empty(&peer->ksnp_conns) &&
+ peer->ksnp_accepting == 0 &&
+ ksocknal_find_connecting_route_locked(peer) == NULL) {
+ notify = 1;
+ last_alive = peer->ksnp_last_alive;
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ if (notify)
+ lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+ last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_tx_t *tx;
+ ksock_tx_t *tmp;
+ LIST_HEAD (zlist);
+
+ /* NB safe to finalize TXs because closing of socket will
+ * abort all buffered data */
+ LASSERT (conn->ksnc_sock == NULL);
+
+ spin_lock(&peer->ksnp_lock);
+
+ list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+ if (tx->tx_conn != conn)
+ continue;
+
+ LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+ tx->tx_msg.ksm_zc_cookies[0] = 0;
+ tx->tx_zc_aborted = 1; /* mark it as not-acked */
+ list_del(&tx->tx_zc_list);
+ list_add(&tx->tx_zc_list, &zlist);
+ }
+
+ spin_unlock(&peer->ksnp_lock);
+
+ while (!list_empty(&zlist)) {
+ tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+ list_del(&tx->tx_zc_list);
+ ksocknal_tx_decref(tx);
+ }
+}
+
+void
+ksocknal_terminate_conn (ksock_conn_t *conn)
+{
+ /* This gets called by the reaper (guaranteed thread context) to
+ * disengage the socket from its callbacks and close it.
+ * ksnc_refcount will eventually hit zero, and then the reaper will
+ * destroy it. */
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+ int failed = 0;
+
+ LASSERT(conn->ksnc_closing);
+
+ /* wake up the scheduler to "send" all remaining packets to /dev/null */
+ spin_lock_bh(&sched->kss_lock);
+
+ /* a closing conn is always ready to tx */
+ conn->ksnc_tx_ready = 1;
+
+ if (!conn->ksnc_tx_scheduled &&
+ !list_empty(&conn->ksnc_tx_queue)){
+ list_add_tail (&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ conn->ksnc_tx_scheduled = 1;
+ /* extra ref for scheduler */
+ ksocknal_conn_addref(conn);
+
+ wake_up (&sched->kss_waitq);
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+
+ /* serialise with callbacks */
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+ /* OK, so this conn may not be completely disengaged from its
+ * scheduler yet, but it _has_ committed to terminate... */
+ conn->ksnc_scheduler->kss_nconns--;
+
+ if (peer->ksnp_error != 0) {
+ /* peer's last conn closed in error */
+ LASSERT (list_empty (&peer->ksnp_conns));
+ failed = 1;
+ peer->ksnp_error = 0; /* avoid multiple notifications */
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (failed)
+ ksocknal_peer_failed(peer);
+
+ /* The socket is closed on the final put; either here, or in
+ * ksocknal_{send,recv}msg(). Since we set up the linger2 option
+ * when the connection was established, this will close the socket
+ * immediately, aborting anything buffered in it. Any hung
+ * zero-copy transmits will therefore complete in finite time. */
+ ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+ /* Queue the conn for the reaper to destroy */
+
+ LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+ wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+ cfs_time_t last_rcv;
+
+ /* Final coup-de-grace of the reaper */
+ CDEBUG (D_NET, "connection %p\n", conn);
+
+ LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+ LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+ LASSERT (conn->ksnc_sock == NULL);
+ LASSERT (conn->ksnc_route == NULL);
+ LASSERT (!conn->ksnc_tx_scheduled);
+ LASSERT (!conn->ksnc_rx_scheduled);
+ LASSERT (list_empty(&conn->ksnc_tx_queue));
+
+ /* complete current receive if any */
+ switch (conn->ksnc_rx_state) {
+ case SOCKNAL_RX_LNET_PAYLOAD:
+ last_rcv = conn->ksnc_rx_deadline -
+ cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+ CERROR("Completing partial receive from %s[%d]"
+ ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, "
+ "last alive is %ld secs ago\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+ conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+ cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+ last_rcv)));
+ lnet_finalize (conn->ksnc_peer->ksnp_ni,
+ conn->ksnc_cookie, -EIO);
+ break;
+ case SOCKNAL_RX_LNET_HEADER:
+ if (conn->ksnc_rx_started)
+ CERROR("Incomplete receive of lnet header from %s"
+ ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+ conn->ksnc_proto->pro_version);
+ break;
+ case SOCKNAL_RX_KSM_HEADER:
+ if (conn->ksnc_rx_started)
+ CERROR("Incomplete receive of ksock message from %s"
+ ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+ conn->ksnc_proto->pro_version);
+ break;
+ case SOCKNAL_RX_SLOP:
+ if (conn->ksnc_rx_started)
+ CERROR("Incomplete receive of slops from %s"
+ ", ip %d.%d.%d.%d:%d, with error\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+ break;
+ default:
+ LBUG ();
+ break;
+ }
+
+ ksocknal_peer_decref(conn->ksnc_peer);
+
+ LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+ struct list_head *cnxt;
+ int count = 0;
+
+ list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+ conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+ if (ipaddr == 0 ||
+ conn->ksnc_ipaddr == ipaddr) {
+ count++;
+ ksocknal_close_conn_locked (conn, why);
+ }
+ }
+
+ return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+ ksock_peer_t *peer = conn->ksnc_peer;
+ __u32 ipaddr = conn->ksnc_ipaddr;
+ int count;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return (count);
+}
+
+int
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
+{
+ ksock_peer_t *peer;
+ struct list_head *ptmp;
+ struct list_head *pnxt;
+ int lo;
+ int hi;
+ int i;
+ int count = 0;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (id.nid != LNET_NID_ANY)
+ lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+ else {
+ lo = 0;
+ hi = ksocknal_data.ksnd_peer_hash_size - 1;
+ }
+
+ for (i = lo; i <= hi; i++) {
+ list_for_each_safe (ptmp, pnxt,
+ &ksocknal_data.ksnd_peers[i]) {
+
+ peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+ if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+ (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+ continue;
+
+ count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+ }
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ /* wildcards always succeed */
+ if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+ return (0);
+
+ return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+ /* The router is telling me she's been notified of a change in
+ * gateway state.... */
+ lnet_process_id_t id = {0};
+
+ id.nid = gw_nid;
+ id.pid = LNET_PID_ANY;
+
+ CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+ alive ? "up" : "down");
+
+ if (!alive) {
+ /* If the gateway crashed, close all open connections... */
+ ksocknal_close_matching_conns (id, 0);
+ return;
+ }
+
+ /* ...otherwise do nothing. We can only establish new connections
+ * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+ int connect = 1;
+ cfs_time_t last_alive = 0;
+ cfs_time_t now = cfs_time_current();
+ ksock_peer_t *peer = NULL;
+ rwlock_t *glock = &ksocknal_data.ksnd_global_lock;
+ lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+ read_lock(glock);
+
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL) {
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+ int bufnob;
+
+ list_for_each (tmp, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+ bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+
+ if (bufnob < conn->ksnc_tx_bufnob) {
+ /* something got ACKed */
+ conn->ksnc_tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ peer->ksnp_last_alive = now;
+ conn->ksnc_tx_bufnob = bufnob;
+ }
+ }
+
+ last_alive = peer->ksnp_last_alive;
+ if (ksocknal_find_connectable_route_locked(peer) == NULL)
+ connect = 0;
+ }
+
+ read_unlock(glock);
+
+ if (last_alive != 0)
+ *when = last_alive;
+
+ CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+ libcfs_nid2str(nid), peer,
+ last_alive ? cfs_duration_sec(now - last_alive) : -1,
+ connect);
+
+ if (!connect)
+ return;
+
+ ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+ write_lock_bh(glock);
+
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL)
+ ksocknal_launch_all_connections_locked(peer);
+
+ write_unlock_bh(glock);
+ return;
+}
+
+void
+ksocknal_push_peer (ksock_peer_t *peer)
+{
+ int index;
+ int i;
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+
+ for (index = 0; ; index++) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ i = 0;
+ conn = NULL;
+
+ list_for_each (tmp, &peer->ksnp_conns) {
+ if (i++ == index) {
+ conn = list_entry (tmp, ksock_conn_t,
+ ksnc_list);
+ ksocknal_conn_addref(conn);
+ break;
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ if (conn == NULL)
+ break;
+
+ ksocknal_lib_push_conn (conn);
+ ksocknal_conn_decref(conn);
+ }
+}
+
+int
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
+{
+ ksock_peer_t *peer;
+ struct list_head *tmp;
+ int index;
+ int i;
+ int j;
+ int rc = -ENOENT;
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ for (j = 0; ; j++) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ index = 0;
+ peer = NULL;
+
+ list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(tmp, ksock_peer_t,
+ ksnp_list);
+
+ if (!((id.nid == LNET_NID_ANY ||
+ id.nid == peer->ksnp_id.nid) &&
+ (id.pid == LNET_PID_ANY ||
+ id.pid == peer->ksnp_id.pid))) {
+ peer = NULL;
+ continue;
+ }
+
+ if (index++ == j) {
+ ksocknal_peer_addref(peer);
+ break;
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ if (peer != NULL) {
+ rc = 0;
+ ksocknal_push_peer (peer);
+ ksocknal_peer_decref(peer);
+ }
+ }
+
+ }
+
+ return (rc);
+}
+
+int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+ ksock_net_t *net = ni->ni_data;
+ ksock_interface_t *iface;
+ int rc;
+ int i;
+ int j;
+ struct list_head *ptmp;
+ ksock_peer_t *peer;
+ struct list_head *rtmp;
+ ksock_route_t *route;
+
+ if (ipaddress == 0 ||
+ netmask == 0)
+ return (-EINVAL);
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ iface = ksocknal_ip2iface(ni, ipaddress);
+ if (iface != NULL) {
+ /* silently ignore dups */
+ rc = 0;
+ } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+ rc = -ENOSPC;
+ } else {
+ iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+ iface->ksni_ipaddr = ipaddress;
+ iface->ksni_netmask = netmask;
+ iface->ksni_nroutes = 0;
+ iface->ksni_npeers = 0;
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry(ptmp, ksock_peer_t,
+ ksnp_list);
+
+ for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+ if (peer->ksnp_passive_ips[j] == ipaddress)
+ iface->ksni_npeers++;
+
+ list_for_each(rtmp, &peer->ksnp_routes) {
+ route = list_entry(rtmp,
+ ksock_route_t,
+ ksnr_list);
+
+ if (route->ksnr_myipaddr == ipaddress)
+ iface->ksni_nroutes++;
+ }
+ }
+ }
+
+ rc = 0;
+ /* NB only new connections will pay attention to the new interface! */
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+ struct list_head *tmp;
+ struct list_head *nxt;
+ ksock_route_t *route;
+ ksock_conn_t *conn;
+ int i;
+ int j;
+
+ for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+ if (peer->ksnp_passive_ips[i] == ipaddr) {
+ for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+ peer->ksnp_passive_ips[j-1] =
+ peer->ksnp_passive_ips[j];
+ peer->ksnp_n_passive_ips--;
+ break;
+ }
+
+ list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+ route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+ if (route->ksnr_myipaddr != ipaddr)
+ continue;
+
+ if (route->ksnr_share_count != 0) {
+ /* Manually created; keep, but unbind */
+ route->ksnr_myipaddr = 0;
+ } else {
+ ksocknal_del_route_locked(route);
+ }
+ }
+
+ list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+ if (conn->ksnc_myipaddr == ipaddr)
+ ksocknal_close_conn_locked (conn, 0);
+ }
+}
+
+int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+ ksock_net_t *net = ni->ni_data;
+ int rc = -ENOENT;
+ struct list_head *tmp;
+ struct list_head *nxt;
+ ksock_peer_t *peer;
+ __u32 this_ip;
+ int i;
+ int j;
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+ if (!(ipaddress == 0 ||
+ ipaddress == this_ip))
+ continue;
+
+ rc = 0;
+
+ for (j = i+1; j < net->ksnn_ninterfaces; j++)
+ net->ksnn_interfaces[j-1] =
+ net->ksnn_interfaces[j];
+
+ net->ksnn_ninterfaces--;
+
+ for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+ list_for_each_safe(tmp, nxt,
+ &ksocknal_data.ksnd_peers[j]) {
+ peer = list_entry(tmp, ksock_peer_t,
+ ksnp_list);
+
+ if (peer->ksnp_ni != ni)
+ continue;
+
+ ksocknal_peer_del_interface_locked(peer, this_ip);
+ }
+ }
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ return (rc);
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+ lnet_process_id_t id = {0};
+ struct libcfs_ioctl_data *data = arg;
+ int rc;
+
+ switch(cmd) {
+ case IOC_LIBCFS_GET_INTERFACE: {
+ ksock_net_t *net = ni->ni_data;
+ ksock_interface_t *iface;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+ rc = -ENOENT;
+ } else {
+ rc = 0;
+ iface = &net->ksnn_interfaces[data->ioc_count];
+
+ data->ioc_u32[0] = iface->ksni_ipaddr;
+ data->ioc_u32[1] = iface->ksni_netmask;
+ data->ioc_u32[2] = iface->ksni_npeers;
+ data->ioc_u32[3] = iface->ksni_nroutes;
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return rc;
+ }
+
+ case IOC_LIBCFS_ADD_INTERFACE:
+ return ksocknal_add_interface(ni,
+ data->ioc_u32[0], /* IP address */
+ data->ioc_u32[1]); /* net mask */
+
+ case IOC_LIBCFS_DEL_INTERFACE:
+ return ksocknal_del_interface(ni,
+ data->ioc_u32[0]); /* IP address */
+
+ case IOC_LIBCFS_GET_PEER: {
+ __u32 myip = 0;
+ __u32 ip = 0;
+ int port = 0;
+ int conn_count = 0;
+ int share_count = 0;
+
+ rc = ksocknal_get_peer_info(ni, data->ioc_count,
+ &id, &myip, &ip, &port,
+ &conn_count, &share_count);
+ if (rc != 0)
+ return rc;
+
+ data->ioc_nid = id.nid;
+ data->ioc_count = share_count;
+ data->ioc_u32[0] = ip;
+ data->ioc_u32[1] = port;
+ data->ioc_u32[2] = myip;
+ data->ioc_u32[3] = conn_count;
+ data->ioc_u32[4] = id.pid;
+ return 0;
+ }
+
+ case IOC_LIBCFS_ADD_PEER:
+ id.nid = data->ioc_nid;
+ id.pid = LUSTRE_SRV_LNET_PID;
+ return ksocknal_add_peer (ni, id,
+ data->ioc_u32[0], /* IP */
+ data->ioc_u32[1]); /* port */
+
+ case IOC_LIBCFS_DEL_PEER:
+ id.nid = data->ioc_nid;
+ id.pid = LNET_PID_ANY;
+ return ksocknal_del_peer (ni, id,
+ data->ioc_u32[0]); /* IP */
+
+ case IOC_LIBCFS_GET_CONN: {
+ int txmem;
+ int rxmem;
+ int nagle;
+ ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+ if (conn == NULL)
+ return -ENOENT;
+
+ ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+ data->ioc_count = txmem;
+ data->ioc_nid = conn->ksnc_peer->ksnp_id.nid;
+ data->ioc_flags = nagle;
+ data->ioc_u32[0] = conn->ksnc_ipaddr;
+ data->ioc_u32[1] = conn->ksnc_port;
+ data->ioc_u32[2] = conn->ksnc_myipaddr;
+ data->ioc_u32[3] = conn->ksnc_type;
+ data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+ data->ioc_u32[5] = rxmem;
+ data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+ ksocknal_conn_decref(conn);
+ return 0;
+ }
+
+ case IOC_LIBCFS_CLOSE_CONNECTION:
+ id.nid = data->ioc_nid;
+ id.pid = LNET_PID_ANY;
+ return ksocknal_close_matching_conns (id,
+ data->ioc_u32[0]);
+
+ case IOC_LIBCFS_REGISTER_MYNID:
+ /* Ignore if this is a noop */
+ if (data->ioc_nid == ni->ni_nid)
+ return 0;
+
+ CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+ libcfs_nid2str(data->ioc_nid),
+ libcfs_nid2str(ni->ni_nid));
+ return -EINVAL;
+
+ case IOC_LIBCFS_PUSH_CONNECTION:
+ id.nid = data->ioc_nid;
+ id.pid = LNET_PID_ANY;
+ return ksocknal_push(ni, id);
+
+ default:
+ return -EINVAL;
+ }
+ /* not reached */
+}
+
+void
+ksocknal_free_buffers (void)
+{
+ LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+ if (ksocknal_data.ksnd_sched_info != NULL) {
+ struct ksock_sched_info *info;
+ int i;
+
+ cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+ if (info->ksi_scheds != NULL) {
+ LIBCFS_FREE(info->ksi_scheds,
+ info->ksi_nthreads_max *
+ sizeof(info->ksi_scheds[0]));
+ }
+ }
+ cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+ }
+
+ LIBCFS_FREE (ksocknal_data.ksnd_peers,
+ sizeof (struct list_head) *
+ ksocknal_data.ksnd_peer_hash_size);
+
+ spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+ if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+ struct list_head zlist;
+ ksock_tx_t *tx;
+
+ list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+ list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+ while (!list_empty(&zlist)) {
+ tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+ list_del(&tx->tx_list);
+ LIBCFS_FREE(tx, tx->tx_desc_size);
+ }
+ } else {
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+ }
+}
+
+void
+ksocknal_base_shutdown(void)
+{
+ struct ksock_sched_info *info;
+ ksock_sched_t *sched;
+ int i;
+ int j;
+
+ CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+ atomic_read (&libcfs_kmemory));
+ LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+ switch (ksocknal_data.ksnd_init) {
+ default:
+ LASSERT (0);
+
+ case SOCKNAL_INIT_ALL:
+ case SOCKNAL_INIT_DATA:
+ LASSERT (ksocknal_data.ksnd_peers != NULL);
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
+ }
+
+ LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+ LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
+ LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
+ LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+ LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
+
+ if (ksocknal_data.ksnd_sched_info != NULL) {
+ cfs_percpt_for_each(info, i,
+ ksocknal_data.ksnd_sched_info) {
+ if (info->ksi_scheds == NULL)
+ continue;
+
+ for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+ sched = &info->ksi_scheds[j];
+ LASSERT(list_empty(&sched->\
+ kss_tx_conns));
+ LASSERT(list_empty(&sched->\
+ kss_rx_conns));
+ LASSERT(list_empty(&sched-> \
+ kss_zombie_noop_txs));
+ LASSERT(sched->kss_nconns == 0);
+ }
+ }
+ }
+
+ /* flag threads to terminate; wake and wait for them to die */
+ ksocknal_data.ksnd_shuttingdown = 1;
+ wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+ wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+ if (ksocknal_data.ksnd_sched_info != NULL) {
+ cfs_percpt_for_each(info, i,
+ ksocknal_data.ksnd_sched_info) {
+ if (info->ksi_scheds == NULL)
+ continue;
+
+ for (j = 0; j < info->ksi_nthreads_max; j++) {
+ sched = &info->ksi_scheds[j];
+ wake_up_all(&sched->kss_waitq);
+ }
+ }
+ }
+
+ i = 4;
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ while (ksocknal_data.ksnd_nthreads != 0) {
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "waiting for %d threads to terminate\n",
+ ksocknal_data.ksnd_nthreads);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ cfs_pause(cfs_time_seconds(1));
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ }
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_free_buffers();
+
+ ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+ break;
+ }
+
+ CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+ atomic_read (&libcfs_kmemory));
+
+ module_put(THIS_MODULE);
+}
+
+__u64
+ksocknal_new_incarnation (void)
+{
+ struct timeval tv;
+
+ /* The incarnation number is the time this module loaded and it
+ * identifies this particular instance of the socknal. Hopefully
+ * we won't be able to reboot more frequently than 1MHz for the
+ * forseeable future :) */
+
+ do_gettimeofday(&tv);
+
+ return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_base_startup(void)
+{
+ struct ksock_sched_info *info;
+ int rc;
+ int i;
+
+ LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+ LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+ memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+ ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+ LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
+ sizeof (struct list_head) *
+ ksocknal_data.ksnd_peer_hash_size);
+ if (ksocknal_data.ksnd_peers == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+ rwlock_init(&ksocknal_data.ksnd_global_lock);
+ INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+ spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+ INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
+ INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
+ INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
+ init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+ INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+ INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+ init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+ spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+ INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
+
+ /* NB memset above zeros whole of ksocknal_data */
+
+ /* flag lists/ptrs/locks initialised */
+ ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+ try_module_get(THIS_MODULE);
+
+ ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*info));
+ if (ksocknal_data.ksnd_sched_info == NULL)
+ goto failed;
+
+ cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+ ksock_sched_t *sched;
+ int nthrs;
+
+ nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+ if (*ksocknal_tunables.ksnd_nscheds > 0) {
+ nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+ } else {
+ /* max to half of CPUs, assume another half should be
+ * reserved for upper layer modules */
+ nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+ }
+
+ info->ksi_nthreads_max = nthrs;
+ info->ksi_cpt = i;
+
+ LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+ info->ksi_nthreads_max * sizeof(*sched));
+ if (info->ksi_scheds == NULL)
+ goto failed;
+
+ for (; nthrs > 0; nthrs--) {
+ sched = &info->ksi_scheds[nthrs - 1];
+
+ sched->kss_info = info;
+ spin_lock_init(&sched->kss_lock);
+ INIT_LIST_HEAD(&sched->kss_rx_conns);
+ INIT_LIST_HEAD(&sched->kss_tx_conns);
+ INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+ init_waitqueue_head(&sched->kss_waitq);
+ }
+ }
+
+ ksocknal_data.ksnd_connd_starting = 0;
+ ksocknal_data.ksnd_connd_failed_stamp = 0;
+ ksocknal_data.ksnd_connd_starting_stamp = cfs_time_current_sec();
+ /* must have at least 2 connds to remain responsive to accepts while
+ * connecting */
+ if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+ *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+ if (*ksocknal_tunables.ksnd_nconnds_max <
+ *ksocknal_tunables.ksnd_nconnds) {
+ ksocknal_tunables.ksnd_nconnds_max =
+ ksocknal_tunables.ksnd_nconnds;
+ }
+
+ for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+ char name[16];
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+ ksocknal_data.ksnd_connd_starting++;
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+ snprintf(name, sizeof(name), "socknal_cd%02d", i);
+ rc = ksocknal_thread_start(ksocknal_connd,
+ (void *)((ulong_ptr_t)i), name);
+ if (rc != 0) {
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+ ksocknal_data.ksnd_connd_starting--;
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+ CERROR("Can't spawn socknal connd: %d\n", rc);
+ goto failed;
+ }
+ }
+
+ rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+ if (rc != 0) {
+ CERROR ("Can't spawn socknal reaper: %d\n", rc);
+ goto failed;
+ }
+
+ /* flag everything initialised */
+ ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+ return 0;
+
+ failed:
+ ksocknal_base_shutdown();
+ return -ENETDOWN;
+}
+
+void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+ ksock_peer_t *peer = NULL;
+ struct list_head *tmp;
+ int i;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+ if (peer->ksnp_ni == ni) break;
+
+ peer = NULL;
+ }
+ }
+
+ if (peer != NULL) {
+ ksock_route_t *route;
+ ksock_conn_t *conn;
+
+ CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+ "closing %d, accepting %d, err %d, zcookie "LPU64", "
+ "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+ atomic_read(&peer->ksnp_refcount),
+ peer->ksnp_sharecount, peer->ksnp_closing,
+ peer->ksnp_accepting, peer->ksnp_error,
+ peer->ksnp_zc_next_cookie,
+ !list_empty(&peer->ksnp_tx_queue),
+ !list_empty(&peer->ksnp_zc_req_list));
+
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+ CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+ "del %d\n", atomic_read(&route->ksnr_refcount),
+ route->ksnr_scheduled, route->ksnr_connecting,
+ route->ksnr_connected, route->ksnr_deleted);
+ }
+
+ list_for_each (tmp, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+ CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+ atomic_read(&conn->ksnc_conn_refcount),
+ atomic_read(&conn->ksnc_sock_refcount),
+ conn->ksnc_type, conn->ksnc_closing);
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return;
+}
+
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+ ksock_net_t *net = ni->ni_data;
+ int i;
+ lnet_process_id_t anyid = {0};
+
+ anyid.nid = LNET_NID_ANY;
+ anyid.pid = LNET_PID_ANY;
+
+ LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+ LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+ spin_lock_bh(&net->ksnn_lock);
+ net->ksnn_shutdown = 1; /* prevent new peers */
+ spin_unlock_bh(&net->ksnn_lock);
+
+ /* Delete all peers */
+ ksocknal_del_peer(ni, anyid, 0);
+
+ /* Wait for all peer state to clean up */
+ i = 2;
+ spin_lock_bh(&net->ksnn_lock);
+ while (net->ksnn_npeers != 0) {
+ spin_unlock_bh(&net->ksnn_lock);
+
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+ "waiting for %d peers to disconnect\n",
+ net->ksnn_npeers);
+ cfs_pause(cfs_time_seconds(1));
+
+ ksocknal_debug_peerhash(ni);
+
+ spin_lock_bh(&net->ksnn_lock);
+ }
+ spin_unlock_bh(&net->ksnn_lock);
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+ LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+ }
+
+ list_del(&net->ksnn_list);
+ LIBCFS_FREE(net, sizeof(*net));
+
+ ksocknal_data.ksnd_nnets--;
+ if (ksocknal_data.ksnd_nnets == 0)
+ ksocknal_base_shutdown();
+}
+
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+ char **names;
+ int i;
+ int j;
+ int rc;
+ int n;
+
+ n = libcfs_ipif_enumerate(&names);
+ if (n <= 0) {
+ CERROR("Can't enumerate interfaces: %d\n", n);
+ return n;
+ }
+
+ for (i = j = 0; i < n; i++) {
+ int up;
+ __u32 ip;
+ __u32 mask;
+
+ if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+ continue;
+
+ rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+ if (rc != 0) {
+ CWARN("Can't get interface %s info: %d\n",
+ names[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Ignoring interface %s (down)\n",
+ names[i]);
+ continue;
+ }
+
+ if (j == LNET_MAX_INTERFACES) {
+ CWARN("Ignoring interface %s (too many interfaces)\n",
+ names[i]);
+ continue;
+ }
+
+ net->ksnn_interfaces[j].ksni_ipaddr = ip;
+ net->ksnn_interfaces[j].ksni_netmask = mask;
+ strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+ names[i], IFNAMSIZ);
+ j++;
+ }
+
+ libcfs_ipif_free_enumeration(names, n);
+
+ if (j == 0)
+ CERROR("Can't find any usable interfaces\n");
+
+ return j;
+}
+
+int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+ int new_ipif = 0;
+ int i;
+
+ for (i = 0; i < net->ksnn_ninterfaces; i++) {
+ char *ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+ char *colon = strchr(ifnam, ':');
+ int found = 0;
+ ksock_net_t *tmp;
+ int j;
+
+ if (colon != NULL) /* ignore alias device */
+ *colon = 0;
+
+ list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+ ksnn_list) {
+ for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+ char *ifnam2 = &tmp->ksnn_interfaces[j].\
+ ksni_name[0];
+ char *colon2 = strchr(ifnam2, ':');
+
+ if (colon2 != NULL)
+ *colon2 = 0;
+
+ found = strcmp(ifnam, ifnam2) == 0;
+ if (colon2 != NULL)
+ *colon2 = ':';
+ }
+ if (found)
+ break;
+ }
+
+ new_ipif += !found;
+ if (colon != NULL)
+ *colon = ':';
+ }
+
+ return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+ int nthrs;
+ int rc = 0;
+ int i;
+
+ if (info->ksi_nthreads == 0) {
+ if (*ksocknal_tunables.ksnd_nscheds > 0) {
+ nthrs = info->ksi_nthreads_max;
+ } else {
+ nthrs = cfs_cpt_weight(lnet_cpt_table(),
+ info->ksi_cpt);
+ nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+ nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+ }
+ nthrs = min(nthrs, info->ksi_nthreads_max);
+ } else {
+ LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+ /* increase two threads if there is new interface */
+ nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+ }
+
+ for (i = 0; i < nthrs; i++) {
+ long id;
+ char name[20];
+ ksock_sched_t *sched;
+ id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+ sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+ snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+ info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+ rc = ksocknal_thread_start(ksocknal_scheduler,
+ (void *)id, name);
+ if (rc == 0)
+ continue;
+
+ CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+ info->ksi_cpt, info->ksi_nthreads + i, rc);
+ break;
+ }
+
+ info->ksi_nthreads += i;
+ return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+ int newif = ksocknal_search_new_ipif(net);
+ int rc;
+ int i;
+
+ LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+ for (i = 0; i < ncpts; i++) {
+ struct ksock_sched_info *info;
+ int cpt = (cpts == NULL) ? i : cpts[i];
+
+ LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+ info = ksocknal_data.ksnd_sched_info[cpt];
+
+ if (!newif && info->ksi_nthreads > 0)
+ continue;
+
+ rc = ksocknal_start_schedulers(info);
+ if (rc != 0)
+ return rc;
+ }
+ return 0;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+ ksock_net_t *net;
+ int rc;
+ int i;
+
+ LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+ if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+ rc = ksocknal_base_startup();
+ if (rc != 0)
+ return rc;
+ }
+
+ LIBCFS_ALLOC(net, sizeof(*net));
+ if (net == NULL)
+ goto fail_0;
+
+ spin_lock_init(&net->ksnn_lock);
+ net->ksnn_incarnation = ksocknal_new_incarnation();
+ ni->ni_data = net;
+ ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout;
+ ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits;
+ ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits;
+ ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+ if (ni->ni_interfaces[0] == NULL) {
+ rc = ksocknal_enumerate_interfaces(net);
+ if (rc <= 0)
+ goto fail_1;
+
+ net->ksnn_ninterfaces = 1;
+ } else {
+ for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+ int up;
+
+ if (ni->ni_interfaces[i] == NULL)
+ break;
+
+ rc = libcfs_ipif_query(
+ ni->ni_interfaces[i], &up,
+ &net->ksnn_interfaces[i].ksni_ipaddr,
+ &net->ksnn_interfaces[i].ksni_netmask);
+
+ if (rc != 0) {
+ CERROR("Can't get interface %s info: %d\n",
+ ni->ni_interfaces[i], rc);
+ goto fail_1;
+ }
+
+ if (!up) {
+ CERROR("Interface %s is down\n",
+ ni->ni_interfaces[i]);
+ goto fail_1;
+ }
+
+ strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+ ni->ni_interfaces[i], IFNAMSIZ);
+ }
+ net->ksnn_ninterfaces = i;
+ }
+
+ /* call it before add it to ksocknal_data.ksnd_nets */
+ rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+ if (rc != 0)
+ goto fail_1;
+
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+ net->ksnn_interfaces[0].ksni_ipaddr);
+ list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+ ksocknal_data.ksnd_nnets++;
+
+ return 0;
+
+ fail_1:
+ LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+ if (ksocknal_data.ksnd_nnets == 0)
+ ksocknal_base_shutdown();
+
+ return -ENETDOWN;
+}
+
+
+void __exit
+ksocknal_module_fini (void)
+{
+ lnet_unregister_lnd(&the_ksocklnd);
+ ksocknal_tunables_fini();
+}
+
+int __init
+ksocknal_module_init (void)
+{
+ int rc;
+
+ /* check ksnr_connected/connecting field large enough */
+ CLASSERT (SOCKLND_CONN_NTYPES <= 4);
+ CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+ /* initialize the_ksocklnd */
+ the_ksocklnd.lnd_type = SOCKLND;
+ the_ksocklnd.lnd_startup = ksocknal_startup;
+ the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+ the_ksocklnd.lnd_ctl = ksocknal_ctl;
+ the_ksocklnd.lnd_send = ksocknal_send;
+ the_ksocklnd.lnd_recv = ksocknal_recv;
+ the_ksocklnd.lnd_notify = ksocknal_notify;
+ the_ksocklnd.lnd_query = ksocknal_query;
+ the_ksocklnd.lnd_accept = ksocknal_accept;
+
+ rc = ksocknal_tunables_init();
+ if (rc != 0)
+ return rc;
+
+ lnet_register_lnd(&the_ksocklnd);
+
+ return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+
+cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 000000000000..b483e0c3a69a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/socklnd.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */
+#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY CFS_TICK /* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK 0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK 1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct /* per scheduler state */
+{
+ spinlock_t kss_lock; /* serialise */
+ struct list_head kss_rx_conns; /* conn waiting to be read */
+ /* conn waiting to be written */
+ struct list_head kss_tx_conns;
+ /* zombie noop tx list */
+ struct list_head kss_zombie_noop_txs;
+ wait_queue_head_t kss_waitq; /* where scheduler sleeps */
+ /* # connections assigned to this scheduler */
+ int kss_nconns;
+ struct ksock_sched_info *kss_info; /* owner of it */
+ struct page *kss_rx_scratch_pgs[LNET_MAX_IOV];
+ struct iovec kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+ int ksi_nthreads_max; /* max allowed threads */
+ int ksi_nthreads; /* number of threads */
+ int ksi_cpt; /* CPT id */
+ ksock_sched_t *ksi_scheds; /* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT 16
+#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct /* in-use interface */
+{
+ __u32 ksni_ipaddr; /* interface's IP address */
+ __u32 ksni_netmask; /* interface's network mask */
+ int ksni_nroutes; /* # routes using (active) */
+ int ksni_npeers; /* # peers using (passive) */
+ char ksni_name[IFNAMSIZ]; /* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+ /* "stuck" socket timeout (seconds) */
+ int *ksnd_timeout;
+ /* # scheduler threads in each pool while starting */
+ int *ksnd_nscheds;
+ int *ksnd_nconnds; /* # connection daemons */
+ int *ksnd_nconnds_max; /* max # connection daemons */
+ int *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+ int *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+ int *ksnd_eager_ack; /* make TCP ack eagerly? */
+ int *ksnd_typed_conns; /* drive sockets by type? */
+ int *ksnd_min_bulk; /* smallest "large" message */
+ int *ksnd_tx_buffer_size; /* socket tx buffer size */
+ int *ksnd_rx_buffer_size; /* socket rx buffer size */
+ int *ksnd_nagle; /* enable NAGLE? */
+ int *ksnd_round_robin; /* round robin for multiple interfaces */
+ int *ksnd_keepalive; /* # secs for sending keepalive NOOP */
+ int *ksnd_keepalive_idle; /* # idle secs before 1st probe */
+ int *ksnd_keepalive_count; /* # probes */
+ int *ksnd_keepalive_intvl; /* time between probes */
+ int *ksnd_credits; /* # concurrent sends */
+ int *ksnd_peertxcredits; /* # concurrent sends to 1 peer */
+ int *ksnd_peerrtrcredits; /* # per-peer router buffer credits */
+ int *ksnd_peertimeout; /* seconds to consider peer dead */
+ int *ksnd_enable_csum; /* enable check sum */
+ int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+ int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */
+ unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */
+ int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */
+ int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+ ctl_table_header_t *ksnd_sysctl; /* sysctl interface */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+ __u64 ksnn_incarnation; /* my epoch */
+ spinlock_t ksnn_lock; /* serialise */
+ struct list_head ksnn_list; /* chain on global list */
+ int ksnn_npeers; /* # peers */
+ int ksnn_shutdown; /* shutting down? */
+ int ksnn_ninterfaces; /* IP interfaces */
+ ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT 120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV 1
+
+typedef struct
+{
+ int ksnd_init; /* initialisation state */
+ int ksnd_nnets; /* # networks set up */
+ struct list_head ksnd_nets; /* list of nets */
+ /* stabilize peer/conn ops */
+ rwlock_t ksnd_global_lock;
+ /* hash table of all my known peers */
+ struct list_head *ksnd_peers;
+ int ksnd_peer_hash_size; /* size of ksnd_peers */
+
+ int ksnd_nthreads; /* # live threads */
+ int ksnd_shuttingdown; /* tell threads to exit */
+ /* schedulers information */
+ struct ksock_sched_info **ksnd_sched_info;
+
+ atomic_t ksnd_nactive_txs; /* #active txs */
+
+ struct list_head ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+ struct list_head ksnd_zombie_conns; /* conns to free: reaper_lock */
+ struct list_head ksnd_enomem_conns; /* conns to retry: reaper_lock*/
+ wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */
+ cfs_time_t ksnd_reaper_waketime;/* when reaper will wake */
+ spinlock_t ksnd_reaper_lock; /* serialise */
+
+ int ksnd_enomem_tx; /* test ENOMEM sender */
+ int ksnd_stall_tx; /* test sluggish sender */
+ int ksnd_stall_rx; /* test sluggish receiver */
+
+ struct list_head ksnd_connd_connreqs; /* incoming connection requests */
+ struct list_head ksnd_connd_routes; /* routes waiting to be connected */
+ wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */
+ int ksnd_connd_connecting;/* # connds connecting */
+ /** time stamp of the last failed connecting attempt */
+ long ksnd_connd_failed_stamp;
+ /** # starting connd */
+ unsigned ksnd_connd_starting;
+ /** time stamp of the last starting connd */
+ long ksnd_connd_starting_stamp;
+ /** # running connd */
+ unsigned ksnd_connd_running;
+ spinlock_t ksnd_connd_lock; /* serialise */
+
+ struct list_head ksnd_idle_noop_txs; /* list head for freed noop tx */
+ spinlock_t ksnd_tx_lock; /* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING 0
+#define SOCKNAL_INIT_DATA 1
+#define SOCKNAL_INIT_ALL 2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header). Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn; /* forward ref */
+struct ksock_peer; /* forward ref */
+struct ksock_route; /* forward ref */
+struct ksock_proto; /* forward ref */
+
+typedef struct /* transmit packet */
+{
+ struct list_head tx_list; /* queue on conn for transmission etc */
+ struct list_head tx_zc_list; /* queue on peer for ZC request */
+ atomic_t tx_refcount; /* tx reference count */
+ int tx_nob; /* # packet bytes */
+ int tx_resid; /* residual bytes */
+ int tx_niov; /* # packet iovec frags */
+ struct iovec *tx_iov; /* packet iovec frags */
+ int tx_nkiov; /* # packet page frags */
+ unsigned short tx_zc_aborted; /* aborted ZC request */
+ unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+ unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+ unsigned short tx_nonblk:1; /* it's a non-blocking ACK */
+ lnet_kiov_t *tx_kiov; /* packet page frags */
+ struct ksock_conn *tx_conn; /* owning conn */
+ lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */
+ cfs_time_t tx_deadline; /* when (in jiffies) tx times out */
+ ksock_msg_t tx_msg; /* socklnd message buffer */
+ int tx_desc_size; /* size of this descriptor */
+ union {
+ struct {
+ struct iovec iov; /* virt hdr */
+ lnet_kiov_t kiov[0]; /* paged payload */
+ } paged;
+ struct {
+ struct iovec iov[1]; /* virt hdr + payload */
+ } virt;
+ } tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+ struct iovec iov[LNET_MAX_IOV];
+ lnet_kiov_t kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */
+#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP 6 /* skipping body */
+
+typedef struct ksock_conn
+{
+ struct ksock_peer *ksnc_peer; /* owning peer */
+ struct ksock_route *ksnc_route; /* owning route */
+ struct list_head ksnc_list; /* stash on peer's conn list */
+ socket_t *ksnc_sock; /* actual socket */
+ void *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+ void *ksnc_saved_write_space; /* socket's original write_space() callback */
+ atomic_t ksnc_conn_refcount; /* conn refcount */
+ atomic_t ksnc_sock_refcount; /* sock refcount */
+ ksock_sched_t *ksnc_scheduler; /* who schedules this connection */
+ __u32 ksnc_myipaddr; /* my IP */
+ __u32 ksnc_ipaddr; /* peer's IP */
+ int ksnc_port; /* peer's port */
+ signed int ksnc_type:3; /* type of connection,
+ * should be signed value */
+ unsigned int ksnc_closing:1; /* being shut down */
+ unsigned int ksnc_flip:1; /* flip or not, only for V2.x */
+ unsigned int ksnc_zc_capable:1; /* enable to ZC */
+ struct ksock_proto *ksnc_proto; /* protocol for the connection */
+
+ /* reader */
+ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */
+ cfs_time_t ksnc_rx_deadline; /* when (in jiffies) receive times out */
+ __u8 ksnc_rx_started; /* started receiving a message */
+ __u8 ksnc_rx_ready; /* data ready to read */
+ __u8 ksnc_rx_scheduled;/* being progressed */
+ __u8 ksnc_rx_state; /* what is being read */
+ int ksnc_rx_nob_left; /* # bytes to next hdr/body */
+ int ksnc_rx_nob_wanted; /* bytes actually wanted */
+ int ksnc_rx_niov; /* # iovec frags */
+ struct iovec *ksnc_rx_iov; /* the iovec frags */
+ int ksnc_rx_nkiov; /* # page frags */
+ lnet_kiov_t *ksnc_rx_kiov; /* the page frags */
+ ksock_rxiovspace_t ksnc_rx_iov_space;/* space for frag descriptors */
+ __u32 ksnc_rx_csum; /* partial checksum for incoming data */
+ void *ksnc_cookie; /* rx lnet_finalize passthru arg */
+ ksock_msg_t ksnc_msg; /* incoming message buffer:
+ * V2.x message takes the
+ * whole struct
+ * V1.x message is a bare
+ * lnet_hdr_t, it's stored in
+ * ksnc_msg.ksm_u.lnetmsg */
+
+ /* WRITER */
+ struct list_head ksnc_tx_list; /* where I enq waiting for output space */
+ struct list_head ksnc_tx_queue; /* packets waiting to be sent */
+ ksock_tx_t *ksnc_tx_carrier; /* next TX that can carry a LNet message or ZC-ACK */
+ cfs_time_t ksnc_tx_deadline; /* when (in jiffies) tx times out */
+ int ksnc_tx_bufnob; /* send buffer marker */
+ atomic_t ksnc_tx_nob; /* # bytes queued */
+ int ksnc_tx_ready; /* write space */
+ int ksnc_tx_scheduled; /* being progressed */
+ cfs_time_t ksnc_tx_last_post; /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route
+{
+ struct list_head ksnr_list; /* chain on peer route list */
+ struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */
+ struct ksock_peer *ksnr_peer; /* owning peer */
+ atomic_t ksnr_refcount; /* # users */
+ cfs_time_t ksnr_timeout; /* when (in jiffies) reconnection can happen next */
+ cfs_duration_t ksnr_retry_interval; /* how long between retries */
+ __u32 ksnr_myipaddr; /* my IP */
+ __u32 ksnr_ipaddr; /* IP address to connect to */
+ int ksnr_port; /* port to connect to */
+ unsigned int ksnr_scheduled:1; /* scheduled for attention */
+ unsigned int ksnr_connecting:1;/* connection establishment in progress */
+ unsigned int ksnr_connected:4; /* connections established by type */
+ unsigned int ksnr_deleted:1; /* been removed from peer? */
+ unsigned int ksnr_share_count; /* created explicitly? */
+ int ksnr_conn_count; /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */
+
+typedef struct ksock_peer
+{
+ struct list_head ksnp_list; /* stash on global peer list */
+ cfs_time_t ksnp_last_alive; /* when (in jiffies) I was last alive */
+ lnet_process_id_t ksnp_id; /* who's on the other end(s) */
+ atomic_t ksnp_refcount; /* # users */
+ int ksnp_sharecount; /* lconf usage counter */
+ int ksnp_closing; /* being closed */
+ int ksnp_accepting;/* # passive connections pending */
+ int ksnp_error; /* errno on closing last conn */
+ __u64 ksnp_zc_next_cookie;/* ZC completion cookie */
+ __u64 ksnp_incarnation; /* latest known peer incarnation */
+ struct ksock_proto *ksnp_proto; /* latest known peer protocol */
+ struct list_head ksnp_conns; /* all active connections */
+ struct list_head ksnp_routes; /* routes */
+ struct list_head ksnp_tx_queue; /* waiting packets */
+ spinlock_t ksnp_lock; /* serialize, g_lock unsafe */
+ struct list_head ksnp_zc_req_list; /* zero copy requests wait for ACK */
+ cfs_time_t ksnp_send_keepalive; /* time to send keepalive */
+ lnet_ni_t *ksnp_ni; /* which network */
+ int ksnp_n_passive_ips; /* # of... */
+ __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq
+{
+ struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */
+ lnet_ni_t *ksncr_ni; /* chosen NI */
+ socket_t *ksncr_sock; /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto
+{
+ int pro_version; /* version number of protocol */
+ int (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *); /* handshake function */
+ int (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+ void (*pro_pack)(ksock_tx_t *); /* message pack */
+ void (*pro_unpack)(ksock_msg_t *); /* message unpack */
+ ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *); /* queue tx on the connection */
+ int (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+ int (*pro_handle_zcreq)(ksock_conn_t *, __u64, int); /* handle ZC request */
+ int (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64); /* handle ZC ACK */
+ int (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int); /* msg type matches the connection type:
+ * return value:
+ * return MATCH_NO : no
+ * return MATCH_YES : matching type
+ * return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE 0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+ if (!*ksocknal_tunables.ksnd_typed_conns)
+ return (1 << SOCKLND_CONN_ANY);
+
+ return ((1 << SOCKLND_CONN_CONTROL) |
+ (1 << SOCKLND_CONN_BULK_IN) |
+ (1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+ unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+ return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+ LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+ atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+ LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+ if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+ ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+ int rc = -ESHUTDOWN;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ if (!conn->ksnc_closing) {
+ LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+ atomic_inc(&conn->ksnc_sock_refcount);
+ rc = 0;
+ }
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+ LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+ if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+ LASSERT (conn->ksnc_closing);
+ libcfs_sock_release(conn->ksnc_sock);
+ conn->ksnc_sock = NULL;
+ ksocknal_finalize_zcreq(conn);
+ }
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+ LASSERT (atomic_read(&tx->tx_refcount) > 0);
+ atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
+{
+ LASSERT (atomic_read(&tx->tx_refcount) > 0);
+ if (atomic_dec_and_test(&tx->tx_refcount))
+ ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+ LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+ atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+ LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+ if (atomic_dec_and_test(&route->ksnr_refcount))
+ ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+ LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+ atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+ LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+ if (atomic_dec_and_test(&peer->ksnp_refcount))
+ ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ int delayed, unsigned int niov,
+ struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, socket_t *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+ socket_t *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int ksocknal_close_peer_conns_locked (ksock_peer_t *peer,
+ __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+ ksock_tx_t *tx, int nonblk);
+
+extern int ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+ lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+ int error);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+ lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+ ksock_hello_msg_t *hello, lnet_process_id_t *id,
+ __u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock (socket_t *so);
+extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem,
+ int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+extern void ksocknal_tunables_fini(void);
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 000000000000..ad5e24104238
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -0,0 +1,2664 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+ ksock_tx_t *tx = NULL;
+
+ if (type == KSOCK_MSG_NOOP) {
+ LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+ /* searching for a noop tx in free list */
+ spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+ if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+ tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+ next, ksock_tx_t, tx_list);
+ LASSERT(tx->tx_desc_size == size);
+ list_del(&tx->tx_list);
+ }
+
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+ }
+
+ if (tx == NULL)
+ LIBCFS_ALLOC(tx, size);
+
+ if (tx == NULL)
+ return NULL;
+
+ atomic_set(&tx->tx_refcount, 1);
+ tx->tx_zc_aborted = 0;
+ tx->tx_zc_capable = 0;
+ tx->tx_zc_checked = 0;
+ tx->tx_desc_size = size;
+
+ atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+ return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+ ksock_tx_t *tx;
+
+ tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+ if (tx == NULL) {
+ CERROR("Can't allocate noop tx desc\n");
+ return NULL;
+ }
+
+ tx->tx_conn = NULL;
+ tx->tx_lnetmsg = NULL;
+ tx->tx_kiov = NULL;
+ tx->tx_nkiov = 0;
+ tx->tx_iov = tx->tx_frags.virt.iov;
+ tx->tx_niov = 1;
+ tx->tx_nonblk = nonblk;
+
+ socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+ return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+ atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+ if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+ /* it's a noop tx */
+ spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+ list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+ spin_unlock(&ksocknal_data.ksnd_tx_lock);
+ } else {
+ LIBCFS_FREE(tx, tx->tx_desc_size);
+ }
+}
+
+int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ struct iovec *iov = tx->tx_iov;
+ int nob;
+ int rc;
+
+ LASSERT (tx->tx_niov > 0);
+
+ /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+ rc = ksocknal_lib_send_iov(conn, tx);
+
+ if (rc <= 0) /* sent nothing? */
+ return (rc);
+
+ nob = rc;
+ LASSERT (nob <= tx->tx_resid);
+ tx->tx_resid -= nob;
+
+ /* "consume" iov */
+ do {
+ LASSERT (tx->tx_niov > 0);
+
+ if (nob < (int) iov->iov_len) {
+ iov->iov_base = (void *)((char *)iov->iov_base + nob);
+ iov->iov_len -= nob;
+ return (rc);
+ }
+
+ nob -= iov->iov_len;
+ tx->tx_iov = ++iov;
+ tx->tx_niov--;
+ } while (nob != 0);
+
+ return (rc);
+}
+
+int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ lnet_kiov_t *kiov = tx->tx_kiov;
+ int nob;
+ int rc;
+
+ LASSERT (tx->tx_niov == 0);
+ LASSERT (tx->tx_nkiov > 0);
+
+ /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+ rc = ksocknal_lib_send_kiov(conn, tx);
+
+ if (rc <= 0) /* sent nothing? */
+ return (rc);
+
+ nob = rc;
+ LASSERT (nob <= tx->tx_resid);
+ tx->tx_resid -= nob;
+
+ /* "consume" kiov */
+ do {
+ LASSERT(tx->tx_nkiov > 0);
+
+ if (nob < (int)kiov->kiov_len) {
+ kiov->kiov_offset += nob;
+ kiov->kiov_len -= nob;
+ return rc;
+ }
+
+ nob -= (int)kiov->kiov_len;
+ tx->tx_kiov = ++kiov;
+ tx->tx_nkiov--;
+ } while (nob != 0);
+
+ return (rc);
+}
+
+int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ int rc;
+ int bufnob;
+
+ if (ksocknal_data.ksnd_stall_tx != 0) {
+ cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+ }
+
+ LASSERT (tx->tx_resid != 0);
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) {
+ LASSERT (conn->ksnc_closing);
+ return (-ESHUTDOWN);
+ }
+
+ do {
+ if (ksocknal_data.ksnd_enomem_tx > 0) {
+ /* testing... */
+ ksocknal_data.ksnd_enomem_tx--;
+ rc = -EAGAIN;
+ } else if (tx->tx_niov != 0) {
+ rc = ksocknal_send_iov (conn, tx);
+ } else {
+ rc = ksocknal_send_kiov (conn, tx);
+ }
+
+ bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+ if (rc > 0) /* sent something? */
+ conn->ksnc_tx_bufnob += rc; /* account it */
+
+ if (bufnob < conn->ksnc_tx_bufnob) {
+ /* allocated send buffer bytes < computed; infer
+ * something got ACKed */
+ conn->ksnc_tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_tx_bufnob = bufnob;
+ mb();
+ }
+
+ if (rc <= 0) { /* Didn't write anything? */
+
+ if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+ rc = -EAGAIN;
+
+ /* Check if EAGAIN is due to memory pressure */
+ if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+ rc = -ENOMEM;
+
+ break;
+ }
+
+ /* socket's wmem_queued now includes 'rc' bytes */
+ atomic_sub (rc, &conn->ksnc_tx_nob);
+ rc = 0;
+
+ } while (tx->tx_resid != 0);
+
+ ksocknal_connsock_decref(conn);
+ return (rc);
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+ struct iovec *iov = conn->ksnc_rx_iov;
+ int nob;
+ int rc;
+
+ LASSERT (conn->ksnc_rx_niov > 0);
+
+ /* Never touch conn->ksnc_rx_iov or change connection
+ * status inside ksocknal_lib_recv_iov */
+ rc = ksocknal_lib_recv_iov(conn);
+
+ if (rc <= 0)
+ return (rc);
+
+ /* received something... */
+ nob = rc;
+
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_rx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
+ conn->ksnc_rx_nob_wanted -= nob;
+ conn->ksnc_rx_nob_left -= nob;
+
+ do {
+ LASSERT (conn->ksnc_rx_niov > 0);
+
+ if (nob < (int)iov->iov_len) {
+ iov->iov_len -= nob;
+ iov->iov_base = (void *)((char *)iov->iov_base + nob);
+ return (-EAGAIN);
+ }
+
+ nob -= iov->iov_len;
+ conn->ksnc_rx_iov = ++iov;
+ conn->ksnc_rx_niov--;
+ } while (nob != 0);
+
+ return (rc);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+ lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
+ int nob;
+ int rc;
+ LASSERT (conn->ksnc_rx_nkiov > 0);
+
+ /* Never touch conn->ksnc_rx_kiov or change connection
+ * status inside ksocknal_lib_recv_iov */
+ rc = ksocknal_lib_recv_kiov(conn);
+
+ if (rc <= 0)
+ return (rc);
+
+ /* received something... */
+ nob = rc;
+
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_rx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
+ conn->ksnc_rx_nob_wanted -= nob;
+ conn->ksnc_rx_nob_left -= nob;
+
+ do {
+ LASSERT (conn->ksnc_rx_nkiov > 0);
+
+ if (nob < (int) kiov->kiov_len) {
+ kiov->kiov_offset += nob;
+ kiov->kiov_len -= nob;
+ return -EAGAIN;
+ }
+
+ nob -= kiov->kiov_len;
+ conn->ksnc_rx_kiov = ++kiov;
+ conn->ksnc_rx_nkiov--;
+ } while (nob != 0);
+
+ return 1;
+}
+
+int
+ksocknal_receive (ksock_conn_t *conn)
+{
+ /* Return 1 on success, 0 on EOF, < 0 on error.
+ * Caller checks ksnc_rx_nob_wanted to determine
+ * progress/completion. */
+ int rc;
+ ENTRY;
+
+ if (ksocknal_data.ksnd_stall_rx != 0) {
+ cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+ }
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) {
+ LASSERT (conn->ksnc_closing);
+ return (-ESHUTDOWN);
+ }
+
+ for (;;) {
+ if (conn->ksnc_rx_niov != 0)
+ rc = ksocknal_recv_iov (conn);
+ else
+ rc = ksocknal_recv_kiov (conn);
+
+ if (rc <= 0) {
+ /* error/EOF or partial receive */
+ if (rc == -EAGAIN) {
+ rc = 1;
+ } else if (rc == 0 && conn->ksnc_rx_started) {
+ /* EOF in the middle of a message */
+ rc = -EPROTO;
+ }
+ break;
+ }
+
+ /* Completed a fragment */
+
+ if (conn->ksnc_rx_nob_wanted == 0) {
+ rc = 1;
+ break;
+ }
+ }
+
+ ksocknal_connsock_decref(conn);
+ RETURN (rc);
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+ lnet_msg_t *lnetmsg = tx->tx_lnetmsg;
+ int rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+ ENTRY;
+
+ LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+ if (tx->tx_conn != NULL)
+ ksocknal_conn_decref(tx->tx_conn);
+
+ if (ni == NULL && tx->tx_conn != NULL)
+ ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+ ksocknal_free_tx (tx);
+ if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+ lnet_finalize (ni, lnetmsg, rc);
+
+ EXIT;
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+ ksock_tx_t *tx;
+
+ while (!list_empty (txlist)) {
+ tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+ if (error && tx->tx_lnetmsg != NULL) {
+ CNETERR("Deleting packet type %d len %d %s->%s\n",
+ le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+ le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+ libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+ libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+ } else if (error) {
+ CNETERR("Deleting noop packet\n");
+ }
+
+ list_del (&tx->tx_list);
+
+ LASSERT (atomic_read(&tx->tx_refcount) == 1);
+ ksocknal_tx_done (ni, tx);
+ }
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+ ksock_conn_t *conn = tx->tx_conn;
+ ksock_peer_t *peer = conn->ksnc_peer;
+
+ /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+ * to ksnp_zc_req_list if some fragment of this message should be sent
+ * zero-copy. Our peer will send an ACK containing this cookie when
+ * she has received this message to tell us we can signal completion.
+ * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+ * ksnp_zc_req_list. */
+ LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+ LASSERT (tx->tx_zc_capable);
+
+ tx->tx_zc_checked = 1;
+
+ if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+ !conn->ksnc_zc_capable)
+ return;
+
+ /* assign cookie and queue tx to pending list, it will be released when
+ * a matching ack is received. See ksocknal_handle_zcack() */
+
+ ksocknal_tx_addref(tx);
+
+ spin_lock(&peer->ksnp_lock);
+
+ /* ZC_REQ is going to be pinned to the peer */
+ tx->tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+ LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+ tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+ if (peer->ksnp_zc_next_cookie == 0)
+ peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+ list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+ spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+ ksock_peer_t *peer = tx->tx_conn->ksnc_peer;
+
+ LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+ LASSERT(tx->tx_zc_capable);
+
+ tx->tx_zc_checked = 0;
+
+ spin_lock(&peer->ksnp_lock);
+
+ if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+ /* Not waiting for an ACK */
+ spin_unlock(&peer->ksnp_lock);
+ return;
+ }
+
+ tx->tx_msg.ksm_zc_cookies[0] = 0;
+ list_del(&tx->tx_zc_list);
+
+ spin_unlock(&peer->ksnp_lock);
+
+ ksocknal_tx_decref(tx);
+}
+
+int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ int rc;
+
+ if (tx->tx_zc_capable && !tx->tx_zc_checked)
+ ksocknal_check_zc_req(tx);
+
+ rc = ksocknal_transmit (conn, tx);
+
+ CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+ if (tx->tx_resid == 0) {
+ /* Sent everything OK */
+ LASSERT (rc == 0);
+
+ return (0);
+ }
+
+ if (rc == -EAGAIN)
+ return (rc);
+
+ if (rc == -ENOMEM) {
+ static int counter;
+
+ counter++; /* exponential backoff warnings */
+ if ((counter & (-counter)) == counter)
+ CWARN("%u ENOMEM tx %p (%u allocated)\n",
+ counter, conn, atomic_read(&libcfs_kmemory));
+
+ /* Queue on ksnd_enomem_conns for retry after a timeout */
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ /* enomem list takes over scheduler's ref... */
+ LASSERT (conn->ksnc_tx_scheduled);
+ list_add_tail(&conn->ksnc_tx_list,
+ &ksocknal_data.ksnd_enomem_conns);
+ if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+ SOCKNAL_ENOMEM_RETRY),
+ ksocknal_data.ksnd_reaper_waketime))
+ wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+ return (rc);
+ }
+
+ /* Actual error */
+ LASSERT (rc < 0);
+
+ if (!conn->ksnc_closing) {
+ switch (rc) {
+ case -ECONNRESET:
+ LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+ "while we were sending data; it may have "
+ "rebooted.\n",
+ HIPQUAD(conn->ksnc_ipaddr));
+ break;
+ default:
+ LCONSOLE_WARN("There was an unexpected network error "
+ "while writing to %u.%u.%u.%u: %d.\n",
+ HIPQUAD(conn->ksnc_ipaddr), rc);
+ break;
+ }
+ CDEBUG(D_NET, "[%p] Error %d on write to %s"
+ " ip %d.%d.%d.%d:%d\n", conn, rc,
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+ }
+
+ if (tx->tx_zc_checked)
+ ksocknal_uncheck_zc_req(tx);
+
+ /* it's not an error if conn is being closed */
+ ksocknal_close_conn_and_siblings (conn,
+ (conn->ksnc_closing) ? 0 : rc);
+
+ return (rc);
+}
+
+void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+ /* called holding write lock on ksnd_global_lock */
+
+ LASSERT (!route->ksnr_scheduled);
+ LASSERT (!route->ksnr_connecting);
+ LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+ route->ksnr_scheduled = 1; /* scheduling conn for connd */
+ ksocknal_route_addref(route); /* extra ref for connd */
+
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+ list_add_tail(&route->ksnr_connd_list,
+ &ksocknal_data.ksnd_connd_routes);
+ wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+ ksock_route_t *route;
+
+ /* called holding write lock on ksnd_global_lock */
+ for (;;) {
+ /* launch any/all connections that need it */
+ route = ksocknal_find_connectable_route_locked(peer);
+ if (route == NULL)
+ return;
+
+ ksocknal_launch_connection_locked(route);
+ }
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+ struct list_head *tmp;
+ ksock_conn_t *conn;
+ ksock_conn_t *typed = NULL;
+ ksock_conn_t *fallback = NULL;
+ int tnob = 0;
+ int fnob = 0;
+
+ list_for_each (tmp, &peer->ksnp_conns) {
+ ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list);
+ int nob = atomic_read(&c->ksnc_tx_nob) +
+ cfs_sock_wmem_queued(c->ksnc_sock);
+ int rc;
+
+ LASSERT (!c->ksnc_closing);
+ LASSERT (c->ksnc_proto != NULL &&
+ c->ksnc_proto->pro_match_tx != NULL);
+
+ rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+ switch (rc) {
+ default:
+ LBUG();
+ case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+ continue;
+
+ case SOCKNAL_MATCH_YES: /* typed connection */
+ if (typed == NULL || tnob > nob ||
+ (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+ cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+ typed = c;
+ tnob = nob;
+ }
+ break;
+
+ case SOCKNAL_MATCH_MAY: /* fallback connection */
+ if (fallback == NULL || fnob > nob ||
+ (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+ cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+ fallback = c;
+ fnob = nob;
+ }
+ break;
+ }
+ }
+
+ /* prefer the typed selection */
+ conn = (typed != NULL) ? typed : fallback;
+
+ if (conn != NULL)
+ conn->ksnc_tx_last_post = cfs_time_current();
+
+ return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ conn->ksnc_proto->pro_pack(tx);
+
+ atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+ ksocknal_conn_addref(conn); /* +1 ref for tx */
+ tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+ ksock_msg_t *msg = &tx->tx_msg;
+ ksock_tx_t *ztx = NULL;
+ int bufnob = 0;
+
+ /* called holding global lock (read or irq-write) and caller may
+ * not have dropped this lock between finding conn and calling me,
+ * so we don't need the {get,put}connsock dance to deref
+ * ksnc_sock... */
+ LASSERT(!conn->ksnc_closing);
+
+ CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+
+ ksocknal_tx_prep(conn, tx);
+
+ /* Ensure the frags we've been given EXACTLY match the number of
+ * bytes we want to send. Many TCP/IP stacks disregard any total
+ * size parameters passed to them and just look at the frags.
+ *
+ * We always expect at least 1 mapped fragment containing the
+ * complete ksocknal message header. */
+ LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+ lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+ (unsigned int)tx->tx_nob);
+ LASSERT (tx->tx_niov >= 1);
+ LASSERT (tx->tx_resid == tx->tx_nob);
+
+ CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+ tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+ KSOCK_MSG_NOOP,
+ tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+ /*
+ * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+ * but they're used inside spinlocks a lot.
+ */
+ bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+ spin_lock_bh(&sched->kss_lock);
+
+ if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+ /* First packet starts the timeout */
+ conn->ksnc_tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+ if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+ conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+ conn->ksnc_tx_bufnob = 0;
+ mb(); /* order with adding to tx_queue */
+ }
+
+ if (msg->ksm_type == KSOCK_MSG_NOOP) {
+ /* The packet is noop ZC ACK, try to piggyback the ack_cookie
+ * on a normal packet so I don't need to send it */
+ LASSERT (msg->ksm_zc_cookies[1] != 0);
+ LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+ if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+ ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+ } else {
+ /* It's a normal packet - can it piggback a noop zc-ack that
+ * has been queued already? */
+ LASSERT (msg->ksm_zc_cookies[1] == 0);
+ LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+ ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+ /* ztx will be released later */
+ }
+
+ if (ztx != NULL) {
+ atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+ list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+ }
+
+ if (conn->ksnc_tx_ready && /* able to send */
+ !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+ /* +1 ref for scheduler */
+ ksocknal_conn_addref(conn);
+ list_add_tail (&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ conn->ksnc_tx_scheduled = 1;
+ wake_up (&sched->kss_waitq);
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+ cfs_time_t now = cfs_time_current();
+ struct list_head *tmp;
+ ksock_route_t *route;
+
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+ LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+ if (route->ksnr_scheduled) /* connections being established */
+ continue;
+
+ /* all route types connected ? */
+ if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+ continue;
+
+ if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+ cfs_time_aftereq(now, route->ksnr_timeout))) {
+ CDEBUG(D_NET,
+ "Too soon to retry route %u.%u.%u.%u "
+ "(cnted %d, interval %ld, %ld secs later)\n",
+ HIPQUAD(route->ksnr_ipaddr),
+ route->ksnr_connected,
+ route->ksnr_retry_interval,
+ cfs_duration_sec(route->ksnr_timeout - now));
+ continue;
+ }
+
+ return (route);
+ }
+
+ return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+ struct list_head *tmp;
+ ksock_route_t *route;
+
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+ LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+ if (route->ksnr_scheduled)
+ return (route);
+ }
+
+ return (NULL);
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+ ksock_peer_t *peer;
+ ksock_conn_t *conn;
+ rwlock_t *g_lock;
+ int retry;
+ int rc;
+
+ LASSERT (tx->tx_conn == NULL);
+
+ g_lock = &ksocknal_data.ksnd_global_lock;
+
+ for (retry = 0;; retry = 1) {
+ read_lock(g_lock);
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL) {
+ if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+ conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+ if (conn != NULL) {
+ /* I've got no routes that need to be
+ * connecting and I do have an actual
+ * connection... */
+ ksocknal_queue_tx_locked (tx, conn);
+ read_unlock(g_lock);
+ return (0);
+ }
+ }
+ }
+
+ /* I'll need a write lock... */
+ read_unlock(g_lock);
+
+ write_lock_bh(g_lock);
+
+ peer = ksocknal_find_peer_locked(ni, id);
+ if (peer != NULL)
+ break;
+
+ write_unlock_bh(g_lock);
+
+ if ((id.pid & LNET_PID_USERFLAG) != 0) {
+ CERROR("Refusing to create a connection to "
+ "userspace process %s\n", libcfs_id2str(id));
+ return -EHOSTUNREACH;
+ }
+
+ if (retry) {
+ CERROR("Can't find peer %s\n", libcfs_id2str(id));
+ return -EHOSTUNREACH;
+ }
+
+ rc = ksocknal_add_peer(ni, id,
+ LNET_NIDADDR(id.nid),
+ lnet_acceptor_port());
+ if (rc != 0) {
+ CERROR("Can't add peer %s: %d\n",
+ libcfs_id2str(id), rc);
+ return rc;
+ }
+ }
+
+ ksocknal_launch_all_connections_locked(peer);
+
+ conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ ksocknal_queue_tx_locked (tx, conn);
+ write_unlock_bh(g_lock);
+ return (0);
+ }
+
+ if (peer->ksnp_accepting > 0 ||
+ ksocknal_find_connecting_route_locked (peer) != NULL) {
+ /* the message is going to be pinned to the peer */
+ tx->tx_deadline =
+ cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+ /* Queue the message until a connection is established */
+ list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+ write_unlock_bh(g_lock);
+ return 0;
+ }
+
+ write_unlock_bh(g_lock);
+
+ /* NB Routes may be ignored if connections to them failed recently */
+ CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+ return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ int mpflag = 0;
+ int type = lntmsg->msg_type;
+ lnet_process_id_t target = lntmsg->msg_target;
+ unsigned int payload_niov = lntmsg->msg_niov;
+ struct iovec *payload_iov = lntmsg->msg_iov;
+ lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
+ unsigned int payload_offset = lntmsg->msg_offset;
+ unsigned int payload_nob = lntmsg->msg_len;
+ ksock_tx_t *tx;
+ int desc_size;
+ int rc;
+
+ /* NB 'private' is different depending on what we're sending.
+ * Just ignore it... */
+
+ CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+ payload_nob, payload_niov, libcfs_id2str(target));
+
+ LASSERT (payload_nob == 0 || payload_niov > 0);
+ LASSERT (payload_niov <= LNET_MAX_IOV);
+ /* payload is either all vaddrs or all pages */
+ LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+ LASSERT (!in_interrupt ());
+
+ if (payload_iov != NULL)
+ desc_size = offsetof(ksock_tx_t,
+ tx_frags.virt.iov[1 + payload_niov]);
+ else
+ desc_size = offsetof(ksock_tx_t,
+ tx_frags.paged.kiov[payload_niov]);
+
+ if (lntmsg->msg_vmflush)
+ mpflag = cfs_memory_pressure_get_and_set();
+ tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+ if (tx == NULL) {
+ CERROR("Can't allocate tx desc type %d size %d\n",
+ type, desc_size);
+ if (lntmsg->msg_vmflush)
+ cfs_memory_pressure_restore(mpflag);
+ return (-ENOMEM);
+ }
+
+ tx->tx_conn = NULL; /* set when assigned a conn */
+ tx->tx_lnetmsg = lntmsg;
+
+ if (payload_iov != NULL) {
+ tx->tx_kiov = NULL;
+ tx->tx_nkiov = 0;
+ tx->tx_iov = tx->tx_frags.virt.iov;
+ tx->tx_niov = 1 +
+ lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ } else {
+ tx->tx_niov = 1;
+ tx->tx_iov = &tx->tx_frags.paged.iov;
+ tx->tx_kiov = tx->tx_frags.paged.kiov;
+ tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+
+ if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+ tx->tx_zc_capable = 1;
+ }
+
+ socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+ /* The first fragment will be set later in pro_pack */
+ rc = ksocknal_launch_packet(ni, tx, target);
+ if (lntmsg->msg_vmflush)
+ cfs_memory_pressure_restore(mpflag);
+ if (rc == 0)
+ return (0);
+
+ ksocknal_free_tx(tx);
+ return (-EIO);
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+ task_t *task = kthread_run(fn, arg, name);
+
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+ ksocknal_data.ksnd_nthreads++;
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+ return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+ ksocknal_data.ksnd_nthreads--;
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+ static char ksocknal_slop_buffer[4096];
+
+ int nob;
+ unsigned int niov;
+ int skipped;
+
+ LASSERT(conn->ksnc_proto != NULL);
+
+ if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+ /* Remind the socket to ack eagerly... */
+ ksocknal_lib_eager_ack(conn);
+ }
+
+ if (nob_to_skip == 0) { /* right at next packet boundary now */
+ conn->ksnc_rx_started = 0;
+ mb(); /* racing with timeout thread */
+
+ switch (conn->ksnc_proto->pro_version) {
+ case KSOCK_PROTO_V2:
+ case KSOCK_PROTO_V3:
+ conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+ conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+ conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+ conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+ conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+ conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u);
+ break;
+
+ case KSOCK_PROTO_V1:
+ /* Receiving bare lnet_hdr_t */
+ conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+ conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+ conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+ conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+ conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+ conn->ksnc_rx_iov[0].iov_len = sizeof (lnet_hdr_t);
+ break;
+
+ default:
+ LBUG ();
+ }
+ conn->ksnc_rx_niov = 1;
+
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_nkiov = 0;
+ conn->ksnc_rx_csum = ~0;
+ return (1);
+ }
+
+ /* Set up to skip as much as possible now. If there's more left
+ * (ran out of iov entries) we'll get called again */
+
+ conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+ conn->ksnc_rx_nob_left = nob_to_skip;
+ conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+ skipped = 0;
+ niov = 0;
+
+ do {
+ nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+ conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+ conn->ksnc_rx_iov[niov].iov_len = nob;
+ niov++;
+ skipped += nob;
+ nob_to_skip -=nob;
+
+ } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */
+ niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+ conn->ksnc_rx_niov = niov;
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_nkiov = 0;
+ conn->ksnc_rx_nob_wanted = skipped;
+ return (0);
+}
+
+int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+ lnet_hdr_t *lhdr;
+ lnet_process_id_t *id;
+ int rc;
+
+ LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+ /* NB: sched lock NOT held */
+ /* SOCKNAL_RX_LNET_HEADER is here for backward compatability */
+ LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+ conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+ conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+ conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+ if (conn->ksnc_rx_nob_wanted != 0) {
+ rc = ksocknal_receive(conn);
+
+ if (rc <= 0) {
+ LASSERT (rc != -EAGAIN);
+
+ if (rc == 0)
+ CDEBUG (D_NET, "[%p] EOF from %s"
+ " ip %d.%d.%d.%d:%d\n", conn,
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+ else if (!conn->ksnc_closing)
+ CERROR ("[%p] Error %d on read from %s"
+ " ip %d.%d.%d.%d:%d\n",
+ conn, rc,
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+
+ /* it's not an error if conn is being closed */
+ ksocknal_close_conn_and_siblings (conn,
+ (conn->ksnc_closing) ? 0 : rc);
+ return (rc == 0 ? -ESHUTDOWN : rc);
+ }
+
+ if (conn->ksnc_rx_nob_wanted != 0) {
+ /* short read */
+ return (-EAGAIN);
+ }
+ }
+ switch (conn->ksnc_rx_state) {
+ case SOCKNAL_RX_KSM_HEADER:
+ if (conn->ksnc_flip) {
+ __swab32s(&conn->ksnc_msg.ksm_type);
+ __swab32s(&conn->ksnc_msg.ksm_csum);
+ __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+ __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+ }
+
+ if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+ conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+ CERROR("%s: Unknown message type: %x\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ conn->ksnc_msg.ksm_type);
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings(conn, -EPROTO);
+ return (-EPROTO);
+ }
+
+ if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+ conn->ksnc_msg.ksm_csum != 0 && /* has checksum */
+ conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+ /* NOOP Checksum error */
+ CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings(conn, -EPROTO);
+ return (-EIO);
+ }
+
+ if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+ __u64 cookie = 0;
+
+ LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+ if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+ cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+ rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+ conn->ksnc_msg.ksm_zc_cookies[1]);
+
+ if (rc != 0) {
+ CERROR("%s: Unknown ZC-ACK cookie: "LPU64", "LPU64"\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings(conn, -EPROTO);
+ return (rc);
+ }
+ }
+
+ if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+ ksocknal_new_packet (conn, 0);
+ return 0; /* NOOP is done and just return */
+ }
+
+ conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+ conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+ conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+ conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+ conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+ conn->ksnc_rx_iov[0].iov_len = sizeof(ksock_lnet_msg_t);
+
+ conn->ksnc_rx_niov = 1;
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_nkiov = 0;
+
+ goto again; /* read lnet header now */
+
+ case SOCKNAL_RX_LNET_HEADER:
+ /* unpack message header */
+ conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+ if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+ /* Userspace peer */
+ lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+ id = &conn->ksnc_peer->ksnp_id;
+
+ /* Substitute process ID assigned at connection time */
+ lhdr->src_pid = cpu_to_le32(id->pid);
+ lhdr->src_nid = cpu_to_le64(id->nid);
+ }
+
+ conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+ ksocknal_conn_addref(conn); /* ++ref while parsing */
+
+ rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+ &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+ conn->ksnc_peer->ksnp_id.nid, conn, 0);
+ if (rc < 0) {
+ /* I just received garbage: give up on this conn */
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings (conn, rc);
+ ksocknal_conn_decref(conn);
+ return (-EPROTO);
+ }
+
+ /* I'm racing with ksocknal_recv() */
+ LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+ conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+ if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+ return 0;
+
+ /* ksocknal_recv() got called */
+ goto again;
+
+ case SOCKNAL_RX_LNET_PAYLOAD:
+ /* payload all received */
+ rc = 0;
+
+ if (conn->ksnc_rx_nob_left == 0 && /* not truncating */
+ conn->ksnc_msg.ksm_csum != 0 && /* has checksum */
+ conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+ CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id),
+ conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+ rc = -EIO;
+ }
+
+ if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+ LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+ lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+ id = &conn->ksnc_peer->ksnp_id;
+
+ rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+ conn->ksnc_msg.ksm_zc_cookies[0],
+ *ksocknal_tunables.ksnd_nonblk_zcack ||
+ le64_to_cpu(lhdr->src_nid) != id->nid);
+ }
+
+ lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+ if (rc != 0) {
+ ksocknal_new_packet(conn, 0);
+ ksocknal_close_conn_and_siblings (conn, rc);
+ return (-EPROTO);
+ }
+ /* Fall through */
+
+ case SOCKNAL_RX_SLOP:
+ /* starting new packet? */
+ if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+ return 0; /* come back later */
+ goto again; /* try to finish reading slop now */
+
+ default:
+ break;
+ }
+
+ /* Not Reached */
+ LBUG ();
+ return (-EINVAL); /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+ unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ ksock_conn_t *conn = (ksock_conn_t *)private;
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+
+ LASSERT (mlen <= rlen);
+ LASSERT (niov <= LNET_MAX_IOV);
+
+ conn->ksnc_cookie = msg;
+ conn->ksnc_rx_nob_wanted = mlen;
+ conn->ksnc_rx_nob_left = rlen;
+
+ if (mlen == 0 || iov != NULL) {
+ conn->ksnc_rx_nkiov = 0;
+ conn->ksnc_rx_kiov = NULL;
+ conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+ conn->ksnc_rx_niov =
+ lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+ niov, iov, offset, mlen);
+ } else {
+ conn->ksnc_rx_niov = 0;
+ conn->ksnc_rx_iov = NULL;
+ conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+ conn->ksnc_rx_nkiov =
+ lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+ niov, kiov, offset, mlen);
+ }
+
+ LASSERT (mlen ==
+ lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+ lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+ LASSERT (conn->ksnc_rx_scheduled);
+
+ spin_lock_bh(&sched->kss_lock);
+
+ switch (conn->ksnc_rx_state) {
+ case SOCKNAL_RX_PARSE_WAIT:
+ list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+ wake_up (&sched->kss_waitq);
+ LASSERT (conn->ksnc_rx_ready);
+ break;
+
+ case SOCKNAL_RX_PARSE:
+ /* scheduler hasn't noticed I'm parsing yet */
+ break;
+ }
+
+ conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+ spin_unlock_bh(&sched->kss_lock);
+ ksocknal_conn_decref(conn);
+ return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+ int rc;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ rc = (!ksocknal_data.ksnd_shuttingdown &&
+ list_empty(&sched->kss_rx_conns) &&
+ list_empty(&sched->kss_tx_conns));
+
+ spin_unlock_bh(&sched->kss_lock);
+ return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+ struct ksock_sched_info *info;
+ ksock_sched_t *sched;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+ int rc;
+ int nloops = 0;
+ long id = (long)arg;
+
+ info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+ sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+ cfs_block_allsigs();
+
+ rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+ if (rc != 0) {
+ CERROR("Can't set CPT affinity to %d: %d\n",
+ info->ksi_cpt, rc);
+ }
+
+ spin_lock_bh(&sched->kss_lock);
+
+ while (!ksocknal_data.ksnd_shuttingdown) {
+ int did_something = 0;
+
+ /* Ensure I progress everything semi-fairly */
+
+ if (!list_empty (&sched->kss_rx_conns)) {
+ conn = list_entry(sched->kss_rx_conns.next,
+ ksock_conn_t, ksnc_rx_list);
+ list_del(&conn->ksnc_rx_list);
+
+ LASSERT(conn->ksnc_rx_scheduled);
+ LASSERT(conn->ksnc_rx_ready);
+
+ /* clear rx_ready in case receive isn't complete.
+ * Do it BEFORE we call process_recv, since
+ * data_ready can set it any time after we release
+ * kss_lock. */
+ conn->ksnc_rx_ready = 0;
+ spin_unlock_bh(&sched->kss_lock);
+
+ rc = ksocknal_process_receive(conn);
+
+ spin_lock_bh(&sched->kss_lock);
+
+ /* I'm the only one that can clear this flag */
+ LASSERT(conn->ksnc_rx_scheduled);
+
+ /* Did process_receive get everything it wanted? */
+ if (rc == 0)
+ conn->ksnc_rx_ready = 1;
+
+ if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+ /* Conn blocked waiting for ksocknal_recv()
+ * I change its state (under lock) to signal
+ * it can be rescheduled */
+ conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+ } else if (conn->ksnc_rx_ready) {
+ /* reschedule for rx */
+ list_add_tail (&conn->ksnc_rx_list,
+ &sched->kss_rx_conns);
+ } else {
+ conn->ksnc_rx_scheduled = 0;
+ /* drop my ref */
+ ksocknal_conn_decref(conn);
+ }
+
+ did_something = 1;
+ }
+
+ if (!list_empty (&sched->kss_tx_conns)) {
+ LIST_HEAD (zlist);
+
+ if (!list_empty(&sched->kss_zombie_noop_txs)) {
+ list_add(&zlist,
+ &sched->kss_zombie_noop_txs);
+ list_del_init(&sched->kss_zombie_noop_txs);
+ }
+
+ conn = list_entry(sched->kss_tx_conns.next,
+ ksock_conn_t, ksnc_tx_list);
+ list_del (&conn->ksnc_tx_list);
+
+ LASSERT(conn->ksnc_tx_scheduled);
+ LASSERT(conn->ksnc_tx_ready);
+ LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+ tx = list_entry(conn->ksnc_tx_queue.next,
+ ksock_tx_t, tx_list);
+
+ if (conn->ksnc_tx_carrier == tx)
+ ksocknal_next_tx_carrier(conn);
+
+ /* dequeue now so empty list => more to send */
+ list_del(&tx->tx_list);
+
+ /* Clear tx_ready in case send isn't complete. Do
+ * it BEFORE we call process_transmit, since
+ * write_space can set it any time after we release
+ * kss_lock. */
+ conn->ksnc_tx_ready = 0;
+ spin_unlock_bh(&sched->kss_lock);
+
+ if (!list_empty(&zlist)) {
+ /* free zombie noop txs, it's fast because
+ * noop txs are just put in freelist */
+ ksocknal_txlist_done(NULL, &zlist, 0);
+ }
+
+ rc = ksocknal_process_transmit(conn, tx);
+
+ if (rc == -ENOMEM || rc == -EAGAIN) {
+ /* Incomplete send: replace tx on HEAD of tx_queue */
+ spin_lock_bh(&sched->kss_lock);
+ list_add(&tx->tx_list,
+ &conn->ksnc_tx_queue);
+ } else {
+ /* Complete send; tx -ref */
+ ksocknal_tx_decref(tx);
+
+ spin_lock_bh(&sched->kss_lock);
+ /* assume space for more */
+ conn->ksnc_tx_ready = 1;
+ }
+
+ if (rc == -ENOMEM) {
+ /* Do nothing; after a short timeout, this
+ * conn will be reposted on kss_tx_conns. */
+ } else if (conn->ksnc_tx_ready &&
+ !list_empty (&conn->ksnc_tx_queue)) {
+ /* reschedule for tx */
+ list_add_tail (&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ } else {
+ conn->ksnc_tx_scheduled = 0;
+ /* drop my ref */
+ ksocknal_conn_decref(conn);
+ }
+
+ did_something = 1;
+ }
+ if (!did_something || /* nothing to do */
+ ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+ spin_unlock_bh(&sched->kss_lock);
+
+ nloops = 0;
+
+ if (!did_something) { /* wait for something to do */
+ cfs_wait_event_interruptible_exclusive(
+ sched->kss_waitq,
+ !ksocknal_sched_cansleep(sched), rc);
+ LASSERT (rc == 0);
+ } else {
+ cond_resched();
+ }
+
+ spin_lock_bh(&sched->kss_lock);
+ }
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+ ksocknal_thread_fini();
+ return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+ ksock_sched_t *sched;
+ ENTRY;
+
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ conn->ksnc_rx_ready = 1;
+
+ if (!conn->ksnc_rx_scheduled) { /* not being progressed */
+ list_add_tail(&conn->ksnc_rx_list,
+ &sched->kss_rx_conns);
+ conn->ksnc_rx_scheduled = 1;
+ /* extra ref for scheduler */
+ ksocknal_conn_addref(conn);
+
+ wake_up (&sched->kss_waitq);
+ }
+ spin_unlock_bh(&sched->kss_lock);
+
+ EXIT;
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+ ksock_sched_t *sched;
+ ENTRY;
+
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ conn->ksnc_tx_ready = 1;
+
+ if (!conn->ksnc_tx_scheduled && // not being progressed
+ !list_empty(&conn->ksnc_tx_queue)){//packets to send
+ list_add_tail (&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ conn->ksnc_tx_scheduled = 1;
+ /* extra ref for scheduler */
+ ksocknal_conn_addref(conn);
+
+ wake_up (&sched->kss_waitq);
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+
+ EXIT;
+}
+
+ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+ __u32 version = 0;
+
+ if (hello->kshm_magic == LNET_PROTO_MAGIC)
+ version = hello->kshm_version;
+ else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+ version = __swab32(hello->kshm_version);
+
+ if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+ if (*ksocknal_tunables.ksnd_protocol == 1)
+ return NULL;
+
+ if (*ksocknal_tunables.ksnd_protocol == 2 &&
+ version == KSOCK_PROTO_V3)
+ return NULL;
+#endif
+ if (version == KSOCK_PROTO_V2)
+ return &ksocknal_protocol_v2x;
+
+ if (version == KSOCK_PROTO_V3)
+ return &ksocknal_protocol_v3x;
+
+ return NULL;
+ }
+
+ if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+ lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+ CLASSERT (sizeof (lnet_magicversion_t) ==
+ offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+ if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+ hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+ return &ksocknal_protocol_v1x;
+ }
+
+ return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+ lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+ /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+ ksock_net_t *net = (ksock_net_t *)ni->ni_data;
+
+ LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+ /* rely on caller to hold a ref on socket so it wouldn't disappear */
+ LASSERT (conn->ksnc_proto != NULL);
+
+ hello->kshm_src_nid = ni->ni_nid;
+ hello->kshm_dst_nid = peer_nid;
+ hello->kshm_src_pid = the_lnet.ln_pid;
+
+ hello->kshm_src_incarnation = net->ksnn_incarnation;
+ hello->kshm_ctype = conn->ksnc_type;
+
+ return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+ switch (type)
+ {
+ case SOCKLND_CONN_ANY:
+ case SOCKLND_CONN_CONTROL:
+ return (type);
+ case SOCKLND_CONN_BULK_IN:
+ return SOCKLND_CONN_BULK_OUT;
+ case SOCKLND_CONN_BULK_OUT:
+ return SOCKLND_CONN_BULK_IN;
+ default:
+ return (SOCKLND_CONN_NONE);
+ }
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+ ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+ __u64 *incarnation)
+{
+ /* Return < 0 fatal error
+ * 0 success
+ * EALREADY lost connection race
+ * EPROTO protocol version mismatch
+ */
+ socket_t *sock = conn->ksnc_sock;
+ int active = (conn->ksnc_proto != NULL);
+ int timeout;
+ int proto_match;
+ int rc;
+ ksock_proto_t *proto;
+ lnet_process_id_t recv_id;
+
+ /* socket type set on active connections - not set on passive */
+ LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+ timeout = active ? *ksocknal_tunables.ksnd_timeout :
+ lnet_acceptor_timeout();
+
+ rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+ if (rc != 0) {
+ CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr));
+ LASSERT (rc < 0);
+ return rc;
+ }
+
+ if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+ hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+ hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+ /* Unexpected magic! */
+ CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+ "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+ LNET_PROTO_TCP_MAGIC,
+ HIPQUAD(conn->ksnc_ipaddr));
+ return -EPROTO;
+ }
+
+ rc = libcfs_sock_read(sock, &hello->kshm_version,
+ sizeof(hello->kshm_version), timeout);
+ if (rc != 0) {
+ CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr));
+ LASSERT (rc < 0);
+ return rc;
+ }
+
+ proto = ksocknal_parse_proto_version(hello);
+ if (proto == NULL) {
+ if (!active) {
+ /* unknown protocol from peer, tell peer my protocol */
+ conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+ if (*ksocknal_tunables.ksnd_protocol == 2)
+ conn->ksnc_proto = &ksocknal_protocol_v2x;
+ else if (*ksocknal_tunables.ksnd_protocol == 1)
+ conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+ hello->kshm_nips = 0;
+ ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+ }
+
+ CERROR ("Unknown protocol version (%d.x expected)"
+ " from %u.%u.%u.%u\n",
+ conn->ksnc_proto->pro_version,
+ HIPQUAD(conn->ksnc_ipaddr));
+
+ return -EPROTO;
+ }
+
+ proto_match = (conn->ksnc_proto == proto);
+ conn->ksnc_proto = proto;
+
+ /* receive the rest of hello message anyway */
+ rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr));
+ LASSERT (rc < 0);
+ return rc;
+ }
+
+ *incarnation = hello->kshm_src_incarnation;
+
+ if (hello->kshm_src_nid == LNET_NID_ANY) {
+ CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+ "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+ return -EPROTO;
+ }
+
+ if (!active &&
+ conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+ /* Userspace NAL assigns peer process ID from socket */
+ recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+ recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+ } else {
+ recv_id.nid = hello->kshm_src_nid;
+ recv_id.pid = hello->kshm_src_pid;
+ }
+
+ if (!active) {
+ *peerid = recv_id;
+
+ /* peer determines type */
+ conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+ if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+ CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+ hello->kshm_ctype, libcfs_id2str(*peerid),
+ HIPQUAD(conn->ksnc_ipaddr));
+ return -EPROTO;
+ }
+
+ return 0;
+ }
+
+ if (peerid->pid != recv_id.pid ||
+ peerid->nid != recv_id.nid) {
+ LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host"
+ " %u.%u.%u.%u, but they claimed they were "
+ "%s; please check your Lustre "
+ "configuration.\n",
+ libcfs_id2str(*peerid),
+ HIPQUAD(conn->ksnc_ipaddr),
+ libcfs_id2str(recv_id));
+ return -EPROTO;
+ }
+
+ if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+ /* Possible protocol mismatch or I lost the connection race */
+ return proto_match ? EALREADY : EPROTO;
+ }
+
+ if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+ CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+ conn->ksnc_type, libcfs_id2str(*peerid),
+ HIPQUAD(conn->ksnc_ipaddr),
+ hello->kshm_ctype);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+int
+ksocknal_connect (ksock_route_t *route)
+{
+ LIST_HEAD (zombies);
+ ksock_peer_t *peer = route->ksnr_peer;
+ int type;
+ int wanted;
+ socket_t *sock;
+ cfs_time_t deadline;
+ int retry_later = 0;
+ int rc = 0;
+
+ deadline = cfs_time_add(cfs_time_current(),
+ cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ LASSERT (route->ksnr_scheduled);
+ LASSERT (!route->ksnr_connecting);
+
+ route->ksnr_connecting = 1;
+
+ for (;;) {
+ wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+ /* stop connecting if peer/route got closed under me, or
+ * route got connected while queued */
+ if (peer->ksnp_closing || route->ksnr_deleted ||
+ wanted == 0) {
+ retry_later = 0;
+ break;
+ }
+
+ /* reschedule if peer is connecting to me */
+ if (peer->ksnp_accepting > 0) {
+ CDEBUG(D_NET,
+ "peer %s(%d) already connecting to me, retry later.\n",
+ libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+ retry_later = 1;
+ }
+
+ if (retry_later) /* needs reschedule */
+ break;
+
+ if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+ type = SOCKLND_CONN_ANY;
+ } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+ type = SOCKLND_CONN_CONTROL;
+ } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+ type = SOCKLND_CONN_BULK_IN;
+ } else {
+ LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+ type = SOCKLND_CONN_BULK_OUT;
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+ rc = -ETIMEDOUT;
+ lnet_connect_console_error(rc, peer->ksnp_id.nid,
+ route->ksnr_ipaddr,
+ route->ksnr_port);
+ goto failed;
+ }
+
+ rc = lnet_connect(&sock, peer->ksnp_id.nid,
+ route->ksnr_myipaddr,
+ route->ksnr_ipaddr, route->ksnr_port);
+ if (rc != 0)
+ goto failed;
+
+ rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+ if (rc < 0) {
+ lnet_connect_console_error(rc, peer->ksnp_id.nid,
+ route->ksnr_ipaddr,
+ route->ksnr_port);
+ goto failed;
+ }
+
+ /* A +ve RC means I have to retry because I lost the connection
+ * race or I have to renegotiate protocol version */
+ retry_later = (rc != 0);
+ if (retry_later)
+ CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+ libcfs_nid2str(peer->ksnp_id.nid));
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+ }
+
+ route->ksnr_scheduled = 0;
+ route->ksnr_connecting = 0;
+
+ if (retry_later) {
+ /* re-queue for attention; this frees me up to handle
+ * the peer's incoming connection request */
+
+ if (rc == EALREADY ||
+ (rc == 0 && peer->ksnp_accepting > 0)) {
+ /* We want to introduce a delay before next
+ * attempt to connect if we lost conn race,
+ * but the race is resolved quickly usually,
+ * so min_reconnectms should be good heuristic */
+ route->ksnr_retry_interval =
+ cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+ route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+ route->ksnr_retry_interval);
+ }
+
+ ksocknal_launch_connection_locked(route);
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+ return retry_later;
+
+ failed:
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ route->ksnr_scheduled = 0;
+ route->ksnr_connecting = 0;
+
+ /* This is a retry rather than a new connection */
+ route->ksnr_retry_interval *= 2;
+ route->ksnr_retry_interval =
+ MAX(route->ksnr_retry_interval,
+ cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+ route->ksnr_retry_interval =
+ MIN(route->ksnr_retry_interval,
+ cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+ LASSERT (route->ksnr_retry_interval != 0);
+ route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+ route->ksnr_retry_interval);
+
+ if (!list_empty(&peer->ksnp_tx_queue) &&
+ peer->ksnp_accepting == 0 &&
+ ksocknal_find_connecting_route_locked(peer) == NULL) {
+ ksock_conn_t *conn;
+
+ /* ksnp_tx_queue is queued on a conn on successful
+ * connection for V1.x and V2.x */
+ if (!list_empty (&peer->ksnp_conns)) {
+ conn = list_entry(peer->ksnp_conns.next,
+ ksock_conn_t, ksnc_list);
+ LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+ }
+
+ /* take all the blocked packets while I've got the lock and
+ * complete below... */
+ list_splice_init(&peer->ksnp_tx_queue, &zombies);
+ }
+
+#if 0 /* irrelevent with only eager routes */
+ if (!route->ksnr_deleted) {
+ /* make this route least-favourite for re-selection */
+ list_del(&route->ksnr_list);
+ list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+ }
+#endif
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_peer_failed(peer);
+ ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+ return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+ char name[16];
+ int rc;
+ int total = ksocknal_data.ksnd_connd_starting +
+ ksocknal_data.ksnd_connd_running;
+
+ if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+ /* still in initializing */
+ return 0;
+ }
+
+ if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+ total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+ /* can't create more connd, or still have enough
+ * threads to handle more connecting */
+ return 0;
+ }
+
+ if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+ /* no pending connecting request */
+ return 0;
+ }
+
+ if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+ /* may run out of resource, retry later */
+ *timeout = cfs_time_seconds(1);
+ return 0;
+ }
+
+ if (ksocknal_data.ksnd_connd_starting > 0) {
+ /* serialize starting to avoid flood */
+ return 0;
+ }
+
+ ksocknal_data.ksnd_connd_starting_stamp = sec;
+ ksocknal_data.ksnd_connd_starting++;
+ spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+ /* NB: total is the next id */
+ snprintf(name, sizeof(name), "socknal_cd%02d", total);
+ rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+ spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+ if (rc == 0)
+ return 1;
+
+ /* we tried ... */
+ LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+ ksocknal_data.ksnd_connd_starting--;
+ ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+
+ return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+ int val;
+
+ if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+ /* still in initializing */
+ return 0;
+ }
+
+ if (ksocknal_data.ksnd_connd_starting > 0) {
+ /* in progress of starting new thread */
+ return 0;
+ }
+
+ if (ksocknal_data.ksnd_connd_running <=
+ *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+ return 0;
+ }
+
+ /* created thread in past 120 seconds? */
+ val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+ SOCKNAL_CONND_TIMEOUT - sec);
+
+ *timeout = (val > 0) ? cfs_time_seconds(val) :
+ cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+ if (val > 0)
+ return 0;
+
+ /* no creating in past 120 seconds */
+
+ return ksocknal_data.ksnd_connd_running >
+ ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+ ksock_route_t *route;
+ cfs_time_t now;
+
+ now = cfs_time_current();
+
+ /* connd_routes can contain both pending and ordinary routes */
+ list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+ ksnr_connd_list) {
+
+ if (route->ksnr_retry_interval == 0 ||
+ cfs_time_aftereq(now, route->ksnr_timeout))
+ return route;
+
+ if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+ (int)*timeout_p > (int)(route->ksnr_timeout - now))
+ *timeout_p = (int)(route->ksnr_timeout - now);
+ }
+
+ return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+ spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
+ ksock_connreq_t *cr;
+ wait_queue_t wait;
+ int nloops = 0;
+ int cons_retry = 0;
+
+ cfs_block_allsigs ();
+
+ init_waitqueue_entry_current (&wait);
+
+ spin_lock_bh(connd_lock);
+
+ LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+ ksocknal_data.ksnd_connd_starting--;
+ ksocknal_data.ksnd_connd_running++;
+
+ while (!ksocknal_data.ksnd_shuttingdown) {
+ ksock_route_t *route = NULL;
+ long sec = cfs_time_current_sec();
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ int dropped_lock = 0;
+
+ if (ksocknal_connd_check_stop(sec, &timeout)) {
+ /* wakeup another one to check stop */
+ wake_up(&ksocknal_data.ksnd_connd_waitq);
+ break;
+ }
+
+ if (ksocknal_connd_check_start(sec, &timeout)) {
+ /* created new thread */
+ dropped_lock = 1;
+ }
+
+ if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+ /* Connection accepted by the listener */
+ cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+ next, ksock_connreq_t, ksncr_list);
+
+ list_del(&cr->ksncr_list);
+ spin_unlock_bh(connd_lock);
+ dropped_lock = 1;
+
+ ksocknal_create_conn(cr->ksncr_ni, NULL,
+ cr->ksncr_sock, SOCKLND_CONN_NONE);
+ lnet_ni_decref(cr->ksncr_ni);
+ LIBCFS_FREE(cr, sizeof(*cr));
+
+ spin_lock_bh(connd_lock);
+ }
+
+ /* Only handle an outgoing connection request if there
+ * is a thread left to handle incoming connections and
+ * create new connd */
+ if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+ ksocknal_data.ksnd_connd_running) {
+ route = ksocknal_connd_get_route_locked(&timeout);
+ }
+ if (route != NULL) {
+ list_del (&route->ksnr_connd_list);
+ ksocknal_data.ksnd_connd_connecting++;
+ spin_unlock_bh(connd_lock);
+ dropped_lock = 1;
+
+ if (ksocknal_connect(route)) {
+ /* consecutive retry */
+ if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+ CWARN("massive consecutive "
+ "re-connecting to %u.%u.%u.%u\n",
+ HIPQUAD(route->ksnr_ipaddr));
+ cons_retry = 0;
+ }
+ } else {
+ cons_retry = 0;
+ }
+
+ ksocknal_route_decref(route);
+
+ spin_lock_bh(connd_lock);
+ ksocknal_data.ksnd_connd_connecting--;
+ }
+
+ if (dropped_lock) {
+ if (++nloops < SOCKNAL_RESCHED)
+ continue;
+ spin_unlock_bh(connd_lock);
+ nloops = 0;
+ cond_resched();
+ spin_lock_bh(connd_lock);
+ continue;
+ }
+
+ /* Nothing to do for 'timeout' */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+ spin_unlock_bh(connd_lock);
+
+ nloops = 0;
+ waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+ spin_lock_bh(connd_lock);
+ }
+ ksocknal_data.ksnd_connd_running--;
+ spin_unlock_bh(connd_lock);
+
+ ksocknal_thread_fini();
+ return 0;
+}
+
+ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+ /* We're called with a shared lock on ksnd_global_lock */
+ ksock_conn_t *conn;
+ struct list_head *ctmp;
+
+ list_for_each (ctmp, &peer->ksnp_conns) {
+ int error;
+ conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+ /* Don't need the {get,put}connsock dance to deref ksnc_sock */
+ LASSERT (!conn->ksnc_closing);
+
+ /* SOCK_ERROR will reset error code of socket in
+ * some platform (like Darwin8.x) */
+ error = cfs_sock_error(conn->ksnc_sock);
+ if (error != 0) {
+ ksocknal_conn_addref(conn);
+
+ switch (error) {
+ case ECONNRESET:
+ CNETERR("A connection with %s "
+ "(%u.%u.%u.%u:%d) was reset; "
+ "it may have rebooted.\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+ break;
+ case ETIMEDOUT:
+ CNETERR("A connection with %s "
+ "(%u.%u.%u.%u:%d) timed out; the "
+ "network or node may be down.\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+ break;
+ default:
+ CNETERR("An unexpected network error %d "
+ "occurred with %s "
+ "(%u.%u.%u.%u:%d\n", error,
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+ break;
+ }
+
+ return (conn);
+ }
+
+ if (conn->ksnc_rx_started &&
+ cfs_time_aftereq(cfs_time_current(),
+ conn->ksnc_rx_deadline)) {
+ /* Timed out incomplete incoming message */
+ ksocknal_conn_addref(conn);
+ CNETERR("Timeout receiving from %s (%u.%u.%u.%u:%d), "
+ "state %d wanted %d left %d\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port,
+ conn->ksnc_rx_state,
+ conn->ksnc_rx_nob_wanted,
+ conn->ksnc_rx_nob_left);
+ return (conn);
+ }
+
+ if ((!list_empty(&conn->ksnc_tx_queue) ||
+ cfs_sock_wmem_queued(conn->ksnc_sock) != 0) &&
+ cfs_time_aftereq(cfs_time_current(),
+ conn->ksnc_tx_deadline)) {
+ /* Timed out messages queued for sending or
+ * buffered in the socket's send buffer */
+ ksocknal_conn_addref(conn);
+ CNETERR("Timeout sending data to %s (%u.%u.%u.%u:%d) "
+ "the network or that node may be down.\n",
+ libcfs_id2str(peer->ksnp_id),
+ HIPQUAD(conn->ksnc_ipaddr),
+ conn->ksnc_port);
+ return (conn);
+ }
+ }
+
+ return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+ ksock_tx_t *tx;
+ LIST_HEAD (stale_txs);
+
+ write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+ while (!list_empty (&peer->ksnp_tx_queue)) {
+ tx = list_entry (peer->ksnp_tx_queue.next,
+ ksock_tx_t, tx_list);
+
+ if (!cfs_time_aftereq(cfs_time_current(),
+ tx->tx_deadline))
+ break;
+
+ list_del (&tx->tx_list);
+ list_add_tail (&tx->tx_list, &stale_txs);
+ }
+
+ write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+ ksock_sched_t *sched;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+
+ if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+ return 0;
+
+ if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+ return 0;
+
+ if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+ cfs_time_before(cfs_time_current(),
+ cfs_time_add(peer->ksnp_last_alive,
+ cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+ return 0;
+
+ if (cfs_time_before(cfs_time_current(),
+ peer->ksnp_send_keepalive))
+ return 0;
+
+ /* retry 10 secs later, so we wouldn't put pressure
+ * on this peer if we failed to send keepalive this time */
+ peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+ conn = ksocknal_find_conn_locked(peer, NULL, 1);
+ if (conn != NULL) {
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+ if (!list_empty(&conn->ksnc_tx_queue)) {
+ spin_unlock_bh(&sched->kss_lock);
+ /* there is an queued ACK, don't need keepalive */
+ return 0;
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ /* cookie = 1 is reserved for keepalive PING */
+ tx = ksocknal_alloc_tx_noop(1, 1);
+ if (tx == NULL) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ return -ENOMEM;
+ }
+
+ if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+ read_lock(&ksocknal_data.ksnd_global_lock);
+ return 1;
+ }
+
+ ksocknal_free_tx(tx);
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ return -EIO;
+}
+
+
+void
+ksocknal_check_peer_timeouts (int idx)
+{
+ struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
+ ksock_peer_t *peer;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+
+ again:
+ /* NB. We expect to have a look at all the peers and not find any
+ * connections to time out, so we just use a shared lock while we
+ * take a look... */
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ list_for_each_entry(peer, peers, ksnp_list) {
+ cfs_time_t deadline = 0;
+ int resid = 0;
+ int n = 0;
+
+ if (ksocknal_send_keepalive_locked(peer) != 0) {
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ goto again;
+ }
+
+ conn = ksocknal_find_timed_out_conn (peer);
+
+ if (conn != NULL) {
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+ /* NB we won't find this one again, but we can't
+ * just proceed with the next peer, since we dropped
+ * ksnd_global_lock and it might be dead already! */
+ ksocknal_conn_decref(conn);
+ goto again;
+ }
+
+ /* we can't process stale txs right here because we're
+ * holding only shared lock */
+ if (!list_empty (&peer->ksnp_tx_queue)) {
+ ksock_tx_t *tx =
+ list_entry (peer->ksnp_tx_queue.next,
+ ksock_tx_t, tx_list);
+
+ if (cfs_time_aftereq(cfs_time_current(),
+ tx->tx_deadline)) {
+
+ ksocknal_peer_addref(peer);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ ksocknal_flush_stale_txs(peer);
+
+ ksocknal_peer_decref(peer);
+ goto again;
+ }
+ }
+
+ if (list_empty(&peer->ksnp_zc_req_list))
+ continue;
+
+ spin_lock(&peer->ksnp_lock);
+ list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+ if (!cfs_time_aftereq(cfs_time_current(),
+ tx->tx_deadline))
+ break;
+ /* ignore the TX if connection is being closed */
+ if (tx->tx_conn->ksnc_closing)
+ continue;
+ n++;
+ }
+
+ if (n == 0) {
+ spin_unlock(&peer->ksnp_lock);
+ continue;
+ }
+
+ tx = list_entry(peer->ksnp_zc_req_list.next,
+ ksock_tx_t, tx_zc_list);
+ deadline = tx->tx_deadline;
+ resid = tx->tx_resid;
+ conn = tx->tx_conn;
+ ksocknal_conn_addref(conn);
+
+ spin_unlock(&peer->ksnp_lock);
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+ "oldest(%p) timed out %ld secs ago, "
+ "resid: %d, wmem: %d\n",
+ n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+ cfs_duration_sec(cfs_time_current() - deadline),
+ resid, cfs_sock_wmem_queued(conn->ksnc_sock));
+
+ ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+ ksocknal_conn_decref(conn);
+ goto again;
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+ wait_queue_t wait;
+ ksock_conn_t *conn;
+ ksock_sched_t *sched;
+ struct list_head enomem_conns;
+ int nenomem_conns;
+ cfs_duration_t timeout;
+ int i;
+ int peer_index = 0;
+ cfs_time_t deadline = cfs_time_current();
+
+ cfs_block_allsigs ();
+
+ INIT_LIST_HEAD(&enomem_conns);
+ init_waitqueue_entry_current (&wait);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ while (!ksocknal_data.ksnd_shuttingdown) {
+
+ if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+ conn = list_entry (ksocknal_data. \
+ ksnd_deathrow_conns.next,
+ ksock_conn_t, ksnc_list);
+ list_del (&conn->ksnc_list);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ ksocknal_terminate_conn(conn);
+ ksocknal_conn_decref(conn);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+ continue;
+ }
+
+ if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+ conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+ next, ksock_conn_t, ksnc_list);
+ list_del (&conn->ksnc_list);
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ ksocknal_destroy_conn(conn);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+ continue;
+ }
+
+ if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+ list_add(&enomem_conns,
+ &ksocknal_data.ksnd_enomem_conns);
+ list_del_init(&ksocknal_data.ksnd_enomem_conns);
+ }
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ /* reschedule all the connections that stalled with ENOMEM... */
+ nenomem_conns = 0;
+ while (!list_empty (&enomem_conns)) {
+ conn = list_entry (enomem_conns.next,
+ ksock_conn_t, ksnc_tx_list);
+ list_del (&conn->ksnc_tx_list);
+
+ sched = conn->ksnc_scheduler;
+
+ spin_lock_bh(&sched->kss_lock);
+
+ LASSERT(conn->ksnc_tx_scheduled);
+ conn->ksnc_tx_ready = 1;
+ list_add_tail(&conn->ksnc_tx_list,
+ &sched->kss_tx_conns);
+ wake_up(&sched->kss_waitq);
+
+ spin_unlock_bh(&sched->kss_lock);
+ nenomem_conns++;
+ }
+
+ /* careful with the jiffy wrap... */
+ while ((timeout = cfs_time_sub(deadline,
+ cfs_time_current())) <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = ksocknal_data.ksnd_peer_hash_size;
+
+ /* Time to check for timeouts on a few more peers: I do
+ * checks every 'p' seconds on a proportion of the peer
+ * table and I need to check every connection 'n' times
+ * within a timeout interval, to ensure I detect a
+ * timeout on any connection within (n+1)/n times the
+ * timeout interval. */
+
+ if (*ksocknal_tunables.ksnd_timeout > n * p)
+ chunk = (chunk * n * p) /
+ *ksocknal_tunables.ksnd_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ ksocknal_check_peer_timeouts (peer_index);
+ peer_index = (peer_index + 1) %
+ ksocknal_data.ksnd_peer_hash_size;
+ }
+
+ deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+ }
+
+ if (nenomem_conns != 0) {
+ /* Reduce my timeout if I rescheduled ENOMEM conns.
+ * This also prevents me getting woken immediately
+ * if any go back on my enomem list. */
+ timeout = SOCKNAL_ENOMEM_RETRY;
+ }
+ ksocknal_data.ksnd_reaper_waketime =
+ cfs_time_add(cfs_time_current(), timeout);
+
+ set_current_state (TASK_INTERRUPTIBLE);
+ add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+ if (!ksocknal_data.ksnd_shuttingdown &&
+ list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+ list_empty (&ksocknal_data.ksnd_zombie_conns))
+ waitq_timedwait (&wait, TASK_INTERRUPTIBLE,
+ timeout);
+
+ set_current_state (TASK_RUNNING);
+ remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+ spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+ }
+
+ spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+ ksocknal_thread_fini();
+ return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644
index 000000000000..3e08fe2d1489
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
@@ -0,0 +1,1088 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+
+enum {
+ SOCKLND_TIMEOUT = 1,
+ SOCKLND_CREDITS,
+ SOCKLND_PEER_TXCREDITS,
+ SOCKLND_PEER_RTRCREDITS,
+ SOCKLND_PEER_TIMEOUT,
+ SOCKLND_NCONNDS,
+ SOCKLND_RECONNECTS_MIN,
+ SOCKLND_RECONNECTS_MAX,
+ SOCKLND_EAGER_ACK,
+ SOCKLND_ZERO_COPY,
+ SOCKLND_TYPED,
+ SOCKLND_BULK_MIN,
+ SOCKLND_RX_BUFFER_SIZE,
+ SOCKLND_TX_BUFFER_SIZE,
+ SOCKLND_NAGLE,
+ SOCKLND_IRQ_AFFINITY,
+ SOCKLND_ROUND_ROBIN,
+ SOCKLND_KEEPALIVE,
+ SOCKLND_KEEPALIVE_IDLE,
+ SOCKLND_KEEPALIVE_COUNT,
+ SOCKLND_KEEPALIVE_INTVL,
+ SOCKLND_BACKOFF_INIT,
+ SOCKLND_BACKOFF_MAX,
+ SOCKLND_PROTOCOL,
+ SOCKLND_ZERO_COPY_RECV,
+ SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS
+};
+
+static ctl_table_t ksocknal_ctl_table[] = {
+ {
+ .ctl_name = SOCKLND_TIMEOUT,
+ .procname = "timeout",
+ .data = &ksocknal_tunables.ksnd_timeout,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_CREDITS,
+ .procname = "credits",
+ .data = &ksocknal_tunables.ksnd_credits,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_PEER_TXCREDITS,
+ .procname = "peer_credits",
+ .data = &ksocknal_tunables.ksnd_peertxcredits,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_PEER_RTRCREDITS,
+ .procname = "peer_buffer_credits",
+ .data = &ksocknal_tunables.ksnd_peerrtrcredits,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_PEER_TIMEOUT,
+ .procname = "peer_timeout",
+ .data = &ksocknal_tunables.ksnd_peertimeout,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_NCONNDS,
+ .procname = "nconnds",
+ .data = &ksocknal_tunables.ksnd_nconnds,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_RECONNECTS_MIN,
+ .procname = "min_reconnectms",
+ .data = &ksocknal_tunables.ksnd_min_reconnectms,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_RECONNECTS_MAX,
+ .procname = "max_reconnectms",
+ .data = &ksocknal_tunables.ksnd_max_reconnectms,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_EAGER_ACK,
+ .procname = "eager_ack",
+ .data = &ksocknal_tunables.ksnd_eager_ack,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_ZERO_COPY,
+ .procname = "zero_copy",
+ .data = &ksocknal_tunables.ksnd_zc_min_payload,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_ZERO_COPY_RECV,
+ .procname = "zero_copy_recv",
+ .data = &ksocknal_tunables.ksnd_zc_recv,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+
+ {
+ .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
+ .procname = "zero_copy_recv",
+ .data = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_TYPED,
+ .procname = "typed",
+ .data = &ksocknal_tunables.ksnd_typed_conns,
+ .maxlen = sizeof (int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_BULK_MIN,
+ .procname = "min_bulk",
+ .data = &ksocknal_tunables.ksnd_min_bulk,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_RX_BUFFER_SIZE,
+ .procname = "rx_buffer_size",
+ .data = &ksocknal_tunables.ksnd_rx_buffer_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_TX_BUFFER_SIZE,
+ .procname = "tx_buffer_size",
+ .data = &ksocknal_tunables.ksnd_tx_buffer_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_NAGLE,
+ .procname = "nagle",
+ .data = &ksocknal_tunables.ksnd_nagle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_ROUND_ROBIN,
+ .procname = "round_robin",
+ .data = &ksocknal_tunables.ksnd_round_robin,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_KEEPALIVE,
+ .procname = "keepalive",
+ .data = &ksocknal_tunables.ksnd_keepalive,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_KEEPALIVE_IDLE,
+ .procname = "keepalive_idle",
+ .data = &ksocknal_tunables.ksnd_keepalive_idle,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_KEEPALIVE_COUNT,
+ .procname = "keepalive_count",
+ .data = &ksocknal_tunables.ksnd_keepalive_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = SOCKLND_KEEPALIVE_INTVL,
+ .procname = "keepalive_intvl",
+ .data = &ksocknal_tunables.ksnd_keepalive_intvl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+#if SOCKNAL_VERSION_DEBUG
+ {
+ .ctl_name = SOCKLND_PROTOCOL,
+ .procname = "protocol",
+ .data = &ksocknal_tunables.ksnd_protocol,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
+#endif
+ {0}
+};
+
+
+ctl_table_t ksocknal_top_ctl_table[] = {
+ {
+ .ctl_name = CTL_SOCKLND,
+ .procname = "socknal",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = ksocknal_ctl_table
+ },
+ { 0 }
+};
+
+int
+ksocknal_lib_tunables_init ()
+{
+ if (!*ksocknal_tunables.ksnd_typed_conns) {
+ int rc = -EINVAL;
+#if SOCKNAL_VERSION_DEBUG
+ if (*ksocknal_tunables.ksnd_protocol < 3)
+ rc = 0;
+#endif
+ if (rc != 0) {
+ CERROR("Protocol V3.x MUST have typed connections\n");
+ return rc;
+ }
+ }
+
+ if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
+ *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
+ if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
+ *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
+
+ ksocknal_tunables.ksnd_sysctl =
+ cfs_register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+ if (ksocknal_tunables.ksnd_sysctl == NULL)
+ CWARN("Can't setup /proc tunables\n");
+
+ return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+ if (ksocknal_tunables.ksnd_sysctl != NULL)
+ unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
+}
+#else
+int
+ksocknal_lib_tunables_init ()
+{
+ return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+ int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+ &conn->ksnc_ipaddr,
+ &conn->ksnc_port);
+
+ /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
+
+ if (rc != 0) {
+ CERROR ("Error %d getting sock peer IP\n", rc);
+ return rc;
+ }
+
+ rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+ &conn->ksnc_myipaddr, NULL);
+ if (rc != 0) {
+ CERROR ("Error %d getting sock local IP\n", rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+ int caps = conn->ksnc_sock->sk->sk_route_caps;
+
+ if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+ return 0;
+
+ /* ZC if the socket supports scatter/gather and doesn't need software
+ * checksums */
+ return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ struct socket *sock = conn->ksnc_sock;
+ int nob;
+ int rc;
+
+ if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */
+ conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */
+ tx->tx_nob == tx->tx_resid && /* frist sending */
+ tx->tx_msg.ksm_csum == 0) /* not checksummed */
+ ksocknal_lib_csum_tx(tx);
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+
+ {
+#if SOCKNAL_SINGLE_FRAG_TX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ unsigned int niov = tx->tx_niov;
+#endif
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_iovlen = niov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = MSG_DONTWAIT
+ };
+ mm_segment_t oldmm = get_fs();
+ int i;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = tx->tx_iov[i];
+ nob += scratchiov[i].iov_len;
+ }
+
+ if (!list_empty(&conn->ksnc_tx_queue) ||
+ nob < tx->tx_resid)
+ msg.msg_flags |= MSG_MORE;
+
+ set_fs (KERNEL_DS);
+ rc = sock_sendmsg(sock, &msg, nob);
+ set_fs (oldmm);
+ }
+ return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ struct socket *sock = conn->ksnc_sock;
+ lnet_kiov_t *kiov = tx->tx_kiov;
+ int rc;
+ int nob;
+
+ /* Not NOOP message */
+ LASSERT (tx->tx_lnetmsg != NULL);
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+ if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+ /* Zero copy is enabled */
+ struct sock *sk = sock->sk;
+ struct page *page = kiov->kiov_page;
+ int offset = kiov->kiov_offset;
+ int fragsize = kiov->kiov_len;
+ int msgflg = MSG_DONTWAIT;
+
+ CDEBUG(D_NET, "page %p + offset %x for %d\n",
+ page, offset, kiov->kiov_len);
+
+ if (!list_empty(&conn->ksnc_tx_queue) ||
+ fragsize < tx->tx_resid)
+ msgflg |= MSG_MORE;
+
+ if (sk->sk_prot->sendpage != NULL) {
+ rc = sk->sk_prot->sendpage(sk, page,
+ offset, fragsize, msgflg);
+ } else {
+ rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+ msgflg);
+ }
+ } else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+ struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ unsigned int niov = tx->tx_nkiov;
+#endif
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_iovlen = niov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = MSG_DONTWAIT
+ };
+ mm_segment_t oldmm = get_fs();
+ int i;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+ kiov[i].kiov_offset;
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ }
+
+ if (!list_empty(&conn->ksnc_tx_queue) ||
+ nob < tx->tx_resid)
+ msg.msg_flags |= MSG_MORE;
+
+ set_fs (KERNEL_DS);
+ rc = sock_sendmsg(sock, &msg, nob);
+ set_fs (oldmm);
+
+ for (i = 0; i < niov; i++)
+ kunmap(kiov[i].kiov_page);
+ }
+ return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+ int opt = 1;
+ mm_segment_t oldmm = get_fs();
+ struct socket *sock = conn->ksnc_sock;
+
+ /* Remind the socket to ACK eagerly. If I don't, the socket might
+ * think I'm about to send something it could piggy-back the ACK
+ * on, introducing delay in completing zero-copy sends in my
+ * peer. */
+
+ set_fs(KERNEL_DS);
+ sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+ (char *)&opt, sizeof (opt));
+ set_fs(oldmm);
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ unsigned int niov = conn->ksnc_rx_niov;
+#endif
+ struct iovec *iov = conn->ksnc_rx_iov;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_iovlen = niov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = 0
+ };
+ mm_segment_t oldmm = get_fs();
+ int nob;
+ int i;
+ int rc;
+ int fragnob;
+ int sum;
+ __u32 saved_csum;
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+ LASSERT (niov > 0);
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = iov[i];
+ nob += scratchiov[i].iov_len;
+ }
+ LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+ set_fs (KERNEL_DS);
+ rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+ /* NB this is just a boolean..........................^ */
+ set_fs (oldmm);
+
+ saved_csum = 0;
+ if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+ saved_csum = conn->ksnc_msg.ksm_csum;
+ conn->ksnc_msg.ksm_csum = 0;
+ }
+
+ if (saved_csum != 0) {
+ /* accumulate checksum */
+ for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+ LASSERT (i < niov);
+
+ fragnob = iov[i].iov_len;
+ if (fragnob > sum)
+ fragnob = sum;
+
+ conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+ iov[i].iov_base, fragnob);
+ }
+ conn->ksnc_msg.ksm_csum = saved_csum;
+ }
+
+ return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+ if (addr == NULL)
+ return;
+
+ vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+ struct iovec *iov, struct page **pages)
+{
+ void *addr;
+ int nob;
+ int i;
+
+ if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+ return NULL;
+
+ LASSERT (niov <= LNET_MAX_IOV);
+
+ if (niov < 2 ||
+ niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+ return NULL;
+
+ for (nob = i = 0; i < niov; i++) {
+ if ((kiov[i].kiov_offset != 0 && i > 0) ||
+ (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+ return NULL;
+
+ pages[i] = kiov[i].kiov_page;
+ nob += kiov[i].kiov_len;
+ }
+
+ addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+ if (addr == NULL)
+ return NULL;
+
+ iov->iov_base = addr + kiov[0].kiov_offset;
+ iov->iov_len = nob;
+
+ return addr;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ struct page **pages = NULL;
+ unsigned int niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+ struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+ struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+ unsigned int niov = conn->ksnc_rx_nkiov;
+#endif
+ lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = 0
+ };
+ mm_segment_t oldmm = get_fs();
+ int nob;
+ int i;
+ int rc;
+ void *base;
+ void *addr;
+ int sum;
+ int fragnob;
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+ if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+ nob = scratchiov[0].iov_len;
+ msg.msg_iovlen = 1;
+
+ } else {
+ for (nob = i = 0; i < niov; i++) {
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+ kiov[i].kiov_offset;
+ }
+ msg.msg_iovlen = niov;
+ }
+
+ LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+ set_fs (KERNEL_DS);
+ rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+ /* NB this is just a boolean.......................^ */
+ set_fs (oldmm);
+
+ if (conn->ksnc_msg.ksm_csum != 0) {
+ for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+ LASSERT (i < niov);
+
+ /* Dang! have to kmap again because I have nowhere to stash the
+ * mapped address. But by doing it while the page is still
+ * mapped, the kernel just bumps the map count and returns me
+ * the address it stashed. */
+ base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+ fragnob = kiov[i].kiov_len;
+ if (fragnob > sum)
+ fragnob = sum;
+
+ conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+ base, fragnob);
+
+ kunmap(kiov[i].kiov_page);
+ }
+ }
+
+ if (addr != NULL) {
+ ksocknal_lib_kiov_vunmap(addr);
+ } else {
+ for (i = 0; i < niov; i++)
+ kunmap(kiov[i].kiov_page);
+ }
+
+ return (rc);
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+ int i;
+ __u32 csum;
+ void *base;
+
+ LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+ LASSERT(tx->tx_conn != NULL);
+ LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+ tx->tx_msg.ksm_csum = 0;
+
+ csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+ tx->tx_iov[0].iov_len);
+
+ if (tx->tx_kiov != NULL) {
+ for (i = 0; i < tx->tx_nkiov; i++) {
+ base = kmap(tx->tx_kiov[i].kiov_page) +
+ tx->tx_kiov[i].kiov_offset;
+
+ csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+ kunmap(tx->tx_kiov[i].kiov_page);
+ }
+ } else {
+ for (i = 1; i < tx->tx_niov; i++)
+ csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+ tx->tx_iov[i].iov_len);
+ }
+
+ if (*ksocknal_tunables.ksnd_inject_csum_error) {
+ csum++;
+ *ksocknal_tunables.ksnd_inject_csum_error = 0;
+ }
+
+ tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+ mm_segment_t oldmm = get_fs ();
+ struct socket *sock = conn->ksnc_sock;
+ int len;
+ int rc;
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) {
+ LASSERT (conn->ksnc_closing);
+ *txmem = *rxmem = *nagle = 0;
+ return (-ESHUTDOWN);
+ }
+
+ rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+ if (rc == 0) {
+ len = sizeof(*nagle);
+ set_fs(KERNEL_DS);
+ rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char *)nagle, &len);
+ set_fs(oldmm);
+ }
+
+ ksocknal_connsock_decref(conn);
+
+ if (rc == 0)
+ *nagle = !*nagle;
+ else
+ *txmem = *rxmem = *nagle = 0;
+
+ return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+ mm_segment_t oldmm = get_fs ();
+ int rc;
+ int option;
+ int keep_idle;
+ int keep_intvl;
+ int keep_count;
+ int do_keepalive;
+ struct linger linger;
+
+ sock->sk->sk_allocation = GFP_NOFS;
+
+ /* Ensure this socket aborts active sends immediately when we close
+ * it. */
+
+ linger.l_onoff = 0;
+ linger.l_linger = 0;
+
+ set_fs (KERNEL_DS);
+ rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
+ (char *)&linger, sizeof (linger));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set SO_LINGER: %d\n", rc);
+ return (rc);
+ }
+
+ option = -1;
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set SO_LINGER2: %d\n", rc);
+ return (rc);
+ }
+
+ if (!*ksocknal_tunables.ksnd_nagle) {
+ option = 1;
+
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't disable nagle: %d\n", rc);
+ return (rc);
+ }
+ }
+
+ rc = libcfs_sock_setbuf(sock,
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size);
+ if (rc != 0) {
+ CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+ return (rc);
+ }
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+ /* snapshot tunables */
+ keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
+ keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+ keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+ do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+ option = (do_keepalive ? 1 : 0);
+ set_fs (KERNEL_DS);
+ rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+ return (rc);
+ }
+
+ if (!do_keepalive)
+ return (0);
+
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&keep_idle, sizeof (keep_idle));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+ return (rc);
+ }
+
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&keep_intvl, sizeof (keep_intvl));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+ return (rc);
+ }
+
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&keep_count, sizeof (keep_count));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
+ return (rc);
+ }
+
+ return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+ struct sock *sk;
+ struct tcp_sock *tp;
+ int nonagle;
+ int val = 1;
+ int rc;
+ mm_segment_t oldmm;
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) /* being shut down */
+ return;
+
+ sk = conn->ksnc_sock->sk;
+ tp = tcp_sk(sk);
+
+ lock_sock (sk);
+ nonagle = tp->nonagle;
+ tp->nonagle = 1;
+ release_sock (sk);
+
+ oldmm = get_fs ();
+ set_fs (KERNEL_DS);
+
+ rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+ (char *)&val, sizeof (val));
+ LASSERT (rc == 0);
+
+ set_fs (oldmm);
+
+ lock_sock (sk);
+ tp->nonagle = nonagle;
+ release_sock (sk);
+
+ ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+ ksock_conn_t *conn;
+ ENTRY;
+
+ /* interleave correctly with closing sockets... */
+ LASSERT(!in_irq());
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ conn = sk->sk_user_data;
+ if (conn == NULL) { /* raced with ksocknal_terminate_conn */
+ LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
+ sk->sk_data_ready (sk, n);
+ } else
+ ksocknal_read_callback(conn);
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ EXIT;
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+ ksock_conn_t *conn;
+ int wspace;
+ int min_wpace;
+
+ /* interleave correctly with closing sockets... */
+ LASSERT(!in_irq());
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ conn = sk->sk_user_data;
+ wspace = SOCKNAL_WSPACE(sk);
+ min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+ CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+ sk, wspace, min_wpace, conn,
+ (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+ " ready" : " blocked"),
+ (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+ " scheduled" : " idle"),
+ (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+ " empty" : " queued"));
+
+ if (conn == NULL) { /* raced with ksocknal_terminate_conn */
+ LASSERT (sk->sk_write_space != &ksocknal_write_space);
+ sk->sk_write_space (sk);
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return;
+ }
+
+ if (wspace >= min_wpace) { /* got enough space */
+ ksocknal_write_callback(conn);
+
+ /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+ * ENOMEM check in ksocknal_transmit is race-free (think about
+ * it). */
+
+ clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+ conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ sock->sk->sk_user_data = conn;
+ sock->sk->sk_data_ready = ksocknal_data_ready;
+ sock->sk->sk_write_space = ksocknal_write_space;
+ return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ /* Remove conn's network callbacks.
+ * NB I _have_ to restore the callback, rather than storing a noop,
+ * since the socket could survive past this module being unloaded!! */
+ sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+ sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+ /* A callback could be in progress already; they hold a read lock
+ * on ksnd_global_lock (to serialise with me) and NOOP if
+ * sk_user_data is NULL. */
+ sock->sk->sk_user_data = NULL;
+
+ return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+ int rc = 0;
+ ksock_sched_t *sched;
+
+ sched = conn->ksnc_scheduler;
+ spin_lock_bh(&sched->kss_lock);
+
+ if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+ !conn->ksnc_tx_ready) {
+ /* SOCK_NOSPACE is set when the socket fills
+ * and cleared in the write_space callback
+ * (which also sets ksnc_tx_ready). If
+ * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+ * zero, I didn't fill the socket and
+ * write_space won't reschedule me, so I
+ * return -ENOMEM to get my caller to retry
+ * after a timeout */
+ rc = -ENOMEM;
+ }
+
+ spin_unlock_bh(&sched->kss_lock);
+
+ return rc;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644
index 000000000000..3c135786dc11
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+ return crc32_le(crc, p, len);
+#else
+ while (len-- > 0)
+ crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+ return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk) sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk) sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS 3
+#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1)
+
+#endif
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 000000000000..8a474f64abbe
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
+ "dead socket timeout (seconds)");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+ "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+ "# concurrent sends to 1 peer");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+ "# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+ "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+ "# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+ "# connection daemons while starting");
+
+static int nconnds_max = 64;
+CFS_MODULE_PARM(nconnds_max, "i", int, 0444,
+ "max # connection daemons");
+
+static int min_reconnectms = 1000;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+ "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+ "max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+ "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+ "use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+ "smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
+ "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
+ "socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+ "enable NAGLE?");
+
+static int round_robin = 1;
+CFS_MODULE_PARM(round_robin, "i", int, 0644,
+ "Round robin for multiple interfaces");
+
+static int keepalive = 30;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+ "# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+ "# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT 5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+ "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+ "seconds between probes");
+
+static int enable_csum = 0;
+CFS_MODULE_PARM(enable_csum, "i", int, 0644,
+ "enable check sum");
+
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+ "set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644,
+ "always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
+ "minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+CFS_MODULE_PARM(zc_recv, "i", int, 0644,
+ "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644,
+ "minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+CFS_MODULE_PARM(protocol, "i", int, 0644,
+ "protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+ /* initialize ksocknal_tunables structure */
+ ksocknal_tunables.ksnd_timeout = &sock_timeout;
+ ksocknal_tunables.ksnd_nscheds = &nscheds;
+ ksocknal_tunables.ksnd_nconnds = &nconnds;
+ ksocknal_tunables.ksnd_nconnds_max = &nconnds_max;
+ ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
+ ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
+ ksocknal_tunables.ksnd_eager_ack = &eager_ack;
+ ksocknal_tunables.ksnd_typed_conns = &typed_conns;
+ ksocknal_tunables.ksnd_min_bulk = &min_bulk;
+ ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size;
+ ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size;
+ ksocknal_tunables.ksnd_nagle = &nagle;
+ ksocknal_tunables.ksnd_round_robin = &round_robin;
+ ksocknal_tunables.ksnd_keepalive = &keepalive;
+ ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle;
+ ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
+ ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
+ ksocknal_tunables.ksnd_credits = &credits;
+ ksocknal_tunables.ksnd_peertxcredits = &peer_credits;
+ ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits;
+ ksocknal_tunables.ksnd_peertimeout = &peer_timeout;
+ ksocknal_tunables.ksnd_enable_csum = &enable_csum;
+ ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
+ ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack;
+ ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload;
+ ksocknal_tunables.ksnd_zc_recv = &zc_recv;
+ ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+ ksocknal_tunables.ksnd_protocol = &protocol;
+#endif
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+ ksocknal_tunables.ksnd_sysctl = NULL;
+#endif
+
+ if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+ *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+ /* initialize platform-sepcific tunables */
+ return ksocknal_lib_tunables_init();
+};
+
+void ksocknal_tunables_fini(void)
+{
+ ksocknal_lib_tunables_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 000000000000..ec57179f8d2b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ * pro_send_hello : send hello message
+ * pro_recv_hello : receive hello message
+ * pro_pack : pack message header
+ * pro_unpack : unpack message header
+ * pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ * return 1 if ACK is piggybacked, otherwise return 0
+ * pro_queue_tx_msg() : Called holding BH lock: kss_lock
+ * return the ACK that piggybacked by my message, or NULL
+ * pro_handle_zcreq() : handler of incoming ZC-REQ
+ * pro_handle_zcack() : handler of incoming ZC-ACK
+ * pro_match_tx() : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+ /* V1.x, just enqueue it */
+ list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+ return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+ ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+ /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+ LASSERT (!list_empty(&conn->ksnc_tx_queue));
+ LASSERT (tx != NULL);
+
+ /* Next TX that can carry ZC-ACK or LNet message */
+ if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+ /* no more packets queued */
+ conn->ksnc_tx_carrier = NULL;
+ } else {
+ conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+ ksock_tx_t, tx_list);
+ LASSERT (conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+ }
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+ ksock_tx_t *tx_ack, __u64 cookie)
+{
+ ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+ LASSERT (tx_ack == NULL ||
+ tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ /*
+ * Enqueue or piggyback tx_ack / cookie
+ * . no tx can piggyback cookie of tx_ack (or cookie), just
+ * enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+ * . There is tx can piggyback cookie of tx_ack (or cookie),
+ * piggyback the cookie and return the tx.
+ */
+ if (tx == NULL) {
+ if (tx_ack != NULL) {
+ list_add_tail(&tx_ack->tx_list,
+ &conn->ksnc_tx_queue);
+ conn->ksnc_tx_carrier = tx_ack;
+ }
+ return 0;
+ }
+
+ if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+ /* tx is noop zc-ack, can't piggyback zc-ack cookie */
+ if (tx_ack != NULL)
+ list_add_tail(&tx_ack->tx_list,
+ &conn->ksnc_tx_queue);
+ return 0;
+ }
+
+ LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+ LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+ if (tx_ack != NULL)
+ cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+ /* piggyback the zc-ack cookie */
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ /* move on to the next TX which can carry cookie */
+ ksocknal_next_tx_carrier(conn);
+
+ return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+ ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+ /*
+ * Enqueue tx_msg:
+ * . If there is no NOOP on the connection, just enqueue
+ * tx_msg and return NULL
+ * . If there is NOOP on the connection, piggyback the cookie
+ * and replace the NOOP tx, and return the NOOP tx.
+ */
+ if (tx == NULL) { /* nothing on queue */
+ list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+ conn->ksnc_tx_carrier = tx_msg;
+ return NULL;
+ }
+
+ if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+ list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+ return NULL;
+ }
+
+ LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ /* There is a noop zc-ack can be piggybacked */
+ tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+ ksocknal_next_tx_carrier(conn);
+
+ /* use new_tx to replace the noop zc-ack packet */
+ list_add(&tx_msg->tx_list, &tx->tx_list);
+ list_del(&tx->tx_list);
+
+ return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+ ksock_tx_t *tx_ack, __u64 cookie)
+{
+ ksock_tx_t *tx;
+
+ if (conn->ksnc_type != SOCKLND_CONN_ACK)
+ return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+ /* non-blocking ZC-ACK (to router) */
+ LASSERT (tx_ack == NULL ||
+ tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ if ((tx = conn->ksnc_tx_carrier) == NULL) {
+ if (tx_ack != NULL) {
+ list_add_tail(&tx_ack->tx_list,
+ &conn->ksnc_tx_queue);
+ conn->ksnc_tx_carrier = tx_ack;
+ }
+ return 0;
+ }
+
+ /* conn->ksnc_tx_carrier != NULL */
+
+ if (tx_ack != NULL)
+ cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+ if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+ return 1;
+
+ if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+ /* replace the keepalive PING with a real ACK */
+ LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ return 1;
+ }
+
+ if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+ cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+ CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+ return 1; /* XXX return error in the future */
+ }
+
+ if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+ /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+ if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+ tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ } else {
+ tx->tx_msg.ksm_zc_cookies[0] = cookie;
+ }
+
+ if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+ /* not likely to carry more ACKs, skip it to simplify logic */
+ ksocknal_next_tx_carrier(conn);
+ }
+
+ return 1;
+ }
+
+ /* takes two or more cookies already */
+
+ if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+ __u64 tmp = 0;
+
+ /* two seperated cookies: (a+2, a) or (a+1, a) */
+ LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+ tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+ if (tx->tx_msg.ksm_zc_cookies[0] -
+ tx->tx_msg.ksm_zc_cookies[1] == 2) {
+ if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+ tmp = cookie;
+ } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+ tmp = tx->tx_msg.ksm_zc_cookies[1];
+ } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+ tmp = tx->tx_msg.ksm_zc_cookies[0];
+ }
+
+ if (tmp != 0) {
+ /* range of cookies */
+ tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+ tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+ return 1;
+ }
+
+ } else {
+ /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+ if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+ cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+ CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+ libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+ return 1; /* XXX: return error in the future */
+ }
+
+ if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+ tx->tx_msg.ksm_zc_cookies[1] = cookie;
+ return 1;
+ }
+
+ if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+ tx->tx_msg.ksm_zc_cookies[0] = cookie;
+ return 1;
+ }
+ }
+
+ /* failed to piggyback ZC-ACK */
+ if (tx_ack != NULL) {
+ list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+ /* the next tx can piggyback at least 1 ACK */
+ ksocknal_next_tx_carrier(conn);
+ }
+
+ return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+ int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+ if (!*ksocknal_tunables.ksnd_typed_conns)
+ return SOCKNAL_MATCH_YES;
+#endif
+
+ if (tx == NULL || tx->tx_lnetmsg == NULL) {
+ /* noop packet */
+ nob = offsetof(ksock_msg_t, ksm_u);
+ } else {
+ nob = tx->tx_lnetmsg->msg_len +
+ ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+ sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+ }
+
+ /* default checking for typed connection */
+ switch (conn->ksnc_type) {
+ default:
+ CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+ LBUG();
+ case SOCKLND_CONN_ANY:
+ return SOCKNAL_MATCH_YES;
+
+ case SOCKLND_CONN_BULK_IN:
+ return SOCKNAL_MATCH_MAY;
+
+ case SOCKLND_CONN_BULK_OUT:
+ if (nob < *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+
+ case SOCKLND_CONN_CONTROL:
+ if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+ }
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+ int nob;
+
+ if (tx == NULL || tx->tx_lnetmsg == NULL)
+ nob = offsetof(ksock_msg_t, ksm_u);
+ else
+ nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+ switch (conn->ksnc_type) {
+ default:
+ CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+ LBUG();
+ case SOCKLND_CONN_ANY:
+ return SOCKNAL_MATCH_NO;
+
+ case SOCKLND_CONN_ACK:
+ if (nonblk)
+ return SOCKNAL_MATCH_YES;
+ else if (tx == NULL || tx->tx_lnetmsg == NULL)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_NO;
+
+ case SOCKLND_CONN_BULK_OUT:
+ if (nonblk)
+ return SOCKNAL_MATCH_NO;
+ else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+
+ case SOCKLND_CONN_CONTROL:
+ if (nonblk)
+ return SOCKNAL_MATCH_NO;
+ else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+ return SOCKNAL_MATCH_MAY;
+ else
+ return SOCKNAL_MATCH_YES;
+ }
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+ ksock_peer_t *peer = c->ksnc_peer;
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+ int rc;
+
+ read_lock(&ksocknal_data.ksnd_global_lock);
+
+ conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+ if (conn != NULL) {
+ ksock_sched_t *sched = conn->ksnc_scheduler;
+
+ LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+ spin_lock_bh(&sched->kss_lock);
+
+ rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+ spin_unlock_bh(&sched->kss_lock);
+
+ if (rc) { /* piggybacked */
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+ return 0;
+ }
+ }
+
+ read_unlock(&ksocknal_data.ksnd_global_lock);
+
+ /* ACK connection is not ready, or can't piggyback the ACK */
+ tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+ if (tx == NULL)
+ return -ENOMEM;
+
+ if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0)
+ return 0;
+
+ ksocknal_free_tx(tx);
+ return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+ ksock_peer_t *peer = conn->ksnc_peer;
+ ksock_tx_t *tx;
+ ksock_tx_t *tmp;
+ LIST_HEAD (zlist);
+ int count;
+
+ if (cookie1 == 0)
+ cookie1 = cookie2;
+
+ count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+ if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+ conn->ksnc_proto == &ksocknal_protocol_v3x) {
+ /* keepalive PING for V3.x, just ignore it */
+ return count == 1 ? 0 : -EPROTO;
+ }
+
+ spin_lock(&peer->ksnp_lock);
+
+ list_for_each_entry_safe(tx, tmp,
+ &peer->ksnp_zc_req_list, tx_zc_list) {
+ __u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+ if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+ tx->tx_msg.ksm_zc_cookies[0] = 0;
+ list_del(&tx->tx_zc_list);
+ list_add(&tx->tx_zc_list, &zlist);
+
+ if (--count == 0)
+ break;
+ }
+ }
+
+ spin_unlock(&peer->ksnp_lock);
+
+ while (!list_empty(&zlist)) {
+ tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+ list_del(&tx->tx_zc_list);
+ ksocknal_tx_decref(tx);
+ }
+
+ return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+ socket_t *sock = conn->ksnc_sock;
+ lnet_hdr_t *hdr;
+ lnet_magicversion_t *hmv;
+ int rc;
+ int i;
+
+ CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+ LIBCFS_ALLOC(hdr, sizeof(*hdr));
+ if (hdr == NULL) {
+ CERROR("Can't allocate lnet_hdr_t\n");
+ return -ENOMEM;
+ }
+
+ hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+ /* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+ * header and send out */
+ hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+ hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+ hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ LNET_LOCK();
+ if ((the_lnet.ln_testprotocompat & 1) != 0) {
+ hmv->version_major++; /* just different! */
+ the_lnet.ln_testprotocompat &= ~1;
+ }
+ if ((the_lnet.ln_testprotocompat & 2) != 0) {
+ hmv->magic = LNET_PROTO_MAGIC;
+ the_lnet.ln_testprotocompat &= ~2;
+ }
+ LNET_UNLOCK();
+ }
+
+ hdr->src_nid = cpu_to_le64 (hello->kshm_src_nid);
+ hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid);
+ hdr->type = cpu_to_le32 (LNET_MSG_HELLO);
+ hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+ hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+ hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+ rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),lnet_acceptor_timeout());
+
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+ goto out;
+ }
+
+ if (hello->kshm_nips == 0)
+ goto out;
+
+ for (i = 0; i < (int) hello->kshm_nips; i++) {
+ hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+ }
+
+ rc = libcfs_sock_write(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32),
+ lnet_acceptor_timeout());
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO payload (%d)"
+ " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+ }
+out:
+ LIBCFS_FREE(hdr, sizeof(*hdr));
+
+ return rc;
+}
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+ socket_t *sock = conn->ksnc_sock;
+ int rc;
+
+ hello->kshm_magic = LNET_PROTO_MAGIC;
+ hello->kshm_version = conn->ksnc_proto->pro_version;
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ LNET_LOCK();
+ if ((the_lnet.ln_testprotocompat & 1) != 0) {
+ hello->kshm_version++; /* just different! */
+ the_lnet.ln_testprotocompat &= ~1;
+ }
+ LNET_UNLOCK();
+ }
+
+ rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+ lnet_acceptor_timeout());
+
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+ return rc;
+ }
+
+ if (hello->kshm_nips == 0)
+ return 0;
+
+ rc = libcfs_sock_write(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32),
+ lnet_acceptor_timeout());
+ if (rc != 0) {
+ CNETERR("Error %d sending HELLO payload (%d)"
+ " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+ HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+ }
+
+ return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout)
+{
+ socket_t *sock = conn->ksnc_sock;
+ lnet_hdr_t *hdr;
+ int rc;
+ int i;
+
+ LIBCFS_ALLOC(hdr, sizeof(*hdr));
+ if (hdr == NULL) {
+ CERROR("Can't allocate lnet_hdr_t\n");
+ return -ENOMEM;
+ }
+
+ rc = libcfs_sock_read(sock, &hdr->src_nid,
+ sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid),
+ timeout);
+ if (rc != 0) {
+ CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr));
+ LASSERT (rc < 0 && rc != -EALREADY);
+ goto out;
+ }
+
+ /* ...and check we got what we expected */
+ if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+ CERROR ("Expecting a HELLO hdr,"
+ " but got type %d from %u.%u.%u.%u\n",
+ le32_to_cpu (hdr->type),
+ HIPQUAD(conn->ksnc_ipaddr));
+ rc = -EPROTO;
+ goto out;
+ }
+
+ hello->kshm_src_nid = le64_to_cpu (hdr->src_nid);
+ hello->kshm_src_pid = le32_to_cpu (hdr->src_pid);
+ hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+ hello->kshm_ctype = le32_to_cpu (hdr->msg.hello.type);
+ hello->kshm_nips = le32_to_cpu (hdr->payload_length) /
+ sizeof (__u32);
+
+ if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+ CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+ hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+ rc = -EPROTO;
+ goto out;
+ }
+
+ if (hello->kshm_nips == 0)
+ goto out;
+
+ rc = libcfs_sock_read(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32), timeout);
+ if (rc != 0) {
+ CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr));
+ LASSERT (rc < 0 && rc != -EALREADY);
+ goto out;
+ }
+
+ for (i = 0; i < (int) hello->kshm_nips; i++) {
+ hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+ if (hello->kshm_ips[i] == 0) {
+ CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+ i, HIPQUAD(conn->ksnc_ipaddr));
+ rc = -EPROTO;
+ break;
+ }
+ }
+out:
+ LIBCFS_FREE(hdr, sizeof(*hdr));
+
+ return rc;
+}
+
+static int
+ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+ socket_t *sock = conn->ksnc_sock;
+ int rc;
+ int i;
+
+ if (hello->kshm_magic == LNET_PROTO_MAGIC)
+ conn->ksnc_flip = 0;
+ else
+ conn->ksnc_flip = 1;
+
+ rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+ offsetof(ksock_hello_msg_t, kshm_ips) -
+ offsetof(ksock_hello_msg_t, kshm_src_nid),
+ timeout);
+ if (rc != 0) {
+ CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr));
+ LASSERT (rc < 0 && rc != -EALREADY);
+ return rc;
+ }
+
+ if (conn->ksnc_flip) {
+ __swab32s(&hello->kshm_src_pid);
+ __swab64s(&hello->kshm_src_nid);
+ __swab32s(&hello->kshm_dst_pid);
+ __swab64s(&hello->kshm_dst_nid);
+ __swab64s(&hello->kshm_src_incarnation);
+ __swab64s(&hello->kshm_dst_incarnation);
+ __swab32s(&hello->kshm_ctype);
+ __swab32s(&hello->kshm_nips);
+ }
+
+ if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+ CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+ hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+ return -EPROTO;
+ }
+
+ if (hello->kshm_nips == 0)
+ return 0;
+
+ rc = libcfs_sock_read(sock, hello->kshm_ips,
+ hello->kshm_nips * sizeof(__u32), timeout);
+ if (rc != 0) {
+ CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+ rc, HIPQUAD(conn->ksnc_ipaddr));
+ LASSERT (rc < 0 && rc != -EALREADY);
+ return rc;
+ }
+
+ for (i = 0; i < (int) hello->kshm_nips; i++) {
+ if (conn->ksnc_flip)
+ __swab32s(&hello->kshm_ips[i]);
+
+ if (hello->kshm_ips[i] == 0) {
+ CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+ i, HIPQUAD(conn->ksnc_ipaddr));
+ return -EPROTO;
+ }
+ }
+
+ return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+ /* V1.x has no KSOCK_MSG_NOOP */
+ LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+ LASSERT(tx->tx_lnetmsg != NULL);
+
+ tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+ tx->tx_iov[0].iov_len = sizeof(lnet_hdr_t);
+
+ tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+ tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+ if (tx->tx_lnetmsg != NULL) {
+ LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+ tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+ tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+ tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+ } else {
+ LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+ tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+ tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+ }
+ /* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+ msg->ksm_csum = 0;
+ msg->ksm_type = KSOCK_MSG_LNET;
+ msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+ return; /* Do nothing */
+}
+
+ksock_proto_t ksocknal_protocol_v1x =
+{
+ .pro_version = KSOCK_PROTO_V1,
+ .pro_send_hello = ksocknal_send_hello_v1,
+ .pro_recv_hello = ksocknal_recv_hello_v1,
+ .pro_pack = ksocknal_pack_msg_v1,
+ .pro_unpack = ksocknal_unpack_msg_v1,
+ .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1,
+ .pro_handle_zcreq = NULL,
+ .pro_handle_zcack = NULL,
+ .pro_queue_tx_zcack = NULL,
+ .pro_match_tx = ksocknal_match_tx
+};
+
+ksock_proto_t ksocknal_protocol_v2x =
+{
+ .pro_version = KSOCK_PROTO_V2,
+ .pro_send_hello = ksocknal_send_hello_v2,
+ .pro_recv_hello = ksocknal_recv_hello_v2,
+ .pro_pack = ksocknal_pack_msg_v2,
+ .pro_unpack = ksocknal_unpack_msg_v2,
+ .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
+ .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2,
+ .pro_handle_zcreq = ksocknal_handle_zcreq,
+ .pro_handle_zcack = ksocknal_handle_zcack,
+ .pro_match_tx = ksocknal_match_tx
+};
+
+ksock_proto_t ksocknal_protocol_v3x =
+{
+ .pro_version = KSOCK_PROTO_V3,
+ .pro_send_hello = ksocknal_send_hello_v2,
+ .pro_recv_hello = ksocknal_recv_hello_v2,
+ .pro_pack = ksocknal_pack_msg_v2,
+ .pro_unpack = ksocknal_unpack_msg_v2,
+ .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2,
+ .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3,
+ .pro_handle_zcreq = ksocknal_handle_zcreq,
+ .pro_handle_zcack = ksocknal_handle_zcack,
+ .pro_match_tx = ksocknal_match_tx_v3
+};
diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644
index 000000000000..1bd9ef774208
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o \
+ lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o \
+ router_proc.o acceptor.o peer.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644
index 000000000000..81ef28bbcba0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/acceptor.c
@@ -0,0 +1,527 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+
+static int accept_port = 988;
+static int accept_backlog = 127;
+static int accept_timeout = 5;
+
+struct {
+ int pta_shutdown;
+ socket_t *pta_sock;
+ struct completion pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+ return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+ return (magic == constant ||
+ magic == __swab32(constant));
+}
+
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+ "Accept connections (secure|all|none)");
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+ "Acceptor's port (same on all nodes)");
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+ "Acceptor's listen backlog");
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+ "Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+int
+lnet_acceptor_get_tunables(void)
+{
+ /* Userland acceptor uses 'accept_type' instead of 'accept', due to
+ * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+ * for compatibility. Hence the trick. */
+ accept_type = accept;
+ return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+ return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+ __u32 peer_ip, int peer_port)
+{
+ switch (rc) {
+ /* "normal" errors */
+ case -ECONNREFUSED:
+ CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was "
+ "refused: check that Lustre is running on that node.\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ CNETERR("Connection to %s at host %u.%u.%u.%u "
+ "was unreachable: the network or that node may "
+ "be down, or Lustre may be misconfigured.\n",
+ libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+ break;
+ case -ETIMEDOUT:
+ CNETERR("Connection to %s at host %u.%u.%u.%u on "
+ "port %d took too long: that node may be hung "
+ "or experiencing high load.\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ case -ECONNRESET:
+ LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u"
+ " on port %d was reset: "
+ "is it running a compatible version of "
+ "Lustre and is %s one of its NIDs?\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port,
+ libcfs_nid2str(peer_nid));
+ break;
+ case -EPROTO:
+ LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+ "host %u.%u.%u.%u on port %d: is it running "
+ "a compatible version of Lustre?\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ case -EADDRINUSE:
+ LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+ "connect to %s at host %u.%u.%u.%u on port "
+ "%d\n", libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ default:
+ LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+ " at host %u.%u.%u.%u on port %d\n", rc,
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ }
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+ __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+ lnet_acceptor_connreq_t cr;
+ socket_t *sock;
+ int rc;
+ int port;
+ int fatal;
+
+ CLASSERT (sizeof(cr) <= 16); /* not too big to be on the stack */
+
+ for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+ port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+ --port) {
+ /* Iterate through reserved ports. */
+
+ rc = libcfs_sock_connect(&sock, &fatal,
+ local_ip, port,
+ peer_ip, peer_port);
+ if (rc != 0) {
+ if (fatal)
+ goto failed;
+ continue;
+ }
+
+ CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+ cr.acr_nid = peer_nid;
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ lnet_net_lock(LNET_LOCK_EX);
+ if ((the_lnet.ln_testprotocompat & 4) != 0) {
+ cr.acr_version++;
+ the_lnet.ln_testprotocompat &= ~4;
+ }
+ if ((the_lnet.ln_testprotocompat & 8) != 0) {
+ cr.acr_magic = LNET_PROTO_MAGIC;
+ the_lnet.ln_testprotocompat &= ~8;
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+ }
+
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+ if (rc != 0)
+ goto failed_sock;
+
+ *sockp = sock;
+ return 0;
+ }
+
+ rc = -EADDRINUSE;
+ goto failed;
+
+ failed_sock:
+ libcfs_sock_release(sock);
+ failed:
+ lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+ return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+int
+lnet_accept(socket_t *sock, __u32 magic)
+{
+ lnet_acceptor_connreq_t cr;
+ __u32 peer_ip;
+ int peer_port;
+ int rc;
+ int flip;
+ lnet_ni_t *ni;
+ char *str;
+
+ LASSERT (sizeof(cr) <= 16); /* not too big for the stack */
+
+ rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+ LASSERT (rc == 0); /* we succeeded before */
+
+ if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+ if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+ /* future version compatibility!
+ * When LNET unifies protocols over all LNDs, the first
+ * thing sent will be a version query. I send back
+ * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+ memset (&cr, 0, sizeof(cr));
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+
+ if (rc != 0)
+ CERROR("Error sending magic+version in response"
+ "to LNET magic from %u.%u.%u.%u: %d\n",
+ HIPQUAD(peer_ip), rc);
+ return -EPROTO;
+ }
+
+ if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+ str = "'old' socknal/tcpnal";
+ else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+ str = "'old' ranal";
+ else
+ str = "unrecognised";
+
+ LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u"
+ " magic %08x: %s acceptor protocol\n",
+ HIPQUAD(peer_ip), magic, str);
+ return -EPROTO;
+ }
+
+ flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+ rc = libcfs_sock_read(sock, &cr.acr_version,
+ sizeof(cr.acr_version),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request version from "
+ "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+ return -EIO;
+ }
+
+ if (flip)
+ __swab32s(&cr.acr_version);
+
+ if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+ /* future version compatibility!
+ * An acceptor-specific protocol rev will first send a version
+ * query. I send back my current version to tell her I'm
+ * "old". */
+ int peer_version = cr.acr_version;
+
+ memset (&cr, 0, sizeof(cr));
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+
+ if (rc != 0)
+ CERROR("Error sending magic+version in response"
+ "to version %d from %u.%u.%u.%u: %d\n",
+ peer_version, HIPQUAD(peer_ip), rc);
+ return -EPROTO;
+ }
+
+ rc = libcfs_sock_read(sock, &cr.acr_nid,
+ sizeof(cr) -
+ offsetof(lnet_acceptor_connreq_t, acr_nid),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request from "
+ "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+ return -EIO;
+ }
+
+ if (flip)
+ __swab64s(&cr.acr_nid);
+
+ ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+ if (ni == NULL || /* no matching net */
+ ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+ if (ni != NULL)
+ lnet_ni_decref(ni);
+ LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u"
+ " for %s: No matching NI\n",
+ HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+ return -EPERM;
+ }
+
+ if (ni->ni_lnd->lnd_accept == NULL) {
+ /* This catches a request for the loopback LND */
+ lnet_ni_decref(ni);
+ LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u"
+ " for %s: NI doesn not accept IP connections\n",
+ HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+ return -EPERM;
+ }
+
+ CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n",
+ libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+
+ rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+ lnet_ni_decref(ni);
+ return rc;
+}
+
+int
+lnet_acceptor(void *arg)
+{
+ socket_t *newsock;
+ int rc;
+ __u32 magic;
+ __u32 peer_ip;
+ int peer_port;
+ int secure = (int)((long_ptr_t)arg);
+
+ LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+ cfs_block_allsigs();
+
+ rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+ 0, accept_port, accept_backlog);
+ if (rc != 0) {
+ if (rc == -EADDRINUSE)
+ LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+ " %d: port already in use\n",
+ accept_port);
+ else
+ LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+ "%d: unexpected error %d\n",
+ accept_port, rc);
+
+ lnet_acceptor_state.pta_sock = NULL;
+ } else {
+ LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+ }
+
+ /* set init status and unblock parent */
+ lnet_acceptor_state.pta_shutdown = rc;
+ complete(&lnet_acceptor_state.pta_signal);
+
+ if (rc != 0)
+ return rc;
+
+ while (!lnet_acceptor_state.pta_shutdown) {
+
+ rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+ if (rc != 0) {
+ if (rc != -EAGAIN) {
+ CWARN("Accept error %d: pausing...\n", rc);
+ cfs_pause(cfs_time_seconds(1));
+ }
+ continue;
+ }
+
+ /* maybe we're waken up with libcfs_sock_abort_accept() */
+ if (lnet_acceptor_state.pta_shutdown) {
+ libcfs_sock_release(newsock);
+ break;
+ }
+
+ rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+ if (rc != 0) {
+ CERROR("Can't determine new connection's address\n");
+ goto failed;
+ }
+
+ if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+ CERROR("Refusing connection from %u.%u.%u.%u: "
+ "insecure port %d\n",
+ HIPQUAD(peer_ip), peer_port);
+ goto failed;
+ }
+
+ rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request from "
+ "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+ goto failed;
+ }
+
+ rc = lnet_accept(newsock, magic);
+ if (rc != 0)
+ goto failed;
+
+ continue;
+
+ failed:
+ libcfs_sock_release(newsock);
+ }
+
+ libcfs_sock_release(lnet_acceptor_state.pta_sock);
+ lnet_acceptor_state.pta_sock = NULL;
+
+ CDEBUG(D_NET, "Acceptor stopping\n");
+
+ /* unblock lnet_acceptor_stop() */
+ complete(&lnet_acceptor_state.pta_signal);
+ return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+ if (!strcmp(acc, "secure")) {
+ *sec = 1;
+ return 1;
+ } else if (!strcmp(acc, "all")) {
+ *sec = 0;
+ return 1;
+ } else if (!strcmp(acc, "none")) {
+ return 0;
+ } else {
+ LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+ acc);
+ return -EINVAL;
+ }
+}
+
+int
+lnet_acceptor_start(void)
+{
+ int rc;
+ long rc2;
+ long secure;
+
+ LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+ rc = lnet_acceptor_get_tunables();
+ if (rc != 0)
+ return rc;
+
+
+ init_completion(&lnet_acceptor_state.pta_signal);
+ rc = accept2secure(accept_type, &secure);
+ if (rc <= 0) {
+ fini_completion(&lnet_acceptor_state.pta_signal);
+ return rc;
+ }
+
+ if (lnet_count_acceptor_nis() == 0) /* not required */
+ return 0;
+
+ rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+ (void *)(ulong_ptr_t)secure,
+ "acceptor_%03ld", secure));
+ if (IS_ERR_VALUE(rc2)) {
+ CERROR("Can't start acceptor thread: %ld\n", rc2);
+ fini_completion(&lnet_acceptor_state.pta_signal);
+
+ return -ESRCH;
+ }
+
+ /* wait for acceptor to startup */
+ wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+ if (!lnet_acceptor_state.pta_shutdown) {
+ /* started OK */
+ LASSERT(lnet_acceptor_state.pta_sock != NULL);
+ return 0;
+ }
+
+ LASSERT(lnet_acceptor_state.pta_sock == NULL);
+ fini_completion(&lnet_acceptor_state.pta_signal);
+
+ return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+ if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+ return;
+
+ lnet_acceptor_state.pta_shutdown = 1;
+ libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+ /* block until acceptor signals exit */
+ wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+ fini_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c
new file mode 100644
index 000000000000..695b27265e23
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-errno.c
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/api-errno.c
+ *
+ * Instantiate the string table of errors
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644
index 000000000000..e88bee362497
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -0,0 +1,1941 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+#include <linux/log2.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t the_lnet; /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+ "LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+ "local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+ "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444,
+ "size of remote network hash table");
+
+char *
+lnet_get_routes(void)
+{
+ return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+ char *nets;
+ int rc;
+
+ if (*networks != 0 && *ip2nets != 0) {
+ LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+ "'ip2nets' but not both at once\n");
+ return NULL;
+ }
+
+ if (*ip2nets != 0) {
+ rc = lnet_parse_ip2nets(&nets, ip2nets);
+ return (rc == 0) ? nets : NULL;
+ }
+
+ if (*networks != 0)
+ return networks;
+
+ return "tcp";
+}
+
+void
+lnet_init_locks(void)
+{
+ spin_lock_init(&the_lnet.ln_eq_wait_lock);
+ init_waitqueue_head(&the_lnet.ln_eq_waitq);
+ mutex_init(&the_lnet.ln_lnd_mutex);
+ mutex_init(&the_lnet.ln_api_mutex);
+}
+
+void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+ int i;
+ struct list_head *hash;
+
+ LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+ LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+ LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+ if (hash == NULL) {
+ CERROR("Failed to create remote nets hash table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&hash[i]);
+ the_lnet.ln_remote_nets_hash = hash;
+ return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+ int i;
+ struct list_head *hash;
+
+ if (the_lnet.ln_remote_nets_hash == NULL)
+ return;
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+ LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+ LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+ LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+ the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+ if (the_lnet.ln_res_lock != NULL) {
+ cfs_percpt_lock_free(the_lnet.ln_res_lock);
+ the_lnet.ln_res_lock = NULL;
+ }
+
+ if (the_lnet.ln_net_lock != NULL) {
+ cfs_percpt_lock_free(the_lnet.ln_net_lock);
+ the_lnet.ln_net_lock = NULL;
+ }
+
+ lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+ lnet_init_locks();
+
+ the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+ if (the_lnet.ln_res_lock == NULL)
+ goto failed;
+
+ the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+ if (the_lnet.ln_net_lock == NULL)
+ goto failed;
+
+ return 0;
+
+ failed:
+ lnet_destroy_locks();
+ return -ENOMEM;
+}
+
+void lnet_assert_wire_constants (void)
+{
+ /* Wire protocol assertions generated by 'wirecheck'
+ * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+ * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+ * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+ /* Constants... */
+ CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+ CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+ CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+ CLASSERT (LNET_MSG_ACK == 0);
+ CLASSERT (LNET_MSG_PUT == 1);
+ CLASSERT (LNET_MSG_GET == 2);
+ CLASSERT (LNET_MSG_REPLY == 3);
+ CLASSERT (LNET_MSG_HELLO == 4);
+
+ /* Checks for struct ptl_handle_wire_t */
+ CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+ CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+ CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+ CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+ CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+ /* Checks for struct lnet_magicversion_t */
+ CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+ CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+ CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+ CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+ CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+ CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+ CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+ /* Checks for struct lnet_hdr_t */
+ CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+ CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+ /* Ack */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+ /* Put */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+ /* Get */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+ /* Reply */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+ /* Hello */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type)
+{
+ lnd_t *lnd;
+ struct list_head *tmp;
+
+ /* holding lnd mutex */
+ list_for_each (tmp, &the_lnet.ln_lnds) {
+ lnd = list_entry(tmp, lnd_t, lnd_list);
+
+ if ((int)lnd->lnd_type == type)
+ return lnd;
+ }
+
+ return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+ LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+ list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+ lnd->lnd_refcount = 0;
+
+ CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd (lnd_t *lnd)
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+ LASSERT (lnd->lnd_refcount == 0);
+
+ list_del (&lnd->lnd_list);
+ CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+ lnet_counters_t *ctr;
+ int i;
+
+ memset(counters, 0, sizeof(*counters));
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+ counters->msgs_max += ctr->msgs_max;
+ counters->msgs_alloc += ctr->msgs_alloc;
+ counters->errors += ctr->errors;
+ counters->send_count += ctr->send_count;
+ counters->recv_count += ctr->recv_count;
+ counters->route_count += ctr->route_count;
+ counters->drop_length += ctr->drop_length;
+ counters->send_length += ctr->send_length;
+ counters->recv_length += ctr->recv_length;
+ counters->route_length += ctr->route_length;
+ counters->drop_length += ctr->drop_length;
+
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+ lnet_counters_t *counters;
+ int i;
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+ memset(counters, 0, sizeof(lnet_counters_t));
+
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+ char *space;
+
+ LASSERT (n > 0);
+
+ size += offsetof (lnet_freeobj_t, fo_contents);
+
+ LIBCFS_ALLOC(space, n * size);
+ if (space == NULL)
+ return (-ENOMEM);
+
+ INIT_LIST_HEAD (&fl->fl_list);
+ fl->fl_objs = space;
+ fl->fl_nobjs = n;
+ fl->fl_objsize = size;
+
+ do
+ {
+ memset (space, 0, size);
+ list_add ((struct list_head *)space, &fl->fl_list);
+ space += size;
+ } while (--n != 0);
+
+ return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+ struct list_head *el;
+ int count;
+
+ if (fl->fl_nobjs == 0)
+ return;
+
+ count = 0;
+ for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+ count++;
+
+ LASSERT (count == fl->fl_nobjs);
+
+ LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+ memset (fl, 0, sizeof (*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+__u64
+lnet_create_interface_cookie (void)
+{
+ /* NB the interface cookie in wire handles guards against delayed
+ * replies and ACKs appearing valid after reboot. Initialisation time,
+ * even if it's only implemented to millisecond resolution is probably
+ * easily good enough. */
+ struct timeval tv;
+ __u64 cookie;
+ do_gettimeofday(&tv);
+ cookie = tv.tv_sec;
+ cookie *= 1000000;
+ cookie += tv.tv_usec;
+ return cookie;
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+ switch (type) {
+ default:
+ LBUG();
+ case LNET_COOKIE_TYPE_MD:
+ return "MD";
+ case LNET_COOKIE_TYPE_ME:
+ return "ME";
+ case LNET_COOKIE_TYPE_EQ:
+ return "EQ";
+ }
+}
+
+void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+ int count = 0;
+
+ if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+ return;
+
+ while (!list_empty(&rec->rec_active)) {
+ struct list_head *e = rec->rec_active.next;
+
+ list_del_init(e);
+ if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+ lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+ } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+ lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+ } else { /* NB: Active MEs should be attached on portals */
+ LBUG();
+ }
+ count++;
+ }
+
+ if (count > 0) {
+ /* Found alive MD/ME/EQ, user really should unlink/free
+ * all of them before finalize LNet, but if someone didn't,
+ * we have to recycle garbage for him */
+ CERROR("%d active elements on exit of %s container\n",
+ count, lnet_res_type2str(rec->rec_type));
+ }
+
+#ifdef LNET_USE_LIB_FREELIST
+ lnet_freelist_fini(&rec->rec_freelist);
+#endif
+ if (rec->rec_lh_hash != NULL) {
+ LIBCFS_FREE(rec->rec_lh_hash,
+ LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+ rec->rec_lh_hash = NULL;
+ }
+
+ rec->rec_type = 0; /* mark it as finalized */
+}
+
+int
+lnet_res_container_setup(struct lnet_res_container *rec,
+ int cpt, int type, int objnum, int objsz)
+{
+ int rc = 0;
+ int i;
+
+ LASSERT(rec->rec_type == 0);
+
+ rec->rec_type = type;
+ INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+ memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+ rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+ if (rc != 0)
+ goto out;
+#endif
+ rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+ /* Arbitrary choice of hash table size */
+ LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+ LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+ if (rec->rec_lh_hash == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+ return 0;
+
+out:
+ CERROR("Failed to setup %s resource container\n",
+ lnet_res_type2str(type));
+ lnet_res_container_cleanup(rec);
+ return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+ struct lnet_res_container *rec;
+ int i;
+
+ cfs_percpt_for_each(rec, i, recs)
+ lnet_res_container_cleanup(rec);
+
+ cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+ struct lnet_res_container **recs;
+ struct lnet_res_container *rec;
+ int rc;
+ int i;
+
+ recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+ if (recs == NULL) {
+ CERROR("Failed to allocate %s resource containers\n",
+ lnet_res_type2str(type));
+ return NULL;
+ }
+
+ cfs_percpt_for_each(rec, i, recs) {
+ rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+ if (rc != 0) {
+ lnet_res_containers_destroy(recs);
+ return NULL;
+ }
+ }
+
+ return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+ /* ALWAYS called with lnet_res_lock held */
+ struct list_head *head;
+ lnet_libhandle_t *lh;
+ unsigned int hash;
+
+ if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+ return NULL;
+
+ hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+ head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+ list_for_each_entry(lh, head, lh_hash_chain) {
+ if (lh->lh_cookie == cookie)
+ return lh;
+ }
+
+ return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+ /* ALWAYS called with lnet_res_lock held */
+ unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+ unsigned int hash;
+
+ lh->lh_cookie = rec->rec_lh_cookie;
+ rec->rec_lh_cookie += 1 << ibits;
+
+ hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+ list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+ /* Prepare to bring up the network */
+ struct lnet_res_container **recs;
+ int rc = 0;
+
+ LASSERT (the_lnet.ln_refcount == 0);
+
+ the_lnet.ln_routing = 0;
+
+ LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+ the_lnet.ln_pid = requested_pid;
+
+ INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+ INIT_LIST_HEAD(&the_lnet.ln_nis);
+ INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+ INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+ INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+ rc = lnet_create_remote_nets_table();
+ if (rc != 0)
+ goto failed;
+
+ the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+ the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(lnet_counters_t));
+ if (the_lnet.ln_counters == NULL) {
+ CERROR("Failed to allocate counters for LNet\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = lnet_peer_tables_create();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_msg_containers_create();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+ LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+ sizeof(lnet_eq_t));
+ if (rc != 0)
+ goto failed;
+
+ recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+ sizeof(lnet_me_t));
+ if (recs == NULL)
+ goto failed;
+
+ the_lnet.ln_me_containers = recs;
+
+ recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+ sizeof(lnet_libmd_t));
+ if (recs == NULL)
+ goto failed;
+
+ the_lnet.ln_md_containers = recs;
+
+ rc = lnet_portals_create();
+ if (rc != 0) {
+ CERROR("Failed to create portals for LNet: %d\n", rc);
+ goto failed;
+ }
+
+ return 0;
+
+ failed:
+ lnet_unprepare();
+ return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+ /* NB no LNET_LOCK since this is the last reference. All LND instances
+ * have shut down already, so it is safe to unlink and free all
+ * descriptors, even those that appear committed to a network op (eg MD
+ * with non-zero pending count) */
+
+ lnet_fail_nid(LNET_NID_ANY, 0);
+
+ LASSERT(the_lnet.ln_refcount == 0);
+ LASSERT(list_empty(&the_lnet.ln_test_peers));
+ LASSERT(list_empty(&the_lnet.ln_nis));
+ LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+ LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+ lnet_portals_destroy();
+
+ if (the_lnet.ln_md_containers != NULL) {
+ lnet_res_containers_destroy(the_lnet.ln_md_containers);
+ the_lnet.ln_md_containers = NULL;
+ }
+
+ if (the_lnet.ln_me_containers != NULL) {
+ lnet_res_containers_destroy(the_lnet.ln_me_containers);
+ the_lnet.ln_me_containers = NULL;
+ }
+
+ lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+ lnet_msg_containers_destroy();
+ lnet_peer_tables_destroy();
+ lnet_rtrpools_free();
+
+ if (the_lnet.ln_counters != NULL) {
+ cfs_percpt_free(the_lnet.ln_counters);
+ the_lnet.ln_counters = NULL;
+ }
+ lnet_destroy_remote_nets_table();
+
+ return 0;
+}
+
+lnet_ni_t *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+ struct list_head *tmp;
+ lnet_ni_t *ni;
+
+ LASSERT(cpt != LNET_LOCK_EX);
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (LNET_NIDNET(ni->ni_nid) == net) {
+ lnet_ni_addref_locked(ni, cpt);
+ return ni;
+ }
+ }
+
+ return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+ lnet_ni_t *ni;
+
+ lnet_net_lock(0);
+ ni = lnet_net2ni_locked(net, 0);
+ lnet_net_unlock(0);
+
+ return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+ __u64 key = nid;
+ unsigned int val;
+
+ LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+ if (number == 1)
+ return 0;
+
+ val = cfs_hash_long(key, LNET_CPT_BITS);
+ /* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+ if (val < number)
+ return val;
+
+ return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+ struct lnet_ni *ni;
+
+ /* must called with hold of lnet_net_lock */
+ if (LNET_CPT_NUMBER == 1)
+ return 0; /* the only one */
+
+ /* take lnet_net_lock(any) would be OK */
+ if (!list_empty(&the_lnet.ln_nis_cpt)) {
+ list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+ if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+ continue;
+
+ LASSERT(ni->ni_cpts != NULL);
+ return ni->ni_cpts[lnet_nid_cpt_hash
+ (nid, ni->ni_ncpts)];
+ }
+ }
+
+ return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+ int cpt;
+ int cpt2;
+
+ if (LNET_CPT_NUMBER == 1)
+ return 0; /* the only one */
+
+ if (list_empty(&the_lnet.ln_nis_cpt))
+ return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+ cpt = lnet_net_lock_current();
+ cpt2 = lnet_cpt_of_nid_locked(nid);
+ lnet_net_unlock(cpt);
+
+ return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+
+ ni = lnet_net2ni_locked(net, cpt);
+ if (ni != NULL)
+ lnet_ni_decref_locked(ni, cpt);
+
+ lnet_net_unlock(cpt);
+
+ return ni != NULL;
+}
+
+lnet_ni_t *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+ struct lnet_ni *ni;
+ struct list_head *tmp;
+
+ LASSERT(cpt != LNET_LOCK_EX);
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (ni->ni_nid == nid) {
+ lnet_ni_addref_locked(ni, cpt);
+ return ni;
+ }
+ }
+
+ return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ ni = lnet_nid2ni_locked(nid, cpt);
+ if (ni != NULL)
+ lnet_ni_decref_locked(ni, cpt);
+ lnet_net_unlock(cpt);
+
+ return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis (void)
+{
+ /* Return the # of NIs that need the acceptor. */
+ int count = 0;
+ struct list_head *tmp;
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (ni->ni_lnd->lnd_accept != NULL)
+ count++;
+ }
+
+ lnet_net_unlock(cpt);
+
+ return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+ int credits;
+
+ LASSERT(ni->ni_ncpts >= 1);
+
+ if (ni->ni_ncpts == 1)
+ return ni->ni_maxtxcredits;
+
+ credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+ credits = max(credits, 8 * ni->ni_peertxcredits);
+ credits = min(credits, ni->ni_maxtxcredits);
+
+ return credits;
+}
+
+void
+lnet_shutdown_lndnis (void)
+{
+ int i;
+ int islo;
+ lnet_ni_t *ni;
+
+ /* NB called holding the global mutex */
+
+ /* All quiet on the API front */
+ LASSERT(!the_lnet.ln_shutdown);
+ LASSERT(the_lnet.ln_refcount == 0);
+ LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_shutdown = 1; /* flag shutdown */
+
+ /* Unlink NIs from the global table */
+ while (!list_empty(&the_lnet.ln_nis)) {
+ ni = list_entry(the_lnet.ln_nis.next,
+ lnet_ni_t, ni_list);
+ /* move it to zombie list and nobody can find it anymore */
+ list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+ lnet_ni_decref_locked(ni, 0); /* drop ln_nis' ref */
+
+ if (!list_empty(&ni->ni_cptlist)) {
+ list_del_init(&ni->ni_cptlist);
+ lnet_ni_decref_locked(ni, 0);
+ }
+ }
+
+ /* Drop the cached eqwait NI. */
+ if (the_lnet.ln_eq_waitni != NULL) {
+ lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+ the_lnet.ln_eq_waitni = NULL;
+ }
+
+ /* Drop the cached loopback NI. */
+ if (the_lnet.ln_loni != NULL) {
+ lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+ the_lnet.ln_loni = NULL;
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ /* Clear lazy portals and drop delayed messages which hold refs
+ * on their lnet_msg_t::msg_rxpeer */
+ for (i = 0; i < the_lnet.ln_nportals; i++)
+ LNetClearLazyPortal(i);
+
+ /* Clear the peer table and wait for all peers to go (they hold refs on
+ * their NIs) */
+ lnet_peer_tables_cleanup();
+
+ lnet_net_lock(LNET_LOCK_EX);
+ /* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+ * and shut them down in guaranteed thread context */
+ i = 2;
+ while (!list_empty(&the_lnet.ln_nis_zombie)) {
+ int *ref;
+ int j;
+
+ ni = list_entry(the_lnet.ln_nis_zombie.next,
+ lnet_ni_t, ni_list);
+ list_del_init(&ni->ni_list);
+ cfs_percpt_for_each(ref, j, ni->ni_refs) {
+ if (*ref == 0)
+ continue;
+ /* still busy, add it back to zombie list */
+ list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+ break;
+ }
+
+ while (!list_empty(&ni->ni_list)) {
+ lnet_net_unlock(LNET_LOCK_EX);
+ ++i;
+ if ((i & (-i)) == i) {
+ CDEBUG(D_WARNING,
+ "Waiting for zombie LNI %s\n",
+ libcfs_nid2str(ni->ni_nid));
+ }
+ cfs_pause(cfs_time_seconds(1));
+ lnet_net_lock(LNET_LOCK_EX);
+ continue;
+ }
+
+ ni->ni_lnd->lnd_refcount--;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ islo = ni->ni_lnd->lnd_type == LOLND;
+
+ LASSERT (!in_interrupt ());
+ (ni->ni_lnd->lnd_shutdown)(ni);
+
+ /* can't deref lnd anymore now; it might have unregistered
+ * itself... */
+
+ if (!islo)
+ CDEBUG(D_LNI, "Removed LNI %s\n",
+ libcfs_nid2str(ni->ni_nid));
+
+ lnet_ni_free(ni);
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ the_lnet.ln_shutdown = 0;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (the_lnet.ln_network_tokens != NULL) {
+ LIBCFS_FREE(the_lnet.ln_network_tokens,
+ the_lnet.ln_network_tokens_nob);
+ the_lnet.ln_network_tokens = NULL;
+ }
+}
+
+int
+lnet_startup_lndnis (void)
+{
+ lnd_t *lnd;
+ struct lnet_ni *ni;
+ struct lnet_tx_queue *tq;
+ struct list_head nilist;
+ int i;
+ int rc = 0;
+ int lnd_type;
+ int nicount = 0;
+ char *nets = lnet_get_networks();
+
+ INIT_LIST_HEAD(&nilist);
+
+ if (nets == NULL)
+ goto failed;
+
+ rc = lnet_parse_networks(&nilist, nets);
+ if (rc != 0)
+ goto failed;
+
+ while (!list_empty(&nilist)) {
+ ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+ lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+ LASSERT (libcfs_isknown_lnd(lnd_type));
+
+ if (lnd_type == CIBLND ||
+ lnd_type == OPENIBLND ||
+ lnd_type == IIBLND ||
+ lnd_type == VIBLND) {
+ CERROR("LND %s obsoleted\n",
+ libcfs_lnd2str(lnd_type));
+ goto failed;
+ }
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+ lnd = lnet_find_lnd_by_type(lnd_type);
+
+ if (lnd == NULL) {
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+ rc = request_module("%s",
+ libcfs_lnd2modname(lnd_type));
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ lnd = lnet_find_lnd_by_type(lnd_type);
+ if (lnd == NULL) {
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+ CERROR("Can't load LND %s, module %s, rc=%d\n",
+ libcfs_lnd2str(lnd_type),
+ libcfs_lnd2modname(lnd_type), rc);
+ goto failed;
+ }
+ }
+
+ lnet_net_lock(LNET_LOCK_EX);
+ lnd->lnd_refcount++;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ ni->ni_lnd = lnd;
+
+ rc = (lnd->lnd_startup)(ni);
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+ if (rc != 0) {
+ LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s"
+ "\n",
+ rc, libcfs_lnd2str(lnd->lnd_type));
+ lnet_net_lock(LNET_LOCK_EX);
+ lnd->lnd_refcount--;
+ lnet_net_unlock(LNET_LOCK_EX);
+ goto failed;
+ }
+
+ LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+ list_del(&ni->ni_list);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ /* refcount for ln_nis */
+ lnet_ni_addref_locked(ni, 0);
+ list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+ if (ni->ni_cpts != NULL) {
+ list_add_tail(&ni->ni_cptlist,
+ &the_lnet.ln_nis_cpt);
+ lnet_ni_addref_locked(ni, 0);
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (lnd->lnd_type == LOLND) {
+ lnet_ni_addref(ni);
+ LASSERT (the_lnet.ln_loni == NULL);
+ the_lnet.ln_loni = ni;
+ continue;
+ }
+
+ if (ni->ni_peertxcredits == 0 ||
+ ni->ni_maxtxcredits == 0) {
+ LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+ libcfs_lnd2str(lnd->lnd_type),
+ ni->ni_peertxcredits == 0 ?
+ "" : "per-peer ");
+ goto failed;
+ }
+
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+ tq->tq_credits_min =
+ tq->tq_credits_max =
+ tq->tq_credits = lnet_ni_tq_credits(ni);
+ }
+
+ CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+ libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+ lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+ ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+ nicount++;
+ }
+
+ if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+ lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+ LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network"
+ "\n",
+ libcfs_lnd2str(lnd_type));
+ goto failed;
+ }
+
+ return 0;
+
+ failed:
+ lnet_shutdown_lndnis();
+
+ while (!list_empty(&nilist)) {
+ ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+ list_del(&ni->ni_list);
+ lnet_ni_free(ni);
+ }
+
+ return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+ int rc;
+
+ lnet_assert_wire_constants();
+ LASSERT(!the_lnet.ln_init);
+
+ memset(&the_lnet, 0, sizeof(the_lnet));
+
+ /* refer to global cfs_cpt_table for now */
+ the_lnet.ln_cpt_table = cfs_cpt_table;
+ the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_table);
+
+ LASSERT(the_lnet.ln_cpt_number > 0);
+ if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+ /* we are under risk of consuming all lh_cookie */
+ CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+ "please change setting of CPT-table and retry\n",
+ the_lnet.ln_cpt_number, LNET_CPT_MAX);
+ return -1;
+ }
+
+ while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+ the_lnet.ln_cpt_bits++;
+
+ rc = lnet_create_locks();
+ if (rc != 0) {
+ CERROR("Can't create LNet global locks: %d\n", rc);
+ return -1;
+ }
+
+ the_lnet.ln_refcount = 0;
+ the_lnet.ln_init = 1;
+ LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+ INIT_LIST_HEAD(&the_lnet.ln_lnds);
+ INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+ INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+ /* The hash table size is the number of bits it takes to express the set
+ * ln_num_routes, minus 1 (better to under estimate than over so we
+ * don't waste memory). */
+ if (rnet_htable_size <= 0)
+ rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+ else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+ rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+ the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+ order_base_2(rnet_htable_size) - 1);
+
+ /* All LNDs apart from the LOLND are in separate modules. They
+ * register themselves when their module loads, and unregister
+ * themselves when their module is unloaded. */
+ lnet_register_lnd(&the_lolnd);
+ return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount == 0);
+
+ while (!list_empty(&the_lnet.ln_lnds))
+ lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+ lnd_t, lnd_list));
+ lnet_destroy_locks();
+
+ the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+ int im_a_router = 0;
+ int rc;
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+ if (the_lnet.ln_refcount > 0) {
+ rc = the_lnet.ln_refcount++;
+ goto out;
+ }
+
+ lnet_get_tunables();
+
+ if (requested_pid == LNET_PID_ANY) {
+ /* Don't instantiate LNET just for me */
+ rc = -ENETDOWN;
+ goto failed0;
+ }
+
+ rc = lnet_prepare(requested_pid);
+ if (rc != 0)
+ goto failed0;
+
+ rc = lnet_startup_lndnis();
+ if (rc != 0)
+ goto failed1;
+
+ rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_check_routes();
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_rtrpools_alloc(im_a_router);
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_acceptor_start();
+ if (rc != 0)
+ goto failed2;
+
+ the_lnet.ln_refcount = 1;
+ /* Now I may use my own API functions... */
+
+ /* NB router checker needs the_lnet.ln_ping_info in
+ * lnet_router_checker -> lnet_update_ni_status_locked */
+ rc = lnet_ping_target_init();
+ if (rc != 0)
+ goto failed3;
+
+ rc = lnet_router_checker_start();
+ if (rc != 0)
+ goto failed4;
+
+ lnet_proc_init();
+ goto out;
+
+ failed4:
+ lnet_ping_target_fini();
+ failed3:
+ the_lnet.ln_refcount = 0;
+ lnet_acceptor_stop();
+ failed2:
+ lnet_destroy_routes();
+ lnet_shutdown_lndnis();
+ failed1:
+ lnet_unprepare();
+ failed0:
+ LASSERT (rc < 0);
+ out:
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+ return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (the_lnet.ln_refcount != 1) {
+ the_lnet.ln_refcount--;
+ } else {
+ LASSERT (!the_lnet.ln_niinit_self);
+
+ lnet_proc_fini();
+ lnet_router_checker_stop();
+ lnet_ping_target_fini();
+
+ /* Teardown fns that use my own API functions BEFORE here */
+ the_lnet.ln_refcount = 0;
+
+ lnet_acceptor_stop();
+ lnet_destroy_routes();
+ lnet_shutdown_lndnis();
+ lnet_unprepare();
+ }
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+ struct libcfs_ioctl_data *data = arg;
+ lnet_process_id_t id = {0};
+ lnet_ni_t *ni;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ switch (cmd) {
+ case IOC_LIBCFS_GET_NI:
+ rc = LNetGetId(data->ioc_count, &id);
+ data->ioc_nid = id.nid;
+ return rc;
+
+ case IOC_LIBCFS_FAIL_NID:
+ return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+ case IOC_LIBCFS_ADD_ROUTE:
+ rc = lnet_add_route(data->ioc_net, data->ioc_count,
+ data->ioc_nid);
+ return (rc != 0) ? rc : lnet_check_routes();
+
+ case IOC_LIBCFS_DEL_ROUTE:
+ return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+ case IOC_LIBCFS_GET_ROUTE:
+ return lnet_get_route(data->ioc_count,
+ &data->ioc_net, &data->ioc_count,
+ &data->ioc_nid, &data->ioc_flags);
+ case IOC_LIBCFS_NOTIFY_ROUTER:
+ return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+ cfs_time_current() -
+ cfs_time_seconds(cfs_time_current_sec() -
+ (time_t)data->ioc_u64[0]));
+
+ case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+ /* This can be removed once lustre stops calling it */
+ return 0;
+
+ case IOC_LIBCFS_LNET_DIST:
+ rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+ if (rc < 0 && rc != -EHOSTUNREACH)
+ return rc;
+
+ data->ioc_u32[0] = rc;
+ return 0;
+
+ case IOC_LIBCFS_TESTPROTOCOMPAT:
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_testprotocompat = data->ioc_flags;
+ lnet_net_unlock(LNET_LOCK_EX);
+ return 0;
+
+ case IOC_LIBCFS_PING:
+ id.nid = data->ioc_nid;
+ id.pid = data->ioc_u32[0];
+ rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+ (lnet_process_id_t *)data->ioc_pbuf1,
+ data->ioc_plen1/sizeof(lnet_process_id_t));
+ if (rc < 0)
+ return rc;
+ data->ioc_count = rc;
+ return 0;
+
+ case IOC_LIBCFS_DEBUG_PEER: {
+ /* CAVEAT EMPTOR: this one designed for calling directly; not
+ * via an ioctl */
+ id = *((lnet_process_id_t *) arg);
+
+ lnet_debug_peer(id.nid);
+
+ ni = lnet_net2ni(LNET_NIDNET(id.nid));
+ if (ni == NULL) {
+ CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+ } else {
+ if (ni->ni_lnd->lnd_ctl == NULL) {
+ CDEBUG(D_WARNING, "No ctl for %s\n",
+ libcfs_id2str(id));
+ } else {
+ (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+ }
+
+ lnet_ni_decref(ni);
+ }
+ return 0;
+ }
+
+ default:
+ ni = lnet_net2ni(data->ioc_net);
+ if (ni == NULL)
+ return -EINVAL;
+
+ if (ni->ni_lnd->lnd_ctl == NULL)
+ rc = -EINVAL;
+ else
+ rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+ lnet_ni_decref(ni);
+ return rc;
+ }
+ /* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+ struct lnet_ni *ni;
+ struct list_head *tmp;
+ int cpt;
+ int rc = -ENOENT;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_net_lock_current();
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ if (index-- != 0)
+ continue;
+
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ id->nid = ni->ni_nid;
+ id->pid = the_lnet.ln_pid;
+ rc = 0;
+ break;
+ }
+
+ lnet_net_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+ snprintf(str, len, LPX64, h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+ int i;
+ int n;
+ int rc;
+ unsigned int infosz;
+ lnet_ni_t *ni;
+ lnet_process_id_t id;
+ lnet_ping_info_t *pinfo;
+
+ for (n = 0; ; n++) {
+ rc = LNetGetId(n, &id);
+ if (rc == -ENOENT)
+ break;
+
+ LASSERT (rc == 0);
+ }
+
+ infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+ LIBCFS_ALLOC(pinfo, infosz);
+ if (pinfo == NULL) {
+ CERROR("Can't allocate ping info[%d]\n", n);
+ return -ENOMEM;
+ }
+
+ pinfo->pi_nnis = n;
+ pinfo->pi_pid = the_lnet.ln_pid;
+ pinfo->pi_magic = LNET_PROTO_PING_MAGIC;
+ pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+ for (i = 0; i < n; i++) {
+ lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+ rc = LNetGetId(i, &id);
+ LASSERT (rc == 0);
+
+ ns->ns_nid = id.nid;
+ ns->ns_status = LNET_NI_STATUS_UP;
+
+ lnet_net_lock(0);
+
+ ni = lnet_nid2ni_locked(id.nid, 0);
+ LASSERT(ni != NULL);
+
+ lnet_ni_lock(ni);
+ LASSERT(ni->ni_status == NULL);
+ ni->ni_status = ns;
+ lnet_ni_unlock(ni);
+
+ lnet_ni_decref_locked(ni, 0);
+ lnet_net_unlock(0);
+ }
+
+ the_lnet.ln_ping_info = pinfo;
+ return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+ struct lnet_ni *ni;
+
+ lnet_net_lock(0);
+
+ list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+ lnet_ni_lock(ni);
+ ni->ni_status = NULL;
+ lnet_ni_unlock(ni);
+ }
+
+ lnet_net_unlock(0);
+
+ LIBCFS_FREE(the_lnet.ln_ping_info,
+ offsetof(lnet_ping_info_t,
+ pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+ the_lnet.ln_ping_info = NULL;
+ return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+ lnet_md_t md = {0};
+ lnet_handle_me_t meh;
+ lnet_process_id_t id;
+ int rc;
+ int rc2;
+ int infosz;
+
+ rc = lnet_create_ping_info();
+ if (rc != 0)
+ return rc;
+
+ /* We can have a tiny EQ since we only need to see the unlink event on
+ * teardown, which by definition is the last one! */
+ rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+ if (rc != 0) {
+ CERROR("Can't allocate ping EQ: %d\n", rc);
+ goto failed_0;
+ }
+
+ memset(&id, 0, sizeof(lnet_process_id_t));
+ id.nid = LNET_NID_ANY;
+ id.pid = LNET_PID_ANY;
+
+ rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+ LNET_PROTO_PING_MATCHBITS, 0,
+ LNET_UNLINK, LNET_INS_AFTER,
+ &meh);
+ if (rc != 0) {
+ CERROR("Can't create ping ME: %d\n", rc);
+ goto failed_1;
+ }
+
+ /* initialize md content */
+ infosz = offsetof(lnet_ping_info_t,
+ pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+ md.start = the_lnet.ln_ping_info;
+ md.length = infosz;
+ md.threshold = LNET_MD_THRESH_INF;
+ md.max_size = 0;
+ md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+ LNET_MD_MANAGE_REMOTE;
+ md.user_ptr = NULL;
+ md.eq_handle = the_lnet.ln_ping_target_eq;
+
+ rc = LNetMDAttach(meh, md,
+ LNET_RETAIN,
+ &the_lnet.ln_ping_target_md);
+ if (rc != 0) {
+ CERROR("Can't attach ping MD: %d\n", rc);
+ goto failed_2;
+ }
+
+ return 0;
+
+ failed_2:
+ rc2 = LNetMEUnlink(meh);
+ LASSERT (rc2 == 0);
+ failed_1:
+ rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+ LASSERT (rc2 == 0);
+ failed_0:
+ lnet_destroy_ping_info();
+ return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+ lnet_event_t event;
+ int rc;
+ int which;
+ int timeout_ms = 1000;
+ sigset_t blocked = cfs_block_allsigs();
+
+ LNetMDUnlink(the_lnet.ln_ping_target_md);
+ /* NB md could be busy; this just starts the unlink */
+
+ for (;;) {
+ rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+ timeout_ms, &event, &which);
+
+ /* I expect overflow... */
+ LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+ if (rc == 0) {
+ /* timed out: provide a diagnostic */
+ CWARN("Still waiting for ping MD to unlink\n");
+ timeout_ms *= 2;
+ continue;
+ }
+
+ /* Got a valid event */
+ if (event.unlinked)
+ break;
+ }
+
+ rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+ LASSERT (rc == 0);
+ lnet_destroy_ping_info();
+ cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+ lnet_handle_eq_t eqh;
+ lnet_handle_md_t mdh;
+ lnet_event_t event;
+ lnet_md_t md = {0};
+ int which;
+ int unlinked = 0;
+ int replied = 0;
+ const int a_long_time = 60000; /* mS */
+ int infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+ lnet_ping_info_t *info;
+ lnet_process_id_t tmpid;
+ int i;
+ int nob;
+ int rc;
+ int rc2;
+ sigset_t blocked;
+
+ if (n_ids <= 0 ||
+ id.nid == LNET_NID_ANY ||
+ timeout_ms > 500000 || /* arbitrary limit! */
+ n_ids > 20) /* arbitrary limit! */
+ return -EINVAL;
+
+ if (id.pid == LNET_PID_ANY)
+ id.pid = LUSTRE_SRV_LNET_PID;
+
+ LIBCFS_ALLOC(info, infosz);
+ if (info == NULL)
+ return -ENOMEM;
+
+ /* NB 2 events max (including any unlink event) */
+ rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+ if (rc != 0) {
+ CERROR("Can't allocate EQ: %d\n", rc);
+ goto out_0;
+ }
+
+ /* initialize md content */
+ md.start = info;
+ md.length = infosz;
+ md.threshold = 2; /*GET/REPLY*/
+ md.max_size = 0;
+ md.options = LNET_MD_TRUNCATE;
+ md.user_ptr = NULL;
+ md.eq_handle = eqh;
+
+ rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+ if (rc != 0) {
+ CERROR("Can't bind MD: %d\n", rc);
+ goto out_1;
+ }
+
+ rc = LNetGet(LNET_NID_ANY, mdh, id,
+ LNET_RESERVED_PORTAL,
+ LNET_PROTO_PING_MATCHBITS, 0);
+
+ if (rc != 0) {
+ /* Don't CERROR; this could be deliberate! */
+
+ rc2 = LNetMDUnlink(mdh);
+ LASSERT (rc2 == 0);
+
+ /* NB must wait for the UNLINK event below... */
+ unlinked = 1;
+ timeout_ms = a_long_time;
+ }
+
+ do {
+ /* MUST block for unlink to complete */
+ if (unlinked)
+ blocked = cfs_block_allsigs();
+
+ rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+ if (unlinked)
+ cfs_restore_sigs(blocked);
+
+ CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+ (rc2 <= 0) ? -1 : event.type,
+ (rc2 <= 0) ? -1 : event.status,
+ (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+ LASSERT (rc2 != -EOVERFLOW); /* can't miss anything */
+
+ if (rc2 <= 0 || event.status != 0) {
+ /* timeout or error */
+ if (!replied && rc == 0)
+ rc = (rc2 < 0) ? rc2 :
+ (rc2 == 0) ? -ETIMEDOUT :
+ event.status;
+
+ if (!unlinked) {
+ /* Ensure completion in finite time... */
+ LNetMDUnlink(mdh);
+ /* No assertion (racing with network) */
+ unlinked = 1;
+ timeout_ms = a_long_time;
+ } else if (rc2 == 0) {
+ /* timed out waiting for unlink */
+ CWARN("ping %s: late network completion\n",
+ libcfs_id2str(id));
+ }
+ } else if (event.type == LNET_EVENT_REPLY) {
+ replied = 1;
+ rc = event.mlength;
+ }
+
+ } while (rc2 <= 0 || !event.unlinked);
+
+ if (!replied) {
+ if (rc >= 0)
+ CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+ libcfs_id2str(id));
+ rc = -EIO;
+ goto out_1;
+ }
+
+ nob = rc;
+ LASSERT (nob >= 0 && nob <= infosz);
+
+ rc = -EPROTO; /* if I can't parse... */
+
+ if (nob < 8) {
+ /* can't check magic/version */
+ CERROR("%s: ping info too short %d\n",
+ libcfs_id2str(id), nob);
+ goto out_1;
+ }
+
+ if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+ lnet_swap_pinginfo(info);
+ } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+ CERROR("%s: Unexpected magic %08x\n",
+ libcfs_id2str(id), info->pi_magic);
+ goto out_1;
+ }
+
+ if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+ CERROR("%s: ping w/o NI status: 0x%x\n",
+ libcfs_id2str(id), info->pi_features);
+ goto out_1;
+ }
+
+ if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+ CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+ nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+ goto out_1;
+ }
+
+ if (info->pi_nnis < n_ids)
+ n_ids = info->pi_nnis;
+
+ if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+ CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+ nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+ goto out_1;
+ }
+
+ rc = -EFAULT; /* If I SEGV... */
+
+ for (i = 0; i < n_ids; i++) {
+ tmpid.pid = info->pi_pid;
+ tmpid.nid = info->pi_ni[i].ns_nid;
+ if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+ goto out_1;
+ }
+ rc = info->pi_nnis;
+
+ out_1:
+ rc2 = LNetEQFree(eqh);
+ if (rc2 != 0)
+ CERROR("rc2 %d\n", rc2);
+ LASSERT (rc2 == 0);
+
+ out_0:
+ LIBCFS_FREE(info, infosz);
+ return rc;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644
index 000000000000..28711e6e8b03
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/config.c
@@ -0,0 +1,1264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+typedef struct { /* tmp struct for parsing routes */
+ struct list_head ltb_list; /* stash on lists */
+ int ltb_size; /* allocated size */
+ char ltb_text[0]; /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0; /* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB (4<<10)
+
+void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+ static char dots[LNET_SINGLE_TEXTBUF_NOB];
+ static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+ memset(dots, '.', sizeof(dots));
+ dots[sizeof(dots)-1] = 0;
+ memset(dashes, '-', sizeof(dashes));
+ dashes[sizeof(dashes)-1] = 0;
+
+ LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+ LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+ (int)strlen(name), dots, offset, dots,
+ (width < 1) ? 0 : width - 1, dashes);
+}
+
+int
+lnet_issep (char c)
+{
+ switch (c) {
+ case '\n':
+ case '\r':
+ case ';':
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+ struct list_head *tmp;
+ lnet_ni_t *ni;
+
+ list_for_each (tmp, nilist) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (LNET_NIDNET(ni->ni_nid) == net)
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+ if (ni->ni_refs != NULL)
+ cfs_percpt_free(ni->ni_refs);
+
+ if (ni->ni_tx_queues != NULL)
+ cfs_percpt_free(ni->ni_tx_queues);
+
+ if (ni->ni_cpts != NULL)
+ cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+ LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+ struct lnet_tx_queue *tq;
+ struct lnet_ni *ni;
+ int rc;
+ int i;
+
+ if (!lnet_net_unique(net, nilist)) {
+ LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+ libcfs_net2str(net));
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(ni, sizeof(*ni));
+ if (ni == NULL) {
+ CERROR("Out of memory creating network %s\n",
+ libcfs_net2str(net));
+ return NULL;
+ }
+
+ spin_lock_init(&ni->ni_lock);
+ INIT_LIST_HEAD(&ni->ni_cptlist);
+ ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ni->ni_refs[0]));
+ if (ni->ni_refs == NULL)
+ goto failed;
+
+ ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ni->ni_tx_queues[0]));
+ if (ni->ni_tx_queues == NULL)
+ goto failed;
+
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+ INIT_LIST_HEAD(&tq->tq_delayed);
+
+ if (el == NULL) {
+ ni->ni_cpts = NULL;
+ ni->ni_ncpts = LNET_CPT_NUMBER;
+ } else {
+ rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+ if (rc <= 0) {
+ CERROR("Failed to set CPTs for NI %s: %d\n",
+ libcfs_net2str(net), rc);
+ goto failed;
+ }
+
+ LASSERT(rc <= LNET_CPT_NUMBER);
+ if (rc == LNET_CPT_NUMBER) {
+ LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+ ni->ni_cpts = NULL;
+ }
+
+ ni->ni_ncpts = rc;
+ }
+
+ /* LND will fill in the address part of the NID */
+ ni->ni_nid = LNET_MKNID(net, 0);
+ ni->ni_last_alive = cfs_time_current_sec();
+ list_add_tail(&ni->ni_list, nilist);
+ return ni;
+ failed:
+ lnet_ni_free(ni);
+ return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+ struct cfs_expr_list *el = NULL;
+ int tokensize = strlen(networks) + 1;
+ char *tokens;
+ char *str;
+ char *tmp;
+ struct lnet_ni *ni;
+ __u32 net;
+ int nnets = 0;
+
+ if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+ /* _WAY_ conservative */
+ LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+ "long\n");
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(tokens, tokensize);
+ if (tokens == NULL) {
+ CERROR("Can't allocate net tokens\n");
+ return -ENOMEM;
+ }
+
+ the_lnet.ln_network_tokens = tokens;
+ the_lnet.ln_network_tokens_nob = tokensize;
+ memcpy (tokens, networks, tokensize);
+ str = tmp = tokens;
+
+ /* Add in the loopback network */
+ ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+ if (ni == NULL)
+ goto failed;
+
+ while (str != NULL && *str != 0) {
+ char *comma = strchr(str, ',');
+ char *bracket = strchr(str, '(');
+ char *square = strchr(str, '[');
+ char *iface;
+ int niface;
+ int rc;
+
+ /* NB we don't check interface conflicts here; it's the LNDs
+ * responsibility (if it cares at all) */
+
+ if (square != NULL && (comma == NULL || square < comma)) {
+ /* i.e: o2ib0(ib0)[1,2], number between square
+ * brackets are CPTs this NI needs to be bond */
+ if (bracket != NULL && bracket > square) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ tmp = strchr(square, ']');
+ if (tmp == NULL) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ rc = cfs_expr_list_parse(square, tmp - square + 1,
+ 0, LNET_CPT_NUMBER - 1, &el);
+ if (rc != 0) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ while (square <= tmp)
+ *square++ = ' ';
+ }
+
+ if (bracket == NULL ||
+ (comma != NULL && comma < bracket)) {
+
+ /* no interface list specified */
+
+ if (comma != NULL)
+ *comma++ = 0;
+ net = libcfs_str2net(cfs_trimwhite(str));
+
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
+ " type\n");
+ tmp = str;
+ goto failed_syntax;
+ }
+
+ if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+ lnet_ni_alloc(net, el, nilist) == NULL)
+ goto failed;
+
+ if (el != NULL) {
+ cfs_expr_list_free(el);
+ el = NULL;
+ }
+
+ str = comma;
+ continue;
+ }
+
+ *bracket = 0;
+ net = libcfs_str2net(cfs_trimwhite(str));
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ tmp = str;
+ goto failed_syntax;
+ }
+
+ nnets++;
+ ni = lnet_ni_alloc(net, el, nilist);
+ if (ni == NULL)
+ goto failed;
+
+ if (el != NULL) {
+ cfs_expr_list_free(el);
+ el = NULL;
+ }
+
+ niface = 0;
+ iface = bracket + 1;
+
+ bracket = strchr(iface, ')');
+ if (bracket == NULL) {
+ tmp = iface;
+ goto failed_syntax;
+ }
+
+ *bracket = 0;
+ do {
+ comma = strchr(iface, ',');
+ if (comma != NULL)
+ *comma++ = 0;
+
+ iface = cfs_trimwhite(iface);
+ if (*iface == 0) {
+ tmp = iface;
+ goto failed_syntax;
+ }
+
+ if (niface == LNET_MAX_INTERFACES) {
+ LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+ "for net %s\n",
+ libcfs_net2str(net));
+ goto failed;
+ }
+
+ ni->ni_interfaces[niface++] = iface;
+ iface = comma;
+ } while (iface != NULL);
+
+ str = bracket + 1;
+ comma = strchr(bracket + 1, ',');
+ if (comma != NULL) {
+ *comma = 0;
+ str = cfs_trimwhite(str);
+ if (*str != 0) {
+ tmp = str;
+ goto failed_syntax;
+ }
+ str = comma + 1;
+ continue;
+ }
+
+ str = cfs_trimwhite(str);
+ if (*str != 0) {
+ tmp = str;
+ goto failed_syntax;
+ }
+ }
+
+ LASSERT(!list_empty(nilist));
+ return 0;
+
+ failed_syntax:
+ lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+ while (!list_empty(nilist)) {
+ ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+ list_del(&ni->ni_list);
+ lnet_ni_free(ni);
+ }
+
+ if (el != NULL)
+ cfs_expr_list_free(el);
+
+ LIBCFS_FREE(tokens, tokensize);
+ the_lnet.ln_network_tokens = NULL;
+
+ return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len)
+{
+ lnet_text_buf_t *ltb;
+ int nob;
+
+ /* NB allocate space for the terminating 0 */
+ nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+ if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+ /* _way_ conservative for "route net gateway..." */
+ CERROR("text buffer too big\n");
+ return NULL;
+ }
+
+ if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+ CERROR("Too many text buffers\n");
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(ltb, nob);
+ if (ltb == NULL)
+ return NULL;
+
+ ltb->ltb_size = nob;
+ ltb->ltb_text[0] = 0;
+ lnet_tbnob += nob;
+ return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+ lnet_tbnob -= ltb->ltb_size;
+ LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+ lnet_text_buf_t *ltb;
+
+ while (!list_empty(tbs)) {
+ ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ }
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+ struct list_head *tmp;
+ lnet_text_buf_t *ltb;
+
+ list_for_each (tmp, tbs) {
+ ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+ CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+ }
+
+ CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str)
+{
+ struct list_head pending;
+ char *sep;
+ int nob;
+ int i;
+ lnet_text_buf_t *ltb;
+
+ INIT_LIST_HEAD(&pending);
+
+ /* Split 'str' into separate commands */
+ for (;;) {
+ /* skip leading whitespace */
+ while (cfs_iswhite(*str))
+ str++;
+
+ /* scan for separator or comment */
+ for (sep = str; *sep != 0; sep++)
+ if (lnet_issep(*sep) || *sep == '#')
+ break;
+
+ nob = (int)(sep - str);
+ if (nob > 0) {
+ ltb = lnet_new_text_buf(nob);
+ if (ltb == NULL) {
+ lnet_free_text_bufs(&pending);
+ return -1;
+ }
+
+ for (i = 0; i < nob; i++)
+ if (cfs_iswhite(str[i]))
+ ltb->ltb_text[i] = ' ';
+ else
+ ltb->ltb_text[i] = str[i];
+
+ ltb->ltb_text[nob] = 0;
+
+ list_add_tail(&ltb->ltb_list, &pending);
+ }
+
+ if (*sep == '#') {
+ /* scan for separator */
+ do {
+ sep++;
+ } while (*sep != 0 && !lnet_issep(*sep));
+ }
+
+ if (*sep == 0)
+ break;
+
+ str = sep + 1;
+ }
+
+ list_splice(&pending, tbs->prev);
+ return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list,
+ char *str, char *sep1, char *sep2,
+ char *item, int itemlen)
+{
+ int len1 = (int)(sep1 - str);
+ int len2 = strlen(sep2 + 1);
+ lnet_text_buf_t *ltb;
+
+ LASSERT (*sep1 == '[');
+ LASSERT (*sep2 == ']');
+
+ ltb = lnet_new_text_buf(len1 + itemlen + len2);
+ if (ltb == NULL)
+ return -ENOMEM;
+
+ memcpy(ltb->ltb_text, str, len1);
+ memcpy(&ltb->ltb_text[len1], item, itemlen);
+ memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+ ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+ list_add_tail(&ltb->ltb_list, list);
+ return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+ char num[16];
+ struct list_head pending;
+ char *sep;
+ char *sep2;
+ char *parsed;
+ char *enditem;
+ int lo;
+ int hi;
+ int stride;
+ int i;
+ int nob;
+ int scanned;
+
+ INIT_LIST_HEAD(&pending);
+
+ sep = strchr(str, '[');
+ if (sep == NULL) /* nothing to expand */
+ return 0;
+
+ sep2 = strchr(sep, ']');
+ if (sep2 == NULL)
+ goto failed;
+
+ for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+ enditem = ++parsed;
+ while (enditem < sep2 && *enditem != ',')
+ enditem++;
+
+ if (enditem == parsed) /* no empty items */
+ goto failed;
+
+ if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+ if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+ /* simple string enumeration */
+ if (lnet_expand1tb(&pending, str, sep, sep2,
+ parsed, (int)(enditem - parsed)) != 0)
+ goto failed;
+
+ continue;
+ }
+
+ stride = 1;
+ }
+
+ /* range expansion */
+
+ if (enditem != parsed + scanned) /* no trailing junk */
+ goto failed;
+
+ if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+ (hi - lo) % stride != 0)
+ goto failed;
+
+ for (i = lo; i <= hi; i += stride) {
+
+ snprintf(num, sizeof(num), "%d", i);
+ nob = strlen(num);
+ if (nob + 1 == sizeof(num))
+ goto failed;
+
+ if (lnet_expand1tb(&pending, str, sep, sep2,
+ num, nob) != 0)
+ goto failed;
+ }
+ }
+
+ list_splice(&pending, tbs->prev);
+ return 1;
+
+ failed:
+ lnet_free_text_bufs(&pending);
+ return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+ int len = strlen(str);
+ int nob = len;
+
+ return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+ nob == len &&
+ *hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+ /* static scratch buffer OK (single threaded) */
+ static char cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+ struct list_head nets;
+ struct list_head gateways;
+ struct list_head *tmp1;
+ struct list_head *tmp2;
+ __u32 net;
+ lnet_nid_t nid;
+ lnet_text_buf_t *ltb;
+ int rc;
+ char *sep;
+ char *token = str;
+ int ntokens = 0;
+ int myrc = -1;
+ unsigned int hops;
+ int got_hops = 0;
+
+ INIT_LIST_HEAD(&gateways);
+ INIT_LIST_HEAD(&nets);
+
+ /* save a copy of the string for error messages */
+ strncpy(cmd, str, sizeof(cmd) - 1);
+ cmd[sizeof(cmd) - 1] = 0;
+
+ sep = str;
+ for (;;) {
+ /* scan for token start */
+ while (cfs_iswhite(*sep))
+ sep++;
+ if (*sep == 0) {
+ if (ntokens < (got_hops ? 3 : 2))
+ goto token_error;
+ break;
+ }
+
+ ntokens++;
+ token = sep++;
+
+ /* scan for token end */
+ while (*sep != 0 && !cfs_iswhite(*sep))
+ sep++;
+ if (*sep != 0)
+ *sep++ = 0;
+
+ if (ntokens == 1) {
+ tmp2 = &nets; /* expanding nets */
+ } else if (ntokens == 2 &&
+ lnet_parse_hops(token, &hops)) {
+ got_hops = 1; /* got a hop count */
+ continue;
+ } else {
+ tmp2 = &gateways; /* expanding gateways */
+ }
+
+ ltb = lnet_new_text_buf(strlen(token));
+ if (ltb == NULL)
+ goto out;
+
+ strcpy(ltb->ltb_text, token);
+ tmp1 = &ltb->ltb_list;
+ list_add_tail(tmp1, tmp2);
+
+ while (tmp1 != tmp2) {
+ ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+ rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+ if (rc < 0)
+ goto token_error;
+
+ tmp1 = tmp1->next;
+
+ if (rc > 0) { /* expanded! */
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ continue;
+ }
+
+ if (ntokens == 1) {
+ net = libcfs_str2net(ltb->ltb_text);
+ if (net == LNET_NIDNET(LNET_NID_ANY) ||
+ LNET_NETTYP(net) == LOLND)
+ goto token_error;
+ } else {
+ nid = libcfs_str2nid(ltb->ltb_text);
+ if (nid == LNET_NID_ANY ||
+ LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+ goto token_error;
+ }
+ }
+ }
+
+ if (!got_hops)
+ hops = 1;
+
+ LASSERT (!list_empty(&nets));
+ LASSERT (!list_empty(&gateways));
+
+ list_for_each (tmp1, &nets) {
+ ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+ net = libcfs_str2net(ltb->ltb_text);
+ LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+ list_for_each (tmp2, &gateways) {
+ ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+ nid = libcfs_str2nid(ltb->ltb_text);
+ LASSERT (nid != LNET_NID_ANY);
+
+ if (lnet_islocalnid(nid)) {
+ *im_a_router = 1;
+ continue;
+ }
+
+ rc = lnet_add_route (net, hops, nid);
+ if (rc != 0) {
+ CERROR("Can't create route "
+ "to %s via %s\n",
+ libcfs_net2str(net),
+ libcfs_nid2str(nid));
+ goto out;
+ }
+ }
+ }
+
+ myrc = 0;
+ goto out;
+
+ token_error:
+ lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+ lnet_free_text_bufs(&nets);
+ lnet_free_text_bufs(&gateways);
+ return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+ lnet_text_buf_t *ltb;
+
+ while (!list_empty(tbs)) {
+ ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+ if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+ lnet_free_text_bufs(tbs);
+ return -EINVAL;
+ }
+
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ }
+
+ return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+ struct list_head tbs;
+ int rc = 0;
+
+ *im_a_router = 0;
+
+ INIT_LIST_HEAD(&tbs);
+
+ if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+ CERROR("Error parsing routes\n");
+ rc = -EINVAL;
+ } else {
+ rc = lnet_parse_route_tbs(&tbs, im_a_router);
+ }
+
+ LASSERT (lnet_tbnob == 0);
+ return rc;
+}
+
+int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+ LIST_HEAD (list);
+ int rc;
+ int i;
+
+ rc = cfs_ip_addr_parse(token, len, &list);
+ if (rc != 0)
+ return rc;
+
+ for (rc = i = 0; !rc && i < nip; i++)
+ rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+ cfs_ip_addr_free(&list);
+
+ return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+ static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+ int matched = 0;
+ int ntokens = 0;
+ int len;
+ char *net = NULL;
+ char *sep;
+ char *token;
+ int rc;
+
+ LASSERT (strlen(net_entry) < sizeof(tokens));
+
+ /* work on a copy of the string */
+ strcpy(tokens, net_entry);
+ sep = tokens;
+ for (;;) {
+ /* scan for token start */
+ while (cfs_iswhite(*sep))
+ sep++;
+ if (*sep == 0)
+ break;
+
+ token = sep++;
+
+ /* scan for token end */
+ while (*sep != 0 && !cfs_iswhite(*sep))
+ sep++;
+ if (*sep != 0)
+ *sep++ = 0;
+
+ if (ntokens++ == 0) {
+ net = token;
+ continue;
+ }
+
+ len = strlen(token);
+
+ rc = lnet_match_network_token(token, len, ipaddrs, nip);
+ if (rc < 0) {
+ lnet_syntax("ip2nets", net_entry,
+ (int)(token - tokens), len);
+ return rc;
+ }
+
+ matched |= (rc != 0);
+ }
+
+ if (!matched)
+ return 0;
+
+ strcpy(net_entry, net); /* replace with matched net */
+ return 1;
+}
+
+__u32
+lnet_netspec2net(char *netspec)
+{
+ char *bracket = strchr(netspec, '(');
+ __u32 net;
+
+ if (bracket != NULL)
+ *bracket = 0;
+
+ net = libcfs_str2net(netspec);
+
+ if (bracket != NULL)
+ *bracket = '(';
+
+ return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+ int offset = 0;
+ int offset2;
+ int len;
+ lnet_text_buf_t *tb;
+ lnet_text_buf_t *tb2;
+ struct list_head *t;
+ char *sep;
+ char *bracket;
+ __u32 net;
+
+ LASSERT (!list_empty(nets));
+ LASSERT (nets->next == nets->prev); /* single entry */
+
+ tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+ for (;;) {
+ sep = strchr(tb->ltb_text, ',');
+ bracket = strchr(tb->ltb_text, '(');
+
+ if (sep != NULL &&
+ bracket != NULL &&
+ bracket < sep) {
+ /* netspec lists interfaces... */
+
+ offset2 = offset + (int)(bracket - tb->ltb_text);
+ len = strlen(bracket);
+
+ bracket = strchr(bracket + 1, ')');
+
+ if (bracket == NULL ||
+ !(bracket[1] == ',' || bracket[1] == 0)) {
+ lnet_syntax("ip2nets", source, offset2, len);
+ return -EINVAL;
+ }
+
+ sep = (bracket[1] == 0) ? NULL : bracket + 1;
+ }
+
+ if (sep != NULL)
+ *sep++ = 0;
+
+ net = lnet_netspec2net(tb->ltb_text);
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ lnet_syntax("ip2nets", source, offset,
+ strlen(tb->ltb_text));
+ return -EINVAL;
+ }
+
+ list_for_each(t, nets) {
+ tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+ if (tb2 == tb)
+ continue;
+
+ if (net == lnet_netspec2net(tb2->ltb_text)) {
+ /* duplicate network */
+ lnet_syntax("ip2nets", source, offset,
+ strlen(tb->ltb_text));
+ return -EINVAL;
+ }
+ }
+
+ if (sep == NULL)
+ return 0;
+
+ offset += (int)(sep - tb->ltb_text);
+ tb2 = lnet_new_text_buf(strlen(sep));
+ if (tb2 == NULL)
+ return -ENOMEM;
+
+ strcpy(tb2->ltb_text, sep);
+ list_add_tail(&tb2->ltb_list, nets);
+
+ tb = tb2;
+ }
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+ static char networks[LNET_SINGLE_TEXTBUF_NOB];
+ static char source[LNET_SINGLE_TEXTBUF_NOB];
+
+ struct list_head raw_entries;
+ struct list_head matched_nets;
+ struct list_head current_nets;
+ struct list_head *t;
+ struct list_head *t2;
+ lnet_text_buf_t *tb;
+ lnet_text_buf_t *tb2;
+ __u32 net1;
+ __u32 net2;
+ int len;
+ int count;
+ int dup;
+ int rc;
+
+ INIT_LIST_HEAD(&raw_entries);
+ if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+ CERROR("Error parsing ip2nets\n");
+ LASSERT (lnet_tbnob == 0);
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&matched_nets);
+ INIT_LIST_HEAD(&current_nets);
+ networks[0] = 0;
+ count = 0;
+ len = 0;
+ rc = 0;
+
+ while (!list_empty(&raw_entries)) {
+ tb = list_entry(raw_entries.next, lnet_text_buf_t,
+ ltb_list);
+
+ strncpy(source, tb->ltb_text, sizeof(source)-1);
+ source[sizeof(source)-1] = 0;
+
+ /* replace ltb_text with the network(s) add on match */
+ rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+ if (rc < 0)
+ break;
+
+ list_del(&tb->ltb_list);
+
+ if (rc == 0) { /* no match */
+ lnet_free_text_buf(tb);
+ continue;
+ }
+
+ /* split into separate networks */
+ INIT_LIST_HEAD(&current_nets);
+ list_add(&tb->ltb_list, &current_nets);
+ rc = lnet_splitnets(source, &current_nets);
+ if (rc < 0)
+ break;
+
+ dup = 0;
+ list_for_each (t, &current_nets) {
+ tb = list_entry(t, lnet_text_buf_t, ltb_list);
+ net1 = lnet_netspec2net(tb->ltb_text);
+ LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+ list_for_each(t2, &matched_nets) {
+ tb2 = list_entry(t2, lnet_text_buf_t,
+ ltb_list);
+ net2 = lnet_netspec2net(tb2->ltb_text);
+ LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+ if (net1 == net2) {
+ dup = 1;
+ break;
+ }
+ }
+
+ if (dup)
+ break;
+ }
+
+ if (dup) {
+ lnet_free_text_bufs(&current_nets);
+ continue;
+ }
+
+ list_for_each_safe(t, t2, &current_nets) {
+ tb = list_entry(t, lnet_text_buf_t, ltb_list);
+
+ list_del(&tb->ltb_list);
+ list_add_tail(&tb->ltb_list, &matched_nets);
+
+ len += snprintf(networks + len, sizeof(networks) - len,
+ "%s%s", (len == 0) ? "" : ",",
+ tb->ltb_text);
+
+ if (len >= sizeof(networks)) {
+ CERROR("Too many matched networks\n");
+ rc = -E2BIG;
+ goto out;
+ }
+ }
+
+ count++;
+ }
+
+ out:
+ lnet_free_text_bufs(&raw_entries);
+ lnet_free_text_bufs(&matched_nets);
+ lnet_free_text_bufs(&current_nets);
+ LASSERT (lnet_tbnob == 0);
+
+ if (rc < 0)
+ return rc;
+
+ *networksp = networks;
+ return count;
+}
+
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+ LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+ int up;
+ __u32 netmask;
+ __u32 *ipaddrs;
+ __u32 *ipaddrs2;
+ int nip;
+ char **ifnames;
+ int nif = libcfs_ipif_enumerate(&ifnames);
+ int i;
+ int rc;
+
+ if (nif <= 0)
+ return nif;
+
+ LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+ if (ipaddrs == NULL) {
+ CERROR("Can't allocate ipaddrs[%d]\n", nif);
+ libcfs_ipif_free_enumeration(ifnames, nif);
+ return -ENOMEM;
+ }
+
+ for (i = nip = 0; i < nif; i++) {
+ if (!strcmp(ifnames[i], "lo"))
+ continue;
+
+ rc = libcfs_ipif_query(ifnames[i], &up,
+ &ipaddrs[nip], &netmask);
+ if (rc != 0) {
+ CWARN("Can't query interface %s: %d\n",
+ ifnames[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Ignoring interface %s: it's down\n",
+ ifnames[i]);
+ continue;
+ }
+
+ nip++;
+ }
+
+ libcfs_ipif_free_enumeration(ifnames, nif);
+
+ if (nip == nif) {
+ *ipaddrsp = ipaddrs;
+ } else {
+ if (nip > 0) {
+ LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+ if (ipaddrs2 == NULL) {
+ CERROR("Can't allocate ipaddrs[%d]\n", nip);
+ nip = -ENOMEM;
+ } else {
+ memcpy(ipaddrs2, ipaddrs,
+ nip * sizeof(*ipaddrs));
+ *ipaddrsp = ipaddrs2;
+ rc = nip;
+ }
+ }
+ lnet_ipaddr_free_enumeration(ipaddrs, nif);
+ }
+ return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+ __u32 *ipaddrs;
+ int nip = lnet_ipaddr_enumerate(&ipaddrs);
+ int rc;
+
+ if (nip < 0) {
+ LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+ "interfaces for ip2nets to match\n", nip);
+ return nip;
+ }
+
+ if (nip == 0) {
+ LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+ "for ip2nets to match\n");
+ return -ENOENT;
+ }
+
+ rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+ lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+ if (rc < 0) {
+ LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+ return rc;
+ }
+
+ if (rc == 0) {
+ LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+ "any local IP interfaces\n");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni)
+{
+ __u32 net = LNET_NIDNET(ni->ni_nid);
+ char **names;
+ int n;
+ __u32 ip;
+ __u32 netmask;
+ int up;
+ int i;
+ int rc;
+
+ /* Convenience for LNDs that use the IP address of a local interface as
+ * the local address part of their NID */
+
+ if (ni->ni_interfaces[0] != NULL) {
+
+ CLASSERT (LNET_MAX_INTERFACES > 1);
+
+ if (ni->ni_interfaces[1] != NULL) {
+ CERROR("Net %s doesn't support multiple interfaces\n",
+ libcfs_net2str(net));
+ return -EPERM;
+ }
+
+ rc = libcfs_ipif_query(ni->ni_interfaces[0],
+ &up, &ip, &netmask);
+ if (rc != 0) {
+ CERROR("Net %s can't query interface %s: %d\n",
+ libcfs_net2str(net), ni->ni_interfaces[0], rc);
+ return -EPERM;
+ }
+
+ if (!up) {
+ CERROR("Net %s can't use interface %s: it's down\n",
+ libcfs_net2str(net), ni->ni_interfaces[0]);
+ return -ENETDOWN;
+ }
+
+ ni->ni_nid = LNET_MKNID(net, ip);
+ return 0;
+ }
+
+ n = libcfs_ipif_enumerate(&names);
+ if (n <= 0) {
+ CERROR("Net %s can't enumerate interfaces: %d\n",
+ libcfs_net2str(net), n);
+ return 0;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+ continue;
+
+ rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+ if (rc != 0) {
+ CWARN("Net %s can't query interface %s: %d\n",
+ libcfs_net2str(net), names[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Net %s ignoring interface %s (down)\n",
+ libcfs_net2str(net), names[i]);
+ continue;
+ }
+
+ libcfs_ipif_free_enumeration(names, n);
+ ni->ni_nid = LNET_MKNID(net, ip);
+ return 0;
+ }
+
+ CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+ libcfs_ipif_free_enumeration(names, n);
+ return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644
index 000000000000..78297a7d94e8
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+ lnet_handle_eq_t *handle)
+{
+ lnet_eq_t *eq;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ /* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+ * overflow, they don't skip entries, so the queue has the same
+ * apparent capacity at all times */
+
+ count = cfs_power2_roundup(count);
+
+ if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+ CWARN("EQ callback is guaranteed to get every event, "
+ "do you still want to set eqcount %d for polling "
+ "event which will have locking overhead? "
+ "Please contact with developer to confirm\n", count);
+ }
+
+ /* count can be 0 if only need callback, we can eliminate
+ * overhead of enqueue event */
+ if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+ return -EINVAL;
+
+ eq = lnet_eq_alloc();
+ if (eq == NULL)
+ return -ENOMEM;
+
+ if (count != 0) {
+ LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+ if (eq->eq_events == NULL)
+ goto failed;
+ /* NB allocator has set all event sequence numbers to 0,
+ * so all them should be earlier than eq_deq_seq */
+ }
+
+ eq->eq_deq_seq = 1;
+ eq->eq_enq_seq = 1;
+ eq->eq_size = count;
+ eq->eq_callback = callback;
+
+ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*eq->eq_refs[0]));
+ if (eq->eq_refs == NULL)
+ goto failed;
+
+ /* MUST hold both exclusive lnet_res_lock */
+ lnet_res_lock(LNET_LOCK_EX);
+ /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+ * both EQ lookup and poll event with only lnet_eq_wait_lock */
+ lnet_eq_wait_lock();
+
+ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+ lnet_eq_wait_unlock();
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ lnet_eq2handle(handle, eq);
+ return 0;
+
+failed:
+ if (eq->eq_events != NULL)
+ LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+ if (eq->eq_refs != NULL)
+ cfs_percpt_free(eq->eq_refs);
+
+ lnet_eq_free(eq);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+ struct lnet_eq *eq;
+ lnet_event_t *events = NULL;
+ int **refs = NULL;
+ int *ref;
+ int rc = 0;
+ int size = 0;
+ int i;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ lnet_res_lock(LNET_LOCK_EX);
+ /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+ * both EQ lookup and poll event with only lnet_eq_wait_lock */
+ lnet_eq_wait_lock();
+
+ eq = lnet_handle2eq(&eqh);
+ if (eq == NULL) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ cfs_percpt_for_each(ref, i, eq->eq_refs) {
+ LASSERT(*ref >= 0);
+ if (*ref == 0)
+ continue;
+
+ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+ i, *ref);
+ rc = -EBUSY;
+ goto out;
+ }
+
+ /* stash for free after lock dropped */
+ events = eq->eq_events;
+ size = eq->eq_size;
+ refs = eq->eq_refs;
+
+ lnet_res_lh_invalidate(&eq->eq_lh);
+ list_del(&eq->eq_list);
+ lnet_eq_free_locked(eq);
+ out:
+ lnet_eq_wait_unlock();
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ if (events != NULL)
+ LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+ if (refs != NULL)
+ cfs_percpt_free(refs);
+
+ return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+ int index;
+
+ if (eq->eq_size == 0) {
+ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+ eq->eq_callback(ev);
+ return;
+ }
+
+ lnet_eq_wait_lock();
+ ev->sequence = eq->eq_enq_seq++;
+
+ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+ index = ev->sequence & (eq->eq_size - 1);
+
+ eq->eq_events[index] = *ev;
+
+ if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+ eq->eq_callback(ev);
+
+ /* Wake anyone waiting in LNetEQPoll() */
+ if (waitqueue_active(&the_lnet.ln_eq_waitq))
+ wake_up_all(&the_lnet.ln_eq_waitq);
+ lnet_eq_wait_unlock();
+}
+
+int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+ int new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+ lnet_event_t *new_event = &eq->eq_events[new_index];
+ int rc;
+ ENTRY;
+
+ /* must called with lnet_eq_wait_lock hold */
+ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+ RETURN(0);
+
+ /* We've got a new event... */
+ *ev = *new_event;
+
+ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+ new_event, eq->eq_deq_seq, eq->eq_size);
+
+ /* ...but did it overwrite an event we've not seen yet? */
+ if (eq->eq_deq_seq == new_event->sequence) {
+ rc = 1;
+ } else {
+ /* don't complain with CERROR: some EQs are sized small
+ * anyway; if it's important, the caller should complain */
+ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+ eq->eq_deq_seq, new_event->sequence);
+ rc = -EOVERFLOW;
+ }
+
+ eq->eq_deq_seq = new_event->sequence + 1;
+ RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0 No pending event in the EQ.
+ * \retval 1 Indicates success.
+ * \retval -ENOENT If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+ int which;
+
+ return LNetEQPoll(&eventq, 1, 0,
+ event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1 Indicates success.
+ * \retval -ENOENT If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+ int which;
+
+ return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+ event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+{
+ int tms = *timeout_ms;
+ int wait;
+ wait_queue_t wl;
+ cfs_time_t now;
+
+ if (tms == 0)
+ return -1; /* don't want to wait and no new event */
+
+ init_waitqueue_entry_current(&wl);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+ lnet_eq_wait_unlock();
+
+ if (tms < 0) {
+ waitq_wait(&wl, TASK_INTERRUPTIBLE);
+
+ } else {
+ struct timeval tv;
+
+ now = cfs_time_current();
+ waitq_timedwait(&wl, TASK_INTERRUPTIBLE,
+ cfs_time_seconds(tms) / 1000);
+ cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+ tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+ if (tms < 0) /* no more wait but may have new event */
+ tms = 0;
+ }
+
+ wait = tms != 0; /* might need to call here again */
+ *timeout_ms = tms;
+
+ lnet_eq_wait_lock();
+ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+ return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0 No pending event in the EQs after timeout.
+ * \retval 1 Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+ lnet_event_t *event, int *which)
+{
+ int wait = 1;
+ int rc;
+ int i;
+ ENTRY;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (neq < 1)
+ RETURN(-ENOENT);
+
+ lnet_eq_wait_lock();
+
+ for (;;) {
+ for (i = 0; i < neq; i++) {
+ lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+ if (eq == NULL) {
+ lnet_eq_wait_unlock();
+ RETURN(-ENOENT);
+ }
+
+ rc = lnet_eq_dequeue_event(eq, event);
+ if (rc != 0) {
+ lnet_eq_wait_unlock();
+ *which = i;
+ RETURN(rc);
+ }
+ }
+
+ if (wait == 0)
+ break;
+
+ /*
+ * return value of lnet_eq_wait_locked:
+ * -1 : did nothing and it's sure no new event
+ * 1 : sleep inside and wait until new event
+ * 0 : don't want to wait anymore, but might have new event
+ * so need to call dequeue again
+ */
+ wait = lnet_eq_wait_locked(&timeout_ms);
+ if (wait < 0) /* no new event */
+ break;
+ }
+
+ lnet_eq_wait_unlock();
+ RETURN(0);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644
index 000000000000..ae643f26933b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-md.c
@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+ if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+ /* first unlink attempt... */
+ lnet_me_t *me = md->md_me;
+
+ md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+ /* Disassociate from ME (if any), and unlink it if it was created
+ * with LNET_UNLINK */
+ if (me != NULL) {
+ /* detach MD from portal */
+ lnet_ptl_detach_md(me, md);
+ if (me->me_unlink == LNET_UNLINK)
+ lnet_me_unlink(me);
+ }
+
+ /* ensure all future handle lookups fail */
+ lnet_res_lh_invalidate(&md->md_lh);
+ }
+
+ if (md->md_refcount != 0) {
+ CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+ return;
+ }
+
+ CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+ if (md->md_eq != NULL) {
+ int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+ LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+ (*md->md_eq->eq_refs[cpt])--;
+ }
+
+ LASSERT(!list_empty(&md->md_list));
+ list_del_init(&md->md_list);
+ lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+ int i;
+ unsigned int niov;
+ int total_length = 0;
+
+ lmd->md_me = NULL;
+ lmd->md_start = umd->start;
+ lmd->md_offset = 0;
+ lmd->md_max_size = umd->max_size;
+ lmd->md_options = umd->options;
+ lmd->md_user_ptr = umd->user_ptr;
+ lmd->md_eq = NULL;
+ lmd->md_threshold = umd->threshold;
+ lmd->md_refcount = 0;
+ lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+ if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+ if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+ return -EINVAL;
+
+ lmd->md_niov = niov = umd->length;
+ memcpy(lmd->md_iov.iov, umd->start,
+ niov * sizeof (lmd->md_iov.iov[0]));
+
+ for (i = 0; i < (int)niov; i++) {
+ /* We take the base address on trust */
+ if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+ return -EINVAL;
+
+ total_length += lmd->md_iov.iov[i].iov_len;
+ }
+
+ lmd->md_length = total_length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > total_length)) // illegal max_size
+ return -EINVAL;
+
+ } else if ((umd->options & LNET_MD_KIOV) != 0) {
+ lmd->md_niov = niov = umd->length;
+ memcpy(lmd->md_iov.kiov, umd->start,
+ niov * sizeof (lmd->md_iov.kiov[0]));
+
+ for (i = 0; i < (int)niov; i++) {
+ /* We take the page pointer on trust */
+ if (lmd->md_iov.kiov[i].kiov_offset +
+ lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE )
+ return -EINVAL; /* invalid length */
+
+ total_length += lmd->md_iov.kiov[i].kiov_len;
+ }
+
+ lmd->md_length = total_length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > total_length)) // illegal max_size
+ return -EINVAL;
+ } else { /* contiguous */
+ lmd->md_length = umd->length;
+ lmd->md_niov = niov = 1;
+ lmd->md_iov.iov[0].iov_base = umd->start;
+ lmd->md_iov.iov[0].iov_len = umd->length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > (int)umd->length)) // illegal max_size
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+ struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+ /* NB we are passed an allocated, but inactive md.
+ * if we return success, caller may lnet_md_unlink() it.
+ * otherwise caller may only lnet_md_free() it.
+ */
+ /* This implementation doesn't know how to create START events or
+ * disable END events. Best to LASSERT our caller is compliant so
+ * we find out quickly... */
+ /* TODO - reevaluate what should be here in light of
+ * the removal of the start and end events
+ * maybe there we shouldn't even allow LNET_EQ_NONE!)
+ * LASSERT (eq == NULL);
+ */
+ if (!LNetHandleIsInvalid(eq_handle)) {
+ md->md_eq = lnet_handle2eq(&eq_handle);
+
+ if (md->md_eq == NULL)
+ return -ENOENT;
+
+ (*md->md_eq->eq_refs[cpt])++;
+ }
+
+ lnet_res_lh_initialize(container, &md->md_lh);
+
+ LASSERT(list_empty(&md->md_list));
+ list_add(&md->md_list, &container->rec_active);
+
+ return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+ /* NB this doesn't copy out all the iov entries so when a
+ * discontiguous MD is copied out, the target gets to know the
+ * original iov pointer (in start) and the number of entries it had
+ * and that's all.
+ */
+ umd->start = lmd->md_start;
+ umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+ lmd->md_length : lmd->md_niov;
+ umd->threshold = lmd->md_threshold;
+ umd->max_size = lmd->md_max_size;
+ umd->options = lmd->md_options;
+ umd->user_ptr = lmd->md_user_ptr;
+ lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+int
+lnet_md_validate(lnet_md_t *umd)
+{
+ if (umd->start == NULL && umd->length != 0) {
+ CERROR("MD start pointer can not be NULL with length %u\n",
+ umd->length);
+ return -EINVAL;
+ }
+
+ if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+ umd->length > LNET_MAX_IOV) {
+ CERROR("Invalid option: too many fragments %u, %d max\n",
+ umd->length, LNET_MAX_IOV);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+ lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+ LIST_HEAD (matches);
+ LIST_HEAD (drops);
+ struct lnet_me *me;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (lnet_md_validate(&umd) != 0)
+ return -EINVAL;
+
+ if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+ CERROR("Invalid option: no MD_OP set\n");
+ return -EINVAL;
+ }
+
+ md = lnet_md_alloc(&umd);
+ if (md == NULL)
+ return -ENOMEM;
+
+ rc = lnet_md_build(md, &umd, unlink);
+ cpt = lnet_cpt_of_cookie(meh.cookie);
+
+ lnet_res_lock(cpt);
+ if (rc != 0)
+ goto failed;
+
+ me = lnet_handle2me(&meh);
+ if (me == NULL)
+ rc = -ENOENT;
+ else if (me->me_md != NULL)
+ rc = -EBUSY;
+ else
+ rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+ if (rc != 0)
+ goto failed;
+
+ /* attach this MD to portal of ME and check if it matches any
+ * blocked msgs on this portal */
+ lnet_ptl_attach_md(me, md, &matches, &drops);
+
+ lnet_md2handle(handle, md);
+
+ lnet_res_unlock(cpt);
+
+ lnet_drop_delayed_msg_list(&drops, "Bad match");
+ lnet_recv_delayed_msg_list(&matches);
+
+ return 0;
+
+ failed:
+ lnet_md_free_locked(md);
+
+ lnet_res_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+ lnet_libmd_t *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (lnet_md_validate(&umd) != 0)
+ return -EINVAL;
+
+ if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+ CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+ return -EINVAL;
+ }
+
+ md = lnet_md_alloc(&umd);
+ if (md == NULL)
+ return -ENOMEM;
+
+ rc = lnet_md_build(md, &umd, unlink);
+
+ cpt = lnet_res_lock_current();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_md_link(md, umd.eq_handle, cpt);
+ if (rc != 0)
+ goto failed;
+
+ lnet_md2handle(handle, md);
+
+ lnet_res_unlock(cpt);
+ return 0;
+
+ failed:
+ lnet_md_free_locked(md);
+
+ lnet_res_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ * and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ * returns, and the unlinked event will be piggybacked on the event of
+ * the completion of the last operation by setting the unlinked field of
+ * the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0 On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink (lnet_handle_md_t mdh)
+{
+ lnet_event_t ev;
+ lnet_libmd_t *md;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL) {
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ /* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+ * when the NAL is done, the completion event flags that the MD was
+ * unlinked. Otherwise, we enqueue an event now... */
+
+ if (md->md_eq != NULL &&
+ md->md_refcount == 0) {
+ lnet_build_unlink_event(md, &ev);
+ lnet_eq_enqueue_event(md->md_eq, &ev);
+ }
+
+ lnet_md_unlink(md);
+
+ lnet_res_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644
index 000000000000..0081075cabee
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-me.c
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+ lnet_process_id_t match_id,
+ __u64 match_bits, __u64 ignore_bits,
+ lnet_unlink_t unlink, lnet_ins_pos_t pos,
+ lnet_handle_me_t *handle)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_me *me;
+ struct list_head *head;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if ((int)portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ mtable = lnet_mt_of_attach(portal, match_id,
+ match_bits, ignore_bits, pos);
+ if (mtable == NULL) /* can't match portal type */
+ return -EPERM;
+
+ me = lnet_me_alloc();
+ if (me == NULL)
+ return -ENOMEM;
+
+ lnet_res_lock(mtable->mt_cpt);
+
+ me->me_portal = portal;
+ me->me_match_id = match_id;
+ me->me_match_bits = match_bits;
+ me->me_ignore_bits = ignore_bits;
+ me->me_unlink = unlink;
+ me->me_md = NULL;
+
+ lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+ &me->me_lh);
+ if (ignore_bits != 0)
+ head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+ else
+ head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+ me->me_pos = head - &mtable->mt_mhash[0];
+ if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+ list_add_tail(&me->me_list, head);
+ else
+ list_add(&me->me_list, head);
+
+ lnet_me2handle(handle, me);
+
+ lnet_res_unlock(mtable->mt_cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0 On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+ lnet_process_id_t match_id,
+ __u64 match_bits, __u64 ignore_bits,
+ lnet_unlink_t unlink, lnet_ins_pos_t pos,
+ lnet_handle_me_t *handle)
+{
+ struct lnet_me *current_me;
+ struct lnet_me *new_me;
+ struct lnet_portal *ptl;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (pos == LNET_INS_LOCAL)
+ return -EPERM;
+
+ new_me = lnet_me_alloc();
+ if (new_me == NULL)
+ return -ENOMEM;
+
+ cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+ lnet_res_lock(cpt);
+
+ current_me = lnet_handle2me(&current_meh);
+ if (current_me == NULL) {
+ lnet_me_free_locked(new_me);
+
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+ ptl = the_lnet.ln_portals[current_me->me_portal];
+ if (lnet_ptl_is_unique(ptl)) {
+ /* nosense to insertion on unique portal */
+ lnet_me_free_locked(new_me);
+ lnet_res_unlock(cpt);
+ return -EPERM;
+ }
+
+ new_me->me_pos = current_me->me_pos;
+ new_me->me_portal = current_me->me_portal;
+ new_me->me_match_id = match_id;
+ new_me->me_match_bits = match_bits;
+ new_me->me_ignore_bits = ignore_bits;
+ new_me->me_unlink = unlink;
+ new_me->me_md = NULL;
+
+ lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+ if (pos == LNET_INS_AFTER)
+ list_add(&new_me->me_list, &current_me->me_list);
+ else
+ list_add_tail(&new_me->me_list, &current_me->me_list);
+
+ lnet_me2handle(handle, new_me);
+
+ lnet_res_unlock(cpt);
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0 On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+ lnet_me_t *me;
+ lnet_libmd_t *md;
+ lnet_event_t ev;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_cpt_of_cookie(meh.cookie);
+ lnet_res_lock(cpt);
+
+ me = lnet_handle2me(&meh);
+ if (me == NULL) {
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ md = me->me_md;
+ if (md != NULL &&
+ md->md_eq != NULL &&
+ md->md_refcount == 0) {
+ lnet_build_unlink_event(md, &ev);
+ lnet_eq_enqueue_event(md->md_eq, &ev);
+ }
+
+ lnet_me_unlink(me);
+
+ lnet_res_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+ list_del(&me->me_list);
+
+ if (me->me_md != NULL) {
+ lnet_libmd_t *md = me->me_md;
+
+ /* detach MD from portal of this ME */
+ lnet_ptl_detach_md(me, md);
+ lnet_md_unlink(md);
+ }
+
+ lnet_res_lh_invalidate(&me->me_lh);
+ lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+ CWARN("Match Entry %p ("LPX64")\n", me,
+ me->me_lh.lh_cookie);
+
+ CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+ me->me_match_bits, me->me_ignore_bits);
+
+ CWARN("\tMD\t= %p\n", me->md);
+ CWARN("\tprev\t= %p\n",
+ list_entry(me->me_list.prev, lnet_me_t, me_list));
+ CWARN("\tnext\t= %p\n",
+ list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644
index 000000000000..49b0f1287a69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -0,0 +1,2441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+ "Reserved");
+
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
+{
+ lnet_test_peer_t *tp;
+ struct list_head *el;
+ struct list_head *next;
+ struct list_head cull;
+
+ LASSERT (the_lnet.ln_init);
+
+ /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+ if (threshold != 0) {
+ /* Adding a new entry */
+ LIBCFS_ALLOC(tp, sizeof(*tp));
+ if (tp == NULL)
+ return -ENOMEM;
+
+ tp->tp_nid = nid;
+ tp->tp_threshold = threshold;
+
+ lnet_net_lock(0);
+ list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+ lnet_net_unlock(0);
+ return 0;
+ }
+
+ /* removing entries */
+ INIT_LIST_HEAD(&cull);
+
+ lnet_net_lock(0);
+
+ list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+ tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+ if (tp->tp_threshold == 0 || /* needs culling anyway */
+ nid == LNET_NID_ANY || /* removing all entries */
+ tp->tp_nid == nid) /* matched this one */
+ {
+ list_del (&tp->tp_list);
+ list_add (&tp->tp_list, &cull);
+ }
+ }
+
+ lnet_net_unlock(0);
+
+ while (!list_empty (&cull)) {
+ tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+
+ list_del (&tp->tp_list);
+ LIBCFS_FREE(tp, sizeof (*tp));
+ }
+ return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+ lnet_test_peer_t *tp;
+ struct list_head *el;
+ struct list_head *next;
+ struct list_head cull;
+ int fail = 0;
+
+ INIT_LIST_HEAD (&cull);
+
+ /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+ lnet_net_lock(0);
+
+ list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+ tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+ if (tp->tp_threshold == 0) {
+ /* zombie entry */
+ if (outgoing) {
+ /* only cull zombies on outgoing tests,
+ * since we may be at interrupt priority on
+ * incoming messages. */
+ list_del (&tp->tp_list);
+ list_add (&tp->tp_list, &cull);
+ }
+ continue;
+ }
+
+ if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+ nid == tp->tp_nid) { /* fail this peer */
+ fail = 1;
+
+ if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+ tp->tp_threshold--;
+ if (outgoing &&
+ tp->tp_threshold == 0) {
+ /* see above */
+ list_del (&tp->tp_list);
+ list_add (&tp->tp_list, &cull);
+ }
+ }
+ break;
+ }
+ }
+
+ lnet_net_unlock(0);
+
+ while (!list_empty (&cull)) {
+ tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+ list_del (&tp->tp_list);
+
+ LIBCFS_FREE(tp, sizeof (*tp));
+ }
+
+ return (fail);
+}
+
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
+{
+ unsigned int nob = 0;
+
+ while (niov-- > 0)
+ nob += (iov++)->iov_len;
+
+ return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+ unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+ unsigned int nob)
+{
+ /* NB diov, siov are READ-ONLY */
+ unsigned int this_nob;
+
+ if (nob == 0)
+ return;
+
+ /* skip complete frags before 'doffset' */
+ LASSERT (ndiov > 0);
+ while (doffset >= diov->iov_len) {
+ doffset -= diov->iov_len;
+ diov++;
+ ndiov--;
+ LASSERT (ndiov > 0);
+ }
+
+ /* skip complete frags before 'soffset' */
+ LASSERT (nsiov > 0);
+ while (soffset >= siov->iov_len) {
+ soffset -= siov->iov_len;
+ siov++;
+ nsiov--;
+ LASSERT (nsiov > 0);
+ }
+
+ do {
+ LASSERT (ndiov > 0);
+ LASSERT (nsiov > 0);
+ this_nob = MIN(diov->iov_len - doffset,
+ siov->iov_len - soffset);
+ this_nob = MIN(this_nob, nob);
+
+ memcpy ((char *)diov->iov_base + doffset,
+ (char *)siov->iov_base + soffset, this_nob);
+ nob -= this_nob;
+
+ if (diov->iov_len > doffset + this_nob) {
+ doffset += this_nob;
+ } else {
+ diov++;
+ ndiov--;
+ doffset = 0;
+ }
+
+ if (siov->iov_len > soffset + this_nob) {
+ soffset += this_nob;
+ } else {
+ siov++;
+ nsiov--;
+ soffset = 0;
+ }
+ } while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+ int src_niov, struct iovec *src,
+ unsigned int offset, unsigned int len)
+{
+ /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+ * for exactly 'len' bytes, and return the number of entries.
+ * NB not destructive to 'src' */
+ unsigned int frag_len;
+ unsigned int niov;
+
+ if (len == 0) /* no data => */
+ return (0); /* no frags */
+
+ LASSERT (src_niov > 0);
+ while (offset >= src->iov_len) { /* skip initial frags */
+ offset -= src->iov_len;
+ src_niov--;
+ src++;
+ LASSERT (src_niov > 0);
+ }
+
+ niov = 1;
+ for (;;) {
+ LASSERT (src_niov > 0);
+ LASSERT ((int)niov <= dst_niov);
+
+ frag_len = src->iov_len - offset;
+ dst->iov_base = ((char *)src->iov_base) + offset;
+
+ if (len <= frag_len) {
+ dst->iov_len = len;
+ return (niov);
+ }
+
+ dst->iov_len = frag_len;
+
+ len -= frag_len;
+ dst++;
+ src++;
+ niov++;
+ src_niov--;
+ offset = 0;
+ }
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
+{
+ unsigned int nob = 0;
+
+ while (niov-- > 0)
+ nob += (kiov++)->kiov_len;
+
+ return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+ unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+ unsigned int nob)
+{
+ /* NB diov, siov are READ-ONLY */
+ unsigned int this_nob;
+ char *daddr = NULL;
+ char *saddr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT (!in_interrupt ());
+
+ LASSERT (ndiov > 0);
+ while (doffset >= diov->kiov_len) {
+ doffset -= diov->kiov_len;
+ diov++;
+ ndiov--;
+ LASSERT (ndiov > 0);
+ }
+
+ LASSERT (nsiov > 0);
+ while (soffset >= siov->kiov_len) {
+ soffset -= siov->kiov_len;
+ siov++;
+ nsiov--;
+ LASSERT (nsiov > 0);
+ }
+
+ do {
+ LASSERT (ndiov > 0);
+ LASSERT (nsiov > 0);
+ this_nob = MIN(diov->kiov_len - doffset,
+ siov->kiov_len - soffset);
+ this_nob = MIN(this_nob, nob);
+
+ if (daddr == NULL)
+ daddr = ((char *)kmap(diov->kiov_page)) +
+ diov->kiov_offset + doffset;
+ if (saddr == NULL)
+ saddr = ((char *)kmap(siov->kiov_page)) +
+ siov->kiov_offset + soffset;
+
+ /* Vanishing risk of kmap deadlock when mapping 2 pages.
+ * However in practice at least one of the kiovs will be mapped
+ * kernel pages and the map/unmap will be NOOPs */
+
+ memcpy (daddr, saddr, this_nob);
+ nob -= this_nob;
+
+ if (diov->kiov_len > doffset + this_nob) {
+ daddr += this_nob;
+ doffset += this_nob;
+ } else {
+ kunmap(diov->kiov_page);
+ daddr = NULL;
+ diov++;
+ ndiov--;
+ doffset = 0;
+ }
+
+ if (siov->kiov_len > soffset + this_nob) {
+ saddr += this_nob;
+ soffset += this_nob;
+ } else {
+ kunmap(siov->kiov_page);
+ saddr = NULL;
+ siov++;
+ nsiov--;
+ soffset = 0;
+ }
+ } while (nob > 0);
+
+ if (daddr != NULL)
+ kunmap(diov->kiov_page);
+ if (saddr != NULL)
+ kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+ unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+ unsigned int nob)
+{
+ /* NB iov, kiov are READ-ONLY */
+ unsigned int this_nob;
+ char *addr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT (!in_interrupt ());
+
+ LASSERT (niov > 0);
+ while (iovoffset >= iov->iov_len) {
+ iovoffset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ LASSERT (nkiov > 0);
+ while (kiovoffset >= kiov->kiov_len) {
+ kiovoffset -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ LASSERT (nkiov > 0);
+ }
+
+ do {
+ LASSERT (niov > 0);
+ LASSERT (nkiov > 0);
+ this_nob = MIN(iov->iov_len - iovoffset,
+ kiov->kiov_len - kiovoffset);
+ this_nob = MIN(this_nob, nob);
+
+ if (addr == NULL)
+ addr = ((char *)kmap(kiov->kiov_page)) +
+ kiov->kiov_offset + kiovoffset;
+
+ memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+ nob -= this_nob;
+
+ if (iov->iov_len > iovoffset + this_nob) {
+ iovoffset += this_nob;
+ } else {
+ iov++;
+ niov--;
+ iovoffset = 0;
+ }
+
+ if (kiov->kiov_len > kiovoffset + this_nob) {
+ addr += this_nob;
+ kiovoffset += this_nob;
+ } else {
+ kunmap(kiov->kiov_page);
+ addr = NULL;
+ kiov++;
+ nkiov--;
+ kiovoffset = 0;
+ }
+
+ } while (nob > 0);
+
+ if (addr != NULL)
+ kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+ unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+ unsigned int nob)
+{
+ /* NB kiov, iov are READ-ONLY */
+ unsigned int this_nob;
+ char *addr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT (!in_interrupt ());
+
+ LASSERT (nkiov > 0);
+ while (kiovoffset >= kiov->kiov_len) {
+ kiovoffset -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ LASSERT (nkiov > 0);
+ }
+
+ LASSERT (niov > 0);
+ while (iovoffset >= iov->iov_len) {
+ iovoffset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do {
+ LASSERT (nkiov > 0);
+ LASSERT (niov > 0);
+ this_nob = MIN(kiov->kiov_len - kiovoffset,
+ iov->iov_len - iovoffset);
+ this_nob = MIN(this_nob, nob);
+
+ if (addr == NULL)
+ addr = ((char *)kmap(kiov->kiov_page)) +
+ kiov->kiov_offset + kiovoffset;
+
+ memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+ nob -= this_nob;
+
+ if (kiov->kiov_len > kiovoffset + this_nob) {
+ addr += this_nob;
+ kiovoffset += this_nob;
+ } else {
+ kunmap(kiov->kiov_page);
+ addr = NULL;
+ kiov++;
+ nkiov--;
+ kiovoffset = 0;
+ }
+
+ if (iov->iov_len > iovoffset + this_nob) {
+ iovoffset += this_nob;
+ } else {
+ iov++;
+ niov--;
+ iovoffset = 0;
+ }
+ } while (nob > 0);
+
+ if (addr != NULL)
+ kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+ int src_niov, lnet_kiov_t *src,
+ unsigned int offset, unsigned int len)
+{
+ /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+ * for exactly 'len' bytes, and return the number of entries.
+ * NB not destructive to 'src' */
+ unsigned int frag_len;
+ unsigned int niov;
+
+ if (len == 0) /* no data => */
+ return (0); /* no frags */
+
+ LASSERT (src_niov > 0);
+ while (offset >= src->kiov_len) { /* skip initial frags */
+ offset -= src->kiov_len;
+ src_niov--;
+ src++;
+ LASSERT (src_niov > 0);
+ }
+
+ niov = 1;
+ for (;;) {
+ LASSERT (src_niov > 0);
+ LASSERT ((int)niov <= dst_niov);
+
+ frag_len = src->kiov_len - offset;
+ dst->kiov_page = src->kiov_page;
+ dst->kiov_offset = src->kiov_offset + offset;
+
+ if (len <= frag_len) {
+ dst->kiov_len = len;
+ LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+ return (niov);
+ }
+
+ dst->kiov_len = frag_len;
+ LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+ len -= frag_len;
+ dst++;
+ src++;
+ niov++;
+ src_niov--;
+ offset = 0;
+ }
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ unsigned int niov = 0;
+ struct iovec *iov = NULL;
+ lnet_kiov_t *kiov = NULL;
+ int rc;
+
+ LASSERT (!in_interrupt ());
+ LASSERT (mlen == 0 || msg != NULL);
+
+ if (msg != NULL) {
+ LASSERT(msg->msg_receiving);
+ LASSERT(!msg->msg_sending);
+ LASSERT(rlen == msg->msg_len);
+ LASSERT(mlen <= msg->msg_len);
+ LASSERT(msg->msg_offset == offset);
+ LASSERT(msg->msg_wanted == mlen);
+
+ msg->msg_receiving = 0;
+
+ if (mlen != 0) {
+ niov = msg->msg_niov;
+ iov = msg->msg_iov;
+ kiov = msg->msg_kiov;
+
+ LASSERT (niov > 0);
+ LASSERT ((iov == NULL) != (kiov == NULL));
+ }
+ }
+
+ rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+ niov, iov, kiov, offset, mlen, rlen);
+ if (rc < 0)
+ lnet_finalize(ni, msg, rc);
+}
+
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+ lnet_libmd_t *md = msg->msg_md;
+
+ LASSERT (msg->msg_len > 0);
+ LASSERT (!msg->msg_routing);
+ LASSERT (md != NULL);
+ LASSERT (msg->msg_niov == 0);
+ LASSERT (msg->msg_iov == NULL);
+ LASSERT (msg->msg_kiov == NULL);
+
+ msg->msg_niov = md->md_niov;
+ if ((md->md_options & LNET_MD_KIOV) != 0)
+ msg->msg_kiov = md->md_iov.kiov;
+ else
+ msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+ unsigned int offset, unsigned int len)
+{
+ msg->msg_type = type;
+ msg->msg_target = target;
+ msg->msg_len = len;
+ msg->msg_offset = offset;
+
+ if (len != 0)
+ lnet_setpayloadbuffer(msg);
+
+ memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+ msg->msg_hdr.type = cpu_to_le32(type);
+ msg->msg_hdr.dest_nid = cpu_to_le64(target.nid);
+ msg->msg_hdr.dest_pid = cpu_to_le32(target.pid);
+ /* src_nid will be set later */
+ msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid);
+ msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ void *priv = msg->msg_private;
+ int rc;
+
+ LASSERT (!in_interrupt ());
+ LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+ (msg->msg_txcredit && msg->msg_peertxcredit));
+
+ rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+ if (rc < 0)
+ lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ int rc;
+
+ LASSERT(!msg->msg_sending);
+ LASSERT(msg->msg_receiving);
+ LASSERT(!msg->msg_rx_ready_delay);
+ LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+ msg->msg_rx_ready_delay = 1;
+ rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+ &msg->msg_private);
+ if (rc != 0) {
+ CERROR("recv from %s / send to %s aborted: "
+ "eager_recv failed %d\n",
+ libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+ libcfs_id2str(msg->msg_target), rc);
+ LASSERT(rc < 0); /* required by my callers */
+ }
+
+ return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+ cfs_time_t last_alive = 0;
+
+ LASSERT(lnet_peer_aliveness_enabled(lp));
+ LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+ lnet_net_unlock(lp->lp_cpt);
+ (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+ lnet_net_lock(lp->lp_cpt);
+
+ lp->lp_last_query = cfs_time_current();
+
+ if (last_alive != 0) /* NI has updated timestamp */
+ lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+ int alive;
+ cfs_time_t deadline;
+
+ LASSERT (lnet_peer_aliveness_enabled(lp));
+
+ /* Trust lnet_notify() if it has more recent aliveness news, but
+ * ignore the initial assumed death (see lnet_peers_start_down()).
+ */
+ if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+ cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+ return 0;
+
+ deadline = cfs_time_add(lp->lp_last_alive,
+ cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+ alive = cfs_time_after(deadline, now);
+
+ /* Update obsolete lp_alive except for routers assumed to be dead
+ * initially, because router checker would update aliveness in this
+ * case, and moreover lp_last_alive at peer creation is assumed.
+ */
+ if (alive && !lp->lp_alive &&
+ !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+ lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+ return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ * may drop the lnet_net_lock */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+ cfs_time_t now = cfs_time_current();
+
+ if (!lnet_peer_aliveness_enabled(lp))
+ return -ENODEV;
+
+ if (lnet_peer_is_alive(lp, now))
+ return 1;
+
+ /* Peer appears dead, but we should avoid frequent NI queries (at
+ * most once per lnet_queryinterval seconds). */
+ if (lp->lp_last_query != 0) {
+ static const int lnet_queryinterval = 1;
+
+ cfs_time_t next_query =
+ cfs_time_add(lp->lp_last_query,
+ cfs_time_seconds(lnet_queryinterval));
+
+ if (cfs_time_before(now, next_query)) {
+ if (lp->lp_alive)
+ CWARN("Unexpected aliveness of peer %s: "
+ "%d < %d (%d/%d)\n",
+ libcfs_nid2str(lp->lp_nid),
+ (int)now, (int)next_query,
+ lnet_queryinterval,
+ lp->lp_ni->ni_peertimeout);
+ return 0;
+ }
+ }
+
+ /* query NI for latest aliveness news */
+ lnet_ni_query_locked(lp->lp_ni, lp);
+
+ if (lnet_peer_is_alive(lp, now))
+ return 1;
+
+ lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+ return 0;
+}
+
+int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+ /* lnet_send is going to lnet_net_unlock immediately after this,
+ * so it sets do_send FALSE and I don't do the unlock/send/lock bit.
+ * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer
+ * appears dead, and 0 if sent or OK to send */
+ struct lnet_peer *lp = msg->msg_txpeer;
+ struct lnet_ni *ni = lp->lp_ni;
+ struct lnet_tx_queue *tq;
+ int cpt;
+
+ /* non-lnet_send() callers have checked before */
+ LASSERT(!do_send || msg->msg_tx_delayed);
+ LASSERT(!msg->msg_receiving);
+ LASSERT(msg->msg_tx_committed);
+
+ cpt = msg->msg_tx_cpt;
+ tq = ni->ni_tx_queues[cpt];
+
+ /* NB 'lp' is always the next hop */
+ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+ lnet_peer_alive_locked(lp) == 0) {
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+ lnet_net_unlock(cpt);
+
+ CNETERR("Dropping message for %s: peer not alive\n",
+ libcfs_id2str(msg->msg_target));
+ if (do_send)
+ lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+ lnet_net_lock(cpt);
+ return EHOSTUNREACH;
+ }
+
+ if (!msg->msg_peertxcredit) {
+ LASSERT ((lp->lp_txcredits < 0) ==
+ !list_empty(&lp->lp_txq));
+
+ msg->msg_peertxcredit = 1;
+ lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+ lp->lp_txcredits--;
+
+ if (lp->lp_txcredits < lp->lp_mintxcredits)
+ lp->lp_mintxcredits = lp->lp_txcredits;
+
+ if (lp->lp_txcredits < 0) {
+ msg->msg_tx_delayed = 1;
+ list_add_tail(&msg->msg_list, &lp->lp_txq);
+ return EAGAIN;
+ }
+ }
+
+ if (!msg->msg_txcredit) {
+ LASSERT((tq->tq_credits < 0) ==
+ !list_empty(&tq->tq_delayed));
+
+ msg->msg_txcredit = 1;
+ tq->tq_credits--;
+
+ if (tq->tq_credits < tq->tq_credits_min)
+ tq->tq_credits_min = tq->tq_credits;
+
+ if (tq->tq_credits < 0) {
+ msg->msg_tx_delayed = 1;
+ list_add_tail(&msg->msg_list, &tq->tq_delayed);
+ return EAGAIN;
+ }
+ }
+
+ if (do_send) {
+ lnet_net_unlock(cpt);
+ lnet_ni_send(ni, msg);
+ lnet_net_lock(cpt);
+ }
+ return 0;
+}
+
+
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+ lnet_rtrbufpool_t *rbp;
+ int cpt;
+
+ LASSERT(msg->msg_rx_committed);
+
+ cpt = msg->msg_rx_cpt;
+ rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+ LASSERT(msg->msg_len <= LNET_MTU);
+ while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+ rbp++;
+ LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+ }
+
+ return rbp;
+}
+
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
+{
+ /* lnet_parse is going to lnet_net_unlock immediately after this, so it
+ * sets do_recv FALSE and I don't do the unlock/send/lock bit. I
+ * return EAGAIN if msg blocked and 0 if received or OK to receive */
+ lnet_peer_t *lp = msg->msg_rxpeer;
+ lnet_rtrbufpool_t *rbp;
+ lnet_rtrbuf_t *rb;
+
+ LASSERT (msg->msg_iov == NULL);
+ LASSERT (msg->msg_kiov == NULL);
+ LASSERT (msg->msg_niov == 0);
+ LASSERT (msg->msg_routing);
+ LASSERT (msg->msg_receiving);
+ LASSERT (!msg->msg_sending);
+
+ /* non-lnet_parse callers only receive delayed messages */
+ LASSERT(!do_recv || msg->msg_rx_delayed);
+
+ if (!msg->msg_peerrtrcredit) {
+ LASSERT ((lp->lp_rtrcredits < 0) ==
+ !list_empty(&lp->lp_rtrq));
+
+ msg->msg_peerrtrcredit = 1;
+ lp->lp_rtrcredits--;
+ if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+ lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+ if (lp->lp_rtrcredits < 0) {
+ /* must have checked eager_recv before here */
+ LASSERT(msg->msg_rx_ready_delay);
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+ return EAGAIN;
+ }
+ }
+
+ rbp = lnet_msg2bufpool(msg);
+
+ if (!msg->msg_rtrcredit) {
+ LASSERT ((rbp->rbp_credits < 0) ==
+ !list_empty(&rbp->rbp_msgs));
+
+ msg->msg_rtrcredit = 1;
+ rbp->rbp_credits--;
+ if (rbp->rbp_credits < rbp->rbp_mincredits)
+ rbp->rbp_mincredits = rbp->rbp_credits;
+
+ if (rbp->rbp_credits < 0) {
+ /* must have checked eager_recv before here */
+ LASSERT(msg->msg_rx_ready_delay);
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+ return EAGAIN;
+ }
+ }
+
+ LASSERT (!list_empty(&rbp->rbp_bufs));
+ rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+ list_del(&rb->rb_list);
+
+ msg->msg_niov = rbp->rbp_npages;
+ msg->msg_kiov = &rb->rb_kiov[0];
+
+ if (do_recv) {
+ int cpt = msg->msg_rx_cpt;
+
+ lnet_net_unlock(cpt);
+ lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+ 0, msg->msg_len, msg->msg_len);
+ lnet_net_lock(cpt);
+ }
+ return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+ lnet_peer_t *txpeer = msg->msg_txpeer;
+ lnet_msg_t *msg2;
+
+ if (msg->msg_txcredit) {
+ struct lnet_ni *ni = txpeer->lp_ni;
+ struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+ /* give back NI txcredits */
+ msg->msg_txcredit = 0;
+
+ LASSERT((tq->tq_credits < 0) ==
+ !list_empty(&tq->tq_delayed));
+
+ tq->tq_credits++;
+ if (tq->tq_credits <= 0) {
+ msg2 = list_entry(tq->tq_delayed.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ LASSERT(msg2->msg_txpeer->lp_ni == ni);
+ LASSERT(msg2->msg_tx_delayed);
+
+ (void) lnet_post_send_locked(msg2, 1);
+ }
+ }
+
+ if (msg->msg_peertxcredit) {
+ /* give back peer txcredits */
+ msg->msg_peertxcredit = 0;
+
+ LASSERT((txpeer->lp_txcredits < 0) ==
+ !list_empty(&txpeer->lp_txq));
+
+ txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+ LASSERT (txpeer->lp_txqnob >= 0);
+
+ txpeer->lp_txcredits++;
+ if (txpeer->lp_txcredits <= 0) {
+ msg2 = list_entry(txpeer->lp_txq.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ LASSERT(msg2->msg_txpeer == txpeer);
+ LASSERT(msg2->msg_tx_delayed);
+
+ (void) lnet_post_send_locked(msg2, 1);
+ }
+ }
+
+ if (txpeer != NULL) {
+ msg->msg_txpeer = NULL;
+ lnet_peer_decref_locked(txpeer);
+ }
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+ lnet_peer_t *rxpeer = msg->msg_rxpeer;
+ lnet_msg_t *msg2;
+
+ if (msg->msg_rtrcredit) {
+ /* give back global router credits */
+ lnet_rtrbuf_t *rb;
+ lnet_rtrbufpool_t *rbp;
+
+ /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+ * there until it gets one allocated, or aborts the wait
+ * itself */
+ LASSERT (msg->msg_kiov != NULL);
+
+ rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+ rbp = rb->rb_pool;
+ LASSERT (rbp == lnet_msg2bufpool(msg));
+
+ msg->msg_kiov = NULL;
+ msg->msg_rtrcredit = 0;
+
+ LASSERT((rbp->rbp_credits < 0) ==
+ !list_empty(&rbp->rbp_msgs));
+ LASSERT((rbp->rbp_credits > 0) ==
+ !list_empty(&rbp->rbp_bufs));
+
+ list_add(&rb->rb_list, &rbp->rbp_bufs);
+ rbp->rbp_credits++;
+ if (rbp->rbp_credits <= 0) {
+ msg2 = list_entry(rbp->rbp_msgs.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ (void) lnet_post_routed_recv_locked(msg2, 1);
+ }
+ }
+
+ if (msg->msg_peerrtrcredit) {
+ /* give back peer router credits */
+ msg->msg_peerrtrcredit = 0;
+
+ LASSERT((rxpeer->lp_rtrcredits < 0) ==
+ !list_empty(&rxpeer->lp_rtrq));
+
+ rxpeer->lp_rtrcredits++;
+ if (rxpeer->lp_rtrcredits <= 0) {
+ msg2 = list_entry(rxpeer->lp_rtrq.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ (void) lnet_post_routed_recv_locked(msg2, 1);
+ }
+ }
+ if (rxpeer != NULL) {
+ msg->msg_rxpeer = NULL;
+ lnet_peer_decref_locked(rxpeer);
+ }
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+ lnet_peer_t *p1 = r1->lr_gateway;
+ lnet_peer_t *p2 = r2->lr_gateway;
+
+ if (r1->lr_hops < r2->lr_hops)
+ return 1;
+
+ if (r1->lr_hops > r2->lr_hops)
+ return -1;
+
+ if (p1->lp_txqnob < p2->lp_txqnob)
+ return 1;
+
+ if (p1->lp_txqnob > p2->lp_txqnob)
+ return -1;
+
+ if (p1->lp_txcredits > p2->lp_txcredits)
+ return 1;
+
+ if (p1->lp_txcredits < p2->lp_txcredits)
+ return -1;
+
+ if (r1->lr_seq - r2->lr_seq <= 0)
+ return 1;
+
+ return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+ lnet_remotenet_t *rnet;
+ lnet_route_t *rtr;
+ lnet_route_t *rtr_best;
+ lnet_route_t *rtr_last;
+ struct lnet_peer *lp_best;
+ struct lnet_peer *lp;
+ int rc;
+
+ /* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+ * rtr_nid nid, otherwise find the best gateway I can use */
+
+ rnet = lnet_find_net_locked(LNET_NIDNET(target));
+ if (rnet == NULL)
+ return NULL;
+
+ lp_best = NULL;
+ rtr_best = rtr_last = NULL;
+ list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+ lp = rtr->lr_gateway;
+
+ if (!lp->lp_alive || /* gateway is down */
+ ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+ rtr->lr_downis != 0)) /* NI to target is down */
+ continue;
+
+ if (ni != NULL && lp->lp_ni != ni)
+ continue;
+
+ if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+ return lp;
+
+ if (lp_best == NULL) {
+ rtr_best = rtr_last = rtr;
+ lp_best = lp;
+ continue;
+ }
+
+ /* no protection on below fields, but it's harmless */
+ if (rtr_last->lr_seq - rtr->lr_seq < 0)
+ rtr_last = rtr;
+
+ rc = lnet_compare_routes(rtr, rtr_best);
+ if (rc < 0)
+ continue;
+
+ rtr_best = rtr;
+ lp_best = lp;
+ }
+
+ /* set sequence number on the best router to the latest sequence + 1
+ * so we can round-robin all routers, it's race and inaccurate but
+ * harmless and functional */
+ if (rtr_best != NULL)
+ rtr_best->lr_seq = rtr_last->lr_seq + 1;
+ return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+ lnet_nid_t dst_nid = msg->msg_target.nid;
+ struct lnet_ni *src_ni;
+ struct lnet_ni *local_ni;
+ struct lnet_peer *lp;
+ int cpt;
+ int cpt2;
+ int rc;
+
+ /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+ * but we might want to use pre-determined router for ACK/REPLY
+ * in the future */
+ /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+ LASSERT (msg->msg_txpeer == NULL);
+ LASSERT (!msg->msg_sending);
+ LASSERT (!msg->msg_target_is_router);
+ LASSERT (!msg->msg_receiving);
+
+ msg->msg_sending = 1;
+
+ LASSERT(!msg->msg_tx_committed);
+ cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ lnet_net_unlock(cpt);
+ return -ESHUTDOWN;
+ }
+
+ if (src_nid == LNET_NID_ANY) {
+ src_ni = NULL;
+ } else {
+ src_ni = lnet_nid2ni_locked(src_nid, cpt);
+ if (src_ni == NULL) {
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("Can't send to %s: src %s is not a "
+ "local nid\n", libcfs_nid2str(dst_nid),
+ libcfs_nid2str(src_nid));
+ return -EINVAL;
+ }
+ LASSERT (!msg->msg_routing);
+ }
+
+ /* Is this for someone on a local network? */
+ local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+ if (local_ni != NULL) {
+ if (src_ni == NULL) {
+ src_ni = local_ni;
+ src_nid = src_ni->ni_nid;
+ } else if (src_ni == local_ni) {
+ lnet_ni_decref_locked(local_ni, cpt);
+ } else {
+ lnet_ni_decref_locked(local_ni, cpt);
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("No route to %s via from %s\n",
+ libcfs_nid2str(dst_nid),
+ libcfs_nid2str(src_nid));
+ return -EINVAL;
+ }
+
+ LASSERT(src_nid != LNET_NID_ANY);
+ lnet_msg_commit(msg, cpt);
+
+ if (!msg->msg_routing)
+ msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+ if (src_ni == the_lnet.ln_loni) {
+ /* No send credit hassles with LOLND */
+ lnet_net_unlock(cpt);
+ lnet_ni_send(src_ni, msg);
+
+ lnet_net_lock(cpt);
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+ return 0;
+ }
+
+ rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+ /* lp has ref on src_ni; lose mine */
+ lnet_ni_decref_locked(src_ni, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+ libcfs_nid2str(dst_nid));
+ /* ENOMEM or shutting down */
+ return rc;
+ }
+ LASSERT (lp->lp_ni == src_ni);
+ } else {
+ /* sending to a remote network */
+ lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+ if (lp == NULL) {
+ if (src_ni != NULL)
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+
+ LCONSOLE_WARN("No route to %s via %s "
+ "(all routers down)\n",
+ libcfs_id2str(msg->msg_target),
+ libcfs_nid2str(src_nid));
+ return -EHOSTUNREACH;
+ }
+
+ /* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+ * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+ * pre-determined router, this can happen if router table
+ * was changed when we release the lock */
+ if (rtr_nid != lp->lp_nid) {
+ cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+ if (cpt2 != cpt) {
+ if (src_ni != NULL)
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+
+ rtr_nid = lp->lp_nid;
+ cpt = cpt2;
+ goto again;
+ }
+ }
+
+ CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+ libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+ lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+ if (src_ni == NULL) {
+ src_ni = lp->lp_ni;
+ src_nid = src_ni->ni_nid;
+ } else {
+ LASSERT (src_ni == lp->lp_ni);
+ lnet_ni_decref_locked(src_ni, cpt);
+ }
+
+ lnet_peer_addref_locked(lp);
+
+ LASSERT(src_nid != LNET_NID_ANY);
+ lnet_msg_commit(msg, cpt);
+
+ if (!msg->msg_routing) {
+ /* I'm the source and now I know which NI to send on */
+ msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+ }
+
+ msg->msg_target_is_router = 1;
+ msg->msg_target.nid = lp->lp_nid;
+ msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+ }
+
+ /* 'lp' is our best choice of peer */
+
+ LASSERT (!msg->msg_peertxcredit);
+ LASSERT (!msg->msg_txcredit);
+ LASSERT (msg->msg_txpeer == NULL);
+
+ msg->msg_txpeer = lp; /* msg takes my ref on lp */
+
+ rc = lnet_post_send_locked(msg, 0);
+ lnet_net_unlock(cpt);
+
+ if (rc == EHOSTUNREACH)
+ return -EHOSTUNREACH;
+
+ if (rc == 0)
+ lnet_ni_send(src_ni, msg);
+
+ return 0;
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+ lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += nob;
+ lnet_net_unlock(cpt);
+
+ lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+
+ if (msg->msg_wanted != 0)
+ lnet_setpayloadbuffer(msg);
+
+ lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+ /* Must I ACK? If so I'll grab the ack_wmd out of the header and put
+ * it back into the ACK during lnet_finalize() */
+ msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+ (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+ lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+ msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ struct lnet_match_info info;
+ int rc;
+
+ /* Convert put fields to host byte order */
+ hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+ hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index);
+ hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
+
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_PUT;
+ info.mi_portal = hdr->msg.put.ptl_index;
+ info.mi_rlength = hdr->payload_length;
+ info.mi_roffset = hdr->msg.put.offset;
+ info.mi_mbits = hdr->msg.put.match_bits;
+
+ msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+ rc = lnet_ptl_match_md(&info, msg);
+ switch (rc) {
+ default:
+ LBUG();
+
+ case LNET_MATCHMD_OK:
+ lnet_recv_put(ni, msg);
+ return 0;
+
+ case LNET_MATCHMD_NONE:
+ if (msg->msg_rx_delayed) /* attached on delayed list */
+ return 0;
+
+ rc = lnet_ni_eager_recv(ni, msg);
+ if (rc == 0)
+ goto again;
+ /* fall through */
+
+ case LNET_MATCHMD_DROP:
+ CNETERR("Dropping PUT from %s portal %d match "LPU64
+ " offset %d length %d: %d\n",
+ libcfs_id2str(info.mi_id), info.mi_portal,
+ info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+ return ENOENT; /* +ve: OK but no match */
+ }
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+ struct lnet_match_info info;
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_handle_wire_t reply_wmd;
+ int rc;
+
+ /* Convert get fields to host byte order */
+ hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+ hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index);
+ hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length);
+ hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset);
+
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_GET;
+ info.mi_portal = hdr->msg.get.ptl_index;
+ info.mi_rlength = hdr->msg.get.sink_length;
+ info.mi_roffset = hdr->msg.get.src_offset;
+ info.mi_mbits = hdr->msg.get.match_bits;
+
+ rc = lnet_ptl_match_md(&info, msg);
+ if (rc == LNET_MATCHMD_DROP) {
+ CNETERR("Dropping GET from %s portal %d match "LPU64
+ " offset %d length %d\n",
+ libcfs_id2str(info.mi_id), info.mi_portal,
+ info.mi_mbits, info.mi_roffset, info.mi_rlength);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ LASSERT(rc == LNET_MATCHMD_OK);
+
+ lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+ reply_wmd = hdr->msg.get.return_wmd;
+
+ lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+ msg->msg_offset, msg->msg_wanted);
+
+ msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+ if (rdma_get) {
+ /* The LND completes the REPLY from her recv procedure */
+ lnet_ni_recv(ni, msg->msg_private, msg, 0,
+ msg->msg_offset, msg->msg_len, msg->msg_len);
+ return 0;
+ }
+
+ lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+ msg->msg_receiving = 0;
+
+ rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+ if (rc < 0) {
+ /* didn't get as far as lnet_ni_send() */
+ CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+ libcfs_nid2str(ni->ni_nid),
+ libcfs_id2str(info.mi_id), rc);
+
+ lnet_finalize(ni, msg, rc);
+ }
+
+ return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ void *private = msg->msg_private;
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_process_id_t src = {0};
+ lnet_libmd_t *md;
+ int rlength;
+ int mlength;
+ int cpt;
+
+ cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+ lnet_res_lock(cpt);
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ /* NB handles only looked up by creator (no flips) */
+ md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CNETERR("%s: Dropping REPLY from %s for %s "
+ "MD "LPX64"."LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ (md == NULL) ? "invalid" : "inactive",
+ hdr->msg.reply.dst_wmd.wh_interface_cookie,
+ hdr->msg.reply.dst_wmd.wh_object_cookie);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("REPLY MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ LASSERT (md->md_offset == 0);
+
+ rlength = hdr->payload_length;
+ mlength = MIN(rlength, (int)md->md_length);
+
+ if (mlength < rlength &&
+ (md->md_options & LNET_MD_TRUNCATE) == 0) {
+ CNETERR("%s: Dropping REPLY from %s length %d "
+ "for MD "LPX64" would overflow (%d)\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+ mlength);
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+ lnet_msg_attach_md(msg, md, 0, mlength);
+
+ if (mlength != 0)
+ lnet_setpayloadbuffer(msg);
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+ lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+ return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_process_id_t src = {0};
+ lnet_libmd_t *md;
+ int cpt;
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ /* Convert ack fields to host byte order */
+ hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+ hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+ cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+ lnet_res_lock(cpt);
+
+ /* NB handles only looked up by creator (no flips) */
+ md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ /* Don't moan; this is expected */
+ CDEBUG(D_NET,
+ "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ (md == NULL) ? "invalid" : "inactive",
+ hdr->msg.ack.dst_wmd.wh_interface_cookie,
+ hdr->msg.ack.dst_wmd.wh_object_cookie);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("Source MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve! */
+ }
+
+ CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+ lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+ return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ int rc = 0;
+
+ if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+ lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+ if (ni->ni_lnd->lnd_eager_recv == NULL) {
+ msg->msg_rx_ready_delay = 1;
+ } else {
+ lnet_net_unlock(msg->msg_rx_cpt);
+ rc = lnet_ni_eager_recv(ni, msg);
+ lnet_net_lock(msg->msg_rx_cpt);
+ }
+ }
+
+ if (rc == 0)
+ rc = lnet_post_routed_recv_locked(msg, 0);
+ return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+ switch (type) {
+ case LNET_MSG_ACK:
+ return ("ACK");
+ case LNET_MSG_PUT:
+ return ("PUT");
+ case LNET_MSG_GET:
+ return ("GET");
+ case LNET_MSG_REPLY:
+ return ("REPLY");
+ case LNET_MSG_HELLO:
+ return ("HELLO");
+ default:
+ return ("<UNKNOWN>");
+ }
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
+{
+ lnet_process_id_t src = {0};
+ lnet_process_id_t dst = {0};
+ char *type_str = lnet_msgtyp2str (hdr->type);
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ dst.nid = hdr->dest_nid;
+ dst.pid = hdr->dest_pid;
+
+ CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+ CWARN(" From %s\n", libcfs_id2str(src));
+ CWARN(" To %s\n", libcfs_id2str(dst));
+
+ switch (hdr->type) {
+ default:
+ break;
+
+ case LNET_MSG_PUT:
+ CWARN(" Ptl index %d, ack md "LPX64"."LPX64", "
+ "match bits "LPU64"\n",
+ hdr->msg.put.ptl_index,
+ hdr->msg.put.ack_wmd.wh_interface_cookie,
+ hdr->msg.put.ack_wmd.wh_object_cookie,
+ hdr->msg.put.match_bits);
+ CWARN(" Length %d, offset %d, hdr data "LPX64"\n",
+ hdr->payload_length, hdr->msg.put.offset,
+ hdr->msg.put.hdr_data);
+ break;
+
+ case LNET_MSG_GET:
+ CWARN(" Ptl index %d, return md "LPX64"."LPX64", "
+ "match bits "LPU64"\n", hdr->msg.get.ptl_index,
+ hdr->msg.get.return_wmd.wh_interface_cookie,
+ hdr->msg.get.return_wmd.wh_object_cookie,
+ hdr->msg.get.match_bits);
+ CWARN(" Length %d, src offset %d\n",
+ hdr->msg.get.sink_length,
+ hdr->msg.get.src_offset);
+ break;
+
+ case LNET_MSG_ACK:
+ CWARN(" dst md "LPX64"."LPX64", "
+ "manipulated length %d\n",
+ hdr->msg.ack.dst_wmd.wh_interface_cookie,
+ hdr->msg.ack.dst_wmd.wh_object_cookie,
+ hdr->msg.ack.mlength);
+ break;
+
+ case LNET_MSG_REPLY:
+ CWARN(" dst md "LPX64"."LPX64", "
+ "length %d\n",
+ hdr->msg.reply.dst_wmd.wh_interface_cookie,
+ hdr->msg.reply.dst_wmd.wh_object_cookie,
+ hdr->payload_length);
+ }
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+ void *private, int rdma_req)
+{
+ int rc = 0;
+ int cpt;
+ int for_me;
+ struct lnet_msg *msg;
+ lnet_pid_t dest_pid;
+ lnet_nid_t dest_nid;
+ lnet_nid_t src_nid;
+ __u32 payload_length;
+ __u32 type;
+
+ LASSERT (!in_interrupt ());
+
+ type = le32_to_cpu(hdr->type);
+ src_nid = le64_to_cpu(hdr->src_nid);
+ dest_nid = le64_to_cpu(hdr->dest_nid);
+ dest_pid = le32_to_cpu(hdr->dest_pid);
+ payload_length = le32_to_cpu(hdr->payload_length);
+
+ for_me = (ni->ni_nid == dest_nid);
+ cpt = lnet_cpt_of_nid(from_nid);
+
+ switch (type) {
+ case LNET_MSG_ACK:
+ case LNET_MSG_GET:
+ if (payload_length > 0) {
+ CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type), payload_length);
+ return -EPROTO;
+ }
+ break;
+
+ case LNET_MSG_PUT:
+ case LNET_MSG_REPLY:
+ if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+ CERROR("%s, src %s: bad %s payload %d "
+ "(%d max expected)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type),
+ payload_length,
+ for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+ return -EPROTO;
+ }
+ break;
+
+ default:
+ CERROR("%s, src %s: Bad message type 0x%x\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid), type);
+ return -EPROTO;
+ }
+
+ if (the_lnet.ln_routing &&
+ ni->ni_last_alive != cfs_time_current_sec()) {
+ lnet_ni_lock(ni);
+
+ /* NB: so far here is the only place to set NI status to "up */
+ ni->ni_last_alive = cfs_time_current_sec();
+ if (ni->ni_status != NULL &&
+ ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+ ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+ lnet_ni_unlock(ni);
+ }
+
+ /* Regard a bad destination NID as a protocol error. Senders should
+ * know what they're doing; if they don't they're misconfigured, buggy
+ * or malicious so we chop them off at the knees :) */
+
+ if (!for_me) {
+ if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+ /* should have gone direct */
+ CERROR ("%s, src %s: Bad dest nid %s "
+ "(should have been sent direct)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (lnet_islocalnid(dest_nid)) {
+ /* dest is another local NI; sender should have used
+ * this node's NID on its own network */
+ CERROR ("%s, src %s: Bad dest nid %s "
+ "(it's my nid but on a different network)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (rdma_req && type == LNET_MSG_GET) {
+ CERROR ("%s, src %s: Bad optimized GET for %s "
+ "(final destination must be me)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (!the_lnet.ln_routing) {
+ CERROR ("%s, src %s: Dropping message for %s "
+ "(routing not enabled)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ goto drop;
+ }
+ }
+
+ /* Message looks OK; we're not going to return an error, so we MUST
+ * call back lnd_recv() come what may... */
+
+ if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer (src_nid, 0)) /* shall we now? */
+ {
+ CERROR("%s, src %s: Dropping %s to simulate failure\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type));
+ goto drop;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("%s, src %s: Dropping %s (out of memory)\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type));
+ goto drop;
+ }
+
+ /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+
+ msg->msg_type = type;
+ msg->msg_private = private;
+ msg->msg_receiving = 1;
+ msg->msg_len = msg->msg_wanted = payload_length;
+ msg->msg_offset = 0;
+ msg->msg_hdr = *hdr;
+ /* for building message event */
+ msg->msg_from = from_nid;
+ if (!for_me) {
+ msg->msg_target.pid = dest_pid;
+ msg->msg_target.nid = dest_nid;
+ msg->msg_routing = 1;
+
+ } else {
+ /* convert common msg->hdr fields to host byteorder */
+ msg->msg_hdr.type = type;
+ msg->msg_hdr.src_nid = src_nid;
+ msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid);
+ msg->msg_hdr.dest_nid = dest_nid;
+ msg->msg_hdr.dest_pid = dest_pid;
+ msg->msg_hdr.payload_length = payload_length;
+ }
+
+ lnet_net_lock(cpt);
+ rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ CERROR("%s, src %s: Dropping %s "
+ "(error %d looking up sender)\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type), rc);
+ lnet_msg_free(msg);
+ goto drop;
+ }
+
+ lnet_msg_commit(msg, cpt);
+
+ if (!for_me) {
+ rc = lnet_parse_forward_locked(ni, msg);
+ lnet_net_unlock(cpt);
+
+ if (rc < 0)
+ goto free_drop;
+ if (rc == 0) {
+ lnet_ni_recv(ni, msg->msg_private, msg, 0,
+ 0, payload_length, payload_length);
+ }
+ return 0;
+ }
+
+ lnet_net_unlock(cpt);
+
+ switch (type) {
+ case LNET_MSG_ACK:
+ rc = lnet_parse_ack(ni, msg);
+ break;
+ case LNET_MSG_PUT:
+ rc = lnet_parse_put(ni, msg);
+ break;
+ case LNET_MSG_GET:
+ rc = lnet_parse_get(ni, msg, rdma_req);
+ break;
+ case LNET_MSG_REPLY:
+ rc = lnet_parse_reply(ni, msg);
+ break;
+ default:
+ LASSERT(0);
+ rc = -EPROTO;
+ goto free_drop; /* prevent an unused label if !kernel */
+ }
+
+ if (rc == 0)
+ return 0;
+
+ LASSERT (rc == ENOENT);
+
+ free_drop:
+ LASSERT(msg->msg_md == NULL);
+ lnet_finalize(ni, msg, rc);
+
+ drop:
+ lnet_drop_message(ni, cpt, private, payload_length);
+ return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+ while (!list_empty(head)) {
+ lnet_process_id_t id = {0};
+ lnet_msg_t *msg;
+
+ msg = list_entry(head->next, lnet_msg_t, msg_list);
+ list_del(&msg->msg_list);
+
+ id.nid = msg->msg_hdr.src_nid;
+ id.pid = msg->msg_hdr.src_pid;
+
+ LASSERT(msg->msg_md == NULL);
+ LASSERT(msg->msg_rx_delayed);
+ LASSERT(msg->msg_rxpeer != NULL);
+ LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+ CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+ " offset %d length %d: %s\n",
+ libcfs_id2str(id),
+ msg->msg_hdr.msg.put.ptl_index,
+ msg->msg_hdr.msg.put.match_bits,
+ msg->msg_hdr.msg.put.offset,
+ msg->msg_hdr.payload_length, reason);
+
+ /* NB I can't drop msg's ref on msg_rxpeer until after I've
+ * called lnet_drop_message(), so I just hang onto msg as well
+ * until that's done */
+
+ lnet_drop_message(msg->msg_rxpeer->lp_ni,
+ msg->msg_rxpeer->lp_cpt,
+ msg->msg_private, msg->msg_len);
+ /*
+ * NB: message will not generate event because w/o attached MD,
+ * but we still should give error code so lnet_msg_decommit()
+ * can skip counters operations and other checks.
+ */
+ lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+ }
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ lnet_msg_t *msg;
+ lnet_process_id_t id;
+
+ msg = list_entry(head->next, lnet_msg_t, msg_list);
+ list_del(&msg->msg_list);
+
+ /* md won't disappear under me, since each msg
+ * holds a ref on it */
+
+ id.nid = msg->msg_hdr.src_nid;
+ id.pid = msg->msg_hdr.src_pid;
+
+ LASSERT(msg->msg_rx_delayed);
+ LASSERT(msg->msg_md != NULL);
+ LASSERT(msg->msg_rxpeer != NULL);
+ LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+ CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+ "match "LPU64" offset %d length %d.\n",
+ libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+ msg->msg_hdr.msg.put.match_bits,
+ msg->msg_hdr.msg.put.offset,
+ msg->msg_hdr.payload_length);
+
+ lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+ }
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval 0 Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+ lnet_process_id_t target, unsigned int portal,
+ __u64 match_bits, unsigned int offset,
+ __u64 hdr_data)
+{
+ struct lnet_msg *msg;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer (target.nid, 1)) /* shall we now? */
+ {
+ CERROR("Dropping PUT to %s: simulated failure\n",
+ libcfs_id2str(target));
+ return -EIO;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+ libcfs_id2str(target));
+ return -ENOMEM;
+ }
+ msg->msg_vmflush = !!memory_pressure_get();
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n",
+ match_bits, portal, libcfs_id2str(target),
+ md == NULL ? -1 : md->md_threshold);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("Source MD also attached to portal %d\n",
+ md->md_me->me_portal);
+ lnet_res_unlock(cpt);
+
+ lnet_msg_free(msg);
+ return -ENOENT;
+ }
+
+ CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+ msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+ msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+ msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+ msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+ /* NB handles only looked up by creator (no flips) */
+ if (ack == LNET_ACK_REQ) {
+ msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+ the_lnet.ln_interface_cookie;
+ msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+ md->md_lh.lh_cookie;
+ } else {
+ msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+ LNET_WIRE_HANDLE_COOKIE_NONE;
+ msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+ LNET_WIRE_HANDLE_COOKIE_NONE;
+ }
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+ rc = lnet_send(self, msg, LNET_NID_ANY);
+ if (rc != 0) {
+ CNETERR( "Error sending PUT to %s: %d\n",
+ libcfs_id2str(target), rc);
+ lnet_finalize (NULL, msg, rc);
+ }
+
+ /* completion will be signalled by an event */
+ return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+ /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This
+ * returns a msg for the LND to pass to lnet_finalize() when the sink
+ * data has been received.
+ *
+ * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+ * lnet_finalize() is called on it, so the LND must call this first */
+
+ struct lnet_msg *msg = lnet_msg_alloc();
+ struct lnet_libmd *getmd = getmsg->msg_md;
+ lnet_process_id_t peer_id = getmsg->msg_target;
+ int cpt;
+
+ LASSERT(!getmsg->msg_target_is_router);
+ LASSERT(!getmsg->msg_routing);
+
+ cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+ lnet_res_lock(cpt);
+
+ LASSERT (getmd->md_refcount > 0);
+
+ if (msg == NULL) {
+ CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+ goto drop;
+ }
+
+ if (getmd->md_threshold == 0) {
+ CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+ getmd);
+ lnet_res_unlock(cpt);
+ goto drop;
+ }
+
+ LASSERT(getmd->md_offset == 0);
+
+ CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+ /* setup information for lnet_build_msg_event */
+ msg->msg_from = peer_id.nid;
+ msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+ msg->msg_hdr.src_nid = peer_id.nid;
+ msg->msg_hdr.payload_length = getmd->md_length;
+ msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+ lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+ lnet_res_unlock(cpt);
+
+ cpt = lnet_cpt_of_nid(peer_id.nid);
+
+ lnet_net_lock(cpt);
+ lnet_msg_commit(msg, cpt);
+ lnet_net_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+ return msg;
+
+ drop:
+ cpt = lnet_cpt_of_nid(peer_id.nid);
+
+ lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+ lnet_net_unlock(cpt);
+
+ if (msg != NULL)
+ lnet_msg_free(msg);
+
+ return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+ /* Set the REPLY length, now the RDMA that elides the REPLY message has
+ * completed and I know it. */
+ LASSERT (reply != NULL);
+ LASSERT (reply->msg_type == LNET_MSG_GET);
+ LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+ /* NB I trusted my peer to RDMA. If she tells me she's written beyond
+ * the end of my buffer, I might as well be dead. */
+ LASSERT (len <= reply->msg_ev.mlength);
+
+ reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval 0 Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+ lnet_process_id_t target, unsigned int portal,
+ __u64 match_bits, unsigned int offset)
+{
+ struct lnet_msg *msg;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer (target.nid, 1)) /* shall we now? */
+ {
+ CERROR("Dropping GET to %s: simulated failure\n",
+ libcfs_id2str(target));
+ return -EIO;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+ libcfs_id2str(target));
+ return -ENOMEM;
+ }
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n",
+ match_bits, portal, libcfs_id2str(target),
+ md == NULL ? -1 : md->md_threshold);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("REPLY MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+
+ lnet_msg_free(msg);
+
+ return -ENOENT;
+ }
+
+ CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+ msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+ msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+ msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+ msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+ /* NB handles only looked up by creator (no flips) */
+ msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+ the_lnet.ln_interface_cookie;
+ msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+ md->md_lh.lh_cookie;
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+ rc = lnet_send(self, msg, LNET_NID_ANY);
+ if (rc < 0) {
+ CNETERR( "Error sending GET to %s: %d\n",
+ libcfs_id2str(target), rc);
+ lnet_finalize (NULL, msg, rc);
+ }
+
+ /* completion will be signalled by an event */
+ return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+ struct list_head *e;
+ struct lnet_ni *ni;
+ lnet_remotenet_t *rnet;
+ __u32 dstnet = LNET_NIDNET(dstnid);
+ int hops;
+ int cpt;
+ __u32 order = 2;
+ struct list_head *rn_list;
+
+ /* if !local_nid_dist_zero, I don't return a distance of 0 ever
+ * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+ * keep order 0 free for 0@lo and order 1 free for a local NID
+ * match */
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ cpt = lnet_net_lock_current();
+
+ list_for_each (e, &the_lnet.ln_nis) {
+ ni = list_entry(e, lnet_ni_t, ni_list);
+
+ if (ni->ni_nid == dstnid) {
+ if (srcnidp != NULL)
+ *srcnidp = dstnid;
+ if (orderp != NULL) {
+ if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+ *orderp = 0;
+ else
+ *orderp = 1;
+ }
+ lnet_net_unlock(cpt);
+
+ return local_nid_dist_zero ? 0 : 1;
+ }
+
+ if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+ if (srcnidp != NULL)
+ *srcnidp = ni->ni_nid;
+ if (orderp != NULL)
+ *orderp = order;
+ lnet_net_unlock(cpt);
+ return 1;
+ }
+
+ order++;
+ }
+
+ rn_list = lnet_net2rnethash(dstnet);
+ list_for_each(e, rn_list) {
+ rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+ if (rnet->lrn_net == dstnet) {
+ lnet_route_t *route;
+ lnet_route_t *shortest = NULL;
+
+ LASSERT (!list_empty(&rnet->lrn_routes));
+
+ list_for_each_entry(route, &rnet->lrn_routes,
+ lr_list) {
+ if (shortest == NULL ||
+ route->lr_hops < shortest->lr_hops)
+ shortest = route;
+ }
+
+ LASSERT (shortest != NULL);
+ hops = shortest->lr_hops;
+ if (srcnidp != NULL)
+ *srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+ if (orderp != NULL)
+ *orderp = order;
+ lnet_net_unlock(cpt);
+ return hops + 1;
+ }
+ order++;
+ }
+
+ lnet_net_unlock(cpt);
+ return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+ return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644
index 000000000000..8f3a50bd5f69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c
@@ -0,0 +1,650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev)
+{
+ ENTRY;
+
+ memset(ev, 0, sizeof(*ev));
+
+ ev->status = 0;
+ ev->unlinked = 1;
+ ev->type = LNET_EVENT_UNLINK;
+ lnet_md_deconstruct(md, &ev->md);
+ lnet_md2handle(&ev->md_handle, md);
+ EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(!msg->msg_routing);
+
+ ev->type = ev_type;
+
+ if (ev_type == LNET_EVENT_SEND) {
+ /* event for active message */
+ ev->target.nid = le64_to_cpu(hdr->dest_nid);
+ ev->target.pid = le32_to_cpu(hdr->dest_pid);
+ ev->initiator.nid = LNET_NID_ANY;
+ ev->initiator.pid = the_lnet.ln_pid;
+ ev->sender = LNET_NID_ANY;
+
+ } else {
+ /* event for passive message */
+ ev->target.pid = hdr->dest_pid;
+ ev->target.nid = hdr->dest_nid;
+ ev->initiator.pid = hdr->src_pid;
+ ev->initiator.nid = hdr->src_nid;
+ ev->rlength = hdr->payload_length;
+ ev->sender = msg->msg_from;
+ ev->mlength = msg->msg_wanted;
+ ev->offset = msg->msg_offset;
+ }
+
+ switch (ev_type) {
+ default:
+ LBUG();
+
+ case LNET_EVENT_PUT: /* passive PUT */
+ ev->pt_index = hdr->msg.put.ptl_index;
+ ev->match_bits = hdr->msg.put.match_bits;
+ ev->hdr_data = hdr->msg.put.hdr_data;
+ return;
+
+ case LNET_EVENT_GET: /* passive GET */
+ ev->pt_index = hdr->msg.get.ptl_index;
+ ev->match_bits = hdr->msg.get.match_bits;
+ ev->hdr_data = 0;
+ return;
+
+ case LNET_EVENT_ACK: /* ACK */
+ ev->match_bits = hdr->msg.ack.match_bits;
+ ev->mlength = hdr->msg.ack.mlength;
+ return;
+
+ case LNET_EVENT_REPLY: /* REPLY */
+ return;
+
+ case LNET_EVENT_SEND: /* active message */
+ if (msg->msg_type == LNET_MSG_PUT) {
+ ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index);
+ ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+ ev->offset = le32_to_cpu(hdr->msg.put.offset);
+ ev->mlength =
+ ev->rlength = le32_to_cpu(hdr->payload_length);
+ ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data);
+
+ } else {
+ LASSERT(msg->msg_type == LNET_MSG_GET);
+ ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index);
+ ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+ ev->mlength =
+ ev->rlength = le32_to_cpu(hdr->msg.get.sink_length);
+ ev->offset = le32_to_cpu(hdr->msg.get.src_offset);
+ ev->hdr_data = 0;
+ }
+ return;
+ }
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+ struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+ lnet_counters_t *counters = the_lnet.ln_counters[cpt];
+
+ /* routed message can be committed for both receiving and sending */
+ LASSERT(!msg->msg_tx_committed);
+
+ if (msg->msg_sending) {
+ LASSERT(!msg->msg_receiving);
+
+ msg->msg_tx_cpt = cpt;
+ msg->msg_tx_committed = 1;
+ if (msg->msg_rx_committed) { /* routed message REPLY */
+ LASSERT(msg->msg_onactivelist);
+ return;
+ }
+ } else {
+ LASSERT(!msg->msg_sending);
+ msg->msg_rx_cpt = cpt;
+ msg->msg_rx_committed = 1;
+ }
+
+ LASSERT(!msg->msg_onactivelist);
+ msg->msg_onactivelist = 1;
+ list_add(&msg->msg_activelist, &container->msc_active);
+
+ counters->msgs_alloc++;
+ if (counters->msgs_alloc > counters->msgs_max)
+ counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+ lnet_counters_t *counters;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(msg->msg_tx_committed);
+ if (status != 0)
+ goto out;
+
+ counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+ switch (ev->type) {
+ default: /* routed message */
+ LASSERT(msg->msg_routing);
+ LASSERT(msg->msg_rx_committed);
+ LASSERT(ev->type == 0);
+
+ counters->route_length += msg->msg_len;
+ counters->route_count++;
+ goto out;
+
+ case LNET_EVENT_PUT:
+ /* should have been decommitted */
+ LASSERT(!msg->msg_rx_committed);
+ /* overwritten while sending ACK */
+ LASSERT(msg->msg_type == LNET_MSG_ACK);
+ msg->msg_type = LNET_MSG_PUT; /* fix type */
+ break;
+
+ case LNET_EVENT_SEND:
+ LASSERT(!msg->msg_rx_committed);
+ if (msg->msg_type == LNET_MSG_PUT)
+ counters->send_length += msg->msg_len;
+ break;
+
+ case LNET_EVENT_GET:
+ LASSERT(msg->msg_rx_committed);
+ /* overwritten while sending reply, we should never be
+ * here for optimized GET */
+ LASSERT(msg->msg_type == LNET_MSG_REPLY);
+ msg->msg_type = LNET_MSG_GET; /* fix type */
+ break;
+ }
+
+ counters->send_count++;
+ out:
+ lnet_return_tx_credits_locked(msg);
+ msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+ lnet_counters_t *counters;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+ LASSERT(msg->msg_rx_committed);
+
+ if (status != 0)
+ goto out;
+
+ counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+ switch (ev->type) {
+ default:
+ LASSERT(ev->type == 0);
+ LASSERT(msg->msg_routing);
+ goto out;
+
+ case LNET_EVENT_ACK:
+ LASSERT(msg->msg_type == LNET_MSG_ACK);
+ break;
+
+ case LNET_EVENT_GET:
+ /* type is "REPLY" if it's an optimized GET on passive side,
+ * because optimized GET will never be committed for sending,
+ * so message type wouldn't be changed back to "GET" by
+ * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+ LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+ msg->msg_type == LNET_MSG_GET);
+ counters->send_length += msg->msg_wanted;
+ break;
+
+ case LNET_EVENT_PUT:
+ LASSERT(msg->msg_type == LNET_MSG_PUT);
+ break;
+
+ case LNET_EVENT_REPLY:
+ /* type is "GET" if it's an optimized GET on active side,
+ * see details in lnet_create_reply_msg() */
+ LASSERT(msg->msg_type == LNET_MSG_GET ||
+ msg->msg_type == LNET_MSG_REPLY);
+ break;
+ }
+
+ counters->recv_count++;
+ if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+ counters->recv_length += msg->msg_wanted;
+
+ out:
+ lnet_return_rx_credits_locked(msg);
+ msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+ int cpt2 = cpt;
+
+ LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+ LASSERT(msg->msg_onactivelist);
+
+ if (msg->msg_tx_committed) { /* always decommit for sending first */
+ LASSERT(cpt == msg->msg_tx_cpt);
+ lnet_msg_decommit_tx(msg, status);
+ }
+
+ if (msg->msg_rx_committed) {
+ /* forwarding msg committed for both receiving and sending */
+ if (cpt != msg->msg_rx_cpt) {
+ lnet_net_unlock(cpt);
+ cpt2 = msg->msg_rx_cpt;
+ lnet_net_lock(cpt2);
+ }
+ lnet_msg_decommit_rx(msg, status);
+ }
+
+ list_del(&msg->msg_activelist);
+ msg->msg_onactivelist = 0;
+
+ the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+ if (cpt2 != cpt) {
+ lnet_net_unlock(cpt2);
+ lnet_net_lock(cpt);
+ }
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+ unsigned int offset, unsigned int mlen)
+{
+ /* NB: @offset and @len are only useful for receiving */
+ /* Here, we attach the MD on lnet_msg and mark it busy and
+ * decrementing its threshold. Come what may, the lnet_msg "owns"
+ * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+ * signals completion. */
+ LASSERT(!msg->msg_routing);
+
+ msg->msg_md = md;
+ if (msg->msg_receiving) { /* commited for receiving */
+ msg->msg_offset = offset;
+ msg->msg_wanted = mlen;
+ }
+
+ md->md_refcount++;
+ if (md->md_threshold != LNET_MD_THRESH_INF) {
+ LASSERT(md->md_threshold > 0);
+ md->md_threshold--;
+ }
+
+ /* build umd in event */
+ lnet_md2handle(&msg->msg_ev.md_handle, md);
+ lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+ lnet_libmd_t *md = msg->msg_md;
+ int unlink;
+
+ /* Now it's safe to drop my caller's ref */
+ md->md_refcount--;
+ LASSERT(md->md_refcount >= 0);
+
+ unlink = lnet_md_unlinkable(md);
+ if (md->md_eq != NULL) {
+ msg->msg_ev.status = status;
+ msg->msg_ev.unlinked = unlink;
+ lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+ }
+
+ if (unlink)
+ lnet_md_unlink(md);
+
+ msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+ lnet_handle_wire_t ack_wmd;
+ int rc;
+ int status = msg->msg_ev.status;
+
+ LASSERT (msg->msg_onactivelist);
+
+ if (status == 0 && msg->msg_ack) {
+ /* Only send an ACK if the PUT completed successfully */
+
+ lnet_msg_decommit(msg, cpt, 0);
+
+ msg->msg_ack = 0;
+ lnet_net_unlock(cpt);
+
+ LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+ LASSERT(!msg->msg_routing);
+
+ ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+ lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+ msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+ msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+ msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+ /* NB: we probably want to use NID of msg::msg_from as 3rd
+ * parameter (router NID) if it's routed message */
+ rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+ lnet_net_lock(cpt);
+ /*
+ * NB: message is committed for sending, we should return
+ * on success because LND will finalize this message later.
+ *
+ * Also, there is possibility that message is commited for
+ * sending and also failed before delivering to LND,
+ * i.e: ENOMEM, in that case we can't fall through either
+ * because CPT for sending can be different with CPT for
+ * receiving, so we should return back to lnet_finalize()
+ * to make sure we are locking the correct partition.
+ */
+ return rc;
+
+ } else if (status == 0 && /* OK so far */
+ (msg->msg_routing && !msg->msg_sending)) {
+ /* not forwarded */
+ LASSERT(!msg->msg_receiving); /* called back recv already */
+ lnet_net_unlock(cpt);
+
+ rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+ lnet_net_lock(cpt);
+ /*
+ * NB: message is committed for sending, we should return
+ * on success because LND will finalize this message later.
+ *
+ * Also, there is possibility that message is commited for
+ * sending and also failed before delivering to LND,
+ * i.e: ENOMEM, in that case we can't fall through either:
+ * - The rule is message must decommit for sending first if
+ * the it's committed for both sending and receiving
+ * - CPT for sending can be different with CPT for receiving,
+ * so we should return back to lnet_finalize() to make
+ * sure we are locking the correct partition.
+ */
+ return rc;
+ }
+
+ lnet_msg_decommit(msg, cpt, status);
+ lnet_msg_free_locked(msg);
+ return 0;
+}
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+ struct lnet_msg_container *container;
+ int my_slot;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT (!in_interrupt ());
+
+ if (msg == NULL)
+ return;
+#if 0
+ CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+ lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+ msg->msg_target_is_router ? "t" : "",
+ msg->msg_routing ? "X" : "",
+ msg->msg_ack ? "A" : "",
+ msg->msg_sending ? "S" : "",
+ msg->msg_receiving ? "R" : "",
+ msg->msg_delayed ? "d" : "",
+ msg->msg_txcredit ? "C" : "",
+ msg->msg_peertxcredit ? "c" : "",
+ msg->msg_rtrcredit ? "F" : "",
+ msg->msg_peerrtrcredit ? "f" : "",
+ msg->msg_onactivelist ? "!" : "",
+ msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+ msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+ msg->msg_ev.status = status;
+
+ if (msg->msg_md != NULL) {
+ cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+ lnet_res_lock(cpt);
+ lnet_msg_detach_md(msg, status);
+ lnet_res_unlock(cpt);
+ }
+
+ again:
+ rc = 0;
+ if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+ /* not commited to network yet */
+ LASSERT(!msg->msg_onactivelist);
+ lnet_msg_free(msg);
+ return;
+ }
+
+ /*
+ * NB: routed message can be commited for both receiving and sending,
+ * we should finalize in LIFO order and keep counters correct.
+ * (finalize sending first then finalize receiving)
+ */
+ cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+ lnet_net_lock(cpt);
+
+ container = the_lnet.ln_msg_containers[cpt];
+ list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+ /* Recursion breaker. Don't complete the message here if I am (or
+ * enough other threads are) already completing messages */
+
+ my_slot = -1;
+ for (i = 0; i < container->msc_nfinalizers; i++) {
+ if (container->msc_finalizers[i] == current)
+ break;
+
+ if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+ my_slot = i;
+ }
+
+ if (i < container->msc_nfinalizers || my_slot < 0) {
+ lnet_net_unlock(cpt);
+ return;
+ }
+
+ container->msc_finalizers[my_slot] = current;
+
+ while (!list_empty(&container->msc_finalizing)) {
+ msg = list_entry(container->msc_finalizing.next,
+ lnet_msg_t, msg_list);
+
+ list_del(&msg->msg_list);
+
+ /* NB drops and regains the lnet lock if it actually does
+ * anything, so my finalizing friends can chomp along too */
+ rc = lnet_complete_msg_locked(msg, cpt);
+ if (rc != 0)
+ break;
+ }
+
+ container->msc_finalizers[my_slot] = NULL;
+ lnet_net_unlock(cpt);
+
+ if (rc != 0)
+ goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+ int count = 0;
+
+ if (container->msc_init == 0)
+ return;
+
+ while (!list_empty(&container->msc_active)) {
+ lnet_msg_t *msg = list_entry(container->msc_active.next,
+ lnet_msg_t, msg_activelist);
+
+ LASSERT(msg->msg_onactivelist);
+ msg->msg_onactivelist = 0;
+ list_del(&msg->msg_activelist);
+ lnet_msg_free(msg);
+ count++;
+ }
+
+ if (count > 0)
+ CERROR("%d active msg on exit\n", count);
+
+ if (container->msc_finalizers != NULL) {
+ LIBCFS_FREE(container->msc_finalizers,
+ container->msc_nfinalizers *
+ sizeof(*container->msc_finalizers));
+ container->msc_finalizers = NULL;
+ }
+#ifdef LNET_USE_LIB_FREELIST
+ lnet_freelist_fini(&container->msc_freelist);
+#endif
+ container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+ int rc;
+
+ container->msc_init = 1;
+
+ INIT_LIST_HEAD(&container->msc_active);
+ INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+ memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+ rc = lnet_freelist_init(&container->msc_freelist,
+ LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+ if (rc != 0) {
+ CERROR("Failed to init freelist for message container\n");
+ lnet_msg_container_cleanup(container);
+ return rc;
+ }
+#else
+ rc = 0;
+#endif
+ /* number of CPUs */
+ container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+ LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+ container->msc_nfinalizers *
+ sizeof(*container->msc_finalizers));
+
+ if (container->msc_finalizers == NULL) {
+ CERROR("Failed to allocate message finalizers\n");
+ lnet_msg_container_cleanup(container);
+ return -ENOMEM;
+ }
+
+ return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+ struct lnet_msg_container *container;
+ int i;
+
+ if (the_lnet.ln_msg_containers == NULL)
+ return;
+
+ cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+ lnet_msg_container_cleanup(container);
+
+ cfs_percpt_free(the_lnet.ln_msg_containers);
+ the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+ struct lnet_msg_container *container;
+ int rc;
+ int i;
+
+ the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*container));
+
+ if (the_lnet.ln_msg_containers == NULL) {
+ CERROR("Failed to allocate cpu-partition data for network\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+ rc = lnet_msg_container_setup(container, i);
+ if (rc != 0) {
+ lnet_msg_containers_destroy();
+ return rc;
+ }
+ }
+
+ return 0;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644
index 000000000000..9b9e7d3139b0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -0,0 +1,938 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int portal_rotor = LNET_PTL_ROTOR_HASH_RT;
+CFS_MODULE_PARM(portal_rotor, "i", int, 0644,
+ "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+ __u64 mbits, __u64 ignore_bits)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[index];
+ int unique;
+
+ unique = ignore_bits == 0 &&
+ match_id.nid != LNET_NID_ANY &&
+ match_id.pid != LNET_PID_ANY;
+
+ LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+ /* prefer to check w/o any lock */
+ if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+ goto match;
+
+ /* unset, new portal */
+ lnet_ptl_lock(ptl);
+ /* check again with lock */
+ if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+ lnet_ptl_unlock(ptl);
+ goto match;
+ }
+
+ /* still not set */
+ if (unique)
+ lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+ else
+ lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+ lnet_ptl_unlock(ptl);
+
+ return 1;
+
+ match:
+ if ((lnet_ptl_is_unique(ptl) && !unique) ||
+ (lnet_ptl_is_wildcard(ptl) && unique))
+ return 0;
+ return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+ struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+ int i;
+
+ /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ mtable->mt_enabled = 1;
+
+ ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+ for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+ LASSERT(ptl->ptl_mt_maps[i] != cpt);
+ if (ptl->ptl_mt_maps[i] < cpt)
+ break;
+
+ /* swap to order */
+ ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+ ptl->ptl_mt_maps[i] = cpt;
+ }
+
+ ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+ struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+ int i;
+
+ /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ if (LNET_CPT_NUMBER == 1)
+ return; /* never disable the only match-table */
+
+ mtable->mt_enabled = 0;
+
+ LASSERT(ptl->ptl_mt_nmaps > 0 &&
+ ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+ /* remove it from mt_maps */
+ ptl->ptl_mt_nmaps--;
+ for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+ if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+ ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+ }
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+ * lnet_match_blocked_msg() relies on this to avoid races */
+ unsigned int offset;
+ unsigned int mlength;
+ lnet_me_t *me = md->md_me;
+
+ /* MD exhausted */
+ if (lnet_md_exhausted(md))
+ return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+ /* mismatched MD op */
+ if ((md->md_options & info->mi_opc) == 0)
+ return LNET_MATCHMD_NONE;
+
+ /* mismatched ME nid/pid? */
+ if (me->me_match_id.nid != LNET_NID_ANY &&
+ me->me_match_id.nid != info->mi_id.nid)
+ return LNET_MATCHMD_NONE;
+
+ if (me->me_match_id.pid != LNET_PID_ANY &&
+ me->me_match_id.pid != info->mi_id.pid)
+ return LNET_MATCHMD_NONE;
+
+ /* mismatched ME matchbits? */
+ if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+ return LNET_MATCHMD_NONE;
+
+ /* Hurrah! This _is_ a match; check it out... */
+
+ if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+ offset = md->md_offset;
+ else
+ offset = info->mi_roffset;
+
+ if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+ mlength = md->md_max_size;
+ LASSERT(md->md_offset + mlength <= md->md_length);
+ } else {
+ mlength = md->md_length - offset;
+ }
+
+ if (info->mi_rlength <= mlength) { /* fits in allowed space */
+ mlength = info->mi_rlength;
+ } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+ /* this packet _really_ is too big */
+ CERROR("Matching packet from %s, match "LPU64
+ " length %d too big: %d left, %d allowed\n",
+ libcfs_id2str(info->mi_id), info->mi_mbits,
+ info->mi_rlength, md->md_length - offset, mlength);
+
+ return LNET_MATCHMD_DROP;
+ }
+
+ /* Commit to this ME/MD */
+ CDEBUG(D_NET, "Incoming %s index %x from %s of "
+ "length %d/%d into md "LPX64" [%d] + %d\n",
+ (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+ info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+ info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+ lnet_msg_attach_md(msg, md, offset, mlength);
+ md->md_offset = offset + mlength;
+
+ if (!lnet_md_exhausted(md))
+ return LNET_MATCHMD_OK;
+
+ /* Auto-unlink NOW, so the ME gets unlinked if required.
+ * We bumped md->md_refcount above so the MD just gets flagged
+ * for unlink when it is finalized. */
+ if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+ lnet_md_unlink(md);
+
+ return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+ if (LNET_CPT_NUMBER == 1)
+ return ptl->ptl_mtables[0]; /* the only one */
+
+ /* if it's a unique portal, return match-table hashed by NID */
+ return lnet_ptl_is_unique(ptl) ?
+ ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+ __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+ struct lnet_portal *ptl;
+ struct lnet_match_table *mtable;
+
+ /* NB: called w/o lock */
+ LASSERT(index < the_lnet.ln_nportals);
+
+ if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+ return NULL;
+
+ ptl = the_lnet.ln_portals[index];
+
+ mtable = lnet_match2mt(ptl, id, mbits);
+ if (mtable != NULL) /* unique portal or only one match-table */
+ return mtable;
+
+ /* it's a wildcard portal */
+ switch (pos) {
+ default:
+ return NULL;
+ case LNET_INS_BEFORE:
+ case LNET_INS_AFTER:
+ /* posted by no affinity thread, always hash to specific
+ * match-table to avoid buffer stealing which is heavy */
+ return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+ case LNET_INS_LOCAL:
+ /* posted by cpu-affinity thread */
+ return ptl->ptl_mtables[lnet_cpt_current()];
+ }
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_portal *ptl;
+ int nmaps;
+ int rotor;
+ int routed;
+ int cpt;
+
+ /* NB: called w/o lock */
+ LASSERT(info->mi_portal < the_lnet.ln_nportals);
+ ptl = the_lnet.ln_portals[info->mi_portal];
+
+ LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+ mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+ if (mtable != NULL)
+ return mtable;
+
+ /* it's a wildcard portal */
+ routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+ LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+ if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+ (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+ cpt = lnet_cpt_current();
+ if (ptl->ptl_mtables[cpt]->mt_enabled)
+ return ptl->ptl_mtables[cpt];
+ }
+
+ rotor = ptl->ptl_rotor++; /* get round-robin factor */
+ if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+ cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+ else
+ cpt = rotor % LNET_CPT_NUMBER;
+
+ if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+ /* is there any active entry for this portal? */
+ nmaps = ptl->ptl_mt_nmaps;
+ /* map to an active mtable to avoid heavy "stealing" */
+ if (nmaps != 0) {
+ /* NB: there is possibility that ptl_mt_maps is being
+ * changed because we are not under protection of
+ * lnet_ptl_lock, but it shouldn't hurt anything */
+ cpt = ptl->ptl_mt_maps[rotor % nmaps];
+ }
+ }
+
+ return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+ __u64 *bmap;
+ int i;
+
+ if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+ return 0;
+
+ if (pos < 0) { /* check all bits */
+ for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+ if (mtable->mt_exhausted[i] != (__u64)(-1))
+ return 0;
+ }
+ return 1;
+ }
+
+ LASSERT(pos <= LNET_MT_HASH_IGNORE);
+ /* mtable::mt_mhash[pos] is marked as exhausted or not */
+ bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+ pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+ return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+ __u64 *bmap;
+
+ LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+ LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+ /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+ bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+ pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+ if (!exhausted)
+ *bmap &= ~(1ULL << pos);
+ else
+ *bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+ lnet_process_id_t id, __u64 mbits)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+ if (lnet_ptl_is_wildcard(ptl)) {
+ return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+ } else {
+ unsigned long hash = mbits + id.nid + id.pid;
+
+ LASSERT(lnet_ptl_is_unique(ptl));
+ hash = cfs_hash_long(hash, LNET_MT_HASH_BITS);
+ return &mtable->mt_mhash[hash];
+ }
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct list_head *head;
+ lnet_me_t *me;
+ lnet_me_t *tmp;
+ int exhausted = 0;
+ int rc;
+
+ /* any ME with ignore bits? */
+ if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+ head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+ else
+ head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+ /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+ if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+ exhausted = LNET_MATCHMD_EXHAUSTED;
+
+ list_for_each_entry_safe(me, tmp, head, me_list) {
+ /* ME attached but MD not attached yet */
+ if (me->me_md == NULL)
+ continue;
+
+ LASSERT(me == me->me_md->md_me);
+
+ rc = lnet_try_match_md(me->me_md, info, msg);
+ if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+ exhausted = 0; /* mlist is not empty */
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0) {
+ /* don't return EXHAUSTED bit because we don't know
+ * whether the mlist is empty or not */
+ return rc & ~LNET_MATCHMD_EXHAUSTED;
+ }
+ }
+
+ if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+ lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+ if (!lnet_mt_test_exhausted(mtable, -1))
+ exhausted = 0;
+ }
+
+ if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+ head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ goto again; /* re-check MEs w/o ignore-bits */
+ }
+
+ if (info->mi_opc == LNET_MD_OP_GET ||
+ !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+ return LNET_MATCHMD_DROP | exhausted;
+
+ return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+ int rc;
+
+ /* message arrived before any buffer posting on this portal,
+ * simply delay or drop this message */
+ if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+ return 0;
+
+ lnet_ptl_lock(ptl);
+ /* check it again with hold of lock */
+ if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+ lnet_ptl_unlock(ptl);
+ return 0;
+ }
+
+ if (lnet_ptl_is_lazy(ptl)) {
+ if (msg->msg_rx_ready_delay) {
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_delayed);
+ }
+ rc = LNET_MATCHMD_NONE;
+ } else {
+ rc = LNET_MATCHMD_DROP;
+ }
+
+ lnet_ptl_unlock(ptl);
+ return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ int first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+ int rc = 0;
+ int i;
+
+ /* steal buffer from other CPTs, and delay it if nothing to steal,
+ * this function is more expensive than a regular match, but we
+ * don't expect it can happen a lot */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ for (i = 0; i < LNET_CPT_NUMBER; i++) {
+ struct lnet_match_table *mtable;
+ int cpt;
+
+ cpt = (first + i) % LNET_CPT_NUMBER;
+ mtable = ptl->ptl_mtables[cpt];
+ if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+ continue;
+
+ lnet_res_lock(cpt);
+ lnet_ptl_lock(ptl);
+
+ if (i == 0) { /* the first try, attach on stealing list */
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_stealing);
+ }
+
+ if (!list_empty(&msg->msg_list)) { /* on stealing list */
+ rc = lnet_mt_match_md(mtable, info, msg);
+
+ if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+ mtable->mt_enabled)
+ lnet_ptl_disable_mt(ptl, cpt);
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0)
+ list_del_init(&msg->msg_list);
+
+ } else {
+ /* could be matched by lnet_ptl_attach_md()
+ * which is called by another thread */
+ rc = msg->msg_md == NULL ?
+ LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+ }
+
+ if (!list_empty(&msg->msg_list) && /* not matched yet */
+ (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+ ptl->ptl_mt_nmaps == 0 || /* no active CPT */
+ (ptl->ptl_mt_nmaps == 1 && /* the only active CPT */
+ ptl->ptl_mt_maps[0] == cpt))) {
+ /* nothing to steal, delay or drop */
+ list_del_init(&msg->msg_list);
+
+ if (lnet_ptl_is_lazy(ptl)) {
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_delayed);
+ rc = LNET_MATCHMD_NONE;
+ } else {
+ rc = LNET_MATCHMD_DROP;
+ }
+ }
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(cpt);
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+ break;
+ }
+
+ return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_portal *ptl;
+ int rc;
+
+ CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+ "MB="LPX64"\n", libcfs_id2str(info->mi_id),
+ info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+ if (info->mi_portal >= the_lnet.ln_nportals) {
+ CERROR("Invalid portal %d not in [0-%d]\n",
+ info->mi_portal, the_lnet.ln_nportals);
+ return LNET_MATCHMD_DROP;
+ }
+
+ ptl = the_lnet.ln_portals[info->mi_portal];
+ rc = lnet_ptl_match_early(ptl, msg);
+ if (rc != 0) /* matched or delayed early message */
+ return rc;
+
+ mtable = lnet_mt_of_match(info, msg);
+ lnet_res_lock(mtable->mt_cpt);
+
+ if (the_lnet.ln_shutdown) {
+ rc = LNET_MATCHMD_DROP;
+ goto out1;
+ }
+
+ rc = lnet_mt_match_md(mtable, info, msg);
+ if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+ lnet_ptl_lock(ptl);
+ lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+ lnet_ptl_unlock(ptl);
+ }
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */
+ goto out1;
+
+ if (!msg->msg_rx_ready_delay)
+ goto out1;
+
+ LASSERT(lnet_ptl_is_lazy(ptl));
+ LASSERT(!msg->msg_rx_delayed);
+
+ /* NB: we don't expect "delay" can happen a lot */
+ if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+ lnet_ptl_lock(ptl);
+
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(mtable->mt_cpt);
+
+ } else {
+ lnet_res_unlock(mtable->mt_cpt);
+ rc = lnet_ptl_match_delay(ptl, info, msg);
+ }
+
+ if (msg->msg_rx_delayed) {
+ CDEBUG(D_NET,
+ "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n",
+ info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+ libcfs_id2str(info->mi_id), info->mi_portal,
+ info->mi_mbits, info->mi_roffset, info->mi_rlength);
+ }
+ goto out0;
+ out1:
+ lnet_res_unlock(mtable->mt_cpt);
+ out0:
+ /* EXHAUSTED bit is only meaningful for internal functions */
+ return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+ LASSERT(me->me_md == md && md->md_me == me);
+
+ me->me_md = NULL;
+ md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+ struct list_head *matches, struct list_head *drops)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal];
+ struct lnet_match_table *mtable;
+ struct list_head *head;
+ lnet_msg_t *tmp;
+ lnet_msg_t *msg;
+ int exhausted = 0;
+ int cpt;
+
+ LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+ me->me_md = md;
+ md->md_me = me;
+
+ cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+ mtable = ptl->ptl_mtables[cpt];
+
+ if (list_empty(&ptl->ptl_msg_stealing) &&
+ list_empty(&ptl->ptl_msg_delayed) &&
+ !lnet_mt_test_exhausted(mtable, me->me_pos))
+ return;
+
+ lnet_ptl_lock(ptl);
+ head = &ptl->ptl_msg_stealing;
+ again:
+ list_for_each_entry_safe(msg, tmp, head, msg_list) {
+ struct lnet_match_info info;
+ lnet_hdr_t *hdr;
+ int rc;
+
+ LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+ hdr = &msg->msg_hdr;
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_PUT;
+ info.mi_portal = hdr->msg.put.ptl_index;
+ info.mi_rlength = hdr->payload_length;
+ info.mi_roffset = hdr->msg.put.offset;
+ info.mi_mbits = hdr->msg.put.match_bits;
+
+ rc = lnet_try_match_md(md, &info, msg);
+
+ exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+ if ((rc & LNET_MATCHMD_NONE) != 0) {
+ if (exhausted)
+ break;
+ continue;
+ }
+
+ /* Hurrah! This _is_ a match */
+ LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+ list_del_init(&msg->msg_list);
+
+ if (head == &ptl->ptl_msg_stealing) {
+ if (exhausted)
+ break;
+ /* stealing thread will handle the message */
+ continue;
+ }
+
+ if ((rc & LNET_MATCHMD_OK) != 0) {
+ list_add_tail(&msg->msg_list, matches);
+
+ CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+ "match "LPU64" offset %d length %d.\n",
+ libcfs_id2str(info.mi_id),
+ info.mi_portal, info.mi_mbits,
+ info.mi_roffset, info.mi_rlength);
+ } else {
+ list_add_tail(&msg->msg_list, drops);
+ }
+
+ if (exhausted)
+ break;
+ }
+
+ if (!exhausted && head == &ptl->ptl_msg_stealing) {
+ head = &ptl->ptl_msg_delayed;
+ goto again;
+ }
+
+ if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+ lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+ if (!mtable->mt_enabled)
+ lnet_ptl_enable_mt(ptl, cpt);
+ }
+
+ lnet_ptl_unlock(ptl);
+}
+
+void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+ struct lnet_match_table *mtable;
+ int i;
+
+ if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+ return;
+
+ LASSERT(list_empty(&ptl->ptl_msg_delayed));
+ LASSERT(list_empty(&ptl->ptl_msg_stealing));
+ cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+ struct list_head *mhash;
+ lnet_me_t *me;
+ int j;
+
+ if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+ continue;
+
+ mhash = mtable->mt_mhash;
+ /* cleanup ME */
+ for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+ while (!list_empty(&mhash[j])) {
+ me = list_entry(mhash[j].next,
+ lnet_me_t, me_list);
+ CERROR("Active ME %p on exit\n", me);
+ list_del(&me->me_list);
+ lnet_me_free(me);
+ }
+ }
+ /* the extra entry is for MEs with ignore bits */
+ LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+ }
+
+ cfs_percpt_free(ptl->ptl_mtables);
+ ptl->ptl_mtables = NULL;
+}
+
+int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+ struct lnet_match_table *mtable;
+ struct list_head *mhash;
+ int i;
+ int j;
+
+ ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(struct lnet_match_table));
+ if (ptl->ptl_mtables == NULL) {
+ CERROR("Failed to create match table for portal %d\n", index);
+ return -ENOMEM;
+ }
+
+ ptl->ptl_index = index;
+ INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+ INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+ spin_lock_init(&ptl->ptl_lock);
+ cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+ /* the extra entry is for MEs with ignore bits */
+ LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+ sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+ if (mhash == NULL) {
+ CERROR("Failed to create match hash for portal %d\n",
+ index);
+ goto failed;
+ }
+
+ memset(&mtable->mt_exhausted[0], -1,
+ sizeof(mtable->mt_exhausted[0]) *
+ LNET_MT_EXHAUSTED_BMAP);
+ mtable->mt_mhash = mhash;
+ for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+ INIT_LIST_HEAD(&mhash[j]);
+
+ mtable->mt_portal = index;
+ mtable->mt_cpt = i;
+ }
+
+ return 0;
+ failed:
+ lnet_ptl_cleanup(ptl);
+ return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+ int i;
+
+ if (the_lnet.ln_portals == NULL)
+ return;
+
+ for (i = 0; i < the_lnet.ln_nportals; i++)
+ lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+ cfs_array_free(the_lnet.ln_portals);
+ the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+ int size;
+ int i;
+
+ size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+ the_lnet.ln_nportals = MAX_PORTALS;
+ the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+ if (the_lnet.ln_portals == NULL) {
+ CERROR("Failed to allocate portals table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < the_lnet.ln_nportals; i++) {
+ if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+ lnet_portals_destroy();
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+ struct lnet_portal *ptl;
+
+ if (portal < 0 || portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+ ptl = the_lnet.ln_portals[portal];
+
+ lnet_res_lock(LNET_LOCK_EX);
+ lnet_ptl_lock(ptl);
+
+ lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+ struct lnet_portal *ptl;
+ LIST_HEAD (zombies);
+
+ if (portal < 0 || portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ ptl = the_lnet.ln_portals[portal];
+
+ lnet_res_lock(LNET_LOCK_EX);
+ lnet_ptl_lock(ptl);
+
+ if (!lnet_ptl_is_lazy(ptl)) {
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+ return 0;
+ }
+
+ if (the_lnet.ln_shutdown)
+ CWARN("Active lazy portal %d on exit\n", portal);
+ else
+ CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+ /* grab all the blocked messages atomically */
+ list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+ lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644
index 000000000000..670dae34107c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lo.c
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ LASSERT (!lntmsg->msg_routing);
+ LASSERT (!lntmsg->msg_target_is_router);
+
+ return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ int delayed, unsigned int niov,
+ struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ lnet_msg_t *sendmsg = private;
+
+ if (lntmsg != NULL) { /* not discarding */
+ if (sendmsg->msg_iov != NULL) {
+ if (iov != NULL)
+ lnet_copy_iov2iov(niov, iov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_iov,
+ sendmsg->msg_offset, mlen);
+ else
+ lnet_copy_iov2kiov(niov, kiov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_iov,
+ sendmsg->msg_offset, mlen);
+ } else {
+ if (iov != NULL)
+ lnet_copy_kiov2iov(niov, iov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_kiov,
+ sendmsg->msg_offset, mlen);
+ else
+ lnet_copy_kiov2kiov(niov, kiov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_kiov,
+ sendmsg->msg_offset, mlen);
+ }
+
+ lnet_finalize(ni, lntmsg, 0);
+ }
+
+ lnet_finalize(ni, sendmsg, 0);
+ return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+ CDEBUG (D_NET, "shutdown\n");
+ LASSERT (lolnd_instanced);
+
+ lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+ LASSERT (ni->ni_lnd == &the_lolnd);
+ LASSERT (!lolnd_instanced);
+ lolnd_instanced = 1;
+
+ return (0);
+}
+
+lnd_t the_lolnd = {
+ /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+ /* .lnd_refcount = */ 0,
+ /* .lnd_type = */ LOLND,
+ /* .lnd_startup = */ lolnd_startup,
+ /* .lnd_shutdown = */ lolnd_shutdown,
+ /* .lnt_ctl = */ NULL,
+ /* .lnd_send = */ lolnd_send,
+ /* .lnd_recv = */ lolnd_recv,
+ /* .lnd_eager_recv = */ NULL,
+ /* .lnd_notify = */ NULL,
+ /* .lnd_accept = */ NULL
+};
diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644
index 000000000000..c8323854580a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/module.c
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+ "configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+int
+lnet_configure (void *arg)
+{
+ /* 'arg' only there so I can be passed to cfs_create_thread() */
+ int rc = 0;
+
+ LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+ if (!the_lnet.ln_niinit_self) {
+ rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+ if (rc >= 0) {
+ the_lnet.ln_niinit_self = 1;
+ rc = 0;
+ }
+ }
+
+ LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+ return rc;
+}
+
+int
+lnet_unconfigure (void)
+{
+ int refcount;
+
+ LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+ if (the_lnet.ln_niinit_self) {
+ the_lnet.ln_niinit_self = 0;
+ LNetNIFini();
+ }
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+ refcount = the_lnet.ln_refcount;
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+ LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+ return (refcount == 0) ? 0 : -EBUSY;
+}
+
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+ int rc;
+
+ switch (cmd) {
+ case IOC_LIBCFS_CONFIGURE:
+ return lnet_configure(NULL);
+
+ case IOC_LIBCFS_UNCONFIGURE:
+ return lnet_unconfigure();
+
+ default:
+ /* Passing LNET_PID_ANY only gives me a ref if the net is up
+ * already; I'll need it to ensure the net can't go down while
+ * I'm called into it */
+ rc = LNetNIInit(LNET_PID_ANY);
+ if (rc >= 0) {
+ rc = LNetCtl(cmd, data);
+ LNetNIFini();
+ }
+ return rc;
+ }
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+int
+init_lnet(void)
+{
+ int rc;
+ ENTRY;
+
+ mutex_init(&lnet_config_mutex);
+
+ rc = LNetInit();
+ if (rc != 0) {
+ CERROR("LNetInit: error %d\n", rc);
+ RETURN(rc);
+ }
+
+ rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+ LASSERT (rc == 0);
+
+ if (config_on_load) {
+ /* Have to schedule a separate thread to avoid deadlocking
+ * in modload */
+ (void) kthread_run(lnet_configure, NULL, "lnet_initd");
+ }
+
+ RETURN(0);
+}
+
+void
+fini_lnet(void)
+{
+ int rc;
+
+ rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+ LASSERT (rc == 0);
+
+ LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644
index 000000000000..286977691393
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -0,0 +1,337 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+int
+lnet_peer_tables_create(void)
+{
+ struct lnet_peer_table *ptable;
+ struct list_head *hash;
+ int i;
+ int j;
+
+ the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ptable));
+ if (the_lnet.ln_peer_tables == NULL) {
+ CERROR("Failed to allocate cpu-partition peer tables\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+ LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+ LNET_PEER_HASH_SIZE * sizeof(*hash));
+ if (hash == NULL) {
+ CERROR("Failed to create peer hash table\n");
+ lnet_peer_tables_destroy();
+ return -ENOMEM;
+ }
+
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+ INIT_LIST_HEAD(&hash[j]);
+ ptable->pt_hash = hash; /* sign of initialization */
+ }
+
+ return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+ struct lnet_peer_table *ptable;
+ struct list_head *hash;
+ int i;
+ int j;
+
+ if (the_lnet.ln_peer_tables == NULL)
+ return;
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ hash = ptable->pt_hash;
+ if (hash == NULL) /* not intialized */
+ break;
+
+ LASSERT(list_empty(&ptable->pt_deathrow));
+
+ ptable->pt_hash = NULL;
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+ LASSERT(list_empty(&hash[j]));
+
+ LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+ }
+
+ cfs_percpt_free(the_lnet.ln_peer_tables);
+ the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+ struct lnet_peer_table *ptable;
+ int i;
+ int j;
+
+ LASSERT(the_lnet.ln_shutdown); /* i.e. no new peers */
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ lnet_net_lock(i);
+
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+ struct list_head *peers = &ptable->pt_hash[j];
+
+ while (!list_empty(peers)) {
+ lnet_peer_t *lp = list_entry(peers->next,
+ lnet_peer_t,
+ lp_hashlist);
+ list_del_init(&lp->lp_hashlist);
+ /* lose hash table's ref */
+ lnet_peer_decref_locked(lp);
+ }
+ }
+
+ lnet_net_unlock(i);
+ }
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ LIST_HEAD (deathrow);
+ lnet_peer_t *lp;
+
+ lnet_net_lock(i);
+
+ for (j = 3; ptable->pt_number != 0; j++) {
+ lnet_net_unlock(i);
+
+ if ((j & (j - 1)) == 0) {
+ CDEBUG(D_WARNING,
+ "Waiting for %d peers on peer table\n",
+ ptable->pt_number);
+ }
+ cfs_pause(cfs_time_seconds(1) / 2);
+ lnet_net_lock(i);
+ }
+ list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+ lnet_net_unlock(i);
+
+ while (!list_empty(&deathrow)) {
+ lp = list_entry(deathrow.next,
+ lnet_peer_t, lp_hashlist);
+ list_del(&lp->lp_hashlist);
+ LIBCFS_FREE(lp, sizeof(*lp));
+ }
+ }
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+ struct lnet_peer_table *ptable;
+
+ LASSERT(lp->lp_refcount == 0);
+ LASSERT(lp->lp_rtr_refcount == 0);
+ LASSERT(list_empty(&lp->lp_txq));
+ LASSERT(list_empty(&lp->lp_hashlist));
+ LASSERT(lp->lp_txqnob == 0);
+
+ ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+ LASSERT(ptable->pt_number > 0);
+ ptable->pt_number--;
+
+ lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+ lp->lp_ni = NULL;
+
+ list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+ struct list_head *peers;
+ lnet_peer_t *lp;
+
+ LASSERT(!the_lnet.ln_shutdown);
+
+ peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+ list_for_each_entry(lp, peers, lp_hashlist) {
+ if (lp->lp_nid == nid) {
+ lnet_peer_addref_locked(lp);
+ return lp;
+ }
+ }
+
+ return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+ struct lnet_peer_table *ptable;
+ lnet_peer_t *lp = NULL;
+ lnet_peer_t *lp2;
+ int cpt2;
+ int rc = 0;
+
+ *lpp = NULL;
+ if (the_lnet.ln_shutdown) /* it's shutting down */
+ return -ESHUTDOWN;
+
+ /* cpt can be LNET_LOCK_EX if it's called from router functions */
+ cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+ ptable = the_lnet.ln_peer_tables[cpt2];
+ lp = lnet_find_peer_locked(ptable, nid);
+ if (lp != NULL) {
+ *lpp = lp;
+ return 0;
+ }
+
+ if (!list_empty(&ptable->pt_deathrow)) {
+ lp = list_entry(ptable->pt_deathrow.next,
+ lnet_peer_t, lp_hashlist);
+ list_del(&lp->lp_hashlist);
+ }
+
+ /*
+ * take extra refcount in case another thread has shutdown LNet
+ * and destroyed locks and peer-table before I finish the allocation
+ */
+ ptable->pt_number++;
+ lnet_net_unlock(cpt);
+
+ if (lp != NULL)
+ memset(lp, 0, sizeof(*lp));
+ else
+ LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+ if (lp == NULL) {
+ rc = -ENOMEM;
+ lnet_net_lock(cpt);
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&lp->lp_txq);
+ INIT_LIST_HEAD(&lp->lp_rtrq);
+ INIT_LIST_HEAD(&lp->lp_routes);
+
+ lp->lp_notify = 0;
+ lp->lp_notifylnd = 0;
+ lp->lp_notifying = 0;
+ lp->lp_alive_count = 0;
+ lp->lp_timestamp = 0;
+ lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+ lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+ lp->lp_last_query = 0; /* haven't asked NI yet */
+ lp->lp_ping_timestamp = 0;
+ lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ lp->lp_nid = nid;
+ lp->lp_cpt = cpt2;
+ lp->lp_refcount = 2; /* 1 for caller; 1 for hash */
+ lp->lp_rtr_refcount = 0;
+
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ rc = -ESHUTDOWN;
+ goto out;
+ }
+
+ lp2 = lnet_find_peer_locked(ptable, nid);
+ if (lp2 != NULL) {
+ *lpp = lp2;
+ goto out;
+ }
+
+ lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+ if (lp->lp_ni == NULL) {
+ rc = -EHOSTUNREACH;
+ goto out;
+ }
+
+ lp->lp_txcredits =
+ lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+ lp->lp_rtrcredits =
+ lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+ list_add_tail(&lp->lp_hashlist,
+ &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+ ptable->pt_version++;
+ *lpp = lp;
+
+ return 0;
+out:
+ if (lp != NULL)
+ list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+ ptable->pt_number--;
+ return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+ char *aliveness = "NA";
+ lnet_peer_t *lp;
+ int rc;
+ int cpt;
+
+ cpt = lnet_cpt_of_nid(nid);
+ lnet_net_lock(cpt);
+
+ rc = lnet_nid2peer_locked(&lp, nid, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+ return;
+ }
+
+ if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+ aliveness = lp->lp_alive ? "up" : "down";
+
+ CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+ libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+ aliveness, lp->lp_ni->ni_peertxcredits,
+ lp->lp_rtrcredits, lp->lp_minrtrcredits,
+ lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+ lnet_peer_decref_locked(lp);
+
+ lnet_net_unlock(cpt);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644
index 000000000000..c5ff97aaacc3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router.c
@@ -0,0 +1,1693 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * This file is part of Portals
+ * http://sourceforge.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+#if defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */
+#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */
+#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */
+#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+ "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+ "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+ "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+ "# of large messages to buffer in the router");
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+ "# router buffer credits per peer");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+ "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+ /* NI option overrides LNet default */
+ if (ni->ni_peerrtrcredits > 0)
+ return ni->ni_peerrtrcredits;
+ if (peer_buffer_credits > 0)
+ return peer_buffer_credits;
+
+ /* As an approximation, allow this peer the same number of router
+ * buffers as it is allowed outstanding sends */
+ return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+ return 0;
+}
+
+#endif
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+ "Assume routers are down and ping them before use");
+
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+ "Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+ "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+ "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+ "Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+ return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+{
+ if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+ CDEBUG(D_NET, "Out of date\n");
+ return;
+ }
+
+ lp->lp_timestamp = when; /* update timestamp */
+ lp->lp_ping_deadline = 0; /* disable ping timeout */
+
+ if (lp->lp_alive_count != 0 && /* got old news */
+ (!lp->lp_alive) == (!alive)) { /* new date for old news */
+ CDEBUG(D_NET, "Old news\n");
+ return;
+ }
+
+ /* Flag that notification is outstanding */
+
+ lp->lp_alive_count++;
+ lp->lp_alive = !(!alive); /* 1 bit! */
+ lp->lp_notify = 1;
+ lp->lp_notifylnd |= notifylnd;
+ if (lp->lp_alive)
+ lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+ CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+ int alive;
+ int notifylnd;
+
+ /* Notify only in 1 thread at any time to ensure ordered notification.
+ * NB individual events can be missed; the only guarantee is that you
+ * always get the most recent news */
+
+ if (lp->lp_notifying)
+ return;
+
+ lp->lp_notifying = 1;
+
+ while (lp->lp_notify) {
+ alive = lp->lp_alive;
+ notifylnd = lp->lp_notifylnd;
+
+ lp->lp_notifylnd = 0;
+ lp->lp_notify = 0;
+
+ if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+ lnet_net_unlock(lp->lp_cpt);
+
+ /* A new notification could happen now; I'll handle it
+ * when control returns to me */
+
+ (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+ lnet_net_lock(lp->lp_cpt);
+ }
+ }
+
+ lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+ LASSERT(lp->lp_refcount > 0);
+ LASSERT(lp->lp_rtr_refcount >= 0);
+
+ /* lnet_net_lock must be exclusively locked */
+ lp->lp_rtr_refcount++;
+ if (lp->lp_rtr_refcount == 1) {
+ struct list_head *pos;
+
+ /* a simple insertion sort */
+ list_for_each_prev(pos, &the_lnet.ln_routers) {
+ lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+ lp_rtr_list);
+
+ if (rtr->lp_nid < lp->lp_nid)
+ break;
+ }
+
+ list_add(&lp->lp_rtr_list, pos);
+ /* addref for the_lnet.ln_routers */
+ lnet_peer_addref_locked(lp);
+ the_lnet.ln_routers_version++;
+ }
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+ LASSERT(lp->lp_refcount > 0);
+ LASSERT(lp->lp_rtr_refcount > 0);
+
+ /* lnet_net_lock must be exclusively locked */
+ lp->lp_rtr_refcount--;
+ if (lp->lp_rtr_refcount == 0) {
+ LASSERT(list_empty(&lp->lp_routes));
+
+ if (lp->lp_rcd != NULL) {
+ list_add(&lp->lp_rcd->rcd_list,
+ &the_lnet.ln_rcd_deathrow);
+ lp->lp_rcd = NULL;
+ }
+
+ list_del(&lp->lp_rtr_list);
+ /* decref for the_lnet.ln_routers */
+ lnet_peer_decref_locked(lp);
+ the_lnet.ln_routers_version++;
+ }
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+ lnet_remotenet_t *rnet;
+ struct list_head *tmp;
+ struct list_head *rn_list;
+
+ LASSERT(!the_lnet.ln_shutdown);
+
+ rn_list = lnet_net2rnethash(net);
+ list_for_each(tmp, rn_list) {
+ rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+ if (rnet->lrn_net == net)
+ return rnet;
+ }
+ return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+ static int seeded = 0;
+ int lnd_type, seed[2];
+ struct timeval tv;
+ lnet_ni_t *ni;
+ struct list_head *tmp;
+
+ if (seeded)
+ return;
+
+ cfs_get_random_bytes(seed, sizeof(seed));
+
+ /* Nodes with small feet have little entropy
+ * the NID for this node gives the most entropy in the low bits */
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+ lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+ if (lnd_type != LOLND)
+ seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+ }
+
+ do_gettimeofday(&tv);
+ cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+ seeded = 1;
+ return;
+}
+
+/* NB expects LNET_LOCK held */
+void
+lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+ unsigned int len = 0;
+ unsigned int offset = 0;
+ struct list_head *e;
+
+ lnet_shuffle_seed();
+
+ list_for_each (e, &rnet->lrn_routes) {
+ len++;
+ }
+
+ /* len+1 positions to add a new entry, also prevents division by 0 */
+ offset = cfs_rand() % (len + 1);
+ list_for_each (e, &rnet->lrn_routes) {
+ if (offset == 0)
+ break;
+ offset--;
+ }
+ list_add(&route->lr_list, e);
+ list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+ the_lnet.ln_remote_nets_version++;
+ lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+ struct list_head *e;
+ lnet_remotenet_t *rnet;
+ lnet_remotenet_t *rnet2;
+ lnet_route_t *route;
+ lnet_ni_t *ni;
+ int add_route;
+ int rc;
+
+ CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+ libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+ if (gateway == LNET_NID_ANY ||
+ LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+ net == LNET_NIDNET(LNET_NID_ANY) ||
+ LNET_NETTYP(net) == LOLND ||
+ LNET_NIDNET(gateway) == net ||
+ hops < 1 || hops > 255)
+ return (-EINVAL);
+
+ if (lnet_islocalnet(net)) /* it's a local network */
+ return 0; /* ignore the route entry */
+
+ /* Assume net, route, all new */
+ LIBCFS_ALLOC(route, sizeof(*route));
+ LIBCFS_ALLOC(rnet, sizeof(*rnet));
+ if (route == NULL || rnet == NULL) {
+ CERROR("Out of memory creating route %s %d %s\n",
+ libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+ if (route != NULL)
+ LIBCFS_FREE(route, sizeof(*route));
+ if (rnet != NULL)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&rnet->lrn_routes);
+ rnet->lrn_net = net;
+ route->lr_hops = hops;
+ route->lr_net = net;
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+ if (rc != 0) {
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ LIBCFS_FREE(route, sizeof(*route));
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
+ return 0; /* ignore the route entry */
+ } else {
+ CERROR("Error %d creating route %s %d %s\n", rc,
+ libcfs_net2str(net), hops,
+ libcfs_nid2str(gateway));
+ }
+ return rc;
+ }
+
+ LASSERT (!the_lnet.ln_shutdown);
+
+ rnet2 = lnet_find_net_locked(net);
+ if (rnet2 == NULL) {
+ /* new network */
+ list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+ rnet2 = rnet;
+ }
+
+ /* Search for a duplicate route (it's a NOOP if it is) */
+ add_route = 1;
+ list_for_each (e, &rnet2->lrn_routes) {
+ lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+ if (route2->lr_gateway == route->lr_gateway) {
+ add_route = 0;
+ break;
+ }
+
+ /* our lookups must be true */
+ LASSERT (route2->lr_gateway->lp_nid != gateway);
+ }
+
+ if (add_route) {
+ lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+ lnet_add_route_to_rnet(rnet2, route);
+
+ ni = route->lr_gateway->lp_ni;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ /* XXX Assume alive */
+ if (ni->ni_lnd->lnd_notify != NULL)
+ (ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ /* -1 for notify or !add_route */
+ lnet_peer_decref_locked(route->lr_gateway);
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (!add_route)
+ LIBCFS_FREE(route, sizeof(*route));
+
+ if (rnet != rnet2)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ lnet_route_t *route2;
+ struct list_head *e1;
+ struct list_head *e2;
+ int cpt;
+ struct list_head *rn_list;
+ int i;
+
+ cpt = lnet_net_lock_current();
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ route2 = NULL;
+ list_for_each(e2, &rnet->lrn_routes) {
+ lnet_nid_t nid1;
+ lnet_nid_t nid2;
+ int net;
+
+ route = list_entry(e2, lnet_route_t,
+ lr_list);
+
+ if (route2 == NULL) {
+ route2 = route;
+ continue;
+ }
+
+ if (route->lr_gateway->lp_ni ==
+ route2->lr_gateway->lp_ni)
+ continue;
+
+ nid1 = route->lr_gateway->lp_nid;
+ nid2 = route2->lr_gateway->lp_nid;
+ net = rnet->lrn_net;
+
+ lnet_net_unlock(cpt);
+
+ CERROR("Routes to %s via %s and %s not "
+ "supported\n",
+ libcfs_net2str(net),
+ libcfs_nid2str(nid1),
+ libcfs_nid2str(nid2));
+ return -EINVAL;
+ }
+ }
+ }
+
+ lnet_net_unlock(cpt);
+ return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+ struct lnet_peer *gateway;
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ struct list_head *e1;
+ struct list_head *e2;
+ int rc = -ENOENT;
+ struct list_head *rn_list;
+ int idx = 0;
+
+ CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+ libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+ /* NB Caller may specify either all routes via the given gateway
+ * or a specific route entry actual NIDs) */
+
+ lnet_net_lock(LNET_LOCK_EX);
+ if (net == LNET_NIDNET(LNET_NID_ANY))
+ rn_list = &the_lnet.ln_remote_nets_hash[0];
+ else
+ rn_list = lnet_net2rnethash(net);
+
+ again:
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+ net == rnet->lrn_net))
+ continue;
+
+ list_for_each(e2, &rnet->lrn_routes) {
+ route = list_entry(e2, lnet_route_t, lr_list);
+
+ gateway = route->lr_gateway;
+ if (!(gw_nid == LNET_NID_ANY ||
+ gw_nid == gateway->lp_nid))
+ continue;
+
+ list_del(&route->lr_list);
+ list_del(&route->lr_gwlist);
+ the_lnet.ln_remote_nets_version++;
+
+ if (list_empty(&rnet->lrn_routes))
+ list_del(&rnet->lrn_list);
+ else
+ rnet = NULL;
+
+ lnet_rtr_decref_locked(gateway);
+ lnet_peer_decref_locked(gateway);
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ LIBCFS_FREE(route, sizeof(*route));
+
+ if (rnet != NULL)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ rc = 0;
+ lnet_net_lock(LNET_LOCK_EX);
+ goto again;
+ }
+ }
+
+ if (net == LNET_NIDNET(LNET_NID_ANY) &&
+ ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+ rn_list = &the_lnet.ln_remote_nets_hash[idx];
+ goto again;
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+ lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+ lnet_nid_t *gateway, __u32 *alive)
+{
+ struct list_head *e1;
+ struct list_head *e2;
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ int cpt;
+ int i;
+ struct list_head *rn_list;
+
+ cpt = lnet_net_lock_current();
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ list_for_each(e2, &rnet->lrn_routes) {
+ route = list_entry(e2, lnet_route_t,
+ lr_list);
+
+ if (idx-- == 0) {
+ *net = rnet->lrn_net;
+ *hops = route->lr_hops;
+ *gateway = route->lr_gateway->lp_nid;
+ *alive = route->lr_gateway->lp_alive;
+ lnet_net_unlock(cpt);
+ return 0;
+ }
+ }
+ }
+ }
+
+ lnet_net_unlock(cpt);
+ return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+ int i;
+ lnet_ni_status_t *stat;
+
+ __swab32s(&info->pi_magic);
+ __swab32s(&info->pi_features);
+ __swab32s(&info->pi_pid);
+ __swab32s(&info->pi_nnis);
+ for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+ stat = &info->pi_ni[i];
+ __swab64s(&stat->ns_nid);
+ __swab32s(&stat->ns_status);
+ }
+ return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+ lnet_ping_info_t *info = rcd->rcd_pinginfo;
+ struct lnet_peer *gw = rcd->rcd_gateway;
+ lnet_route_t *rtr;
+
+ if (!gw->lp_alive)
+ return;
+
+ if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+ lnet_swap_pinginfo(info);
+
+ /* NB always racing with network! */
+ if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+ CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+ libcfs_nid2str(gw->lp_nid), info->pi_magic);
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ gw->lp_ping_feats = info->pi_features;
+ if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+ CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+ libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+ return; /* nothing I can understand */
+ }
+
+ if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+ return; /* can't carry NI status info */
+
+ list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+ int ptl_status = LNET_NI_STATUS_INVALID;
+ int down = 0;
+ int up = 0;
+ int i;
+
+ for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+ lnet_ni_status_t *stat = &info->pi_ni[i];
+ lnet_nid_t nid = stat->ns_nid;
+
+ if (nid == LNET_NID_ANY) {
+ CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+ libcfs_nid2str(gw->lp_nid));
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+ continue;
+
+ if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+ if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+ down++;
+ else if (ptl_status != LNET_NI_STATUS_UP)
+ ptl_status = LNET_NI_STATUS_DOWN;
+ continue;
+ }
+
+ if (stat->ns_status == LNET_NI_STATUS_UP) {
+ if (LNET_NIDNET(nid) == rtr->lr_net) {
+ up = 1;
+ break;
+ }
+ /* ptl NIs are considered down only when
+ * they're all down */
+ if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+ ptl_status = LNET_NI_STATUS_UP;
+ continue;
+ }
+
+ CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+ libcfs_nid2str(gw->lp_nid), stat->ns_status);
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ if (up) { /* ignore downed NIs if NI for dest network is up */
+ rtr->lr_downis = 0;
+ continue;
+ }
+ rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+ }
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+ lnet_rc_data_t *rcd = event->md.user_ptr;
+ struct lnet_peer *lp;
+
+ LASSERT(rcd != NULL);
+
+ if (event->unlinked) {
+ LNetInvalidateHandle(&rcd->rcd_mdh);
+ return;
+ }
+
+ LASSERT(event->type == LNET_EVENT_SEND ||
+ event->type == LNET_EVENT_REPLY);
+
+ lp = rcd->rcd_gateway;
+ LASSERT(lp != NULL);
+
+ /* NB: it's called with holding lnet_res_lock, we have a few
+ * places need to hold both locks at the same time, please take
+ * care of lock ordering */
+ lnet_net_lock(lp->lp_cpt);
+ if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+ /* ignore if no longer a router or rcd is replaced */
+ goto out;
+ }
+
+ if (event->type == LNET_EVENT_SEND) {
+ lp->lp_ping_notsent = 0;
+ if (event->status == 0)
+ goto out;
+ }
+
+ /* LNET_EVENT_REPLY */
+ /* A successful REPLY means the router is up. If _any_ comms
+ * to the router fail I assume it's down (this will happen if
+ * we ping alive routers to try to detect router death before
+ * apps get burned). */
+
+ lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+ /* The router checker will wake up very shortly and do the
+ * actual notification.
+ * XXX If 'lp' stops being a router before then, it will still
+ * have the notification pending!!! */
+
+ if (avoid_asym_router_failure && event->status == 0)
+ lnet_parse_rc_info(rcd);
+
+ out:
+ lnet_net_unlock(lp->lp_cpt);
+}
+
+void
+lnet_wait_known_routerstate(void)
+{
+ lnet_peer_t *rtr;
+ struct list_head *entry;
+ int all_known;
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ for (;;) {
+ int cpt = lnet_net_lock_current();
+
+ all_known = 1;
+ list_for_each (entry, &the_lnet.ln_routers) {
+ rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+ if (rtr->lp_alive_count == 0) {
+ all_known = 0;
+ break;
+ }
+ }
+
+ lnet_net_unlock(cpt);
+
+ if (all_known)
+ return;
+
+ cfs_pause(cfs_time_seconds(1));
+ }
+}
+
+void
+lnet_update_ni_status_locked(void)
+{
+ lnet_ni_t *ni;
+ long now;
+ int timeout;
+
+ LASSERT(the_lnet.ln_routing);
+
+ timeout = router_ping_timeout +
+ MAX(live_router_check_interval, dead_router_check_interval);
+
+ now = cfs_time_current_sec();
+ list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+ if (ni->ni_lnd->lnd_type == LOLND)
+ continue;
+
+ if (now < ni->ni_last_alive + timeout)
+ continue;
+
+ lnet_ni_lock(ni);
+ /* re-check with lock */
+ if (now < ni->ni_last_alive + timeout) {
+ lnet_ni_unlock(ni);
+ continue;
+ }
+
+ LASSERT(ni->ni_status != NULL);
+
+ if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+ CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+ libcfs_nid2str(ni->ni_nid), timeout);
+ /* NB: so far, this is the only place to set
+ * NI status to "down" */
+ ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+ }
+ lnet_ni_unlock(ni);
+ }
+}
+
+void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+ LASSERT(list_empty(&rcd->rcd_list));
+ /* detached from network */
+ LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+ if (rcd->rcd_gateway != NULL) {
+ int cpt = rcd->rcd_gateway->lp_cpt;
+
+ lnet_net_lock(cpt);
+ lnet_peer_decref_locked(rcd->rcd_gateway);
+ lnet_net_unlock(cpt);
+ }
+
+ if (rcd->rcd_pinginfo != NULL)
+ LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+ LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+ lnet_rc_data_t *rcd = NULL;
+ lnet_ping_info_t *pi;
+ int rc;
+ int i;
+
+ lnet_net_unlock(gateway->lp_cpt);
+
+ LIBCFS_ALLOC(rcd, sizeof(*rcd));
+ if (rcd == NULL)
+ goto out;
+
+ LNetInvalidateHandle(&rcd->rcd_mdh);
+ INIT_LIST_HEAD(&rcd->rcd_list);
+
+ LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+ if (pi == NULL)
+ goto out;
+
+ memset(pi, 0, LNET_PINGINFO_SIZE);
+ for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+ pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+ pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+ }
+ rcd->rcd_pinginfo = pi;
+
+ LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+ rc = LNetMDBind((lnet_md_t){.start = pi,
+ .user_ptr = rcd,
+ .length = LNET_PINGINFO_SIZE,
+ .threshold = LNET_MD_THRESH_INF,
+ .options = LNET_MD_TRUNCATE,
+ .eq_handle = the_lnet.ln_rc_eqh},
+ LNET_UNLINK,
+ &rcd->rcd_mdh);
+ if (rc < 0) {
+ CERROR("Can't bind MD: %d\n", rc);
+ goto out;
+ }
+ LASSERT(rc == 0);
+
+ lnet_net_lock(gateway->lp_cpt);
+ /* router table changed or someone has created rcd for this gateway */
+ if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+ lnet_net_unlock(gateway->lp_cpt);
+ goto out;
+ }
+
+ lnet_peer_addref_locked(gateway);
+ rcd->rcd_gateway = gateway;
+ gateway->lp_rcd = rcd;
+ gateway->lp_ping_notsent = 0;
+
+ return rcd;
+
+ out:
+ if (rcd != NULL) {
+ if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+ rc = LNetMDUnlink(rcd->rcd_mdh);
+ LASSERT(rc == 0);
+ }
+ lnet_destroy_rc_data(rcd);
+ }
+
+ lnet_net_lock(gateway->lp_cpt);
+ return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval (lnet_peer_t *rtr)
+{
+ int secs;
+
+ secs = rtr->lp_alive ? live_router_check_interval :
+ dead_router_check_interval;
+ if (secs < 0)
+ secs = 0;
+
+ return secs;
+}
+
+static void
+lnet_ping_router_locked (lnet_peer_t *rtr)
+{
+ lnet_rc_data_t *rcd = NULL;
+ cfs_time_t now = cfs_time_current();
+ int secs;
+
+ lnet_peer_addref_locked(rtr);
+
+ if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+ cfs_time_after(now, rtr->lp_ping_deadline))
+ lnet_notify_locked(rtr, 1, 0, now);
+
+ /* Run any outstanding notifications */
+ lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+ if (!lnet_isrouter(rtr) ||
+ the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+ /* router table changed or router checker is shutting down */
+ lnet_peer_decref_locked(rtr);
+ return;
+ }
+
+ rcd = rtr->lp_rcd != NULL ?
+ rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+ if (rcd == NULL)
+ return;
+
+ secs = lnet_router_check_interval(rtr);
+
+ CDEBUG(D_NET,
+ "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+ "alive_count %d lp_ping_timestamp %lu\n",
+ libcfs_nid2str(rtr->lp_nid), secs,
+ rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+ rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+ if (secs != 0 && !rtr->lp_ping_notsent &&
+ cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+ cfs_time_seconds(secs)))) {
+ int rc;
+ lnet_process_id_t id;
+ lnet_handle_md_t mdh;
+
+ id.nid = rtr->lp_nid;
+ id.pid = LUSTRE_SRV_LNET_PID;
+ CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+ rtr->lp_ping_notsent = 1;
+ rtr->lp_ping_timestamp = now;
+
+ mdh = rcd->rcd_mdh;
+
+ if (rtr->lp_ping_deadline == 0) {
+ rtr->lp_ping_deadline =
+ cfs_time_shift(router_ping_timeout);
+ }
+
+ lnet_net_unlock(rtr->lp_cpt);
+
+ rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+ LNET_PROTO_PING_MATCHBITS, 0);
+
+ lnet_net_lock(rtr->lp_cpt);
+ if (rc != 0)
+ rtr->lp_ping_notsent = 0; /* no event pending */
+ }
+
+ lnet_peer_decref_locked(rtr);
+ return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+ int rc;
+ int eqsz;
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+ if (check_routers_before_use &&
+ dead_router_check_interval <= 0) {
+ LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+ " set if 'check_routers_before_use' is set"
+ "\n");
+ return -EINVAL;
+ }
+
+ if (!the_lnet.ln_routing &&
+ live_router_check_interval <= 0 &&
+ dead_router_check_interval <= 0)
+ return 0;
+
+ sema_init(&the_lnet.ln_rc_signal, 0);
+ /* EQ size doesn't matter; the callback is guaranteed to get every
+ * event */
+ eqsz = 0;
+ rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+ &the_lnet.ln_rc_eqh);
+ if (rc != 0) {
+ CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+ return -ENOMEM;
+ }
+
+ the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+ rc = PTR_ERR(kthread_run(lnet_router_checker,
+ NULL, "router_checker"));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("Can't start router checker thread: %d\n", rc);
+ /* block until event callback signals exit */
+ down(&the_lnet.ln_rc_signal);
+ rc = LNetEQFree(the_lnet.ln_rc_eqh);
+ LASSERT(rc == 0);
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ return -ENOMEM;
+ }
+
+ if (check_routers_before_use) {
+ /* Note that a helpful side-effect of pinging all known routers
+ * at startup is that it makes them drop stale connections they
+ * may have to a previous instance of me. */
+ lnet_wait_known_routerstate();
+ }
+
+ return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+ int rc;
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+ return;
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+ the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+ /* block until event callback signals exit */
+ down(&the_lnet.ln_rc_signal);
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+ rc = LNetEQFree(the_lnet.ln_rc_eqh);
+ LASSERT (rc == 0);
+ return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+ lnet_rc_data_t *rcd;
+ lnet_rc_data_t *tmp;
+ lnet_peer_t *lp;
+ struct list_head head;
+ int i = 2;
+
+ if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+ list_empty(&the_lnet.ln_rcd_deathrow) &&
+ list_empty(&the_lnet.ln_rcd_zombie)))
+ return;
+
+ INIT_LIST_HEAD(&head);
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+ /* router checker is stopping, prune all */
+ list_for_each_entry(lp, &the_lnet.ln_routers,
+ lp_rtr_list) {
+ if (lp->lp_rcd == NULL)
+ continue;
+
+ LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+ list_add(&lp->lp_rcd->rcd_list,
+ &the_lnet.ln_rcd_deathrow);
+ lp->lp_rcd = NULL;
+ }
+ }
+
+ /* unlink all RCDs on deathrow list */
+ list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+ if (!list_empty(&head)) {
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ list_for_each_entry(rcd, &head, rcd_list)
+ LNetMDUnlink(rcd->rcd_mdh);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+ /* release all zombie RCDs */
+ while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+ list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+ rcd_list) {
+ if (LNetHandleIsInvalid(rcd->rcd_mdh))
+ list_move(&rcd->rcd_list, &head);
+ }
+
+ wait_unlink = wait_unlink &&
+ !list_empty(&the_lnet.ln_rcd_zombie);
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ while (!list_empty(&head)) {
+ rcd = list_entry(head.next,
+ lnet_rc_data_t, rcd_list);
+ list_del_init(&rcd->rcd_list);
+ lnet_destroy_rc_data(rcd);
+ }
+
+ if (!wait_unlink)
+ return;
+
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for rc buffers to unlink\n");
+ cfs_pause(cfs_time_seconds(1) / 4);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+ lnet_peer_t *rtr;
+ struct list_head *entry;
+
+ cfs_block_allsigs();
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+ __u64 version;
+ int cpt;
+ int cpt2;
+
+ cpt = lnet_net_lock_current();
+rescan:
+ version = the_lnet.ln_routers_version;
+
+ list_for_each(entry, &the_lnet.ln_routers) {
+ rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+ cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+ if (cpt != cpt2) {
+ lnet_net_unlock(cpt);
+ cpt = cpt2;
+ lnet_net_lock(cpt);
+ /* the routers list has changed */
+ if (version != the_lnet.ln_routers_version)
+ goto rescan;
+ }
+
+ lnet_ping_router_locked(rtr);
+
+ /* NB dropped lock */
+ if (version != the_lnet.ln_routers_version) {
+ /* the routers list has changed */
+ goto rescan;
+ }
+ }
+
+ if (the_lnet.ln_routing)
+ lnet_update_ni_status_locked();
+
+ lnet_net_unlock(cpt);
+
+ lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+ /* Call cfs_pause() here always adds 1 to load average
+ * because kernel counts # active tasks as nr_running
+ * + nr_uninterruptible. */
+ schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+ cfs_time_seconds(1));
+ }
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+ lnet_prune_rc_data(1); /* wait for UNLINK */
+
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ up(&the_lnet.ln_rc_signal);
+ /* The unlink event callback will signal final completion */
+ return 0;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+ int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+ while (--npages >= 0)
+ __free_page(rb->rb_kiov[npages].kiov_page);
+
+ LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+ int npages = rbp->rbp_npages;
+ int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+ struct page *page;
+ lnet_rtrbuf_t *rb;
+ int i;
+
+ LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+ if (rb == NULL)
+ return NULL;
+
+ rb->rb_pool = rbp;
+
+ for (i = 0; i < npages; i++) {
+ page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+ __GFP_ZERO | GFP_IOFS);
+ if (page == NULL) {
+ while (--i >= 0)
+ __free_page(rb->rb_kiov[i].kiov_page);
+
+ LIBCFS_FREE(rb, sz);
+ return NULL;
+ }
+
+ rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+ rb->rb_kiov[i].kiov_offset = 0;
+ rb->rb_kiov[i].kiov_page = page;
+ }
+
+ return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+ int npages = rbp->rbp_npages;
+ int nbuffers = 0;
+ lnet_rtrbuf_t *rb;
+
+ if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+ return;
+
+ LASSERT (list_empty(&rbp->rbp_msgs));
+ LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+ while (!list_empty(&rbp->rbp_bufs)) {
+ LASSERT (rbp->rbp_credits > 0);
+
+ rb = list_entry(rbp->rbp_bufs.next,
+ lnet_rtrbuf_t, rb_list);
+ list_del(&rb->rb_list);
+ lnet_destroy_rtrbuf(rb, npages);
+ nbuffers++;
+ }
+
+ LASSERT (rbp->rbp_nbuffers == nbuffers);
+ LASSERT (rbp->rbp_credits == nbuffers);
+
+ rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+ lnet_rtrbuf_t *rb;
+ int i;
+
+ if (rbp->rbp_nbuffers != 0) {
+ LASSERT (rbp->rbp_nbuffers == nbufs);
+ return 0;
+ }
+
+ for (i = 0; i < nbufs; i++) {
+ rb = lnet_new_rtrbuf(rbp, cpt);
+
+ if (rb == NULL) {
+ CERROR("Failed to allocate %d router bufs of %d pages\n",
+ nbufs, rbp->rbp_npages);
+ return -ENOMEM;
+ }
+
+ rbp->rbp_nbuffers++;
+ rbp->rbp_credits++;
+ rbp->rbp_mincredits++;
+ list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+ /* No allocation "under fire" */
+ /* Otherwise we'd need code to schedule blocked msgs etc */
+ LASSERT (!the_lnet.ln_routing);
+ }
+
+ LASSERT (rbp->rbp_credits == nbufs);
+ return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+ INIT_LIST_HEAD(&rbp->rbp_msgs);
+ INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+ rbp->rbp_npages = npages;
+ rbp->rbp_credits = 0;
+ rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+ lnet_rtrbufpool_t *rtrp;
+ int i;
+
+ if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+ return;
+
+ cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+ lnet_rtrpool_free_bufs(&rtrp[0]);
+ lnet_rtrpool_free_bufs(&rtrp[1]);
+ lnet_rtrpool_free_bufs(&rtrp[2]);
+ }
+
+ cfs_percpt_free(the_lnet.ln_rtrpools);
+ the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+ int nrbs = LNET_NRB_TINY;
+
+ if (tiny_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "tiny_router_buffers=%d invalid when "
+ "routing enabled\n", tiny_router_buffers);
+ return -1;
+ }
+
+ if (tiny_router_buffers > 0)
+ nrbs = tiny_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+ int nrbs = LNET_NRB_SMALL;
+
+ if (small_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "small_router_buffers=%d invalid when "
+ "routing enabled\n", small_router_buffers);
+ return -1;
+ }
+
+ if (small_router_buffers > 0)
+ nrbs = small_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+ int nrbs = LNET_NRB_LARGE;
+
+ if (large_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "large_router_buffers=%d invalid when "
+ "routing enabled\n", large_router_buffers);
+ return -1;
+ }
+
+ if (large_router_buffers > 0)
+ nrbs = large_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+ lnet_rtrbufpool_t *rtrp;
+ int large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int small_pages = 1;
+ int nrb_tiny;
+ int nrb_small;
+ int nrb_large;
+ int rc;
+ int i;
+
+ if (!strcmp(forwarding, "")) {
+ /* not set either way */
+ if (!im_a_router)
+ return 0;
+ } else if (!strcmp(forwarding, "disabled")) {
+ /* explicitly disabled */
+ return 0;
+ } else if (!strcmp(forwarding, "enabled")) {
+ /* explicitly enabled */
+ } else {
+ LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+ "'enabled' or 'disabled'\n");
+ return -EINVAL;
+ }
+
+ nrb_tiny = lnet_nrb_tiny_calculate(0);
+ if (nrb_tiny < 0)
+ return -EINVAL;
+
+ nrb_small = lnet_nrb_small_calculate(small_pages);
+ if (nrb_small < 0)
+ return -EINVAL;
+
+ nrb_large = lnet_nrb_large_calculate(large_pages);
+ if (nrb_large < 0)
+ return -EINVAL;
+
+ the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+ LNET_NRBPOOLS *
+ sizeof(lnet_rtrbufpool_t));
+ if (the_lnet.ln_rtrpools == NULL) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "Failed to initialize router buffe pool\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+ lnet_rtrpool_init(&rtrp[0], 0);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+ if (rc != 0)
+ goto failed;
+
+ lnet_rtrpool_init(&rtrp[1], small_pages);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+ if (rc != 0)
+ goto failed;
+
+ lnet_rtrpool_init(&rtrp[2], large_pages);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+ if (rc != 0)
+ goto failed;
+ }
+
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_routing = 1;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ return 0;
+
+ failed:
+ lnet_rtrpools_free();
+ return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+ struct lnet_peer *lp = NULL;
+ cfs_time_t now = cfs_time_current();
+ int cpt = lnet_cpt_of_nid(nid);
+
+ LASSERT (!in_interrupt ());
+
+ CDEBUG (D_NET, "%s notifying %s: %s\n",
+ (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+ libcfs_nid2str(nid),
+ alive ? "up" : "down");
+
+ if (ni != NULL &&
+ LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+ CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+ libcfs_nid2str(nid), alive ? "birth" : "death",
+ libcfs_nid2str(ni->ni_nid));
+ return -EINVAL;
+ }
+
+ /* can't do predictions... */
+ if (cfs_time_after(when, now)) {
+ CWARN ("Ignoring prediction from %s of %s %s "
+ "%ld seconds in the future\n",
+ (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+ libcfs_nid2str(nid), alive ? "up" : "down",
+ cfs_duration_sec(cfs_time_sub(when, now)));
+ return -EINVAL;
+ }
+
+ if (ni != NULL && !alive && /* LND telling me she's down */
+ !auto_down) { /* auto-down disabled */
+ CDEBUG(D_NET, "Auto-down disabled\n");
+ return 0;
+ }
+
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ lnet_net_unlock(cpt);
+ return -ESHUTDOWN;
+ }
+
+ lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+ if (lp == NULL) {
+ /* nid not found */
+ lnet_net_unlock(cpt);
+ CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+ return 0;
+ }
+
+ /* We can't fully trust LND on reporting exact peer last_alive
+ * if he notifies us about dead peer. For example ksocklnd can
+ * call us with when == _time_when_the_node_was_booted_ if
+ * no connections were successfully established */
+ if (ni != NULL && !alive && when < lp->lp_last_alive)
+ when = lp->lp_last_alive;
+
+ lnet_notify_locked(lp, ni == NULL, alive, when);
+
+ lnet_ni_notify_locked(ni, lp);
+
+ lnet_peer_decref_locked(lp);
+
+ lnet_net_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables (void)
+{
+ return;
+}
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+ return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker (void)
+{
+ static time_t last = 0;
+ static int running = 0;
+
+ time_t now = cfs_time_current_sec();
+ int interval = now - last;
+ int rc;
+ __u64 version;
+ lnet_peer_t *rtr;
+
+ /* It's no use to call me again within a sec - all intervals and
+ * timeouts are measured in seconds */
+ if (last != 0 && interval < 2)
+ return;
+
+ if (last != 0 &&
+ interval > MAX(live_router_check_interval,
+ dead_router_check_interval))
+ CNETERR("Checker(%d/%d) not called for %d seconds\n",
+ live_router_check_interval, dead_router_check_interval,
+ interval);
+
+ LASSERT(LNET_CPT_NUMBER == 1);
+
+ lnet_net_lock(0);
+ LASSERT(!running); /* recursion check */
+ running = 1;
+ lnet_net_unlock(0);
+
+ last = now;
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+ lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+ /* consume all pending events */
+ while (1) {
+ int i;
+ lnet_event_t ev;
+
+ /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+ * recursion breaker in LNetEQPoll would fail */
+ rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+ if (rc == 0) /* no event pending */
+ break;
+
+ /* NB a lost SENT prevents me from pinging a router again */
+ if (rc == -EOVERFLOW) {
+ CERROR("Dropped an event!!!\n");
+ abort();
+ }
+
+ LASSERT (rc == 1);
+
+ lnet_router_checker_event(&ev);
+ }
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+ lnet_prune_rc_data(1); /* release rcd */
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ running = 0;
+ return;
+ }
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ lnet_net_lock(0);
+
+ version = the_lnet.ln_routers_version;
+ list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) {
+ lnet_ping_router_locked(rtr);
+ LASSERT (version == the_lnet.ln_routers_version);
+ }
+
+ lnet_net_unlock(0);
+
+ running = 0; /* lock only needed for the recursion check */
+ return;
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables (void)
+{
+ char *s;
+
+ s = getenv("LNET_ROUTER_PING_TIMEOUT");
+ if (s != NULL) router_ping_timeout = atoi(s);
+
+ s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+ if (s != NULL) live_router_check_interval = atoi(s);
+
+ s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+ if (s != NULL) dead_router_check_interval = atoi(s);
+
+ /* This replaces old lnd_notify mechanism */
+ check_routers_before_use = 1;
+ if (dead_router_check_interval <= 0)
+ dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+ return 0;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644
index 000000000000..3084b0c75983
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router_proc.c
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * This file is part of Portals
+ * http://sourceforge.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+
+#if defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static ctl_table_header_t *lnet_table_header = NULL;
+
+#define CTL_LNET (0x100)
+enum {
+ PSDEV_LNET_STATS = 100,
+ PSDEV_LNET_ROUTES,
+ PSDEV_LNET_ROUTERS,
+ PSDEV_LNET_PEERS,
+ PSDEV_LNET_BUFFERS,
+ PSDEV_LNET_NIS,
+ PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS (sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \
+ LNET_PROC_CPT_BITS - \
+ LNET_PROC_VER_BITS - \
+ LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos) \
+ (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos) \
+ (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos) \
+ (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos) \
+ (int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \
+ (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \
+ ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \
+ ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+ ((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+ loff_t pos, void *buffer, int nob)
+{
+ int rc;
+ lnet_counters_t *ctrs;
+ int len;
+ char *tmpstr;
+ const int tmpsiz = 256; /* 7 %u and 4 LPU64 */
+
+ if (write) {
+ lnet_counters_reset();
+ return 0;
+ }
+
+ /* read */
+
+ LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+ if (ctrs == NULL)
+ return -ENOMEM;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL) {
+ LIBCFS_FREE(ctrs, sizeof(*ctrs));
+ return -ENOMEM;
+ }
+
+ lnet_counters_get(ctrs);
+
+ len = snprintf(tmpstr, tmpsiz,
+ "%u %u %u %u %u %u %u "LPU64" "LPU64" "
+ LPU64" "LPU64,
+ ctrs->msgs_alloc, ctrs->msgs_max,
+ ctrs->errors,
+ ctrs->send_count, ctrs->recv_count,
+ ctrs->route_count, ctrs->drop_count,
+ ctrs->send_length, ctrs->recv_length,
+ ctrs->route_length, ctrs->drop_length);
+
+ if (pos >= min_t(int, len, strlen(tmpstr)))
+ rc = 0;
+ else
+ rc = cfs_trace_copyout_string(buffer, nob,
+ tmpstr + pos, "\n");
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ LIBCFS_FREE(ctrs, sizeof(*ctrs));
+ return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_stats);
+
+int LL_PROC_PROTO(proc_lnet_routes)
+{
+ const int tmpsiz = 256;
+ char *tmpstr;
+ char *s;
+ int rc = 0;
+ int len;
+ int ver;
+ int off;
+
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ CLASSERT(sizeof(loff_t) >= 4);
+
+ off = LNET_PROC_HOFF_GET(*ppos);
+ ver = LNET_PROC_VER_GET(*ppos);
+
+ LASSERT (!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+ the_lnet.ln_routing ? "enabled" : "disabled");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n",
+ "net", "hops", "state", "router");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ lnet_net_lock(0);
+ ver = (unsigned int)the_lnet.ln_remote_nets_version;
+ lnet_net_unlock(0);
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ } else {
+ struct list_head *n;
+ struct list_head *r;
+ lnet_route_t *route = NULL;
+ lnet_remotenet_t *rnet = NULL;
+ int skip = off - 1;
+ struct list_head *rn_list;
+ int i;
+
+ lnet_net_lock(0);
+
+ if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+ lnet_net_unlock(0);
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+ i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+ n = rn_list->next;
+
+ while (n != rn_list && route == NULL) {
+ rnet = list_entry(n, lnet_remotenet_t,
+ lrn_list);
+
+ r = rnet->lrn_routes.next;
+
+ while (r != &rnet->lrn_routes) {
+ lnet_route_t *re =
+ list_entry(r, lnet_route_t,
+ lr_list);
+ if (skip == 0) {
+ route = re;
+ break;
+ }
+
+ skip--;
+ r = r->next;
+ }
+
+ n = n->next;
+ }
+ }
+
+ if (route != NULL) {
+ __u32 net = rnet->lrn_net;
+ unsigned int hops = route->lr_hops;
+ lnet_nid_t nid = route->lr_gateway->lp_nid;
+ int alive = route->lr_gateway->lp_alive;
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-8s %4u %7s %s\n",
+ libcfs_net2str(net), hops,
+ alive ? "up" : "down",
+ libcfs_nid2str(nid));
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else {
+ off += 1;
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ }
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_routers)
+{
+ int rc = 0;
+ char *tmpstr;
+ char *s;
+ const int tmpsiz = 256;
+ int len;
+ int ver;
+ int off;
+
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ off = LNET_PROC_HOFF_GET(*ppos);
+ ver = LNET_PROC_VER_GET(*ppos);
+
+ LASSERT (!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+ "ref", "rtr_ref", "alive_cnt", "state",
+ "last_ping", "ping_sent", "deadline",
+ "down_ni", "router");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ lnet_net_lock(0);
+ ver = (unsigned int)the_lnet.ln_routers_version;
+ lnet_net_unlock(0);
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ } else {
+ struct list_head *r;
+ struct lnet_peer *peer = NULL;
+ int skip = off - 1;
+
+ lnet_net_lock(0);
+
+ if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+ lnet_net_unlock(0);
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ r = the_lnet.ln_routers.next;
+
+ while (r != &the_lnet.ln_routers) {
+ lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+ lp_rtr_list);
+
+ if (skip == 0) {
+ peer = lp;
+ break;
+ }
+
+ skip--;
+ r = r->next;
+ }
+
+ if (peer != NULL) {
+ lnet_nid_t nid = peer->lp_nid;
+ cfs_time_t now = cfs_time_current();
+ cfs_time_t deadline = peer->lp_ping_deadline;
+ int nrefs = peer->lp_refcount;
+ int nrtrrefs = peer->lp_rtr_refcount;
+ int alive_cnt = peer->lp_alive_count;
+ int alive = peer->lp_alive;
+ int pingsent = !peer->lp_ping_notsent;
+ int last_ping = cfs_duration_sec(cfs_time_sub(now,
+ peer->lp_ping_timestamp));
+ int down_ni = 0;
+ lnet_route_t *rtr;
+
+ if ((peer->lp_ping_feats &
+ LNET_PING_FEAT_NI_STATUS) != 0) {
+ list_for_each_entry(rtr, &peer->lp_routes,
+ lr_gwlist) {
+ /* downis on any route should be the
+ * number of downis on the gateway */
+ if (rtr->lr_downis != 0) {
+ down_ni = rtr->lr_downis;
+ break;
+ }
+ }
+ }
+
+ if (deadline == 0)
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+ nrefs, nrtrrefs, alive_cnt,
+ alive ? "up" : "down", last_ping,
+ pingsent, "NA", down_ni,
+ libcfs_nid2str(nid));
+ else
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+ nrefs, nrtrrefs, alive_cnt,
+ alive ? "up" : "down", last_ping,
+ pingsent,
+ cfs_duration_sec(cfs_time_sub(deadline, now)),
+ down_ni, libcfs_nid2str(nid));
+ LASSERT (tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else {
+ off += 1;
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ }
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_peers)
+{
+ const int tmpsiz = 256;
+ struct lnet_peer_table *ptable;
+ char *tmpstr;
+ char *s;
+ int cpt = LNET_PROC_CPT_GET(*ppos);
+ int ver = LNET_PROC_VER_GET(*ppos);
+ int hash = LNET_PROC_HASH_GET(*ppos);
+ int hoff = LNET_PROC_HOFF_GET(*ppos);
+ int rc = 0;
+ int len;
+
+ CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+ LASSERT(!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ if (cpt >= LNET_CPT_NUMBER) {
+ *lenp = 0;
+ return 0;
+ }
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+ "nid", "refs", "state", "last", "max",
+ "rtr", "min", "tx", "min", "queue");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ hoff++;
+ } else {
+ struct lnet_peer *peer;
+ struct list_head *p;
+ int skip;
+ again:
+ p = NULL;
+ peer = NULL;
+ skip = hoff - 1;
+
+ lnet_net_lock(cpt);
+ ptable = the_lnet.ln_peer_tables[cpt];
+ if (hoff == 1)
+ ver = LNET_PROC_VERSION(ptable->pt_version);
+
+ if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+ lnet_net_unlock(cpt);
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ while (hash < LNET_PEER_HASH_SIZE) {
+ if (p == NULL)
+ p = ptable->pt_hash[hash].next;
+
+ while (p != &ptable->pt_hash[hash]) {
+ lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+ lp_hashlist);
+ if (skip == 0) {
+ peer = lp;
+
+ /* minor optimization: start from idx+1
+ * on next iteration if we've just
+ * drained lp_hashlist */
+ if (lp->lp_hashlist.next ==
+ &ptable->pt_hash[hash]) {
+ hoff = 1;
+ hash++;
+ } else {
+ hoff++;
+ }
+
+ break;
+ }
+
+ skip--;
+ p = lp->lp_hashlist.next;
+ }
+
+ if (peer != NULL)
+ break;
+
+ p = NULL;
+ hoff = 1;
+ hash++;
+ }
+
+ if (peer != NULL) {
+ lnet_nid_t nid = peer->lp_nid;
+ int nrefs = peer->lp_refcount;
+ int lastalive = -1;
+ char *aliveness = "NA";
+ int maxcr = peer->lp_ni->ni_peertxcredits;
+ int txcr = peer->lp_txcredits;
+ int mintxcr = peer->lp_mintxcredits;
+ int rtrcr = peer->lp_rtrcredits;
+ int minrtrcr = peer->lp_minrtrcredits;
+ int txqnob = peer->lp_txqnob;
+
+ if (lnet_isrouter(peer) ||
+ lnet_peer_aliveness_enabled(peer))
+ aliveness = peer->lp_alive ? "up" : "down";
+
+ if (lnet_peer_aliveness_enabled(peer)) {
+ cfs_time_t now = cfs_time_current();
+ cfs_duration_t delta;
+
+ delta = cfs_time_sub(now, peer->lp_last_alive);
+ lastalive = cfs_duration_sec(delta);
+
+ /* No need to mess up peers contents with
+ * arbitrarily long integers - it suffices to
+ * know that lastalive is more than 10000s old
+ */
+ if (lastalive >= 10000)
+ lastalive = 9999;
+ }
+
+ lnet_net_unlock(cpt);
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+ libcfs_nid2str(nid), nrefs, aliveness,
+ lastalive, maxcr, rtrcr, minrtrcr, txcr,
+ mintxcr, txqnob);
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ } else { /* peer is NULL */
+ lnet_net_unlock(cpt);
+ }
+
+ if (hash == LNET_PEER_HASH_SIZE) {
+ cpt++;
+ hash = 0;
+ hoff = 1;
+ if (peer == NULL && cpt < LNET_CPT_NUMBER)
+ goto again;
+ }
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else
+ *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+ loff_t pos, void *buffer, int nob)
+{
+ char *s;
+ char *tmpstr;
+ int tmpsiz;
+ int idx;
+ int len;
+ int rc;
+ int i;
+
+ LASSERT(!write);
+
+ /* (4 %d) * 4 * LNET_CPT_NUMBER */
+ tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%5s %5s %7s %7s\n",
+ "pages", "count", "credits", "min");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ if (the_lnet.ln_rtrpools == NULL)
+ goto out; /* I'm not a router */
+
+ for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+ lnet_rtrbufpool_t *rbp;
+
+ lnet_net_lock(LNET_LOCK_EX);
+ cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%5d %5d %7d %7d\n",
+ rbp[idx].rbp_npages,
+ rbp[idx].rbp_nbuffers,
+ rbp[idx].rbp_credits,
+ rbp[idx].rbp_mincredits);
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+ }
+
+ out:
+ len = s - tmpstr;
+
+ if (pos >= min_t(int, len, strlen(tmpstr)))
+ rc = 0;
+ else
+ rc = cfs_trace_copyout_string(buffer, nob,
+ tmpstr + pos, NULL);
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_buffers);
+
+int LL_PROC_PROTO(proc_lnet_nis)
+{
+ int tmpsiz = 128 * LNET_CPT_NUMBER;
+ int rc = 0;
+ char *tmpstr;
+ char *s;
+ int len;
+
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ LASSERT (!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+ "nid", "status", "alive", "refs", "peer",
+ "rtr", "max", "tx", "min");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+ } else {
+ struct list_head *n;
+ lnet_ni_t *ni = NULL;
+ int skip = *ppos - 1;
+
+ lnet_net_lock(0);
+
+ n = the_lnet.ln_nis.next;
+
+ while (n != &the_lnet.ln_nis) {
+ lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+ if (skip == 0) {
+ ni = a_ni;
+ break;
+ }
+
+ skip--;
+ n = n->next;
+ }
+
+ if (ni != NULL) {
+ struct lnet_tx_queue *tq;
+ char *stat;
+ long now = cfs_time_current_sec();
+ int last_alive = -1;
+ int i;
+ int j;
+
+ if (the_lnet.ln_routing)
+ last_alive = now - ni->ni_last_alive;
+
+ /* @lo forever alive */
+ if (ni->ni_lnd->lnd_type == LOLND)
+ last_alive = 0;
+
+ lnet_ni_lock(ni);
+ LASSERT(ni->ni_status != NULL);
+ stat = (ni->ni_status->ns_status ==
+ LNET_NI_STATUS_UP) ? "up" : "down";
+ lnet_ni_unlock(ni);
+
+ /* we actually output credits information for
+ * TX queue of each partition */
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+ for (j = 0; ni->ni_cpts != NULL &&
+ j < ni->ni_ncpts; j++) {
+ if (i == ni->ni_cpts[j])
+ break;
+ }
+
+ if (j == ni->ni_ncpts)
+ continue;
+
+ if (i != 0)
+ lnet_net_lock(i);
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+ libcfs_nid2str(ni->ni_nid), stat,
+ last_alive, *ni->ni_refs[i],
+ ni->ni_peertxcredits,
+ ni->ni_peerrtrcredits,
+ tq->tq_credits_max,
+ tq->tq_credits, tq->tq_credits_min);
+ if (i != 0)
+ lnet_net_unlock(i);
+ }
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else
+ *ppos += 1;
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+struct lnet_portal_rotors {
+ int pr_value;
+ const char *pr_name;
+ const char *pr_desc;
+};
+
+static struct lnet_portal_rotors portal_rotors[] = {
+ {
+ .pr_value = LNET_PTL_ROTOR_OFF,
+ .pr_name = "OFF",
+ .pr_desc = "Turn off message rotor for wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_ON,
+ .pr_name = "ON",
+ .pr_desc = "round-robin dispatch all PUT messages for "
+ "wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_RR_RT,
+ .pr_name = "RR_RT",
+ .pr_desc = "round-robin dispatch routed PUT message for "
+ "wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_HASH_RT,
+ .pr_name = "HASH_RT",
+ .pr_desc = "dispatch routed PUT message by hashing source "
+ "NID for wildcard portals"
+ },
+ {
+ .pr_value = -1,
+ .pr_name = NULL,
+ .pr_desc = NULL
+ },
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+ loff_t pos, void *buffer, int nob)
+{
+ const int buf_len = 128;
+ char *buf;
+ char *tmp;
+ int rc;
+ int i;
+
+ LIBCFS_ALLOC(buf, buf_len);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ if (!write) {
+ lnet_res_lock(0);
+
+ for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+ if (portal_rotors[i].pr_value == portal_rotor)
+ break;
+ }
+
+ LASSERT(portal_rotors[i].pr_value == portal_rotor);
+ lnet_res_unlock(0);
+
+ rc = snprintf(buf, buf_len,
+ "{\n\tportals: all\n"
+ "\trotor: %s\n\tdescription: %s\n}",
+ portal_rotors[i].pr_name,
+ portal_rotors[i].pr_desc);
+
+ if (pos >= min_t(int, rc, buf_len)) {
+ rc = 0;
+ } else {
+ rc = cfs_trace_copyout_string(buffer, nob,
+ buf + pos, "\n");
+ }
+ goto out;
+ }
+
+ rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+ if (rc < 0)
+ goto out;
+
+ tmp = cfs_trimwhite(buf);
+
+ rc = -EINVAL;
+ lnet_res_lock(0);
+ for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+ if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp,
+ strlen(portal_rotors[i].pr_name)) == 0) {
+ portal_rotor = portal_rotors[i].pr_value;
+ rc = 0;
+ break;
+ }
+ }
+ lnet_res_unlock(0);
+out:
+ LIBCFS_FREE(buf, buf_len);
+ return rc;
+}
+DECLARE_PROC_HANDLER(proc_lnet_portal_rotor);
+
+static ctl_table_t lnet_table[] = {
+ /*
+ * NB No .strategy entries have been provided since sysctl(8) prefers
+ * to go via /proc for portability.
+ */
+ {
+ INIT_CTL_NAME(PSDEV_LNET_STATS)
+ .procname = "stats",
+ .mode = 0644,
+ .proc_handler = &proc_lnet_stats,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_ROUTES)
+ .procname = "routes",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_routes,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_ROUTERS)
+ .procname = "routers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_routers,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_PEERS)
+ .procname = "peers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_peers,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_PEERS)
+ .procname = "buffers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_buffers,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_NIS)
+ .procname = "nis",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_nis,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR)
+ .procname = "portal_rotor",
+ .mode = 0644,
+ .proc_handler = &proc_lnet_portal_rotor,
+ },
+ {
+ INIT_CTL_NAME(0)
+ }
+};
+
+static ctl_table_t top_table[] = {
+ {
+ INIT_CTL_NAME(CTL_LNET)
+ .procname = "lnet",
+ .mode = 0555,
+ .data = NULL,
+ .maxlen = 0,
+ .child = lnet_table,
+ },
+ {
+ INIT_CTL_NAME(0)
+ }
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ if (lnet_table_header == NULL)
+ lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+ if (lnet_table_header != NULL)
+ unregister_sysctl_table(lnet_table_header);
+
+ lnet_table_header = NULL;
+#endif
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644
index 000000000000..1e40aeea2962
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+ module.o ping_test.o brw_test.o
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644
index 000000000000..3bb6fbe23f78
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/brw_test.c
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(brw_srv_workitems, "i", int, 0644, "# BRW server workitems");
+
+static int brw_inject_errors;
+CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
+ "# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini (sfw_test_instance_t *tsi)
+{
+ srpc_bulk_t *bulk;
+ sfw_test_unit_t *tsu;
+
+ LASSERT (tsi->tsi_is_client);
+
+ list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+ bulk = tsu->tsu_private;
+ if (bulk == NULL) continue;
+
+ srpc_free_bulk(bulk);
+ tsu->tsu_private = NULL;
+ }
+}
+
+int
+brw_client_init (sfw_test_instance_t *tsi)
+{
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ int flags;
+ int npg;
+ int len;
+ int opc;
+ srpc_bulk_t *bulk;
+ sfw_test_unit_t *tsu;
+
+ LASSERT(sn != NULL);
+ LASSERT(tsi->tsi_is_client);
+
+ if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+ test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ npg = breq->blk_npg;
+ /* NB: this is not going to work for variable page size,
+ * but we have to keep it for compatibility */
+ len = npg * PAGE_CACHE_SIZE;
+
+ } else {
+ test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1;
+
+ /* I should never get this step if it's unknown feature
+ * because make_session will reject unknown feature */
+ LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ len = breq->blk_len;
+ npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ if (npg > LNET_MAX_IOV || npg <= 0)
+ return -EINVAL;
+
+ if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+ return -EINVAL;
+
+ if (flags != LST_BRW_CHECK_NONE &&
+ flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+ return -EINVAL;
+
+ list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+ bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+ npg, len, opc == LST_BRW_READ);
+ if (bulk == NULL) {
+ brw_client_fini(tsi);
+ return -ENOMEM;
+ }
+
+ tsu->tsu_private = bulk;
+ }
+
+ return 0;
+}
+
+#define BRW_POISON 0xbeefbeefbeefbeefULL
+#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE sizeof(__u64)
+
+int
+brw_inject_one_error (void)
+{
+ struct timeval tv;
+
+ if (brw_inject_errors <= 0) return 0;
+
+ do_gettimeofday(&tv);
+
+ if ((tv.tv_usec & 1) == 0) return 0;
+
+ return brw_inject_errors--;
+}
+
+void
+brw_fill_page (struct page *pg, int pattern, __u64 magic)
+{
+ char *addr = page_address(pg);
+ int i;
+
+ LASSERT (addr != NULL);
+
+ if (pattern == LST_BRW_CHECK_NONE) return;
+
+ if (magic == BRW_MAGIC)
+ magic += brw_inject_one_error();
+
+ if (pattern == LST_BRW_CHECK_SIMPLE) {
+ memcpy(addr, &magic, BRW_MSIZE);
+ addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+ memcpy(addr, &magic, BRW_MSIZE);
+ return;
+ }
+
+ if (pattern == LST_BRW_CHECK_FULL) {
+ for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+ memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+ return;
+ }
+
+ LBUG ();
+ return;
+}
+
+int
+brw_check_page (struct page *pg, int pattern, __u64 magic)
+{
+ char *addr = page_address(pg);
+ __u64 data = 0; /* make compiler happy */
+ int i;
+
+ LASSERT (addr != NULL);
+
+ if (pattern == LST_BRW_CHECK_NONE)
+ return 0;
+
+ if (pattern == LST_BRW_CHECK_SIMPLE) {
+ data = *((__u64 *) addr);
+ if (data != magic) goto bad_data;
+
+ addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+ data = *((__u64 *) addr);
+ if (data != magic) goto bad_data;
+
+ return 0;
+ }
+
+ if (pattern == LST_BRW_CHECK_FULL) {
+ for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+ data = *(((__u64 *) addr) + i);
+ if (data != magic) goto bad_data;
+ }
+
+ return 0;
+ }
+
+ LBUG ();
+
+bad_data:
+ CERROR ("Bad data in page %p: "LPX64", "LPX64" expected\n",
+ pg, data, magic);
+ return 1;
+}
+
+void
+brw_fill_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+ int i;
+ struct page *pg;
+
+ for (i = 0; i < bk->bk_niov; i++) {
+ pg = bk->bk_iovs[i].kiov_page;
+ brw_fill_page(pg, pattern, magic);
+ }
+}
+
+int
+brw_check_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+ int i;
+ struct page *pg;
+
+ for (i = 0; i < bk->bk_niov; i++) {
+ pg = bk->bk_iovs[i].kiov_page;
+ if (brw_check_page(pg, pattern, magic) != 0) {
+ CERROR ("Bulk page %p (%d/%d) is corrupted!\n",
+ pg, i, bk->bk_niov);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+brw_client_prep_rpc (sfw_test_unit_t *tsu,
+ lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+ srpc_bulk_t *bulk = tsu->tsu_private;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ srpc_client_rpc_t *rpc;
+ srpc_brw_reqst_t *req;
+ int flags;
+ int npg;
+ int len;
+ int opc;
+ int rc;
+
+ LASSERT(sn != NULL);
+ LASSERT(bulk != NULL);
+
+ if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+ test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ npg = breq->blk_npg;
+ len = npg * PAGE_CACHE_SIZE;
+
+ } else {
+ test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1;
+
+ /* I should never get this step if it's unknown feature
+ * because make_session will reject unknown feature */
+ LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ opc = breq->blk_opc;
+ flags = breq->blk_flags;
+ len = breq->blk_len;
+ npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+ if (rc != 0)
+ return rc;
+
+ memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+ if (opc == LST_BRW_WRITE)
+ brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+ else
+ brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+ req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+ req->brw_flags = flags;
+ req->brw_rw = opc;
+ req->brw_len = len;
+
+ *rpcpp = rpc;
+ return 0;
+}
+
+static void
+brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+ __u64 magic = BRW_MAGIC;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ srpc_msg_t *msg = &rpc->crpc_replymsg;
+ srpc_brw_reply_t *reply = &msg->msg_body.brw_reply;
+ srpc_brw_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+ LASSERT (sn != NULL);
+
+ if (rpc->crpc_status != 0) {
+ CERROR ("BRW RPC to %s failed with %d\n",
+ libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+ if (!tsi->tsi_stopping) /* rpc could have been aborted */
+ atomic_inc(&sn->sn_brw_errors);
+ goto out;
+ }
+
+ if (msg->msg_magic != SRPC_MSG_MAGIC) {
+ __swab64s(&magic);
+ __swab32s(&reply->brw_status);
+ }
+
+ CDEBUG (reply->brw_status ? D_WARNING : D_NET,
+ "BRW RPC to %s finished with brw_status: %d\n",
+ libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+ if (reply->brw_status != 0) {
+ atomic_inc(&sn->sn_brw_errors);
+ rpc->crpc_status = -(int)reply->brw_status;
+ goto out;
+ }
+
+ if (reqst->brw_rw == LST_BRW_WRITE) goto out;
+
+ if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+ CERROR ("Bulk data from %s is corrupted!\n",
+ libcfs_id2str(rpc->crpc_dest));
+ atomic_inc(&sn->sn_brw_errors);
+ rpc->crpc_status = -EBADMSG;
+ }
+
+out:
+ return;
+}
+
+void
+brw_server_rpc_done (srpc_server_rpc_t *rpc)
+{
+ srpc_bulk_t *blk = rpc->srpc_bulk;
+
+ if (blk == NULL) return;
+
+ if (rpc->srpc_status != 0)
+ CERROR ("Bulk transfer %s %s has failed: %d\n",
+ blk->bk_sink ? "from" : "to",
+ libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+ else
+ CDEBUG (D_NET, "Transfered %d pages bulk data %s %s\n",
+ blk->bk_niov, blk->bk_sink ? "from" : "to",
+ libcfs_id2str(rpc->srpc_peer));
+
+ sfw_free_pages(rpc);
+}
+
+int
+brw_bulk_ready (srpc_server_rpc_t *rpc, int status)
+{
+ __u64 magic = BRW_MAGIC;
+ srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+ srpc_brw_reqst_t *reqst;
+ srpc_msg_t *reqstmsg;
+
+ LASSERT (rpc->srpc_bulk != NULL);
+ LASSERT (rpc->srpc_reqstbuf != NULL);
+
+ reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+ reqst = &reqstmsg->msg_body.brw_reqst;
+
+ if (status != 0) {
+ CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+ reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+ libcfs_id2str(rpc->srpc_peer), status);
+ return -EIO;
+ }
+
+ if (reqst->brw_rw == LST_BRW_READ)
+ return 0;
+
+ if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+ __swab64s(&magic);
+
+ if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+ CERROR ("Bulk data from %s is corrupted!\n",
+ libcfs_id2str(rpc->srpc_peer));
+ reply->brw_status = EBADMSG;
+ }
+
+ return 0;
+}
+
+int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ srpc_msg_t *replymsg = &rpc->srpc_replymsg;
+ srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+ srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+ int npg;
+ int rc;
+
+ LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+ if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+ LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ __swab32s(&reqst->brw_rw);
+ __swab32s(&reqst->brw_len);
+ __swab32s(&reqst->brw_flags);
+ __swab64s(&reqst->brw_rpyid);
+ __swab64s(&reqst->brw_bulkid);
+ }
+ LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+ reply->brw_status = 0;
+ rpc->srpc_done = brw_server_rpc_done;
+
+ if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+ (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+ reqst->brw_flags != LST_BRW_CHECK_FULL &&
+ reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+ reply->brw_status = EINVAL;
+ return 0;
+ }
+
+ if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ replymsg->msg_ses_feats = LST_FEATS_MASK;
+ reply->brw_status = EPROTO;
+ return 0;
+ }
+
+ if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+ /* compat with old version */
+ if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+ reply->brw_status = EINVAL;
+ return 0;
+ }
+ npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+ } else {
+ npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ }
+
+ replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+ if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+ reply->brw_status = EINVAL;
+ return 0;
+ }
+
+ rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+ reqst->brw_len,
+ reqst->brw_rw == LST_BRW_WRITE);
+ if (rc != 0)
+ return rc;
+
+ if (reqst->brw_rw == LST_BRW_READ)
+ brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+ else
+ brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+ return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+ brw_test_client.tso_init = brw_client_init;
+ brw_test_client.tso_fini = brw_client_fini;
+ brw_test_client.tso_prep_rpc = brw_client_prep_rpc;
+ brw_test_client.tso_done_rpc = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+ brw_test_service.sv_id = SRPC_SERVICE_BRW;
+ brw_test_service.sv_name = "brw_test";
+ brw_test_service.sv_handler = brw_server_handle;
+ brw_test_service.sv_bulk_ready = brw_bulk_ready;
+ brw_test_service.sv_wi_total = brw_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644
index 000000000000..bce3d3bde6b2
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conctl.c
@@ -0,0 +1,931 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnetst.h>
+#include "console.h"
+
+int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+ char *name;
+ int rc;
+
+ if (args->lstio_ses_idp == NULL || /* address for output sid */
+ args->lstio_ses_key == 0 || /* no key is specified */
+ args->lstio_ses_namep == NULL || /* session name */
+ args->lstio_ses_nmlen <= 0 ||
+ args->lstio_ses_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_ses_namep,
+ args->lstio_ses_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_ses_nmlen] = 0;
+
+ rc = lstcon_session_new(name,
+ args->lstio_ses_key,
+ args->lstio_ses_feats,
+ args->lstio_ses_force,
+ args->lstio_ses_timeout,
+ args->lstio_ses_idp);
+
+ LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+ return rc;
+}
+
+int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+ if (args->lstio_ses_key != console_session.ses_key)
+ return -EACCES;
+
+ return lstcon_session_end();
+}
+
+int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+ /* no checking of key */
+
+ if (args->lstio_ses_idp == NULL || /* address for ouput sid */
+ args->lstio_ses_keyp == NULL || /* address for ouput key */
+ args->lstio_ses_featp == NULL || /* address for ouput features */
+ args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+ args->lstio_ses_namep == NULL || /* address for ouput name */
+ args->lstio_ses_nmlen <= 0 ||
+ args->lstio_ses_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ return lstcon_session_info(args->lstio_ses_idp,
+ args->lstio_ses_keyp,
+ args->lstio_ses_featp,
+ args->lstio_ses_ndinfo,
+ args->lstio_ses_namep,
+ args->lstio_ses_nmlen);
+}
+
+int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+ char *name = NULL;
+ int client = 1;
+ int rc;
+
+ if (args->lstio_dbg_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_dbg_resultp == NULL)
+ return -EINVAL;
+
+ if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+ (args->lstio_dbg_nmlen <= 0 ||
+ args->lstio_dbg_nmlen > LST_NAME_SIZE))
+ return -EINVAL;
+
+ if (args->lstio_dbg_namep != NULL) {
+ LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name, args->lstio_dbg_namep,
+ args->lstio_dbg_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+ return -EFAULT;
+ }
+
+ name[args->lstio_dbg_nmlen] = 0;
+ }
+
+ rc = -EINVAL;
+
+ switch (args->lstio_dbg_type) {
+ case LST_OPC_SESSION:
+ rc = lstcon_session_debug(args->lstio_dbg_timeout,
+ args->lstio_dbg_resultp);
+ break;
+
+ case LST_OPC_BATCHSRV:
+ client = 0;
+ case LST_OPC_BATCHCLI:
+ if (name == NULL)
+ goto out;
+
+ rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+ name, client, args->lstio_dbg_resultp);
+ break;
+
+ case LST_OPC_GROUP:
+ if (name == NULL)
+ goto out;
+
+ rc = lstcon_group_debug(args->lstio_dbg_timeout,
+ name, args->lstio_dbg_resultp);
+ break;
+
+ case LST_OPC_NODES:
+ if (args->lstio_dbg_count <= 0 ||
+ args->lstio_dbg_idsp == NULL)
+ goto out;
+
+ rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+ args->lstio_dbg_count,
+ args->lstio_dbg_idsp,
+ args->lstio_dbg_resultp);
+ break;
+
+ default:
+ break;
+ }
+
+out:
+ if (name != NULL)
+ LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+ char *name;
+ int rc;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_namep == NULL||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_group_add(name);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_group_del(name);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_resultp == NULL ||
+ args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ switch (args->lstio_grp_opc) {
+ case LST_GROUP_CLEAN:
+ rc = lstcon_group_clean(name, args->lstio_grp_args);
+ break;
+
+ case LST_GROUP_REFRESH:
+ rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+ break;
+
+ case LST_GROUP_RMND:
+ if (args->lstio_grp_count <= 0 ||
+ args->lstio_grp_idsp == NULL) {
+ rc = -EINVAL;
+ break;
+ }
+ rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+ args->lstio_grp_idsp,
+ args->lstio_grp_resultp);
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+ unsigned feats;
+ int rc;
+ char *name;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_idsp == NULL || /* array of ids */
+ args->lstio_grp_count <= 0 ||
+ args->lstio_grp_resultp == NULL ||
+ args->lstio_grp_featp == NULL ||
+ args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name, args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_nodes_add(name, args->lstio_grp_count,
+ args->lstio_grp_idsp, &feats,
+ args->lstio_grp_resultp);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ if (rc == 0 &&
+ copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_idx < 0 ||
+ args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ return lstcon_group_list(args->lstio_grp_idx,
+ args->lstio_grp_nmlen,
+ args->lstio_grp_namep);
+}
+
+int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+ char *name;
+ int ndent;
+ int index;
+ int rc;
+
+ if (args->lstio_grp_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_grp_namep == NULL ||
+ args->lstio_grp_nmlen <= 0 ||
+ args->lstio_grp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_grp_entp == NULL && /* output: group entry */
+ args->lstio_grp_dentsp == NULL) /* output: node entry */
+ return -EINVAL;
+
+ if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+ if (args->lstio_grp_idxp == NULL || /* node index */
+ args->lstio_grp_ndentp == NULL) /* # of node entry */
+ return -EINVAL;
+
+ if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+ sizeof(ndent)) ||
+ copy_from_user(&index, args->lstio_grp_idxp,
+ sizeof(index)))
+ return -EFAULT;
+
+ if (ndent <= 0 || index < 0)
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_grp_namep,
+ args->lstio_grp_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_grp_nmlen] = 0;
+
+ rc = lstcon_group_info(name, args->lstio_grp_entp,
+ &index, &ndent, args->lstio_grp_dentsp);
+
+ LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+ if (rc != 0)
+ return rc;
+
+ if (args->lstio_grp_dentsp != NULL &&
+ (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+ copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+ rc = -EFAULT;
+
+ return 0;
+}
+
+int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_add(name);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+ args->lstio_bat_resultp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+ int rc;
+ char *name;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_resultp == NULL ||
+ args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_stop(name, args->lstio_bat_force,
+ args->lstio_bat_resultp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+ char *name;
+ int rc;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_resultp == NULL ||
+ args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_bat_testidx < 0)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep,
+ args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_test_batch_query(name,
+ args->lstio_bat_testidx,
+ args->lstio_bat_client,
+ args->lstio_bat_timeout,
+ args->lstio_bat_resultp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ return rc;
+}
+
+int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_idx < 0 ||
+ args->lstio_bat_namep == NULL ||
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ return lstcon_batch_list(args->lstio_bat_idx,
+ args->lstio_bat_nmlen,
+ args->lstio_bat_namep);
+}
+
+int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+ char *name;
+ int rc;
+ int index;
+ int ndent;
+
+ if (args->lstio_bat_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_bat_namep == NULL || /* batch name */
+ args->lstio_bat_nmlen <= 0 ||
+ args->lstio_bat_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_bat_entp == NULL && /* output: batch entry */
+ args->lstio_bat_dentsp == NULL) /* output: node entry */
+ return -EINVAL;
+
+ if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+ if (args->lstio_bat_idxp == NULL || /* node index */
+ args->lstio_bat_ndentp == NULL) /* # of node entry */
+ return -EINVAL;
+
+ if (copy_from_user(&index, args->lstio_bat_idxp,
+ sizeof(index)) ||
+ copy_from_user(&ndent, args->lstio_bat_ndentp,
+ sizeof(ndent)))
+ return -EFAULT;
+
+ if (ndent <= 0 || index < 0)
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name,
+ args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+ return -EFAULT;
+ }
+
+ name[args->lstio_bat_nmlen] = 0;
+
+ rc = lstcon_batch_info(name,
+ args->lstio_bat_entp, args->lstio_bat_server,
+ args->lstio_bat_testidx, &index, &ndent,
+ args->lstio_bat_dentsp);
+
+ LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+ if (rc != 0)
+ return rc;
+
+ if (args->lstio_bat_dentsp != NULL &&
+ (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+ copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+ rc = -EFAULT;
+
+ return rc;
+}
+
+int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+ int rc;
+ char *name;
+
+ /* TODO: not finished */
+ if (args->lstio_sta_key != console_session.ses_key)
+ return -EACCES;
+
+ if (args->lstio_sta_resultp == NULL ||
+ (args->lstio_sta_namep == NULL &&
+ args->lstio_sta_idsp == NULL) ||
+ args->lstio_sta_nmlen <= 0 ||
+ args->lstio_sta_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_sta_idsp != NULL &&
+ args->lstio_sta_count <= 0)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+ if (name == NULL)
+ return -ENOMEM;
+
+ if (copy_from_user(name, args->lstio_sta_namep,
+ args->lstio_sta_nmlen)) {
+ LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+ return -EFAULT;
+ }
+
+ if (args->lstio_sta_idsp == NULL) {
+ rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+ args->lstio_sta_resultp);
+ } else {
+ rc = lstcon_nodes_stat(args->lstio_sta_count,
+ args->lstio_sta_idsp,
+ args->lstio_sta_timeout,
+ args->lstio_sta_resultp);
+ }
+
+ LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+ return rc;
+}
+
+int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+ char *name;
+ char *srcgrp = NULL;
+ char *dstgrp = NULL;
+ void *param = NULL;
+ int ret = 0;
+ int rc = -ENOMEM;
+
+ if (args->lstio_tes_resultp == NULL ||
+ args->lstio_tes_retp == NULL ||
+ args->lstio_tes_bat_name == NULL || /* no specified batch */
+ args->lstio_tes_bat_nmlen <= 0 ||
+ args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+ args->lstio_tes_sgrp_name == NULL || /* no source group */
+ args->lstio_tes_sgrp_nmlen <= 0 ||
+ args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+ args->lstio_tes_dgrp_name == NULL || /* no target group */
+ args->lstio_tes_dgrp_nmlen <= 0 ||
+ args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+ return -EINVAL;
+
+ if (args->lstio_tes_loop == 0 || /* negative is infinite */
+ args->lstio_tes_concur <= 0 ||
+ args->lstio_tes_dist <= 0 ||
+ args->lstio_tes_span <= 0)
+ return -EINVAL;
+
+ /* have parameter, check if parameter length is valid */
+ if (args->lstio_tes_param != NULL &&
+ (args->lstio_tes_param_len <= 0 ||
+ args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+ return -EINVAL;
+
+ LIBCFS_ALLOC(name, args->lstio_tes_bat_nmlen + 1);
+ if (name == NULL)
+ return rc;
+
+ LIBCFS_ALLOC(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+ if (srcgrp == NULL)
+ goto out;
+
+ LIBCFS_ALLOC(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+ if (dstgrp == NULL)
+ goto out;
+
+ if (args->lstio_tes_param != NULL) {
+ LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+ if (param == NULL)
+ goto out;
+ }
+
+ rc = -EFAULT;
+ if (copy_from_user(name,
+ args->lstio_tes_bat_name,
+ args->lstio_tes_bat_nmlen) ||
+ copy_from_user(srcgrp,
+ args->lstio_tes_sgrp_name,
+ args->lstio_tes_sgrp_nmlen) ||
+ copy_from_user(dstgrp,
+ args->lstio_tes_dgrp_name,
+ args->lstio_tes_dgrp_nmlen) ||
+ copy_from_user(param, args->lstio_tes_param,
+ args->lstio_tes_param_len))
+ goto out;
+
+ rc = lstcon_test_add(name,
+ args->lstio_tes_type,
+ args->lstio_tes_loop,
+ args->lstio_tes_concur,
+ args->lstio_tes_dist, args->lstio_tes_span,
+ srcgrp, dstgrp, param, args->lstio_tes_param_len,
+ &ret, args->lstio_tes_resultp);
+
+ if (ret != 0)
+ rc = (copy_to_user(args->lstio_tes_retp, &ret,
+ sizeof(ret))) ? -EFAULT : 0;
+out:
+ if (name != NULL)
+ LIBCFS_FREE(name, args->lstio_tes_bat_nmlen + 1);
+
+ if (srcgrp != NULL)
+ LIBCFS_FREE(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+
+ if (dstgrp != NULL)
+ LIBCFS_FREE(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+
+ if (param != NULL)
+ LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+ return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+ char *buf;
+ int opc = data->ioc_u32[0];
+ int rc;
+
+ if (cmd != IOC_LIBCFS_LNETST)
+ return -EINVAL;
+
+ if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+ return -EINVAL;
+
+ LIBCFS_ALLOC(buf, data->ioc_plen1);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* copy in parameter */
+ if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+ LIBCFS_FREE(buf, data->ioc_plen1);
+ return -EFAULT;
+ }
+
+ mutex_lock(&console_session.ses_mutex);
+
+ console_session.ses_laststamp = cfs_time_current_sec();
+
+ if (console_session.ses_shutdown) {
+ rc = -ESHUTDOWN;
+ goto out;
+ }
+
+ if (console_session.ses_expired)
+ lstcon_session_end();
+
+ if (opc != LSTIO_SESSION_NEW &&
+ console_session.ses_state == LST_SESSION_NONE) {
+ CDEBUG(D_NET, "LST no active session\n");
+ rc = -ESRCH;
+ goto out;
+ }
+
+ memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+ switch (opc) {
+ case LSTIO_SESSION_NEW:
+ rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+ break;
+ case LSTIO_SESSION_END:
+ rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+ break;
+ case LSTIO_SESSION_INFO:
+ rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+ break;
+ case LSTIO_DEBUG:
+ rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+ break;
+ case LSTIO_GROUP_ADD:
+ rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+ break;
+ case LSTIO_GROUP_DEL:
+ rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+ break;
+ case LSTIO_GROUP_UPDATE:
+ rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+ break;
+ case LSTIO_NODES_ADD:
+ rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+ break;
+ case LSTIO_GROUP_LIST:
+ rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+ break;
+ case LSTIO_GROUP_INFO:
+ rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+ break;
+ case LSTIO_BATCH_ADD:
+ rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+ break;
+ case LSTIO_BATCH_START:
+ rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+ break;
+ case LSTIO_BATCH_STOP:
+ rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+ break;
+ case LSTIO_BATCH_QUERY:
+ rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+ break;
+ case LSTIO_BATCH_LIST:
+ rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+ break;
+ case LSTIO_BATCH_INFO:
+ rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+ break;
+ case LSTIO_TEST_ADD:
+ rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+ break;
+ case LSTIO_STAT_QUERY:
+ rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+ break;
+ default:
+ rc = -EINVAL;
+ }
+
+ if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+ sizeof(lstcon_trans_stat_t)))
+ rc = -EFAULT;
+out:
+ mutex_unlock(&console_session.ses_mutex);
+
+ LIBCFS_FREE(buf, data->ioc_plen1);
+
+ return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644
index 000000000000..446de0e4672f
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.c
@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+ lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+ lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+ LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+ LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+ spin_lock(&rpc->crpc_lock);
+
+ if (crpc->crp_trans == NULL) {
+ /* Orphan RPC is not in any transaction,
+ * I'm just a poor body and nobody loves me */
+ spin_unlock(&rpc->crpc_lock);
+
+ /* release it */
+ lstcon_rpc_put(crpc);
+ return;
+ }
+
+ /* not an orphan RPC */
+ crpc->crp_finished = 1;
+
+ if (crpc->crp_stamp == 0) {
+ /* not aborted */
+ LASSERT (crpc->crp_status == 0);
+
+ crpc->crp_stamp = cfs_time_current();
+ crpc->crp_status = rpc->crpc_status;
+ }
+
+ /* wakeup (transaction)thread if I'm the last RPC in the transaction */
+ if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+ wake_up(&crpc->crp_trans->tas_waitq);
+
+ spin_unlock(&rpc->crpc_lock);
+}
+
+int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+ int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+ crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+ feats, bulk_npg, bulk_len,
+ lstcon_rpc_done, (void *)crpc);
+ if (crpc->crp_rpc == NULL)
+ return -ENOMEM;
+
+ crpc->crp_trans = NULL;
+ crpc->crp_node = nd;
+ crpc->crp_posted = 0;
+ crpc->crp_finished = 0;
+ crpc->crp_unpacked = 0;
+ crpc->crp_status = 0;
+ crpc->crp_stamp = 0;
+ crpc->crp_embedded = embedded;
+ INIT_LIST_HEAD(&crpc->crp_link);
+
+ atomic_inc(&console_session.ses_rpc_counter);
+
+ return 0;
+}
+
+int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+ int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+ lstcon_rpc_t *crpc = NULL;
+ int rc;
+
+ spin_lock(&console_session.ses_rpc_lock);
+
+ if (!list_empty(&console_session.ses_rpc_freelist)) {
+ crpc = list_entry(console_session.ses_rpc_freelist.next,
+ lstcon_rpc_t, crp_link);
+ list_del_init(&crpc->crp_link);
+ }
+
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ if (crpc == NULL) {
+ LIBCFS_ALLOC(crpc, sizeof(*crpc));
+ if (crpc == NULL)
+ return -ENOMEM;
+ }
+
+ rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+ if (rc == 0) {
+ *crpcpp = crpc;
+ return 0;
+ }
+
+ LIBCFS_FREE(crpc, sizeof(*crpc));
+
+ return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+ srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+ int i;
+
+ LASSERT (list_empty(&crpc->crp_link));
+
+ for (i = 0; i < bulk->bk_niov; i++) {
+ if (bulk->bk_iovs[i].kiov_page == NULL)
+ continue;
+
+ __free_page(bulk->bk_iovs[i].kiov_page);
+ }
+
+ srpc_client_rpc_decref(crpc->crp_rpc);
+
+ if (crpc->crp_embedded) {
+ /* embedded RPC, don't recycle it */
+ memset(crpc, 0, sizeof(*crpc));
+ crpc->crp_embedded = 1;
+
+ } else {
+ spin_lock(&console_session.ses_rpc_lock);
+
+ list_add(&crpc->crp_link,
+ &console_session.ses_rpc_freelist);
+
+ spin_unlock(&console_session.ses_rpc_lock);
+ }
+
+ /* RPC is not alive now */
+ atomic_dec(&console_session.ses_rpc_counter);
+}
+
+void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+ lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+ LASSERT (trans != NULL);
+
+ atomic_inc(&trans->tas_remaining);
+ crpc->crp_posted = 1;
+
+ sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+ if (transop == LST_TRANS_SESNEW)
+ return "SESNEW";
+
+ if (transop == LST_TRANS_SESEND)
+ return "SESEND";
+
+ if (transop == LST_TRANS_SESQRY)
+ return "SESQRY";
+
+ if (transop == LST_TRANS_SESPING)
+ return "SESPING";
+
+ if (transop == LST_TRANS_TSBCLIADD)
+ return "TSBCLIADD";
+
+ if (transop == LST_TRANS_TSBSRVADD)
+ return "TSBSRVADD";
+
+ if (transop == LST_TRANS_TSBRUN)
+ return "TSBRUN";
+
+ if (transop == LST_TRANS_TSBSTOP)
+ return "TSBSTOP";
+
+ if (transop == LST_TRANS_TSBCLIQRY)
+ return "TSBCLIQRY";
+
+ if (transop == LST_TRANS_TSBSRVQRY)
+ return "TSBSRVQRY";
+
+ if (transop == LST_TRANS_STATQRY)
+ return "STATQRY";
+
+ return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+ int transop, lstcon_rpc_trans_t **transpp)
+{
+ lstcon_rpc_trans_t *trans;
+
+ if (translist != NULL) {
+ list_for_each_entry(trans, translist, tas_link) {
+ /* Can't enqueue two private transaction on
+ * the same object */
+ if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+ return -EPERM;
+ }
+ }
+
+ /* create a trans group */
+ LIBCFS_ALLOC(trans, sizeof(*trans));
+ if (trans == NULL)
+ return -ENOMEM;
+
+ trans->tas_opc = transop;
+
+ if (translist == NULL)
+ INIT_LIST_HEAD(&trans->tas_olink);
+ else
+ list_add_tail(&trans->tas_olink, translist);
+
+ list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+ INIT_LIST_HEAD(&trans->tas_rpcs_list);
+ atomic_set(&trans->tas_remaining, 0);
+ init_waitqueue_head(&trans->tas_waitq);
+
+ spin_lock(&console_session.ses_rpc_lock);
+ trans->tas_features = console_session.ses_features;
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ *transpp = trans;
+ return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+ list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+ crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+ srpc_client_rpc_t *rpc;
+ lstcon_rpc_t *crpc;
+ lstcon_node_t *nd;
+
+ list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+ rpc = crpc->crp_rpc;
+
+ spin_lock(&rpc->crpc_lock);
+
+ if (!crpc->crp_posted || /* not posted */
+ crpc->crp_stamp != 0) { /* rpc done or aborted already */
+ if (crpc->crp_stamp == 0) {
+ crpc->crp_stamp = cfs_time_current();
+ crpc->crp_status = -EINTR;
+ }
+ spin_unlock(&rpc->crpc_lock);
+ continue;
+ }
+
+ crpc->crp_stamp = cfs_time_current();
+ crpc->crp_status = error;
+
+ spin_unlock(&rpc->crpc_lock);
+
+ sfw_abort_rpc(rpc);
+
+ if (error != ETIMEDOUT)
+ continue;
+
+ nd = crpc->crp_node;
+ if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+ continue;
+
+ nd->nd_stamp = crpc->crp_stamp;
+ nd->nd_state = LST_NODE_DOWN;
+ }
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+ if (console_session.ses_shutdown &&
+ !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+ return 1;
+
+ return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+ lstcon_rpc_t *crpc;
+ int rc;
+
+ if (list_empty(&trans->tas_rpcs_list))
+ return 0;
+
+ if (timeout < LST_TRANS_MIN_TIMEOUT)
+ timeout = LST_TRANS_MIN_TIMEOUT;
+
+ CDEBUG(D_NET, "Transaction %s started\n",
+ lstcon_rpc_trans_name(trans->tas_opc));
+
+ /* post all requests */
+ list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+ LASSERT (!crpc->crp_posted);
+
+ lstcon_rpc_post(crpc);
+ }
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ rc = wait_event_interruptible_timeout(trans->tas_waitq,
+ lstcon_rpc_trans_check(trans),
+ cfs_time_seconds(timeout));
+ rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+ mutex_lock(&console_session.ses_mutex);
+
+ if (console_session.ses_shutdown)
+ rc = -ESHUTDOWN;
+
+ if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+ /* treat short timeout as canceled */
+ if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+ rc = -EINTR;
+
+ lstcon_rpc_trans_abort(trans, rc);
+ }
+
+ CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+ lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+ lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+ return rc;
+}
+
+int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+ lstcon_node_t *nd = crpc->crp_node;
+ srpc_client_rpc_t *rpc = crpc->crp_rpc;
+ srpc_generic_reply_t *rep;
+
+ LASSERT (nd != NULL && rpc != NULL);
+ LASSERT (crpc->crp_stamp != 0);
+
+ if (crpc->crp_status != 0) {
+ *msgpp = NULL;
+ return crpc->crp_status;
+ }
+
+ *msgpp = &rpc->crpc_replymsg;
+ if (!crpc->crp_unpacked) {
+ sfw_unpack_message(*msgpp);
+ crpc->crp_unpacked = 1;
+ }
+
+ if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+ return 0;
+
+ nd->nd_stamp = crpc->crp_stamp;
+ rep = &(*msgpp)->msg_body.reply;
+
+ if (rep->sid.ses_nid == LNET_NID_ANY)
+ nd->nd_state = LST_NODE_UNKNOWN;
+ else if (lstcon_session_match(rep->sid))
+ nd->nd_state = LST_NODE_ACTIVE;
+ else
+ nd->nd_state = LST_NODE_BUSY;
+
+ return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+ lstcon_rpc_t *crpc;
+ srpc_msg_t *rep;
+ int error;
+
+ LASSERT (stat != NULL);
+
+ memset(stat, 0, sizeof(*stat));
+
+ list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+ lstcon_rpc_stat_total(stat, 1);
+
+ LASSERT (crpc->crp_stamp != 0);
+
+ error = lstcon_rpc_get_reply(crpc, &rep);
+ if (error != 0) {
+ lstcon_rpc_stat_failure(stat, 1);
+ if (stat->trs_rpc_errno == 0)
+ stat->trs_rpc_errno = -error;
+
+ continue;
+ }
+
+ lstcon_rpc_stat_success(stat, 1);
+
+ lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+ }
+
+ if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+ stat->trs_fwk_errno =
+ lstcon_session_feats_check(trans->tas_features);
+ }
+
+ CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+ "RPC error(%d), Framework error(%d)\n",
+ lstcon_rpc_trans_name(trans->tas_opc),
+ lstcon_rpc_stat_success(stat, 0),
+ lstcon_rpc_stat_failure(stat, 0),
+ lstcon_rpc_stat_total(stat, 0),
+ stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+ return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+ struct list_head *head_up,
+ lstcon_rpc_readent_func_t readent)
+{
+ struct list_head tmp;
+ struct list_head *next;
+ lstcon_rpc_ent_t *ent;
+ srpc_generic_reply_t *rep;
+ lstcon_rpc_t *crpc;
+ srpc_msg_t *msg;
+ lstcon_node_t *nd;
+ cfs_duration_t dur;
+ struct timeval tv;
+ int error;
+
+ LASSERT (head_up != NULL);
+
+ next = head_up;
+
+ list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+ if (copy_from_user(&tmp, next,
+ sizeof(struct list_head)))
+ return -EFAULT;
+
+ if (tmp.next == head_up)
+ return 0;
+
+ next = tmp.next;
+
+ ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+ LASSERT (crpc->crp_stamp != 0);
+
+ error = lstcon_rpc_get_reply(crpc, &msg);
+
+ nd = crpc->crp_node;
+
+ dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+ (cfs_time_t)console_session.ses_id.ses_stamp);
+ cfs_duration_usec(dur, &tv);
+
+ if (copy_to_user(&ent->rpe_peer,
+ &nd->nd_id, sizeof(lnet_process_id_t)) ||
+ copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+ copy_to_user(&ent->rpe_state,
+ &nd->nd_state, sizeof(nd->nd_state)) ||
+ copy_to_user(&ent->rpe_rpc_errno, &error,
+ sizeof(error)))
+ return -EFAULT;
+
+ if (error != 0)
+ continue;
+
+ /* RPC is done */
+ rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+ if (copy_to_user(&ent->rpe_sid,
+ &rep->sid, sizeof(lst_sid_t)) ||
+ copy_to_user(&ent->rpe_fwk_errno,
+ &rep->status, sizeof(rep->status)))
+ return -EFAULT;
+
+ if (readent == NULL)
+ continue;
+
+ if ((error = readent(trans->tas_opc, msg, ent)) != 0)
+ return error;
+ }
+
+ return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+ srpc_client_rpc_t *rpc;
+ lstcon_rpc_t *crpc;
+ lstcon_rpc_t *tmp;
+ int count = 0;
+
+ list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+ crp_link) {
+ rpc = crpc->crp_rpc;
+
+ spin_lock(&rpc->crpc_lock);
+
+ /* free it if not posted or finished already */
+ if (!crpc->crp_posted || crpc->crp_finished) {
+ spin_unlock(&rpc->crpc_lock);
+
+ list_del_init(&crpc->crp_link);
+ lstcon_rpc_put(crpc);
+
+ continue;
+ }
+
+ /* rpcs can be still not callbacked (even LNetMDUnlink is called)
+ * because huge timeout for inaccessible network, don't make
+ * user wait for them, just abandon them, they will be recycled
+ * in callback */
+
+ LASSERT (crpc->crp_status != 0);
+
+ crpc->crp_node = NULL;
+ crpc->crp_trans = NULL;
+ list_del_init(&crpc->crp_link);
+ count ++;
+
+ spin_unlock(&rpc->crpc_lock);
+
+ atomic_dec(&trans->tas_remaining);
+ }
+
+ LASSERT (atomic_read(&trans->tas_remaining) == 0);
+
+ list_del(&trans->tas_link);
+ if (!list_empty(&trans->tas_olink))
+ list_del(&trans->tas_olink);
+
+ CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+ lstcon_rpc_trans_name(trans->tas_opc), count);
+
+ LIBCFS_FREE(trans, sizeof(*trans));
+
+ return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+ unsigned feats, lstcon_rpc_t **crpc)
+{
+ srpc_mksn_reqst_t *msrq;
+ srpc_rmsn_reqst_t *rsrq;
+ int rc;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+ feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+ msrq->mksn_sid = console_session.ses_id;
+ msrq->mksn_force = console_session.ses_force;
+ strncpy(msrq->mksn_name, console_session.ses_name,
+ strlen(console_session.ses_name));
+ break;
+
+ case LST_TRANS_SESEND:
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+ feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+ rsrq->rmsn_sid = console_session.ses_id;
+ break;
+
+ default:
+ LBUG();
+ }
+
+ return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+ srpc_debug_reqst_t *drq;
+ int rc;
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+ drq->dbg_sid = console_session.ses_id;
+ drq->dbg_flags = 0;
+
+ return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+ lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+ lstcon_batch_t *batch;
+ srpc_batch_reqst_t *brq;
+ int rc;
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+ brq->bar_sid = console_session.ses_id;
+ brq->bar_bid = tsb->tsb_id;
+ brq->bar_testidx = tsb->tsb_index;
+ brq->bar_opc = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+ (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+ SRPC_BATCH_OPC_QUERY);
+
+ if (transop != LST_TRANS_TSBRUN &&
+ transop != LST_TRANS_TSBSTOP)
+ return 0;
+
+ LASSERT (tsb->tsb_index == 0);
+
+ batch = (lstcon_batch_t *)tsb;
+ brq->bar_arg = batch->bat_arg;
+
+ return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+ srpc_stat_reqst_t *srq;
+ int rc;
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+ if (rc != 0)
+ return rc;
+
+ srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+ srq->str_sid = console_session.ses_id;
+ srq->str_type = 0; /* XXX remove it */
+
+ return 0;
+}
+
+lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+ lnet_process_id_packed_t *pid;
+ int i;
+
+ i = idx / SFW_ID_PER_PAGE;
+
+ LASSERT (i < nkiov);
+
+ pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+ return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+ int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+ lnet_process_id_packed_t *pid;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ int start;
+ int end;
+ int i = 0;
+
+ LASSERT (dist >= 1);
+ LASSERT (span >= 1);
+ LASSERT (grp->grp_nnode >= 1);
+
+ if (span > grp->grp_nnode)
+ return -EINVAL;
+
+ start = ((idx / dist) * span) % grp->grp_nnode;
+ end = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+ list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+ nd = ndl->ndl_node;
+ if (i < start) {
+ i ++;
+ continue;
+ }
+
+ if (i > (end >= start ? end: grp->grp_nnode))
+ break;
+
+ pid = lstcon_next_id((i - start), nkiov, kiov);
+ pid->nid = nd->nd_id.nid;
+ pid->pid = nd->nd_id.pid;
+ i++;
+ }
+
+ if (start <= end) /* done */
+ return 0;
+
+ list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+ if (i > grp->grp_nnode + end)
+ break;
+
+ nd = ndl->ndl_node;
+ pid = lstcon_next_id((i - start), nkiov, kiov);
+ pid->nid = nd->nd_id.nid;
+ pid->pid = nd->nd_id.pid;
+ i++;
+ }
+
+ return 0;
+}
+
+int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+ test_ping_req_t *prq = &req->tsr_u.ping;
+
+ prq->png_size = param->png_size;
+ prq->png_flags = param->png_flags;
+ /* TODO dest */
+ return 0;
+}
+
+int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+ test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+ brq->blk_opc = param->blk_opc;
+ brq->blk_npg = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+ brq->blk_flags = param->blk_flags;
+
+ return 0;
+}
+
+int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+ test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+ brq->blk_opc = param->blk_opc;
+ brq->blk_flags = param->blk_flags;
+ brq->blk_len = param->blk_size;
+ brq->blk_offset = 0; /* reserved */
+
+ return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+ lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+ lstcon_group_t *sgrp = test->tes_src_grp;
+ lstcon_group_t *dgrp = test->tes_dst_grp;
+ srpc_test_reqst_t *trq;
+ srpc_bulk_t *bulk;
+ int i;
+ int npg = 0;
+ int nob = 0;
+ int rc = 0;
+
+ if (transop == LST_TRANS_TSBCLIADD) {
+ npg = sfw_id_pages(test->tes_span);
+ nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+ npg * PAGE_CACHE_SIZE :
+ sizeof(lnet_process_id_packed_t) * test->tes_span;
+ }
+
+ rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+ if (rc != 0)
+ return rc;
+
+ trq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+ if (transop == LST_TRANS_TSBSRVADD) {
+ int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+ int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+ int nmax = (ndist + nspan - 1) / nspan;
+
+ trq->tsr_ndest = 0;
+ trq->tsr_loop = nmax * test->tes_dist * test->tes_concur;
+
+ } else {
+ bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+ for (i = 0; i < npg; i++) {
+ int len;
+
+ LASSERT(nob > 0);
+
+ len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+ PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+ nob -= len;
+
+ bulk->bk_iovs[i].kiov_offset = 0;
+ bulk->bk_iovs[i].kiov_len = len;
+ bulk->bk_iovs[i].kiov_page =
+ alloc_page(GFP_IOFS);
+
+ if (bulk->bk_iovs[i].kiov_page == NULL) {
+ lstcon_rpc_put(*crpc);
+ return -ENOMEM;
+ }
+ }
+
+ bulk->bk_sink = 0;
+
+ LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+ rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+ test->tes_cliidx++,
+ test->tes_dist,
+ test->tes_span,
+ npg, &bulk->bk_iovs[0]);
+ if (rc != 0) {
+ lstcon_rpc_put(*crpc);
+ return rc;
+ }
+
+ trq->tsr_ndest = test->tes_span;
+ trq->tsr_loop = test->tes_loop;
+ }
+
+ trq->tsr_sid = console_session.ses_id;
+ trq->tsr_bid = test->tes_hdr.tsb_id;
+ trq->tsr_concur = test->tes_concur;
+ trq->tsr_is_client = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+ trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+ switch (test->tes_type) {
+ case LST_TEST_PING:
+ trq->tsr_service = SRPC_SERVICE_PING;
+ rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+ &test->tes_param[0], trq);
+ break;
+
+ case LST_TEST_BULK:
+ trq->tsr_service = SRPC_SERVICE_BRW;
+ if ((feats & LST_FEAT_BULK_LEN) == 0) {
+ rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+ &test->tes_param[0], trq);
+ } else {
+ rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+ &test->tes_param[0], trq);
+ }
+
+ break;
+ default:
+ LBUG();
+ break;
+ }
+
+ return rc;
+}
+
+int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+ lstcon_node_t *nd, srpc_msg_t *reply)
+{
+ srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+ int status = mksn_rep->mksn_status;
+
+ if (status == 0 &&
+ (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ mksn_rep->mksn_status = EPROTO;
+ status = EPROTO;
+ }
+
+ if (status == EPROTO) {
+ CNETERR("session protocol error from %s: %u\n",
+ libcfs_nid2str(nd->nd_id.nid),
+ reply->msg_ses_feats);
+ }
+
+ if (status != 0)
+ return status;
+
+ if (!trans->tas_feats_updated) {
+ trans->tas_feats_updated = 1;
+ trans->tas_features = reply->msg_ses_feats;
+ }
+
+ if (reply->msg_ses_feats != trans->tas_features) {
+ CNETERR("Framework features %x from %s is different with "
+ "features on this transaction: %x\n",
+ reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+ trans->tas_features);
+ status = mksn_rep->mksn_status = EPROTO;
+ }
+
+ if (status == 0) {
+ /* session timeout on remote node */
+ nd->nd_timeout = mksn_rep->mksn_timeout;
+ }
+
+ return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+ lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+ srpc_rmsn_reply_t *rmsn_rep;
+ srpc_debug_reply_t *dbg_rep;
+ srpc_batch_reply_t *bat_rep;
+ srpc_test_reply_t *test_rep;
+ srpc_stat_reply_t *stat_rep;
+ int rc = 0;
+
+ switch (trans->tas_opc) {
+ case LST_TRANS_SESNEW:
+ rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+ if (rc == 0) {
+ lstcon_sesop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_sesop_stat_failure(stat, 1);
+ break;
+
+ case LST_TRANS_SESEND:
+ rmsn_rep = &msg->msg_body.rmsn_reply;
+ /* ESRCH is not an error for end session */
+ if (rmsn_rep->rmsn_status == 0 ||
+ rmsn_rep->rmsn_status == ESRCH) {
+ lstcon_sesop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_sesop_stat_failure(stat, 1);
+ rc = rmsn_rep->rmsn_status;
+ break;
+
+ case LST_TRANS_SESQRY:
+ case LST_TRANS_SESPING:
+ dbg_rep = &msg->msg_body.dbg_reply;
+
+ if (dbg_rep->dbg_status == ESRCH) {
+ lstcon_sesqry_stat_unknown(stat, 1);
+ return;
+ }
+
+ if (lstcon_session_match(dbg_rep->dbg_sid))
+ lstcon_sesqry_stat_active(stat, 1);
+ else
+ lstcon_sesqry_stat_busy(stat, 1);
+ return;
+
+ case LST_TRANS_TSBRUN:
+ case LST_TRANS_TSBSTOP:
+ bat_rep = &msg->msg_body.bat_reply;
+
+ if (bat_rep->bar_status == 0) {
+ lstcon_tsbop_stat_success(stat, 1);
+ return;
+ }
+
+ if (bat_rep->bar_status == EPERM &&
+ trans->tas_opc == LST_TRANS_TSBSTOP) {
+ lstcon_tsbop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_tsbop_stat_failure(stat, 1);
+ rc = bat_rep->bar_status;
+ break;
+
+ case LST_TRANS_TSBCLIQRY:
+ case LST_TRANS_TSBSRVQRY:
+ bat_rep = &msg->msg_body.bat_reply;
+
+ if (bat_rep->bar_active != 0)
+ lstcon_tsbqry_stat_run(stat, 1);
+ else
+ lstcon_tsbqry_stat_idle(stat, 1);
+
+ if (bat_rep->bar_status == 0)
+ return;
+
+ lstcon_tsbqry_stat_failure(stat, 1);
+ rc = bat_rep->bar_status;
+ break;
+
+ case LST_TRANS_TSBCLIADD:
+ case LST_TRANS_TSBSRVADD:
+ test_rep = &msg->msg_body.tes_reply;
+
+ if (test_rep->tsr_status == 0) {
+ lstcon_tsbop_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_tsbop_stat_failure(stat, 1);
+ rc = test_rep->tsr_status;
+ break;
+
+ case LST_TRANS_STATQRY:
+ stat_rep = &msg->msg_body.stat_reply;
+
+ if (stat_rep->str_status == 0) {
+ lstcon_statqry_stat_success(stat, 1);
+ return;
+ }
+
+ lstcon_statqry_stat_failure(stat, 1);
+ rc = stat_rep->str_status;
+ break;
+
+ default:
+ LBUG();
+ }
+
+ if (stat->trs_fwk_errno == 0)
+ stat->trs_fwk_errno = rc;
+
+ return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+ struct list_head *translist, int transop,
+ void *arg, lstcon_rpc_cond_func_t condition,
+ lstcon_rpc_trans_t **transpp)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ lstcon_rpc_t *rpc;
+ unsigned feats;
+ int rc;
+
+ /* Creating session RPG for list of nodes */
+
+ rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction %d: %d\n", transop, rc);
+ return rc;
+ }
+
+ feats = trans->tas_features;
+ list_for_each_entry(ndl, ndlist, ndl_link) {
+ rc = condition == NULL ? 1 :
+ condition(transop, ndl->ndl_node, arg);
+
+ if (rc == 0)
+ continue;
+
+ if (rc < 0) {
+ CDEBUG(D_NET, "Condition error while creating RPC "
+ " for transaction %d: %d\n", transop, rc);
+ break;
+ }
+
+ nd = ndl->ndl_node;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ case LST_TRANS_SESEND:
+ rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+ break;
+ case LST_TRANS_SESQRY:
+ case LST_TRANS_SESPING:
+ rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+ break;
+ case LST_TRANS_TSBCLIADD:
+ case LST_TRANS_TSBSRVADD:
+ rc = lstcon_testrpc_prep(nd, transop, feats,
+ (lstcon_test_t *)arg, &rpc);
+ break;
+ case LST_TRANS_TSBRUN:
+ case LST_TRANS_TSBSTOP:
+ case LST_TRANS_TSBCLIQRY:
+ case LST_TRANS_TSBSRVQRY:
+ rc = lstcon_batrpc_prep(nd, transop, feats,
+ (lstcon_tsb_hdr_t *)arg, &rpc);
+ break;
+ case LST_TRANS_STATQRY:
+ rc = lstcon_statrpc_prep(nd, feats, &rpc);
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ if (rc != 0) {
+ CERROR("Failed to create RPC for transaction %s: %d\n",
+ lstcon_rpc_trans_name(transop), rc);
+ break;
+ }
+
+ lstcon_rpc_trans_addreq(trans, rpc);
+ }
+
+ if (rc == 0) {
+ *transpp = trans;
+ return 0;
+ }
+
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+void
+lstcon_rpc_pinger(void *arg)
+{
+ stt_timer_t *ptimer = (stt_timer_t *)arg;
+ lstcon_rpc_trans_t *trans;
+ lstcon_rpc_t *crpc;
+ srpc_msg_t *rep;
+ srpc_debug_reqst_t *drq;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ time_t intv;
+ int count = 0;
+ int rc;
+
+ /* RPC pinger is a special case of transaction,
+ * it's called by timer at 8 seconds interval.
+ */
+ mutex_lock(&console_session.ses_mutex);
+
+ if (console_session.ses_shutdown || console_session.ses_expired) {
+ mutex_unlock(&console_session.ses_mutex);
+ return;
+ }
+
+ if (!console_session.ses_expired &&
+ cfs_time_current_sec() - console_session.ses_laststamp >
+ (time_t)console_session.ses_timeout)
+ console_session.ses_expired = 1;
+
+ trans = console_session.ses_ping;
+
+ LASSERT (trans != NULL);
+
+ list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+ nd = ndl->ndl_node;
+
+ if (console_session.ses_expired) {
+ /* idle console, end session on all nodes */
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ continue;
+
+ rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+ trans->tas_features, &crpc);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ break;
+ }
+
+ lstcon_rpc_trans_addreq(trans, crpc);
+ lstcon_rpc_post(crpc);
+
+ continue;
+ }
+
+ crpc = &nd->nd_ping;
+
+ if (crpc->crp_rpc != NULL) {
+ LASSERT (crpc->crp_trans == trans);
+ LASSERT (!list_empty(&crpc->crp_link));
+
+ spin_lock(&crpc->crp_rpc->crpc_lock);
+
+ LASSERT(crpc->crp_posted);
+
+ if (!crpc->crp_finished) {
+ /* in flight */
+ spin_unlock(&crpc->crp_rpc->crpc_lock);
+ continue;
+ }
+
+ spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+ lstcon_rpc_get_reply(crpc, &rep);
+
+ list_del_init(&crpc->crp_link);
+
+ lstcon_rpc_put(crpc);
+ }
+
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ continue;
+
+ intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+ nd->nd_stamp));
+ if (intv < (time_t)nd->nd_timeout / 2)
+ continue;
+
+ rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+ trans->tas_features, 0, 0, 1, crpc);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ break;
+ }
+
+ drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+ drq->dbg_sid = console_session.ses_id;
+ drq->dbg_flags = 0;
+
+ lstcon_rpc_trans_addreq(trans, crpc);
+ lstcon_rpc_post(crpc);
+
+ count ++;
+ }
+
+ if (console_session.ses_expired) {
+ mutex_unlock(&console_session.ses_mutex);
+ return;
+ }
+
+ CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+ ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+ stt_add_timer(ptimer);
+
+ mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+ stt_timer_t *ptimer;
+ int rc;
+
+ LASSERT (list_empty(&console_session.ses_rpc_freelist));
+ LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+
+ rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+ &console_session.ses_ping);
+ if (rc != 0) {
+ CERROR("Failed to create console pinger\n");
+ return rc;
+ }
+
+ ptimer = &console_session.ses_ping_timer;
+ ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+
+ stt_add_timer(ptimer);
+
+ return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+ LASSERT (console_session.ses_shutdown);
+
+ stt_del_timer(&console_session.ses_ping_timer);
+
+ lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+ lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+ lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+ memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+ console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_rpc_t *crpc;
+ struct list_head *pacer;
+ struct list_head zlist;
+
+ /* Called with hold of global mutex */
+
+ LASSERT (console_session.ses_shutdown);
+
+ while (!list_empty(&console_session.ses_trans_list)) {
+ list_for_each(pacer, &console_session.ses_trans_list) {
+ trans = list_entry(pacer, lstcon_rpc_trans_t,
+ tas_link);
+
+ CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+ lstcon_rpc_trans_name(trans->tas_opc));
+
+ wake_up(&trans->tas_waitq);
+ }
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ CWARN("Session is shutting down, "
+ "waiting for termination of transactions\n");
+ cfs_pause(cfs_time_seconds(1));
+
+ mutex_lock(&console_session.ses_mutex);
+ }
+
+ spin_lock(&console_session.ses_rpc_lock);
+
+ lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+ console_session.ses_rpc_lock,
+ "Network is not accessable or target is down, "
+ "waiting for %d console RPCs to being recycled\n",
+ atomic_read(&console_session.ses_rpc_counter));
+
+ list_add(&zlist, &console_session.ses_rpc_freelist);
+ list_del_init(&console_session.ses_rpc_freelist);
+
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ while (!list_empty(&zlist)) {
+ crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+ list_del(&crpc->crp_link);
+ LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+ }
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+ INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+ console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+ console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+ console_session.ses_ping = NULL;
+
+ spin_lock_init(&console_session.ses_rpc_lock);
+ atomic_set(&console_session.ses_rpc_counter, 0);
+ INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+ return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+ LASSERT (list_empty(&console_session.ses_rpc_freelist));
+ LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644
index 000000000000..9aba24a2eab9
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.h
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT 30
+#define LST_TRANS_MIN_TIMEOUT 3
+
+#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL 8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+ struct list_head crp_link; /* chain on rpc transaction */
+ srpc_client_rpc_t *crp_rpc; /* client rpc */
+ struct lstcon_node *crp_node; /* destination node */
+ struct lstcon_rpc_trans *crp_trans; /* conrpc transaction */
+
+ unsigned int crp_posted:1; /* rpc is posted */
+ unsigned int crp_finished:1; /* rpc is finished */
+ unsigned int crp_unpacked:1; /* reply is unpacked */
+ /** RPC is embedded in other structure and can't free it */
+ unsigned int crp_embedded:1;
+ int crp_status; /* console rpc errors */
+ cfs_time_t crp_stamp; /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+ struct list_head tas_olink; /* link chain on owner list */
+ struct list_head tas_link; /* link chain on global list */
+ int tas_opc; /* operation code of transaction */
+ /* features mask is uptodate */
+ unsigned tas_feats_updated;
+ /* test features mask */
+ unsigned tas_features;
+ wait_queue_head_t tas_waitq; /* wait queue head */
+ atomic_t tas_remaining; /* # of un-scheduled rpcs */
+ struct list_head tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE 0x1000
+
+#define LST_TRANS_SESNEW (LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND (LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY 0x03
+#define LST_TRANS_SESPING 0x04
+
+#define LST_TRANS_TSBCLIADD (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN (LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY 0x15
+#define LST_TRANS_TSBSRVQRY 0x16
+
+#define LST_TRANS_STATQRY 0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+ unsigned version, lstcon_rpc_t **crpc);
+int lstcon_dbgrpc_prep(struct lstcon_node *nd,
+ unsigned version, lstcon_rpc_t **crpc);
+int lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+ struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+ struct lstcon_test *test, lstcon_rpc_t **crpc);
+int lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+ lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int lstcon_rpc_trans_prep(struct list_head *translist,
+ int transop, lstcon_rpc_trans_t **transpp);
+int lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+ struct list_head *translist, int transop,
+ void *arg, lstcon_rpc_cond_func_t condition,
+ lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+ lstcon_trans_stat_t *stat);
+int lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+ struct list_head *head_up,
+ lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644
index 000000000000..78e8d0467267
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.c
@@ -0,0 +1,2071 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p) \
+do { \
+ if ((nd)->nd_state == LST_NODE_ACTIVE) \
+ (p)->nle_nactive ++; \
+ else if ((nd)->nd_state == LST_NODE_BUSY) \
+ (p)->nle_nbusy ++; \
+ else if ((nd)->nd_state == LST_NODE_DOWN) \
+ (p)->nle_ndown ++; \
+ else \
+ (p)->nle_nunknown ++; \
+ (p)->nle_nnode ++; \
+} while (0)
+
+lstcon_session_t console_session;
+
+void
+lstcon_node_get(lstcon_node_t *nd)
+{
+ LASSERT (nd->nd_ref >= 1);
+
+ nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+ lstcon_ndlink_t *ndl;
+ unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+ LASSERT (id.nid != LNET_NID_ANY);
+
+ list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+ if (ndl->ndl_node->nd_id.nid != id.nid ||
+ ndl->ndl_node->nd_id.pid != id.pid)
+ continue;
+
+ lstcon_node_get(ndl->ndl_node);
+ *ndpp = ndl->ndl_node;
+ return 0;
+ }
+
+ if (!create)
+ return -ENOENT;
+
+ LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+ if (*ndpp == NULL)
+ return -ENOMEM;
+
+ ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+ ndl->ndl_node = *ndpp;
+
+ ndl->ndl_node->nd_ref = 1;
+ ndl->ndl_node->nd_id = id;
+ ndl->ndl_node->nd_stamp = cfs_time_current();
+ ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+ ndl->ndl_node->nd_timeout = 0;
+ memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+ /* queued in global hash & list, no refcount is taken by
+ * global hash & list, if caller release his refcount,
+ * node will be released */
+ list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+ list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+ return 0;
+}
+
+void
+lstcon_node_put(lstcon_node_t *nd)
+{
+ lstcon_ndlink_t *ndl;
+
+ LASSERT (nd->nd_ref > 0);
+
+ if (--nd->nd_ref > 0)
+ return;
+
+ ndl = (lstcon_ndlink_t *)(nd + 1);
+
+ LASSERT (!list_empty(&ndl->ndl_link));
+ LASSERT (!list_empty(&ndl->ndl_hlink));
+
+ /* remove from session */
+ list_del(&ndl->ndl_link);
+ list_del(&ndl->ndl_hlink);
+
+ LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+ lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+ unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ int rc;
+
+ if (id.nid == LNET_NID_ANY)
+ return -EINVAL;
+
+ /* search in hash */
+ list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+ if (ndl->ndl_node->nd_id.nid != id.nid ||
+ ndl->ndl_node->nd_id.pid != id.pid)
+ continue;
+
+ *ndlpp = ndl;
+ return 0;
+ }
+
+ if (create == 0)
+ return -ENOENT;
+
+ /* find or create in session hash */
+ rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+ if (rc != 0)
+ return rc;
+
+ LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+ if (ndl == NULL) {
+ lstcon_node_put(nd);
+ return -ENOMEM;
+ }
+
+ *ndlpp = ndl;
+
+ ndl->ndl_node = nd;
+ INIT_LIST_HEAD(&ndl->ndl_link);
+ list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+ return 0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+ LASSERT (list_empty(&ndl->ndl_link));
+ LASSERT (!list_empty(&ndl->ndl_hlink));
+
+ list_del(&ndl->ndl_hlink); /* delete from hash */
+ lstcon_node_put(ndl->ndl_node);
+
+ LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+ lstcon_group_t *grp;
+ int i;
+
+ LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+ grp_ndl_hash[LST_NODE_HASHSIZE]));
+ if (grp == NULL)
+ return -ENOMEM;
+
+ memset(grp, 0, offsetof(lstcon_group_t,
+ grp_ndl_hash[LST_NODE_HASHSIZE]));
+
+ grp->grp_ref = 1;
+ if (name != NULL)
+ strcpy(grp->grp_name, name);
+
+ INIT_LIST_HEAD(&grp->grp_link);
+ INIT_LIST_HEAD(&grp->grp_ndl_list);
+ INIT_LIST_HEAD(&grp->grp_trans_list);
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++)
+ INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+ *grpp = grp;
+
+ return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+ grp->grp_ref ++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_ndlink_t *tmp;
+
+ list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+ if ((ndl->ndl_node->nd_state & keep) == 0)
+ lstcon_group_ndlink_release(grp, ndl);
+ }
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+ int i;
+
+ if (--grp->grp_ref > 0)
+ return;
+
+ if (!list_empty(&grp->grp_link))
+ list_del(&grp->grp_link);
+
+ lstcon_group_drain(grp, 0);
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ LASSERT (list_empty(&grp->grp_ndl_hash[i]));
+ }
+
+ LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+ grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(char *name, lstcon_group_t **grpp)
+{
+ lstcon_group_t *grp;
+
+ list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+ if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+ continue;
+
+ lstcon_group_addref(grp); /* +1 ref for caller */
+ *grpp = grp;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+ lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+ lstcon_ndlink_t **ndlpp, int create)
+{
+ int rc;
+
+ rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+ if (rc != 0)
+ return rc;
+
+ if (!list_empty(&(*ndlpp)->ndl_link))
+ return 0;
+
+ list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+ grp->grp_nnode ++;
+
+ return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+ list_del_init(&ndl->ndl_link);
+ lstcon_ndlink_release(ndl);
+ grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+ lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+ unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+ LST_NODE_HASHSIZE;
+
+ list_del(&ndl->ndl_hlink);
+ list_del(&ndl->ndl_link);
+ old->grp_nnode --;
+
+ list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+ list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+ new->grp_nnode ++;
+
+ return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+ lstcon_ndlink_t *ndl;
+
+ while (!list_empty(&old->grp_ndl_list)) {
+ ndl = list_entry(old->grp_ndl_list.next,
+ lstcon_ndlink_t, ndl_link);
+ lstcon_group_ndlink_move(old, new, ndl);
+ }
+}
+
+int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+ lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ if (nd->nd_state == LST_NODE_ACTIVE)
+ return 0;
+ break;
+
+ case LST_TRANS_SESEND:
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return 0;
+
+ if (grp != NULL && nd->nd_ref > 1)
+ return 0;
+ break;
+
+ case LST_TRANS_SESQRY:
+ break;
+
+ default:
+ LBUG();
+ }
+
+ return 1;
+}
+
+int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+ lstcon_rpc_ent_t *ent_up)
+{
+ srpc_debug_reply_t *rep;
+
+ switch (transop) {
+ case LST_TRANS_SESNEW:
+ case LST_TRANS_SESEND:
+ return 0;
+
+ case LST_TRANS_SESQRY:
+ rep = &msg->msg_body.dbg_reply;
+
+ if (copy_to_user(&ent_up->rpe_priv[0],
+ &rep->dbg_timeout, sizeof(int)) ||
+ copy_to_user(&ent_up->rpe_payload[0],
+ &rep->dbg_name, LST_NAME_SIZE))
+ return -EFAULT;
+
+ return 0;
+
+ default:
+ LBUG();
+ }
+
+ return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+ int count, lnet_process_id_t *ids_up,
+ unsigned *featp, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *tmp;
+ lnet_process_id_t id;
+ int i;
+ int rc;
+
+ rc = lstcon_group_alloc(NULL, &tmp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0 ; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ /* skip if it's in this group already */
+ rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+ if (rc == 0)
+ continue;
+
+ /* add to tmp group */
+ rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+ if (rc != 0) {
+ CERROR("Can't create ndlink, out of memory\n");
+ break;
+ }
+ }
+
+ if (rc != 0) {
+ lstcon_group_put(tmp);
+ return rc;
+ }
+
+ rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+ &tmp->grp_trans_list, LST_TRANS_SESNEW,
+ tmp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ lstcon_group_put(tmp);
+ return rc;
+ }
+
+ /* post all RPCs */
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_sesrpc_readent);
+ *featp = trans->tas_features;
+
+ /* destroy all RPGs */
+ lstcon_rpc_trans_destroy(trans);
+
+ lstcon_group_move(tmp, grp);
+ lstcon_group_put(tmp);
+
+ return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+ int count, lnet_process_id_t *ids_up,
+ struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *tmp;
+ lnet_process_id_t id;
+ int rc;
+ int i;
+
+ /* End session and remove node from the group */
+
+ rc = lstcon_group_alloc(NULL, &tmp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ goto error;
+ }
+
+ /* move node to tmp group */
+ if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+ lstcon_group_ndlink_move(grp, tmp, ndl);
+ }
+
+ rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+ &tmp->grp_trans_list, LST_TRANS_SESEND,
+ tmp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ goto error;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* release nodes anyway, because we can't rollback status */
+ lstcon_group_put(tmp);
+
+ return rc;
+error:
+ lstcon_group_move(tmp, grp);
+ lstcon_group_put(tmp);
+
+ return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+ if (rc != 0) {
+ /* find a group with same name */
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ rc = lstcon_group_alloc(name, &grp);
+ if (rc != 0) {
+ CERROR("Can't allocate descriptor for group %s\n", name);
+ return -ENOMEM;
+ }
+
+ list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+ return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+ unsigned *featp, struct list_head *result_up)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ LASSERT (count > 0);
+ LASSERT (ids_up != NULL);
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by other threads or test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+
+ return -EBUSY;
+ }
+
+ rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group: %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by others threads or test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+ &grp->grp_trans_list, LST_TRANS_SESEND,
+ grp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ lstcon_rpc_trans_destroy(trans);
+
+ lstcon_group_put(grp);
+ /* -ref for session, it's destroyed,
+ * status can't be rolled back, destroy group anway */
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+ lstcon_group_t *grp = NULL;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+ LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+ lstcon_group_drain(grp, args);
+
+ lstcon_group_put(grp);
+ /* release empty group */
+ if (list_empty(&grp->grp_ndl_list))
+ lstcon_group_put(grp);
+
+ return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+ lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+ lstcon_group_t *grp = NULL;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group: %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+ lstcon_group_put(grp);
+ /* release empty group */
+ if (list_empty(&grp->grp_ndl_list))
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group: %s\n", name);
+ return rc;
+ }
+
+ if (grp->grp_ref > 2) {
+ /* referred by test */
+ CDEBUG(D_NET, "Group %s is busy\n", name);
+ lstcon_group_put(grp);
+ return -EBUSY;
+ }
+
+ /* re-invite all inactive nodes int the group */
+ rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+ &grp->grp_trans_list, LST_TRANS_SESNEW,
+ grp, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ /* local error, return */
+ CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* -ref for me */
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+ lstcon_group_t *grp;
+
+ LASSERT (index >= 0);
+ LASSERT (name_up != NULL);
+
+ list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+ if (index-- == 0) {
+ return copy_to_user(name_up, grp->grp_name, len) ?
+ -EFAULT : 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+ int *count_p, lstcon_node_ent_t *dents_up)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_node_t *nd;
+ int count = 0;
+ int index = 0;
+
+ LASSERT (index_p != NULL && count_p != NULL);
+ LASSERT (dents_up != NULL);
+ LASSERT (*index_p >= 0);
+ LASSERT (*count_p > 0);
+
+ list_for_each_entry(ndl, head, ndl_link) {
+ if (index++ < *index_p)
+ continue;
+
+ if (count >= *count_p)
+ break;
+
+ nd = ndl->ndl_node;
+ if (copy_to_user(&dents_up[count].nde_id,
+ &nd->nd_id, sizeof(nd->nd_id)) ||
+ copy_to_user(&dents_up[count].nde_state,
+ &nd->nd_state, sizeof(nd->nd_state)))
+ return -EFAULT;
+
+ count ++;
+ }
+
+ if (index <= *index_p)
+ return -ENOENT;
+
+ *count_p = count;
+ *index_p = index;
+
+ return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+ int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+ lstcon_ndlist_ent_t *gentp;
+ lstcon_group_t *grp;
+ lstcon_ndlink_t *ndl;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", name);
+ return rc;
+ }
+
+ if (dents_up != 0) {
+ /* verbose query */
+ rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+ index_p, count_p, dents_up);
+ lstcon_group_put(grp);
+
+ return rc;
+ }
+
+ /* non-verbose query */
+ LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+ if (gentp == NULL) {
+ CERROR("Can't allocate ndlist_ent\n");
+ lstcon_group_put(grp);
+
+ return -ENOMEM;
+ }
+
+ memset(gentp, 0, sizeof(lstcon_ndlist_ent_t));
+
+ list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+ rc = copy_to_user(gents_p, gentp,
+ sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+ LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+ lstcon_group_put(grp);
+
+ return 0;
+}
+
+int
+lstcon_batch_find(char *name, lstcon_batch_t **batpp)
+{
+ lstcon_batch_t *bat;
+
+ list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+ if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+ *batpp = bat;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+ lstcon_batch_t *bat;
+ int i;
+ int rc;
+
+ rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+ if (rc != 0) {
+ CDEBUG(D_NET, "Batch %s already exists\n", name);
+ return rc;
+ }
+
+ LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+ if (bat == NULL) {
+ CERROR("Can't allocate descriptor for batch %s\n", name);
+ return -ENOMEM;
+ }
+
+ LIBCFS_ALLOC(bat->bat_cli_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ if (bat->bat_cli_hash == NULL) {
+ CERROR("Can't allocate hash for batch %s\n", name);
+ LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+ return -ENOMEM;
+ }
+
+ LIBCFS_ALLOC(bat->bat_srv_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ if (bat->bat_srv_hash == NULL) {
+ CERROR("Can't allocate hash for batch %s\n", name);
+ LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+ LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+ return -ENOMEM;
+ }
+
+ strcpy(bat->bat_name, name);
+ bat->bat_hdr.tsb_index = 0;
+ bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+ bat->bat_ntest = 0;
+ bat->bat_state = LST_BATCH_IDLE;
+
+ INIT_LIST_HEAD(&bat->bat_cli_list);
+ INIT_LIST_HEAD(&bat->bat_srv_list);
+ INIT_LIST_HEAD(&bat->bat_test_list);
+ INIT_LIST_HEAD(&bat->bat_trans_list);
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+ INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+ }
+
+ list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+ return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+ lstcon_batch_t *bat;
+
+ LASSERT (name_up != NULL);
+ LASSERT (index >= 0);
+
+ list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+ if (index-- == 0) {
+ return copy_to_user(name_up,bat->bat_name, len) ?
+ -EFAULT: 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+ int testidx, int *index_p, int *ndent_p,
+ lstcon_node_ent_t *dents_up)
+{
+ lstcon_test_batch_ent_t *entp;
+ struct list_head *clilst;
+ struct list_head *srvlst;
+ lstcon_test_t *test = NULL;
+ lstcon_batch_t *bat;
+ lstcon_ndlink_t *ndl;
+ int rc;
+
+ rc = lstcon_batch_find(name, &bat);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return -ENOENT;
+ }
+
+ if (testidx > 0) {
+ /* query test, test index start from 1 */
+ list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+ if (testidx-- == 1)
+ break;
+ }
+
+ if (testidx > 0) {
+ CDEBUG(D_NET, "Can't find specified test in batch\n");
+ return -ENOENT;
+ }
+ }
+
+ clilst = (test == NULL) ? &bat->bat_cli_list :
+ &test->tes_src_grp->grp_ndl_list;
+ srvlst = (test == NULL) ? &bat->bat_srv_list :
+ &test->tes_dst_grp->grp_ndl_list;
+
+ if (dents_up != NULL) {
+ rc = lstcon_nodes_getent((server ? srvlst: clilst),
+ index_p, ndent_p, dents_up);
+ return rc;
+ }
+
+ /* non-verbose query */
+ LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+ if (entp == NULL)
+ return -ENOMEM;
+
+ memset(entp, 0, sizeof(lstcon_test_batch_ent_t));
+
+ if (test == NULL) {
+ entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+ entp->u.tbe_batch.bae_state = bat->bat_state;
+
+ } else {
+
+ entp->u.tbe_test.tse_type = test->tes_type;
+ entp->u.tbe_test.tse_loop = test->tes_loop;
+ entp->u.tbe_test.tse_concur = test->tes_concur;
+ }
+
+ list_for_each_entry(ndl, clilst, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+ list_for_each_entry(ndl, srvlst, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+ rc = copy_to_user(ent_up, entp,
+ sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+ LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+ return rc;
+}
+
+int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+ switch (transop) {
+ case LST_TRANS_TSBRUN:
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return -ENETDOWN;
+ break;
+
+ case LST_TRANS_TSBSTOP:
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return 0;
+ break;
+
+ case LST_TRANS_TSBCLIQRY:
+ case LST_TRANS_TSBSRVQRY:
+ break;
+ }
+
+ return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+ struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ int rc;
+
+ rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+ &bat->bat_trans_list, transop,
+ bat, lstcon_batrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+ lstcon_batch_t *bat;
+ int rc;
+
+ if (lstcon_batch_find(name, &bat) != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return -ENOENT;
+ }
+
+ bat->bat_arg = timeout;
+
+ rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+ /* mark batch as running if it's started in any node */
+ if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+ bat->bat_state = LST_BATCH_RUNNING;
+
+ return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+ lstcon_batch_t *bat;
+ int rc;
+
+ if (lstcon_batch_find(name, &bat) != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return -ENOENT;
+ }
+
+ bat->bat_arg = force;
+
+ rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+ /* mark batch as stopped if all RPCs finished */
+ if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+ bat->bat_state = LST_BATCH_IDLE;
+
+ return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_test_t *test;
+ int i;
+
+ list_del(&bat->bat_link);
+
+ while (!list_empty(&bat->bat_test_list)) {
+ test = list_entry(bat->bat_test_list.next,
+ lstcon_test_t, tes_link);
+ LASSERT (list_empty(&test->tes_trans_list));
+
+ list_del(&test->tes_link);
+
+ lstcon_group_put(test->tes_src_grp);
+ lstcon_group_put(test->tes_dst_grp);
+
+ LIBCFS_FREE(test, offsetof(lstcon_test_t,
+ tes_param[test->tes_paramlen]));
+ }
+
+ LASSERT (list_empty(&bat->bat_trans_list));
+
+ while (!list_empty(&bat->bat_cli_list)) {
+ ndl = list_entry(bat->bat_cli_list.next,
+ lstcon_ndlink_t, ndl_link);
+ list_del_init(&ndl->ndl_link);
+
+ lstcon_ndlink_release(ndl);
+ }
+
+ while (!list_empty(&bat->bat_srv_list)) {
+ ndl = list_entry(bat->bat_srv_list.next,
+ lstcon_ndlink_t, ndl_link);
+ list_del_init(&ndl->ndl_link);
+
+ lstcon_ndlink_release(ndl);
+ }
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ LASSERT (list_empty(&bat->bat_cli_hash[i]));
+ LASSERT (list_empty(&bat->bat_srv_hash[i]));
+ }
+
+ LIBCFS_FREE(bat->bat_cli_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ LIBCFS_FREE(bat->bat_srv_hash,
+ sizeof(struct list_head) * LST_NODE_HASHSIZE);
+ LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+ lstcon_test_t *test;
+ lstcon_batch_t *batch;
+ lstcon_ndlink_t *ndl;
+ struct list_head *hash;
+ struct list_head *head;
+
+ test = (lstcon_test_t *)arg;
+ LASSERT (test != NULL);
+
+ batch = test->tes_batch;
+ LASSERT (batch != NULL);
+
+ if (test->tes_oneside &&
+ transop == LST_TRANS_TSBSRVADD)
+ return 0;
+
+ if (nd->nd_state != LST_NODE_ACTIVE)
+ return -ENETDOWN;
+
+ if (transop == LST_TRANS_TSBCLIADD) {
+ hash = batch->bat_cli_hash;
+ head = &batch->bat_cli_list;
+
+ } else {
+ LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+ hash = batch->bat_srv_hash;
+ head = &batch->bat_srv_list;
+ }
+
+ LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+ if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+ return -ENOMEM;
+
+ if (list_empty(&ndl->ndl_link))
+ list_add_tail(&ndl->ndl_link, head);
+
+ return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ int transop;
+ int rc;
+
+ LASSERT (test->tes_src_grp != NULL);
+ LASSERT (test->tes_dst_grp != NULL);
+
+ transop = LST_TRANS_TSBSRVADD;
+ grp = test->tes_dst_grp;
+again:
+ rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+ &test->tes_trans_list, transop,
+ test, lstcon_testrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+ lstcon_trans_stat()->trs_fwk_errno != 0) {
+ lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* return if any error */
+ CDEBUG(D_NET, "Failed to add test %s, "
+ "RPC error %d, framework error %d\n",
+ transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+ lstcon_trans_stat()->trs_rpc_errno,
+ lstcon_trans_stat()->trs_fwk_errno);
+
+ return rc;
+ }
+
+ lstcon_rpc_trans_destroy(trans);
+
+ if (transop == LST_TRANS_TSBCLIADD)
+ return rc;
+
+ transop = LST_TRANS_TSBCLIADD;
+ grp = test->tes_src_grp;
+ test->tes_cliidx = 0;
+
+ /* requests to test clients */
+ goto again;
+}
+
+int
+lstcon_test_add(char *name, int type, int loop, int concur,
+ int dist, int span, char *src_name, char * dst_name,
+ void *param, int paramlen, int *retp,
+ struct list_head *result_up)
+{
+ lstcon_group_t *src_grp = NULL;
+ lstcon_group_t *dst_grp = NULL;
+ lstcon_test_t *test = NULL;
+ lstcon_batch_t *batch;
+ int rc;
+
+ rc = lstcon_batch_find(name, &batch);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find batch %s\n", name);
+ return rc;
+ }
+
+ if (batch->bat_state != LST_BATCH_IDLE) {
+ CDEBUG(D_NET, "Can't change running batch %s\n", name);
+ return rc;
+ }
+
+ rc = lstcon_group_find(src_name, &src_grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", src_name);
+ goto out;
+ }
+
+ rc = lstcon_group_find(dst_name, &dst_grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", dst_name);
+ goto out;
+ }
+
+ if (dst_grp->grp_userland)
+ *retp = 1;
+
+ LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+ if (!test) {
+ CERROR("Can't allocate test descriptor\n");
+ rc = -ENOMEM;
+
+ goto out;
+ }
+
+ memset(test, 0, offsetof(lstcon_test_t, tes_param[paramlen]));
+ test->tes_hdr.tsb_id = batch->bat_hdr.tsb_id;
+ test->tes_batch = batch;
+ test->tes_type = type;
+ test->tes_oneside = 0; /* TODO */
+ test->tes_loop = loop;
+ test->tes_concur = concur;
+ test->tes_stop_onerr = 1; /* TODO */
+ test->tes_span = span;
+ test->tes_dist = dist;
+ test->tes_cliidx = 0; /* just used for creating RPC */
+ test->tes_src_grp = src_grp;
+ test->tes_dst_grp = dst_grp;
+ INIT_LIST_HEAD(&test->tes_trans_list);
+
+ if (param != NULL) {
+ test->tes_paramlen = paramlen;
+ memcpy(&test->tes_param[0], param, paramlen);
+ }
+
+ rc = lstcon_test_nodes_add(test, result_up);
+
+ if (rc != 0)
+ goto out;
+
+ if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+ lstcon_trans_stat()->trs_fwk_errno != 0)
+ CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, name);
+
+ /* add to test list anyway, so user can check what's going on */
+ list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+ batch->bat_ntest ++;
+ test->tes_hdr.tsb_index = batch->bat_ntest;
+
+ /* hold groups so nobody can change them */
+ return rc;
+out:
+ if (test != NULL)
+ LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+ if (dst_grp != NULL)
+ lstcon_group_put(dst_grp);
+
+ if (src_grp != NULL)
+ lstcon_group_put(src_grp);
+
+ return rc;
+}
+
+int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+ lstcon_test_t *test;
+
+ list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+ if (idx == test->tes_hdr.tsb_index) {
+ *testpp = test;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+ lstcon_rpc_ent_t *ent_up)
+{
+ srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+ LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+ transop == LST_TRANS_TSBSRVQRY);
+
+ /* positive errno, framework error code */
+ if (copy_to_user(&ent_up->rpe_priv[0],
+ &rep->bar_active, sizeof(rep->bar_active)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+ int timeout, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ struct list_head *translist;
+ struct list_head *ndlist;
+ lstcon_tsb_hdr_t *hdr;
+ lstcon_batch_t *batch;
+ lstcon_test_t *test = NULL;
+ int transop;
+ int rc;
+
+ rc = lstcon_batch_find(name, &batch);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find batch: %s\n", name);
+ return rc;
+ }
+
+ if (testidx == 0) {
+ translist = &batch->bat_trans_list;
+ ndlist = &batch->bat_cli_list;
+ hdr = &batch->bat_hdr;
+
+ } else {
+ /* query specified test only */
+ rc = lstcon_test_find(batch, testidx, &test);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+ return rc;
+ }
+
+ translist = &test->tes_trans_list;
+ ndlist = &test->tes_src_grp->grp_ndl_list;
+ hdr = &test->tes_hdr;
+ }
+
+ transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+ rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+ lstcon_batrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, timeout);
+
+ if (testidx == 0 && /* query a batch, not a test */
+ lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+ lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+ /* all RPCs finished, and no active test */
+ batch->bat_state = LST_BATCH_IDLE;
+ }
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_tsbrpc_readent);
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+ lstcon_rpc_ent_t *ent_up)
+{
+ srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+ sfw_counters_t *sfwk_stat;
+ srpc_counters_t *srpc_stat;
+ lnet_counters_t *lnet_stat;
+
+ if (rep->str_status != 0)
+ return 0;
+
+ sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+ srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+ lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+ if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+ copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+ copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int
+lstcon_ndlist_stat(struct list_head *ndlist,
+ int timeout, struct list_head *result_up)
+{
+ struct list_head head;
+ lstcon_rpc_trans_t *trans;
+ int rc;
+
+ INIT_LIST_HEAD(&head);
+
+ rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+ LST_TRANS_STATQRY, NULL, NULL, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_statrpc_readent);
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(grp_name, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+ return rc;
+ }
+
+ rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+ int timeout, struct list_head *result_up)
+{
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *tmp;
+ lnet_process_id_t id;
+ int i;
+ int rc;
+
+ rc = lstcon_group_alloc(NULL, &tmp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0 ; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ /* add to tmp group */
+ rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+ if (rc != 0) {
+ CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+ "Failed to find or create %s: %d\n",
+ libcfs_id2str(id), rc);
+ break;
+ }
+ }
+
+ if (rc != 0) {
+ lstcon_group_put(tmp);
+ return rc;
+ }
+
+ rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+ lstcon_group_put(tmp);
+
+ return rc;
+}
+
+int
+lstcon_debug_ndlist(struct list_head *ndlist,
+ struct list_head *translist,
+ int timeout, struct list_head *result_up)
+{
+ lstcon_rpc_trans_t *trans;
+ int rc;
+
+ rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+ NULL, lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+ rc = lstcon_rpc_trans_interpreter(trans, result_up,
+ lstcon_sesrpc_readent);
+ lstcon_rpc_trans_destroy(trans);
+
+ return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+ return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+ NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+ int client, struct list_head *result_up)
+{
+ lstcon_batch_t *bat;
+ int rc;
+
+ rc = lstcon_batch_find(name, &bat);
+ if (rc != 0)
+ return -ENOENT;
+
+ rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+ &bat->bat_srv_list,
+ NULL, timeout, result_up);
+
+ return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+ struct list_head *result_up)
+{
+ lstcon_group_t *grp;
+ int rc;
+
+ rc = lstcon_group_find(name, &grp);
+ if (rc != 0)
+ return -ENOENT;
+
+ rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+ timeout, result_up);
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+ int count, lnet_process_id_t *ids_up,
+ struct list_head *result_up)
+{
+ lnet_process_id_t id;
+ lstcon_ndlink_t *ndl;
+ lstcon_group_t *grp;
+ int i;
+ int rc;
+
+ rc = lstcon_group_alloc(NULL, &grp);
+ if (rc != 0) {
+ CDEBUG(D_NET, "Out of memory\n");
+ return rc;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ /* node is added to tmp group */
+ rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+ if (rc != 0) {
+ CERROR("Can't create node link\n");
+ break;
+ }
+ }
+
+ if (rc != 0) {
+ lstcon_group_put(grp);
+ return rc;
+ }
+
+ rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+ timeout, result_up);
+
+ lstcon_group_put(grp);
+
+ return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+ return (console_session.ses_id.ses_nid == sid.ses_nid &&
+ console_session.ses_id.ses_stamp == sid.ses_stamp) ? 1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+ lnet_process_id_t id;
+
+ LASSERT (console_session.ses_state == LST_SESSION_NONE);
+
+ LNetGetId(1, &id);
+ sid->ses_nid = id.nid;
+ sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+ int timeout, int force, lst_sid_t *sid_up)
+{
+ int rc = 0;
+ int i;
+
+ if (console_session.ses_state != LST_SESSION_NONE) {
+ /* session exists */
+ if (!force) {
+ CNETERR("Session %s already exists\n",
+ console_session.ses_name);
+ return -EEXIST;
+ }
+
+ rc = lstcon_session_end();
+
+ /* lstcon_session_end() only return local error */
+ if (rc != 0)
+ return rc;
+ }
+
+ if ((feats & ~LST_FEATS_MASK) != 0) {
+ CNETERR("Unknown session features %x\n",
+ (feats & ~LST_FEATS_MASK));
+ return -EINVAL;
+ }
+
+ for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+ LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+ lstcon_new_session_id(&console_session.ses_id);
+
+ console_session.ses_key = key;
+ console_session.ses_state = LST_SESSION_ACTIVE;
+ console_session.ses_force = !!force;
+ console_session.ses_features = feats;
+ console_session.ses_feats_updated = 0;
+ console_session.ses_timeout = (timeout <= 0) ?
+ LST_CONSOLE_TIMEOUT : timeout;
+ strcpy(console_session.ses_name, name);
+
+ rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+ if (rc != 0)
+ return rc;
+
+ rc = lstcon_rpc_pinger_start();
+ if (rc != 0) {
+ lstcon_batch_t *bat = NULL;
+
+ lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+ lstcon_batch_destroy(bat);
+
+ return rc;
+ }
+
+ if (copy_to_user(sid_up, &console_session.ses_id,
+ sizeof(lst_sid_t)) == 0)
+ return rc;
+
+ lstcon_session_end();
+
+ return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+ lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+ lstcon_ndlist_ent_t *entp;
+ lstcon_ndlink_t *ndl;
+ int rc = 0;
+
+ if (console_session.ses_state != LST_SESSION_ACTIVE)
+ return -ESRCH;
+
+ LIBCFS_ALLOC(entp, sizeof(*entp));
+ if (entp == NULL)
+ return -ENOMEM;
+
+ memset(entp, 0, sizeof(*entp));
+
+ list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+ LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+ if (copy_to_user(sid_up, &console_session.ses_id,
+ sizeof(lst_sid_t)) ||
+ copy_to_user(key_up, &console_session.ses_key,
+ sizeof(*key_up)) ||
+ copy_to_user(featp, &console_session.ses_features,
+ sizeof(*featp)) ||
+ copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+ copy_to_user(name_up, console_session.ses_name, len))
+ rc = -EFAULT;
+
+ LIBCFS_FREE(entp, sizeof(*entp));
+
+ return rc;
+}
+
+int
+lstcon_session_end()
+{
+ lstcon_rpc_trans_t *trans;
+ lstcon_group_t *grp;
+ lstcon_batch_t *bat;
+ int rc = 0;
+
+ LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+ rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+ NULL, LST_TRANS_SESEND, NULL,
+ lstcon_sesrpc_condition, &trans);
+ if (rc != 0) {
+ CERROR("Can't create transaction: %d\n", rc);
+ return rc;
+ }
+
+ console_session.ses_shutdown = 1;
+
+ lstcon_rpc_pinger_stop();
+
+ lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+ lstcon_rpc_trans_destroy(trans);
+ /* User can do nothing even rpc failed, so go on */
+
+ /* waiting for orphan rpcs to die */
+ lstcon_rpc_cleanup_wait();
+
+ console_session.ses_id = LST_INVALID_SID;
+ console_session.ses_state = LST_SESSION_NONE;
+ console_session.ses_key = 0;
+ console_session.ses_force = 0;
+ console_session.ses_feats_updated = 0;
+
+ /* destroy all batches */
+ while (!list_empty(&console_session.ses_bat_list)) {
+ bat = list_entry(console_session.ses_bat_list.next,
+ lstcon_batch_t, bat_link);
+
+ lstcon_batch_destroy(bat);
+ }
+
+ /* destroy all groups */
+ while (!list_empty(&console_session.ses_grp_list)) {
+ grp = list_entry(console_session.ses_grp_list.next,
+ lstcon_group_t, grp_link);
+ LASSERT (grp->grp_ref == 1);
+
+ lstcon_group_put(grp);
+ }
+
+ /* all nodes should be released */
+ LASSERT (list_empty(&console_session.ses_ndl_list));
+
+ console_session.ses_shutdown = 0;
+ console_session.ses_expired = 0;
+
+ return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+ int rc = 0;
+
+ if ((feats & ~LST_FEATS_MASK) != 0) {
+ CERROR("Can't support these features: %x\n",
+ (feats & ~LST_FEATS_MASK));
+ return -EPROTO;
+ }
+
+ spin_lock(&console_session.ses_rpc_lock);
+
+ if (!console_session.ses_feats_updated) {
+ console_session.ses_feats_updated = 1;
+ console_session.ses_features = feats;
+ }
+
+ if (console_session.ses_features != feats)
+ rc = -EPROTO;
+
+ spin_unlock(&console_session.ses_rpc_lock);
+
+ if (rc != 0) {
+ CERROR("remote features %x do not match with "
+ "session features %x of console\n",
+ feats, console_session.ses_features);
+ }
+
+ return rc;
+}
+
+static int
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+{
+ srpc_msg_t *rep = &rpc->srpc_replymsg;
+ srpc_msg_t *req = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+ srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+ lstcon_group_t *grp = NULL;
+ lstcon_ndlink_t *ndl;
+ int rc = 0;
+
+ sfw_unpack_message(req);
+
+ mutex_lock(&console_session.ses_mutex);
+
+ jrep->join_sid = console_session.ses_id;
+
+ if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+ jrep->join_status = ESRCH;
+ goto out;
+ }
+
+ if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+ jrep->join_status = EPROTO;
+ goto out;
+ }
+
+ if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+ !lstcon_session_match(jreq->join_sid)) {
+ jrep->join_status = EBUSY;
+ goto out;
+ }
+
+ if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+ rc = lstcon_group_alloc(jreq->join_group, &grp);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ goto out;
+ }
+
+ list_add_tail(&grp->grp_link,
+ &console_session.ses_grp_list);
+ lstcon_group_addref(grp);
+ }
+
+ if (grp->grp_ref > 2) {
+ /* Group in using */
+ jrep->join_status = EBUSY;
+ goto out;
+ }
+
+ rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+ if (rc == 0) {
+ jrep->join_status = EEXIST;
+ goto out;
+ }
+
+ rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+ if (rc != 0) {
+ CERROR("Out of memory\n");
+ goto out;
+ }
+
+ ndl->ndl_node->nd_state = LST_NODE_ACTIVE;
+ ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+ if (grp->grp_userland == 0)
+ grp->grp_userland = 1;
+
+ strcpy(jrep->join_session, console_session.ses_name);
+ jrep->join_timeout = console_session.ses_timeout;
+ jrep->join_status = 0;
+
+out:
+ rep->msg_ses_feats = console_session.ses_features;
+ if (grp != NULL)
+ lstcon_group_put(grp);
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+void lstcon_init_acceptor_service(void)
+{
+ /* initialize selftest console acceptor service table */
+ lstcon_acceptor_service.sv_name = "join session";
+ lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+ lstcon_acceptor_service.sv_id = SRPC_SERVICE_JOIN;
+ lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+ int i;
+ int rc;
+
+ memset(&console_session, 0, sizeof(lstcon_session_t));
+
+ console_session.ses_id = LST_INVALID_SID;
+ console_session.ses_state = LST_SESSION_NONE;
+ console_session.ses_timeout = 0;
+ console_session.ses_force = 0;
+ console_session.ses_expired = 0;
+ console_session.ses_feats_updated = 0;
+ console_session.ses_features = LST_FEATS_MASK;
+ console_session.ses_laststamp = cfs_time_current_sec();
+
+ mutex_init(&console_session.ses_mutex);
+
+ INIT_LIST_HEAD(&console_session.ses_ndl_list);
+ INIT_LIST_HEAD(&console_session.ses_grp_list);
+ INIT_LIST_HEAD(&console_session.ses_bat_list);
+ INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+ LIBCFS_ALLOC(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+ if (console_session.ses_ndl_hash == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+ INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+ /* initialize acceptor service table */
+ lstcon_init_acceptor_service();
+
+ rc = srpc_add_service(&lstcon_acceptor_service);
+ LASSERT (rc != -EBUSY);
+ if (rc != 0) {
+ LIBCFS_FREE(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+ return rc;
+ }
+
+ rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+ lstcon_acceptor_service.sv_wi_total);
+ if (rc != 0) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+ if (rc == 0) {
+ lstcon_rpc_module_init();
+ return 0;
+ }
+
+out:
+ srpc_shutdown_service(&lstcon_acceptor_service);
+ srpc_remove_service(&lstcon_acceptor_service);
+
+ LIBCFS_FREE(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+ srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+ return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+ int i;
+
+ libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+ mutex_lock(&console_session.ses_mutex);
+
+ srpc_shutdown_service(&lstcon_acceptor_service);
+ srpc_remove_service(&lstcon_acceptor_service);
+
+ if (console_session.ses_state != LST_SESSION_NONE)
+ lstcon_session_end();
+
+ lstcon_rpc_module_fini();
+
+ mutex_unlock(&console_session.ses_mutex);
+
+ LASSERT (list_empty(&console_session.ses_ndl_list));
+ LASSERT (list_empty(&console_session.ses_grp_list));
+ LASSERT (list_empty(&console_session.ses_bat_list));
+ LASSERT (list_empty(&console_session.ses_trans_list));
+
+ for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+ LASSERT (list_empty(&console_session.ses_ndl_hash[i]));
+ }
+
+ LIBCFS_FREE(console_session.ses_ndl_hash,
+ sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+ srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+ return 0;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644
index 000000000000..e61b26687dbb
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.h
@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+ lnet_process_id_t nd_id; /* id of the node */
+ int nd_ref; /* reference count */
+ int nd_state; /* state of the node */
+ int nd_timeout; /* session timeout */
+ cfs_time_t nd_stamp; /* timestamp of last replied RPC */
+ struct lstcon_rpc nd_ping; /* ping rpc */
+} lstcon_node_t; /*** node descriptor */
+
+typedef struct {
+ struct list_head ndl_link; /* chain on list */
+ struct list_head ndl_hlink; /* chain on hash */
+ lstcon_node_t *ndl_node; /* pointer to node */
+} lstcon_ndlink_t; /*** node link descriptor */
+
+typedef struct {
+ struct list_head grp_link; /* chain on global group list */
+ int grp_ref; /* reference count */
+ int grp_userland; /* has userland nodes */
+ int grp_nnode; /* # of nodes */
+ char grp_name[LST_NAME_SIZE]; /* group name */
+
+ struct list_head grp_trans_list; /* transaction list */
+ struct list_head grp_ndl_list; /* nodes list */
+ struct list_head grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t; /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE 0xB0 /* idle batch */
+#define LST_BATCH_RUNNING 0xB1 /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+ lst_bid_t tsb_id; /* batch ID */
+ int tsb_index; /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+ lstcon_tsb_hdr_t bat_hdr; /* test_batch header */
+ struct list_head bat_link; /* chain on session's batches list */
+ int bat_ntest; /* # of test */
+ int bat_state; /* state of the batch */
+ int bat_arg; /* parameter for run|stop, timeout for run, force for stop */
+ char bat_name[LST_NAME_SIZE]; /* name of batch */
+
+ struct list_head bat_test_list; /* list head of tests (lstcon_test_t) */
+ struct list_head bat_trans_list; /* list head of transaction */
+ struct list_head bat_cli_list; /* list head of client nodes (lstcon_node_t) */
+ struct list_head *bat_cli_hash; /* hash table of client nodes */
+ struct list_head bat_srv_list; /* list head of server nodes */
+ struct list_head *bat_srv_hash; /* hash table of server nodes */
+} lstcon_batch_t; /*** (tests ) batch descritptor */
+
+typedef struct lstcon_test {
+ lstcon_tsb_hdr_t tes_hdr; /* test batch header */
+ struct list_head tes_link; /* chain on batch's tests list */
+ lstcon_batch_t *tes_batch; /* pointer to batch */
+
+ int tes_type; /* type of the test, i.e: bulk, ping */
+ int tes_stop_onerr; /* stop on error */
+ int tes_oneside; /* one-sided test */
+ int tes_concur; /* concurrency */
+ int tes_loop; /* loop count */
+ int tes_dist; /* nodes distribution of target group */
+ int tes_span; /* nodes span of target group */
+ int tes_cliidx; /* client index, used for RPC creating */
+
+ struct list_head tes_trans_list; /* transaction list */
+ lstcon_group_t *tes_src_grp; /* group run the test */
+ lstcon_group_t *tes_dst_grp; /* target group */
+
+ int tes_paramlen; /* test parameter length */
+ char tes_param[0]; /* test parameter */
+} lstcon_test_t; /*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE 503 /* global nodes hash table size */
+#define LST_NODE_HASHSIZE 239 /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE 0x0 /* no session */
+#define LST_SESSION_ACTIVE 0x1 /* working session */
+
+#define LST_CONSOLE_TIMEOUT 300 /* default console timeout */
+
+typedef struct {
+ struct mutex ses_mutex; /* only 1 thread in session */
+ lst_sid_t ses_id; /* global session id */
+ int ses_key; /* local session key */
+ int ses_state; /* state of session */
+ int ses_timeout; /* timeout in seconds */
+ time_t ses_laststamp; /* last operation stamp (seconds) */
+ /** tests features of the session */
+ unsigned ses_features;
+ /** features are synced with remote test nodes */
+ unsigned ses_feats_updated:1;
+ /** force creating */
+ unsigned ses_force:1;
+ /** session is shutting down */
+ unsigned ses_shutdown:1;
+ /** console is timedout */
+ unsigned ses_expired:1;
+ __u64 ses_id_cookie; /* batch id cookie */
+ char ses_name[LST_NAME_SIZE]; /* session name */
+ lstcon_rpc_trans_t *ses_ping; /* session pinger */
+ stt_timer_t ses_ping_timer; /* timer for pinger */
+ lstcon_trans_stat_t ses_trans_stat; /* transaction stats */
+
+ struct list_head ses_trans_list; /* global list of transaction */
+ struct list_head ses_grp_list; /* global list of groups */
+ struct list_head ses_bat_list; /* global list of batches */
+ struct list_head ses_ndl_list; /* global list of nodes */
+ struct list_head *ses_ndl_hash; /* hash table of nodes */
+
+ spinlock_t ses_rpc_lock; /* serialize */
+ atomic_t ses_rpc_counter;/* # of initialized RPCs */
+ struct list_head ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t; /*** session descriptor */
+
+extern lstcon_session_t console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+ return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+ unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+ return &hash[idx];
+}
+
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+ int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+ lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+ int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+ struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+ struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+ unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+ struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+ int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+ struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+ struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+ int client, int timeout,
+ struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+ int server, int testidx, int *index_p,
+ int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+ struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+ int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *name, int type, int loop, int concur,
+ int dist, int span, char *src_name, char * dst_name,
+ void *param, int paramlen, int *retp,
+ struct list_head *result_up);
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644
index 000000000000..483c78564dae
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/framework.c
@@ -0,0 +1,1814 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+CFS_MODULE_PARM(session_timeout, "i", int, 0444,
+ "test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+CFS_MODULE_PARM(rpc_timeout, "i", int, 0644,
+ "rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id) \
+do { \
+ __swab64s(&(id).nid); \
+ __swab32s(&(id).pid); \
+} while (0)
+
+#define sfw_unpack_sid(sid) \
+do { \
+ __swab64s(&(sid).ses_nid); \
+ __swab64s(&(sid).ses_stamp); \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc) \
+do { \
+ __swab32s(&(fc).running_ms); \
+ __swab32s(&(fc).active_batches); \
+ __swab32s(&(fc).zombie_sessions); \
+ __swab32s(&(fc).brw_errors); \
+ __swab32s(&(fc).ping_errors); \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc) \
+do { \
+ __swab32s(&(rc).errors); \
+ __swab32s(&(rc).rpcs_sent); \
+ __swab32s(&(rc).rpcs_rcvd); \
+ __swab32s(&(rc).rpcs_dropped); \
+ __swab32s(&(rc).rpcs_expired); \
+ __swab64s(&(rc).bulk_get); \
+ __swab64s(&(rc).bulk_put); \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc) \
+do { \
+ __swab32s(&(lc).errors); \
+ __swab32s(&(lc).msgs_max); \
+ __swab32s(&(lc).msgs_alloc); \
+ __swab32s(&(lc).send_count); \
+ __swab32s(&(lc).recv_count); \
+ __swab32s(&(lc).drop_count); \
+ __swab32s(&(lc).route_count); \
+ __swab64s(&(lc).send_length); \
+ __swab64s(&(lc).recv_length); \
+ __swab64s(&(lc).drop_length); \
+ __swab64s(&(lc).route_length); \
+} while (0)
+
+#define sfw_test_active(t) (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b) (atomic_read(&(b)->bat_nactive) != 0)
+
+struct smoketest_framework {
+ struct list_head fw_zombie_rpcs; /* RPCs to be recycled */
+ struct list_head fw_zombie_sessions; /* stopping sessions */
+ struct list_head fw_tests; /* registered test cases */
+ atomic_t fw_nzombies; /* # zombie sessions */
+ spinlock_t fw_lock; /* serialise */
+ sfw_session_t *fw_session; /* _the_ session */
+ int fw_shuttingdown; /* shutdown in progress */
+ srpc_server_rpc_t *fw_active_srpc; /* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+ sfw_test_case_t *tsc;
+
+ LASSERT (id <= SRPC_SERVICE_MAX_ID);
+ LASSERT (id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+ if (tsc->tsc_srv_service->sv_id == id)
+ return tsc;
+ }
+
+ return NULL;
+}
+
+static int
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+ sfw_test_case_t *tsc;
+
+ if (sfw_find_test_case(service->sv_id) != NULL) {
+ CERROR ("Failed to register test %s (%d)\n",
+ service->sv_name, service->sv_id);
+ return -EEXIST;
+ }
+
+ LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+ if (tsc == NULL)
+ return -ENOMEM;
+
+ memset(tsc, 0, sizeof(sfw_test_case_t));
+ tsc->tsc_cli_ops = cliops;
+ tsc->tsc_srv_service = service;
+
+ list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+ return 0;
+}
+
+void
+sfw_add_session_timer (void)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ stt_timer_t *timer = &sn->sn_timer;
+
+ LASSERT (!sfw_data.fw_shuttingdown);
+
+ if (sn == NULL || sn->sn_timeout == 0)
+ return;
+
+ LASSERT (!sn->sn_timer_active);
+
+ sn->sn_timer_active = 1;
+ timer->stt_expires = cfs_time_add(sn->sn_timeout,
+ cfs_time_current_sec());
+ stt_add_timer(timer);
+ return;
+}
+
+int
+sfw_del_session_timer (void)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ if (sn == NULL || !sn->sn_timer_active)
+ return 0;
+
+ LASSERT (sn->sn_timeout != 0);
+
+ if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+ sn->sn_timer_active = 0;
+ return 0;
+ }
+
+ return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ int nactive = 0;
+ sfw_batch_t *tsb;
+ sfw_test_case_t *tsc;
+
+ if (sn == NULL) return;
+
+ LASSERT (!sn->sn_timer_active);
+
+ sfw_data.fw_session = NULL;
+ atomic_inc(&sfw_data.fw_nzombies);
+ list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+ srpc_abort_service(tsc->tsc_srv_service);
+ }
+
+ spin_lock(&sfw_data.fw_lock);
+
+ list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+ if (sfw_batch_active(tsb)) {
+ nactive++;
+ sfw_stop_batch(tsb, 1);
+ }
+ }
+
+ if (nactive != 0)
+ return; /* wait for active batches to stop */
+
+ list_del_init(&sn->sn_list);
+ spin_unlock(&sfw_data.fw_lock);
+
+ sfw_destroy_session(sn);
+
+ spin_lock(&sfw_data.fw_lock);
+}
+
+
+void
+sfw_session_expired (void *data)
+{
+ sfw_session_t *sn = data;
+
+ spin_lock(&sfw_data.fw_lock);
+
+ LASSERT (sn->sn_timer_active);
+ LASSERT (sn == sfw_data.fw_session);
+
+ CWARN ("Session expired! sid: %s-"LPU64", name: %s\n",
+ libcfs_nid2str(sn->sn_id.ses_nid),
+ sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+ sn->sn_timer_active = 0;
+ sfw_deactivate_session();
+
+ spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+ unsigned features, const char *name)
+{
+ stt_timer_t *timer = &sn->sn_timer;
+
+ memset(sn, 0, sizeof(sfw_session_t));
+ INIT_LIST_HEAD(&sn->sn_list);
+ INIT_LIST_HEAD(&sn->sn_batches);
+ atomic_set(&sn->sn_refcount, 1); /* +1 for caller */
+ atomic_set(&sn->sn_brw_errors, 0);
+ atomic_set(&sn->sn_ping_errors, 0);
+ strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+ sn->sn_timer_active = 0;
+ sn->sn_id = sid;
+ sn->sn_features = features;
+ sn->sn_timeout = session_timeout;
+ sn->sn_started = cfs_time_current();
+
+ timer->stt_data = sn;
+ timer->stt_func = sfw_session_expired;
+ INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ int status = rpc->srpc_status;
+
+ CDEBUG (D_NET,
+ "Incoming framework RPC done: "
+ "service %s, peer %s, status %s:%d\n",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+ swi_state2str(rpc->srpc_wi.swi_state),
+ status);
+
+ if (rpc->srpc_bulk != NULL)
+ sfw_free_pages(rpc);
+ return;
+}
+
+void
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+{
+ LASSERT (rpc->crpc_bulk.bk_niov == 0);
+ LASSERT (list_empty(&rpc->crpc_list));
+ LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+ CDEBUG (D_NET,
+ "Outgoing framework RPC done: "
+ "service %d, peer %s, status %s:%d:%d\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ swi_state2str(rpc->crpc_wi.swi_state),
+ rpc->crpc_aborted, rpc->crpc_status);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ /* my callers must finish all RPCs before shutting me down */
+ LASSERT(!sfw_data.fw_shuttingdown);
+ list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+ spin_unlock(&sfw_data.fw_lock);
+}
+
+sfw_batch_t *
+sfw_find_batch (lst_bid_t bid)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ sfw_batch_t *bat;
+
+ LASSERT (sn != NULL);
+
+ list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+ if (bat->bat_id.bat_id == bid.bat_id)
+ return bat;
+ }
+
+ return NULL;
+}
+
+sfw_batch_t *
+sfw_bid2batch (lst_bid_t bid)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ sfw_batch_t *bat;
+
+ LASSERT (sn != NULL);
+
+ bat = sfw_find_batch(bid);
+ if (bat != NULL)
+ return bat;
+
+ LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+ if (bat == NULL)
+ return NULL;
+
+ bat->bat_error = 0;
+ bat->bat_session = sn;
+ bat->bat_id = bid;
+ atomic_set(&bat->bat_nactive, 0);
+ INIT_LIST_HEAD(&bat->bat_tests);
+
+ list_add_tail(&bat->bat_list, &sn->sn_batches);
+ return bat;
+}
+
+int
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ sfw_counters_t *cnt = &reply->str_fw;
+ sfw_batch_t *bat;
+ struct timeval tv;
+
+ reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (request->str_sid.ses_nid == LNET_NID_ANY) {
+ reply->str_status = EINVAL;
+ return 0;
+ }
+
+ if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+ reply->str_status = ESRCH;
+ return 0;
+ }
+
+ lnet_counters_get(&reply->str_lnet);
+ srpc_get_counters(&reply->str_rpc);
+
+ /* send over the msecs since the session was started
+ - with 32 bits to send, this is ~49 days */
+ cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+ sn->sn_started), &tv);
+
+ cnt->running_ms = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+ cnt->brw_errors = atomic_read(&sn->sn_brw_errors);
+ cnt->ping_errors = atomic_read(&sn->sn_ping_errors);
+ cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+ cnt->active_batches = 0;
+ list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+ if (atomic_read(&bat->bat_nactive) > 0)
+ cnt->active_batches++;
+ }
+
+ reply->str_status = 0;
+ return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ srpc_msg_t *msg = container_of(request, srpc_msg_t,
+ msg_body.mksn_reqst);
+ int cplen = 0;
+
+ if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+ reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+ reply->mksn_status = EINVAL;
+ return 0;
+ }
+
+ if (sn != NULL) {
+ reply->mksn_status = 0;
+ reply->mksn_sid = sn->sn_id;
+ reply->mksn_timeout = sn->sn_timeout;
+
+ if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+ atomic_inc(&sn->sn_refcount);
+ return 0;
+ }
+
+ if (!request->mksn_force) {
+ reply->mksn_status = EBUSY;
+ cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+ sizeof(reply->mksn_name));
+ if (cplen >= sizeof(reply->mksn_name))
+ return -E2BIG;
+ return 0;
+ }
+ }
+
+ /* reject the request if it requires unknown features
+ * NB: old version will always accept all features because it's not
+ * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+ * harmless because it will return zero feature to console, and it's
+ * console's responsibility to make sure all nodes in a session have
+ * same feature mask. */
+ if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ reply->mksn_status = EPROTO;
+ return 0;
+ }
+
+ /* brand new or create by force */
+ LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+ if (sn == NULL) {
+ CERROR ("Dropping RPC (mksn) under memory pressure.\n");
+ return -ENOMEM;
+ }
+
+ sfw_init_session(sn, request->mksn_sid,
+ msg->msg_ses_feats, &request->mksn_name[0]);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ sfw_deactivate_session();
+ LASSERT(sfw_data.fw_session == NULL);
+ sfw_data.fw_session = sn;
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ reply->mksn_status = 0;
+ reply->mksn_sid = sn->sn_id;
+ reply->mksn_timeout = sn->sn_timeout;
+ return 0;
+}
+
+int
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+ reply->rmsn_status = EINVAL;
+ return 0;
+ }
+
+ if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+ reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+ return 0;
+ }
+
+ if (!atomic_dec_and_test(&sn->sn_refcount)) {
+ reply->rmsn_status = 0;
+ return 0;
+ }
+
+ spin_lock(&sfw_data.fw_lock);
+ sfw_deactivate_session();
+ spin_unlock(&sfw_data.fw_lock);
+
+ reply->rmsn_status = 0;
+ reply->rmsn_sid = LST_INVALID_SID;
+ LASSERT(sfw_data.fw_session == NULL);
+ return 0;
+}
+
+int
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ if (sn == NULL) {
+ reply->dbg_status = ESRCH;
+ reply->dbg_sid = LST_INVALID_SID;
+ return 0;
+ }
+
+ reply->dbg_status = 0;
+ reply->dbg_sid = sn->sn_id;
+ reply->dbg_timeout = sn->sn_timeout;
+ if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+ >= sizeof(reply->dbg_name))
+ return -E2BIG;
+
+ return 0;
+}
+
+void
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+{
+ sfw_test_unit_t *tsu = rpc->crpc_priv;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+ /* Called with hold of tsi->tsi_lock */
+ LASSERT (list_empty(&rpc->crpc_list));
+ list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+ struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+ struct srpc_service *svc = tsc->tsc_srv_service;
+ int nbuf;
+
+ nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+ return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+ struct sfw_test_case *tsc;
+ struct srpc_service *svc;
+ int nbuf;
+ int rc;
+
+ LASSERT(tsi != NULL);
+ tsc = sfw_find_test_case(tsi->tsi_service);
+ nbuf = sfw_test_buffers(tsi);
+ LASSERT(tsc != NULL);
+ svc = tsc->tsc_srv_service;
+
+ if (tsi->tsi_is_client) {
+ tsi->tsi_ops = tsc->tsc_cli_ops;
+ return 0;
+ }
+
+ rc = srpc_service_add_buffers(svc, nbuf);
+ if (rc != 0) {
+ CWARN("Failed to reserve enough buffers: "
+ "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+ /* NB: this error handler is not strictly correct, because
+ * it may release more buffers than already allocated,
+ * but it doesn't matter because request portal should
+ * be lazy portal and will grow buffers if necessary. */
+ srpc_service_remove_buffers(svc, nbuf);
+ return -ENOMEM;
+ }
+
+ CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+ nbuf * (srpc_serv_is_framework(svc) ?
+ 1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+ return 0;
+}
+
+void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+ struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+ LASSERT(tsc != NULL);
+
+ if (tsi->tsi_is_client)
+ return;
+
+ /* shrink buffers, because request portal is lazy portal
+ * which can grow buffers at runtime so we may leave
+ * some buffers behind, but never mind... */
+ srpc_service_remove_buffers(tsc->tsc_srv_service,
+ sfw_test_buffers(tsi));
+ return;
+}
+
+void
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+{
+ srpc_client_rpc_t *rpc;
+ sfw_test_unit_t *tsu;
+
+ if (!tsi->tsi_is_client) goto clean;
+
+ tsi->tsi_ops->tso_fini(tsi);
+
+ LASSERT (!tsi->tsi_stopping);
+ LASSERT (list_empty(&tsi->tsi_active_rpcs));
+ LASSERT (!sfw_test_active(tsi));
+
+ while (!list_empty(&tsi->tsi_units)) {
+ tsu = list_entry(tsi->tsi_units.next,
+ sfw_test_unit_t, tsu_list);
+ list_del(&tsu->tsu_list);
+ LIBCFS_FREE(tsu, sizeof(*tsu));
+ }
+
+ while (!list_empty(&tsi->tsi_free_rpcs)) {
+ rpc = list_entry(tsi->tsi_free_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ list_del(&rpc->crpc_list);
+ LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+ }
+
+clean:
+ sfw_unload_test(tsi);
+ LIBCFS_FREE(tsi, sizeof(*tsi));
+ return;
+}
+
+void
+sfw_destroy_batch (sfw_batch_t *tsb)
+{
+ sfw_test_instance_t *tsi;
+
+ LASSERT (!sfw_batch_active(tsb));
+ LASSERT (list_empty(&tsb->bat_list));
+
+ while (!list_empty(&tsb->bat_tests)) {
+ tsi = list_entry(tsb->bat_tests.next,
+ sfw_test_instance_t, tsi_list);
+ list_del_init(&tsi->tsi_list);
+ sfw_destroy_test_instance(tsi);
+ }
+
+ LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+ return;
+}
+
+void
+sfw_destroy_session (sfw_session_t *sn)
+{
+ sfw_batch_t *batch;
+
+ LASSERT (list_empty(&sn->sn_list));
+ LASSERT (sn != sfw_data.fw_session);
+
+ while (!list_empty(&sn->sn_batches)) {
+ batch = list_entry(sn->sn_batches.next,
+ sfw_batch_t, bat_list);
+ list_del_init(&batch->bat_list);
+ sfw_destroy_batch(batch);
+ }
+
+ LIBCFS_FREE(sn, sizeof(*sn));
+ atomic_dec(&sfw_data.fw_nzombies);
+ return;
+}
+
+void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+ srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+ LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+ LASSERT (req->tsr_is_client);
+
+ if (msg->msg_magic == SRPC_MSG_MAGIC)
+ return; /* no flipping needed */
+
+ LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ if (req->tsr_service == SRPC_SERVICE_BRW) {
+ if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+ test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+ __swab32s(&bulk->blk_opc);
+ __swab32s(&bulk->blk_npg);
+ __swab32s(&bulk->blk_flags);
+
+ } else {
+ test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+ __swab16s(&bulk->blk_opc);
+ __swab16s(&bulk->blk_flags);
+ __swab32s(&bulk->blk_offset);
+ __swab32s(&bulk->blk_len);
+ }
+
+ return;
+ }
+
+ if (req->tsr_service == SRPC_SERVICE_PING) {
+ test_ping_req_t *ping = &req->tsr_u.ping;
+
+ __swab32s(&ping->png_size);
+ __swab32s(&ping->png_flags);
+ return;
+ }
+
+ LBUG ();
+ return;
+}
+
+int
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+ srpc_msg_t *msg = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+ srpc_bulk_t *bk = rpc->srpc_bulk;
+ int ndest = req->tsr_ndest;
+ sfw_test_unit_t *tsu;
+ sfw_test_instance_t *tsi;
+ int i;
+ int rc;
+
+ LIBCFS_ALLOC(tsi, sizeof(*tsi));
+ if (tsi == NULL) {
+ CERROR ("Can't allocate test instance for batch: "LPU64"\n",
+ tsb->bat_id.bat_id);
+ return -ENOMEM;
+ }
+
+ memset(tsi, 0, sizeof(*tsi));
+ spin_lock_init(&tsi->tsi_lock);
+ atomic_set(&tsi->tsi_nactive, 0);
+ INIT_LIST_HEAD(&tsi->tsi_units);
+ INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+ INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+ tsi->tsi_stopping = 0;
+ tsi->tsi_batch = tsb;
+ tsi->tsi_loop = req->tsr_loop;
+ tsi->tsi_concur = req->tsr_concur;
+ tsi->tsi_service = req->tsr_service;
+ tsi->tsi_is_client = !!(req->tsr_is_client);
+ tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+ rc = sfw_load_test(tsi);
+ if (rc != 0) {
+ LIBCFS_FREE(tsi, sizeof(*tsi));
+ return rc;
+ }
+
+ LASSERT (!sfw_batch_active(tsb));
+
+ if (!tsi->tsi_is_client) {
+ /* it's test server, just add it to tsb */
+ list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+ return 0;
+ }
+
+ LASSERT (bk != NULL);
+ LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+ LASSERT((unsigned int)bk->bk_len >=
+ sizeof(lnet_process_id_packed_t) * ndest);
+
+ sfw_unpack_addtest_req(msg);
+ memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+ for (i = 0; i < ndest; i++) {
+ lnet_process_id_packed_t *dests;
+ lnet_process_id_packed_t id;
+ int j;
+
+ dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+ LASSERT (dests != NULL); /* my pages are within KVM always */
+ id = dests[i % SFW_ID_PER_PAGE];
+ if (msg->msg_magic != SRPC_MSG_MAGIC)
+ sfw_unpack_id(id);
+
+ for (j = 0; j < tsi->tsi_concur; j++) {
+ LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+ if (tsu == NULL) {
+ rc = -ENOMEM;
+ CERROR ("Can't allocate tsu for %d\n",
+ tsi->tsi_service);
+ goto error;
+ }
+
+ tsu->tsu_dest.nid = id.nid;
+ tsu->tsu_dest.pid = id.pid;
+ tsu->tsu_instance = tsi;
+ tsu->tsu_private = NULL;
+ list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+ }
+ }
+
+ rc = tsi->tsi_ops->tso_init(tsi);
+ if (rc == 0) {
+ list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+ return 0;
+ }
+
+error:
+ LASSERT (rc != 0);
+ sfw_destroy_test_instance(tsi);
+ return rc;
+}
+
+static void
+sfw_test_unit_done (sfw_test_unit_t *tsu)
+{
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_batch_t *tsb = tsi->tsi_batch;
+ sfw_session_t *sn = tsb->bat_session;
+
+ LASSERT (sfw_test_active(tsi));
+
+ if (!atomic_dec_and_test(&tsi->tsi_nactive))
+ return;
+
+ /* the test instance is done */
+ spin_lock(&tsi->tsi_lock);
+
+ tsi->tsi_stopping = 0;
+
+ spin_unlock(&tsi->tsi_lock);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+ sn == sfw_data.fw_session) { /* sn also active */
+ spin_unlock(&sfw_data.fw_lock);
+ return;
+ }
+
+ LASSERT (!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+ list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+ if (sfw_batch_active(tsb)) {
+ spin_unlock(&sfw_data.fw_lock);
+ return;
+ }
+ }
+
+ list_del_init(&sn->sn_list);
+ spin_unlock(&sfw_data.fw_lock);
+
+ sfw_destroy_session(sn);
+ return;
+}
+
+void
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+{
+ sfw_test_unit_t *tsu = rpc->crpc_priv;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ int done = 0;
+
+ tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+ spin_lock(&tsi->tsi_lock);
+
+ LASSERT (sfw_test_active(tsi));
+ LASSERT (!list_empty(&rpc->crpc_list));
+
+ list_del_init(&rpc->crpc_list);
+
+ /* batch is stopping or loop is done or get error */
+ if (tsi->tsi_stopping ||
+ tsu->tsu_loop == 0 ||
+ (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+ done = 1;
+
+ /* dec ref for poster */
+ srpc_client_rpc_decref(rpc);
+
+ spin_unlock(&tsi->tsi_lock);
+
+ if (!done) {
+ swi_schedule_workitem(&tsu->tsu_worker);
+ return;
+ }
+
+ sfw_test_unit_done(tsu);
+ return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+ unsigned features, int nblk, int blklen,
+ srpc_client_rpc_t **rpcpp)
+{
+ srpc_client_rpc_t *rpc = NULL;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+ spin_lock(&tsi->tsi_lock);
+
+ LASSERT (sfw_test_active(tsi));
+
+ if (!list_empty(&tsi->tsi_free_rpcs)) {
+ /* pick request from buffer */
+ rpc = list_entry(tsi->tsi_free_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ LASSERT (nblk == rpc->crpc_bulk.bk_niov);
+ list_del_init(&rpc->crpc_list);
+ }
+
+ spin_unlock(&tsi->tsi_lock);
+
+ if (rpc == NULL) {
+ rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+ blklen, sfw_test_rpc_done,
+ sfw_test_rpc_fini, tsu);
+ } else {
+ srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+ blklen, sfw_test_rpc_done,
+ sfw_test_rpc_fini, tsu);
+ }
+
+ if (rpc == NULL) {
+ CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+ return -ENOMEM;
+ }
+
+ rpc->crpc_reqstmsg.msg_ses_feats = features;
+ *rpcpp = rpc;
+
+ return 0;
+}
+
+int
+sfw_run_test (swi_workitem_t *wi)
+{
+ sfw_test_unit_t *tsu = wi->swi_workitem.wi_data;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ srpc_client_rpc_t *rpc = NULL;
+
+ LASSERT (wi == &tsu->tsu_worker);
+
+ if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+ LASSERT (rpc == NULL);
+ goto test_done;
+ }
+
+ LASSERT (rpc != NULL);
+
+ spin_lock(&tsi->tsi_lock);
+
+ if (tsi->tsi_stopping) {
+ list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+ spin_unlock(&tsi->tsi_lock);
+ goto test_done;
+ }
+
+ if (tsu->tsu_loop > 0)
+ tsu->tsu_loop--;
+
+ list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+ spin_unlock(&tsi->tsi_lock);
+
+ rpc->crpc_timeout = rpc_timeout;
+
+ spin_lock(&rpc->crpc_lock);
+ srpc_post_rpc(rpc);
+ spin_unlock(&rpc->crpc_lock);
+ return 0;
+
+test_done:
+ /*
+ * No one can schedule me now since:
+ * - previous RPC, if any, has done and
+ * - no new RPC is initiated.
+ * - my batch is still active; no one can run it again now.
+ * Cancel pending schedules and prevent future schedule attempts:
+ */
+ swi_exit_workitem(wi);
+ sfw_test_unit_done(tsu);
+ return 1;
+}
+
+int
+sfw_run_batch (sfw_batch_t *tsb)
+{
+ swi_workitem_t *wi;
+ sfw_test_unit_t *tsu;
+ sfw_test_instance_t *tsi;
+
+ if (sfw_batch_active(tsb)) {
+ CDEBUG(D_NET, "Batch already active: "LPU64" (%d)\n",
+ tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+ return 0;
+ }
+
+ list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+ if (!tsi->tsi_is_client) /* skip server instances */
+ continue;
+
+ LASSERT (!tsi->tsi_stopping);
+ LASSERT (!sfw_test_active(tsi));
+
+ atomic_inc(&tsb->bat_nactive);
+
+ list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+ atomic_inc(&tsi->tsi_nactive);
+ tsu->tsu_loop = tsi->tsi_loop;
+ wi = &tsu->tsu_worker;
+ swi_init_workitem(wi, tsu, sfw_run_test,
+ lst_sched_test[\
+ lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+ swi_schedule_workitem(wi);
+ }
+ }
+
+ return 0;
+}
+
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
+{
+ sfw_test_instance_t *tsi;
+ srpc_client_rpc_t *rpc;
+
+ if (!sfw_batch_active(tsb)) {
+ CDEBUG(D_NET, "Batch "LPU64" inactive\n", tsb->bat_id.bat_id);
+ return 0;
+ }
+
+ list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+ spin_lock(&tsi->tsi_lock);
+
+ if (!tsi->tsi_is_client ||
+ !sfw_test_active(tsi) || tsi->tsi_stopping) {
+ spin_unlock(&tsi->tsi_lock);
+ continue;
+ }
+
+ tsi->tsi_stopping = 1;
+
+ if (!force) {
+ spin_unlock(&tsi->tsi_lock);
+ continue;
+ }
+
+ /* abort launched rpcs in the test */
+ list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+ spin_lock(&rpc->crpc_lock);
+
+ srpc_abort_rpc(rpc, -EINTR);
+
+ spin_unlock(&rpc->crpc_lock);
+ }
+
+ spin_unlock(&tsi->tsi_lock);
+ }
+
+ return 0;
+}
+
+int
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+ sfw_test_instance_t *tsi;
+
+ if (testidx < 0)
+ return -EINVAL;
+
+ if (testidx == 0) {
+ reply->bar_active = atomic_read(&tsb->bat_nactive);
+ return 0;
+ }
+
+ list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+ if (testidx-- > 1)
+ continue;
+
+ reply->bar_active = atomic_read(&tsi->tsi_nactive);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+void
+sfw_free_pages (srpc_server_rpc_t *rpc)
+{
+ srpc_free_bulk(rpc->srpc_bulk);
+ rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+ int sink)
+{
+ LASSERT(rpc->srpc_bulk == NULL);
+ LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+ rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+ if (rpc->srpc_bulk == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int
+sfw_add_test (srpc_server_rpc_t *rpc)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+ srpc_test_reqst_t *request;
+ int rc;
+ sfw_batch_t *bat;
+
+ request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+ reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (request->tsr_loop == 0 ||
+ request->tsr_concur == 0 ||
+ request->tsr_sid.ses_nid == LNET_NID_ANY ||
+ request->tsr_ndest > SFW_MAX_NDESTS ||
+ (request->tsr_is_client && request->tsr_ndest == 0) ||
+ request->tsr_concur > SFW_MAX_CONCUR ||
+ request->tsr_service > SRPC_SERVICE_MAX_ID ||
+ request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+ reply->tsr_status = EINVAL;
+ return 0;
+ }
+
+ if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+ sfw_find_test_case(request->tsr_service) == NULL) {
+ reply->tsr_status = ENOENT;
+ return 0;
+ }
+
+ bat = sfw_bid2batch(request->tsr_bid);
+ if (bat == NULL) {
+ CERROR ("Dropping RPC (%s) from %s under memory pressure.\n",
+ rpc->srpc_scd->scd_svc->sv_name,
+ libcfs_id2str(rpc->srpc_peer));
+ return -ENOMEM;
+ }
+
+ if (sfw_batch_active(bat)) {
+ reply->tsr_status = EBUSY;
+ return 0;
+ }
+
+ if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+ /* rpc will be resumed later in sfw_bulk_ready */
+ int npg = sfw_id_pages(request->tsr_ndest);
+ int len;
+
+ if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+ len = npg * PAGE_CACHE_SIZE;
+
+ } else {
+ len = sizeof(lnet_process_id_packed_t) *
+ request->tsr_ndest;
+ }
+
+ return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+ }
+
+ rc = sfw_add_test_instance(bat, rpc);
+ CDEBUG (rc == 0 ? D_NET : D_WARNING,
+ "%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+ rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+ request->tsr_is_client ? "client" : "server",
+ request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+ reply->tsr_status = (rc < 0) ? -rc : rc;
+ return 0;
+}
+
+int
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+ sfw_session_t *sn = sfw_data.fw_session;
+ int rc = 0;
+ sfw_batch_t *bat;
+
+ reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+ if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+ reply->bar_status = ESRCH;
+ return 0;
+ }
+
+ bat = sfw_find_batch(request->bar_bid);
+ if (bat == NULL) {
+ reply->bar_status = ENOENT;
+ return 0;
+ }
+
+ switch (request->bar_opc) {
+ case SRPC_BATCH_OPC_RUN:
+ rc = sfw_run_batch(bat);
+ break;
+
+ case SRPC_BATCH_OPC_STOP:
+ rc = sfw_stop_batch(bat, request->bar_arg);
+ break;
+
+ case SRPC_BATCH_OPC_QUERY:
+ rc = sfw_query_batch(bat, request->bar_testidx, reply);
+ break;
+
+ default:
+ return -EINVAL; /* drop it */
+ }
+
+ reply->bar_status = (rc < 0) ? -rc : rc;
+ return 0;
+}
+
+int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ srpc_msg_t *reply = &rpc->srpc_replymsg;
+ srpc_msg_t *request = &rpc->srpc_reqstbuf->buf_msg;
+ unsigned features = LST_FEATS_MASK;
+ int rc = 0;
+
+ LASSERT(sfw_data.fw_active_srpc == NULL);
+ LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (sfw_data.fw_shuttingdown) {
+ spin_unlock(&sfw_data.fw_lock);
+ return -ESHUTDOWN;
+ }
+
+ /* Remove timer to avoid racing with it or expiring active session */
+ if (sfw_del_session_timer() != 0) {
+ CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+ spin_unlock(&sfw_data.fw_lock);
+ return -EAGAIN;
+ }
+
+ sfw_data.fw_active_srpc = rpc;
+ spin_unlock(&sfw_data.fw_lock);
+
+ sfw_unpack_message(request);
+ LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+ /* rpc module should have checked this */
+ LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+ if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+ sv->sv_id != SRPC_SERVICE_DEBUG) {
+ sfw_session_t *sn = sfw_data.fw_session;
+
+ if (sn != NULL &&
+ sn->sn_features != request->msg_ses_feats) {
+ CNETERR("Features of framework RPC don't match "
+ "features of current session: %x/%x\n",
+ request->msg_ses_feats, sn->sn_features);
+ reply->msg_body.reply.status = EPROTO;
+ reply->msg_body.reply.sid = sn->sn_id;
+ goto out;
+ }
+
+ } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ /* NB: at this point, old version will ignore features and
+ * create new session anyway, so console should be able
+ * to handle this */
+ reply->msg_body.reply.status = EPROTO;
+ goto out;
+ }
+
+ switch(sv->sv_id) {
+ default:
+ LBUG ();
+ case SRPC_SERVICE_TEST:
+ rc = sfw_add_test(rpc);
+ break;
+
+ case SRPC_SERVICE_BATCH:
+ rc = sfw_control_batch(&request->msg_body.bat_reqst,
+ &reply->msg_body.bat_reply);
+ break;
+
+ case SRPC_SERVICE_QUERY_STAT:
+ rc = sfw_get_stats(&request->msg_body.stat_reqst,
+ &reply->msg_body.stat_reply);
+ break;
+
+ case SRPC_SERVICE_DEBUG:
+ rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+ &reply->msg_body.dbg_reply);
+ break;
+
+ case SRPC_SERVICE_MAKE_SESSION:
+ rc = sfw_make_session(&request->msg_body.mksn_reqst,
+ &reply->msg_body.mksn_reply);
+ break;
+
+ case SRPC_SERVICE_REMOVE_SESSION:
+ rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+ &reply->msg_body.rmsn_reply);
+ break;
+ }
+
+ if (sfw_data.fw_session != NULL)
+ features = sfw_data.fw_session->sn_features;
+ out:
+ reply->msg_ses_feats = features;
+ rpc->srpc_done = sfw_server_rpc_done;
+ spin_lock(&sfw_data.fw_lock);
+
+ if (!sfw_data.fw_shuttingdown)
+ sfw_add_session_timer();
+
+ sfw_data.fw_active_srpc = NULL;
+ spin_unlock(&sfw_data.fw_lock);
+ return rc;
+}
+
+int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ int rc;
+
+ LASSERT(rpc->srpc_bulk != NULL);
+ LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+ LASSERT(sfw_data.fw_active_srpc == NULL);
+ LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (status != 0) {
+ CERROR("Bulk transfer failed for RPC: "
+ "service %s, peer %s, status %d\n",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+ spin_unlock(&sfw_data.fw_lock);
+ return -EIO;
+ }
+
+ if (sfw_data.fw_shuttingdown) {
+ spin_unlock(&sfw_data.fw_lock);
+ return -ESHUTDOWN;
+ }
+
+ if (sfw_del_session_timer() != 0) {
+ CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+ sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+ spin_unlock(&sfw_data.fw_lock);
+ return -EAGAIN;
+ }
+
+ sfw_data.fw_active_srpc = rpc;
+ spin_unlock(&sfw_data.fw_lock);
+
+ rc = sfw_add_test(rpc);
+
+ spin_lock(&sfw_data.fw_lock);
+
+ if (!sfw_data.fw_shuttingdown)
+ sfw_add_session_timer();
+
+ sfw_data.fw_active_srpc = NULL;
+ spin_unlock(&sfw_data.fw_lock);
+ return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+ unsigned features, int nbulkiov, int bulklen,
+ void (*done)(srpc_client_rpc_t *), void *priv)
+{
+ srpc_client_rpc_t *rpc = NULL;
+
+ spin_lock(&sfw_data.fw_lock);
+
+ LASSERT (!sfw_data.fw_shuttingdown);
+ LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+ rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ list_del(&rpc->crpc_list);
+
+ srpc_init_client_rpc(rpc, peer, service, 0, 0,
+ done, sfw_client_rpc_fini, priv);
+ }
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ if (rpc == NULL) {
+ rpc = srpc_create_client_rpc(peer, service,
+ nbulkiov, bulklen, done,
+ nbulkiov != 0 ? NULL :
+ sfw_client_rpc_fini,
+ priv);
+ }
+
+ if (rpc != NULL) /* "session" is concept in framework */
+ rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+ return rpc;
+}
+
+void
+sfw_unpack_message (srpc_msg_t *msg)
+{
+ if (msg->msg_magic == SRPC_MSG_MAGIC)
+ return; /* no flipping needed */
+
+ /* srpc module should guarantee I wouldn't get crap */
+ LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+ srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+ __swab32s(&req->str_type);
+ __swab64s(&req->str_rpyid);
+ sfw_unpack_sid(req->str_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+ srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+ __swab32s(&rep->str_status);
+ sfw_unpack_sid(rep->str_sid);
+ sfw_unpack_fw_counters(rep->str_fw);
+ sfw_unpack_rpc_counters(rep->str_rpc);
+ sfw_unpack_lnet_counters(rep->str_lnet);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+ srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+ __swab64s(&req->mksn_rpyid);
+ __swab32s(&req->mksn_force);
+ sfw_unpack_sid(req->mksn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+ srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+ __swab32s(&rep->mksn_status);
+ __swab32s(&rep->mksn_timeout);
+ sfw_unpack_sid(rep->mksn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+ srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+ __swab64s(&req->rmsn_rpyid);
+ sfw_unpack_sid(req->rmsn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+ srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+ __swab32s(&rep->rmsn_status);
+ sfw_unpack_sid(rep->rmsn_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+ srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+ __swab64s(&req->dbg_rpyid);
+ __swab32s(&req->dbg_flags);
+ sfw_unpack_sid(req->dbg_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+ srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+ __swab32s(&rep->dbg_nbatch);
+ __swab32s(&rep->dbg_timeout);
+ sfw_unpack_sid(rep->dbg_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+ srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+ __swab32s(&req->bar_opc);
+ __swab64s(&req->bar_rpyid);
+ __swab32s(&req->bar_testidx);
+ __swab32s(&req->bar_arg);
+ sfw_unpack_sid(req->bar_sid);
+ __swab64s(&req->bar_bid.bat_id);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+ srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+ __swab32s(&rep->bar_status);
+ sfw_unpack_sid(rep->bar_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+ srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+ __swab64s(&req->tsr_rpyid);
+ __swab64s(&req->tsr_bulkid);
+ __swab32s(&req->tsr_loop);
+ __swab32s(&req->tsr_ndest);
+ __swab32s(&req->tsr_concur);
+ __swab32s(&req->tsr_service);
+ sfw_unpack_sid(req->tsr_sid);
+ __swab64s(&req->tsr_bid.bat_id);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+ srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+ __swab32s(&rep->tsr_status);
+ sfw_unpack_sid(rep->tsr_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+ srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+ __swab64s(&req->join_rpyid);
+ sfw_unpack_sid(req->join_sid);
+ return;
+ }
+
+ if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+ srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+ __swab32s(&rep->join_status);
+ __swab32s(&rep->join_timeout);
+ sfw_unpack_sid(rep->join_sid);
+ return;
+ }
+
+ LBUG ();
+ return;
+}
+
+void
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
+{
+ LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+ LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+ spin_lock(&rpc->crpc_lock);
+ srpc_abort_rpc(rpc, -EINTR);
+ spin_unlock(&rpc->crpc_lock);
+ return;
+}
+
+void
+sfw_post_rpc (srpc_client_rpc_t *rpc)
+{
+ spin_lock(&rpc->crpc_lock);
+
+ LASSERT (!rpc->crpc_closed);
+ LASSERT (!rpc->crpc_aborted);
+ LASSERT (list_empty(&rpc->crpc_list));
+ LASSERT (!sfw_data.fw_shuttingdown);
+
+ rpc->crpc_timeout = rpc_timeout;
+ srpc_post_rpc(rpc);
+
+ spin_unlock(&rpc->crpc_lock);
+ return;
+}
+
+static srpc_service_t sfw_services[] =
+{
+ {
+ /* sv_id */ SRPC_SERVICE_DEBUG,
+ /* sv_name */ "debug",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_QUERY_STAT,
+ /* sv_name */ "query stats",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_MAKE_SESSION,
+ /* sv_name */ "make session",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_REMOVE_SESSION,
+ /* sv_name */ "remove session",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_BATCH,
+ /* sv_name */ "batch service",
+ 0
+ },
+ {
+ /* sv_id */ SRPC_SERVICE_TEST,
+ /* sv_name */ "test service",
+ 0
+ },
+ {
+ /* sv_id */ 0,
+ /* sv_name */ NULL,
+ 0
+ }
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup (void)
+{
+ int i;
+ int rc;
+ int error;
+ srpc_service_t *sv;
+ sfw_test_case_t *tsc;
+
+
+ if (session_timeout < 0) {
+ CERROR ("Session timeout must be non-negative: %d\n",
+ session_timeout);
+ return -EINVAL;
+ }
+
+ if (rpc_timeout < 0) {
+ CERROR ("RPC timeout must be non-negative: %d\n",
+ rpc_timeout);
+ return -EINVAL;
+ }
+
+ if (session_timeout == 0)
+ CWARN ("Zero session_timeout specified "
+ "- test sessions never expire.\n");
+
+ if (rpc_timeout == 0)
+ CWARN ("Zero rpc_timeout specified "
+ "- test RPC never expire.\n");
+
+ memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+ sfw_data.fw_session = NULL;
+ sfw_data.fw_active_srpc = NULL;
+ spin_lock_init(&sfw_data.fw_lock);
+ atomic_set(&sfw_data.fw_nzombies, 0);
+ INIT_LIST_HEAD(&sfw_data.fw_tests);
+ INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+ INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+ brw_init_test_client();
+ brw_init_test_service();
+ rc = sfw_register_test(&brw_test_service, &brw_test_client);
+ LASSERT (rc == 0);
+
+ ping_init_test_client();
+ ping_init_test_service();
+ rc = sfw_register_test(&ping_test_service, &ping_test_client);
+ LASSERT (rc == 0);
+
+ error = 0;
+ list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+ sv = tsc->tsc_srv_service;
+
+ rc = srpc_add_service(sv);
+ LASSERT (rc != -EBUSY);
+ if (rc != 0) {
+ CWARN ("Failed to add %s service: %d\n",
+ sv->sv_name, rc);
+ error = rc;
+ }
+ }
+
+ for (i = 0; ; i++) {
+ sv = &sfw_services[i];
+ if (sv->sv_name == NULL) break;
+
+ sv->sv_bulk_ready = NULL;
+ sv->sv_handler = sfw_handle_server_rpc;
+ sv->sv_wi_total = SFW_FRWK_WI_MAX;
+ if (sv->sv_id == SRPC_SERVICE_TEST)
+ sv->sv_bulk_ready = sfw_bulk_ready;
+
+ rc = srpc_add_service(sv);
+ LASSERT (rc != -EBUSY);
+ if (rc != 0) {
+ CWARN ("Failed to add %s service: %d\n",
+ sv->sv_name, rc);
+ error = rc;
+ }
+
+ /* about to sfw_shutdown, no need to add buffer */
+ if (error) continue;
+
+ rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+ if (rc != 0) {
+ CWARN("Failed to reserve enough buffers: "
+ "service %s, %d needed: %d\n",
+ sv->sv_name, sv->sv_wi_total, rc);
+ error = -ENOMEM;
+ }
+ }
+
+ if (error != 0)
+ sfw_shutdown();
+ return error;
+}
+
+void
+sfw_shutdown (void)
+{
+ srpc_service_t *sv;
+ sfw_test_case_t *tsc;
+ int i;
+
+ spin_lock(&sfw_data.fw_lock);
+
+ sfw_data.fw_shuttingdown = 1;
+ lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+ "waiting for active RPC to finish.\n");
+
+ if (sfw_del_session_timer() != 0)
+ lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+ "waiting for session timer to explode.\n");
+
+ sfw_deactivate_session();
+ lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+ sfw_data.fw_lock,
+ "waiting for %d zombie sessions to die.\n",
+ atomic_read(&sfw_data.fw_nzombies));
+
+ spin_unlock(&sfw_data.fw_lock);
+
+ for (i = 0; ; i++) {
+ sv = &sfw_services[i];
+ if (sv->sv_name == NULL)
+ break;
+
+ srpc_shutdown_service(sv);
+ srpc_remove_service(sv);
+ }
+
+ list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+ sv = tsc->tsc_srv_service;
+ srpc_shutdown_service(sv);
+ srpc_remove_service(sv);
+ }
+
+ while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+ srpc_client_rpc_t *rpc;
+
+ rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+ srpc_client_rpc_t, crpc_list);
+ list_del(&rpc->crpc_list);
+
+ LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+ }
+
+ for (i = 0; ; i++) {
+ sv = &sfw_services[i];
+ if (sv->sv_name == NULL)
+ break;
+
+ srpc_wait_service_shutdown(sv);
+ }
+
+ while (!list_empty(&sfw_data.fw_tests)) {
+ tsc = list_entry(sfw_data.fw_tests.next,
+ sfw_test_case_t, tsc_list);
+
+ srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+ list_del(&tsc->tsc_list);
+ LIBCFS_FREE(tsc, sizeof(*tsc));
+ }
+
+ return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644
index 000000000000..5257e5630a0e
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/module.c
@@ -0,0 +1,169 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+ LST_INIT_NONE = 0,
+ LST_INIT_WI_SERIAL,
+ LST_INIT_WI_TEST,
+ LST_INIT_RPC,
+ LST_INIT_FW,
+ LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+void
+lnet_selftest_fini(void)
+{
+ int i;
+
+ switch (lst_init_step) {
+ case LST_INIT_CONSOLE:
+ lstcon_console_fini();
+ case LST_INIT_FW:
+ sfw_shutdown();
+ case LST_INIT_RPC:
+ srpc_shutdown();
+ case LST_INIT_WI_TEST:
+ for (i = 0;
+ i < cfs_cpt_number(lnet_cpt_table()); i++) {
+ if (lst_sched_test[i] == NULL)
+ continue;
+ cfs_wi_sched_destroy(lst_sched_test[i]);
+ }
+ LIBCFS_FREE(lst_sched_test,
+ sizeof(lst_sched_test[0]) *
+ cfs_cpt_number(lnet_cpt_table()));
+ lst_sched_test = NULL;
+
+ case LST_INIT_WI_SERIAL:
+ cfs_wi_sched_destroy(lst_sched_serial);
+ lst_sched_serial = NULL;
+ case LST_INIT_NONE:
+ break;
+ default:
+ LBUG();
+ }
+ return;
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+ CLASSERT(sizeof(srpc_msg_t) == 160);
+ CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+ CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+ CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+ CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+ CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+}
+
+int
+lnet_selftest_init(void)
+{
+ int nscheds;
+ int rc;
+ int i;
+
+ rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+ 1, &lst_sched_serial);
+ if (rc != 0) {
+ CERROR("Failed to create serial WI scheduler for LST\n");
+ return rc;
+ }
+ lst_init_step = LST_INIT_WI_SERIAL;
+
+ nscheds = cfs_cpt_number(lnet_cpt_table());
+ LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+ if (lst_sched_test == NULL)
+ goto error;
+
+ lst_init_step = LST_INIT_WI_TEST;
+ for (i = 0; i < nscheds; i++) {
+ int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+ /* reserve at least one CPU for LND */
+ nthrs = max(nthrs - 1, 1);
+ rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+ nthrs, &lst_sched_test[i]);
+ if (rc != 0) {
+ CERROR("Failed to create CPT affinity WI scheduler "
+ "%d for LST\n", i);
+ goto error;
+ }
+ }
+
+ rc = srpc_startup();
+ if (rc != 0) {
+ CERROR("LST can't startup rpc\n");
+ goto error;
+ }
+ lst_init_step = LST_INIT_RPC;
+
+ rc = sfw_startup();
+ if (rc != 0) {
+ CERROR("LST can't startup framework\n");
+ goto error;
+ }
+ lst_init_step = LST_INIT_FW;
+
+ rc = lstcon_console_init();
+ if (rc != 0) {
+ CERROR("LST can't startup console\n");
+ goto error;
+ }
+ lst_init_step = LST_INIT_CONSOLE;
+ return 0;
+error:
+ lnet_selftest_fini();
+ return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "0.9.0", lnet_selftest_init, lnet_selftest_fini);
diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644
index 000000000000..f0f919482b56
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/ping_test.c
@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC 0xbabeface
+
+int ping_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(ping_srv_workitems, "i", int, 0644, "# PING server workitems");
+
+typedef struct {
+ spinlock_t pnd_lock; /* serialize */
+ int pnd_counter; /* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+ LASSERT(tsi->tsi_is_client);
+ LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ spin_lock_init(&lst_ping_data.pnd_lock);
+ lst_ping_data.pnd_counter = 0;
+
+ return 0;
+}
+
+static void
+ping_client_fini (sfw_test_instance_t *tsi)
+{
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ int errors;
+
+ LASSERT (sn != NULL);
+ LASSERT (tsi->tsi_is_client);
+
+ errors = atomic_read(&sn->sn_ping_errors);
+ if (errors)
+ CWARN ("%d pings have failed.\n", errors);
+ else
+ CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+ lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+ srpc_ping_reqst_t *req;
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ struct timeval tv;
+ int rc;
+
+ LASSERT(sn != NULL);
+ LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+ rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+ if (rc != 0)
+ return rc;
+
+ req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+ req->pnr_magic = LST_PING_TEST_MAGIC;
+
+ spin_lock(&lst_ping_data.pnd_lock);
+ req->pnr_seq = lst_ping_data.pnd_counter++;
+ spin_unlock(&lst_ping_data.pnd_lock);
+
+ cfs_fs_timeval(&tv);
+ req->pnr_time_sec = tv.tv_sec;
+ req->pnr_time_usec = tv.tv_usec;
+
+ return rc;
+}
+
+static void
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+ sfw_test_instance_t *tsi = tsu->tsu_instance;
+ sfw_session_t *sn = tsi->tsi_batch->bat_session;
+ srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+ srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+ struct timeval tv;
+
+ LASSERT (sn != NULL);
+
+ if (rpc->crpc_status != 0) {
+ if (!tsi->tsi_stopping) /* rpc could have been aborted */
+ atomic_inc(&sn->sn_ping_errors);
+ CERROR ("Unable to ping %s (%d): %d\n",
+ libcfs_id2str(rpc->crpc_dest),
+ reqst->pnr_seq, rpc->crpc_status);
+ return;
+ }
+
+ if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+ __swab32s(&reply->pnr_seq);
+ __swab32s(&reply->pnr_magic);
+ __swab32s(&reply->pnr_status);
+ }
+
+ if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+ rpc->crpc_status = -EBADMSG;
+ atomic_inc(&sn->sn_ping_errors);
+ CERROR ("Bad magic %u from %s, %u expected.\n",
+ reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+ LST_PING_TEST_MAGIC);
+ return;
+ }
+
+ if (reply->pnr_seq != reqst->pnr_seq) {
+ rpc->crpc_status = -EBADMSG;
+ atomic_inc(&sn->sn_ping_errors);
+ CERROR ("Bad seq %u from %s, %u expected.\n",
+ reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+ reqst->pnr_seq);
+ return;
+ }
+
+ cfs_fs_timeval(&tv);
+ CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+ (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+ + (tv.tv_usec - reqst->pnr_time_usec)));
+ return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+ struct srpc_service *sv = rpc->srpc_scd->scd_svc;
+ srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+ srpc_msg_t *replymsg = &rpc->srpc_replymsg;
+ srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+ srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+ LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+ if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+ LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+ __swab32s(&req->pnr_seq);
+ __swab32s(&req->pnr_magic);
+ __swab64s(&req->pnr_time_sec);
+ __swab64s(&req->pnr_time_usec);
+ }
+ LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+ if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+ CERROR ("Unexpect magic %08x from %s\n",
+ req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+ return -EINVAL;
+ }
+
+ rep->pnr_seq = req->pnr_seq;
+ rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+ if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+ replymsg->msg_ses_feats = LST_FEATS_MASK;
+ rep->pnr_status = EPROTO;
+ return 0;
+ }
+
+ replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+ CDEBUG(D_NET, "Get ping %d from %s\n",
+ req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+ return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+ ping_test_client.tso_init = ping_client_init;
+ ping_test_client.tso_fini = ping_client_fini;
+ ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+ ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+ ping_test_service.sv_id = SRPC_SERVICE_PING;
+ ping_test_service.sv_name = "ping_test";
+ ping_test_service.sv_handler = ping_server_handle;
+ ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644
index 000000000000..91d83f4b746e
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.c
@@ -0,0 +1,1665 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+ SRPC_STATE_NONE,
+ SRPC_STATE_NI_INIT,
+ SRPC_STATE_EQ_INIT,
+ SRPC_STATE_RUNNING,
+ SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+struct smoketest_rpc {
+ spinlock_t rpc_glock; /* global lock */
+ srpc_service_t *rpc_services[SRPC_SERVICE_MAX_ID + 1];
+ lnet_handle_eq_t rpc_lnet_eq; /* _the_ LNet event queue */
+ srpc_state_t rpc_state;
+ srpc_counters_t rpc_counters;
+ __u64 rpc_matchbits; /* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+ return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+ SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc (swi_workitem_t *wi);
+
+void srpc_get_counters (srpc_counters_t *cnt)
+{
+ spin_lock(&srpc_data.rpc_glock);
+ *cnt = srpc_data.rpc_counters;
+ spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters (const srpc_counters_t *cnt)
+{
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters = *cnt;
+ spin_unlock(&srpc_data.rpc_glock);
+}
+
+int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+ nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+ LASSERT(nob > 0);
+ LASSERT(i >= 0 && i < bk->bk_niov);
+
+ bk->bk_iovs[i].kiov_offset = 0;
+ bk->bk_iovs[i].kiov_page = pg;
+ bk->bk_iovs[i].kiov_len = nob;
+ return nob;
+}
+
+void
+srpc_free_bulk (srpc_bulk_t *bk)
+{
+ int i;
+ struct page *pg;
+
+ LASSERT (bk != NULL);
+
+ for (i = 0; i < bk->bk_niov; i++) {
+ pg = bk->bk_iovs[i].kiov_page;
+ if (pg == NULL) break;
+
+ __free_page(pg);
+ }
+
+ LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+ return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+ srpc_bulk_t *bk;
+ struct page **pages;
+ int i;
+
+ LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+ LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+ offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+ if (bk == NULL) {
+ CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+ return NULL;
+ }
+
+ memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+ bk->bk_sink = sink;
+ bk->bk_len = bulk_len;
+ bk->bk_niov = bulk_npg;
+ UNUSED(pages);
+
+ for (i = 0; i < bulk_npg; i++) {
+ struct page *pg;
+ int nob;
+
+ pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_IOFS);
+ if (pg == NULL) {
+ CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+ srpc_free_bulk(bk);
+ return NULL;
+ }
+
+ nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+ bulk_len -= nob;
+ }
+
+ return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+ __u64 id;
+
+ spin_lock(&srpc_data.rpc_glock);
+ id = srpc_data.rpc_matchbits++;
+ spin_unlock(&srpc_data.rpc_glock);
+ return id;
+}
+
+void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+ struct srpc_service_cd *scd,
+ struct srpc_buffer *buffer)
+{
+ memset(rpc, 0, sizeof(*rpc));
+ swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+ srpc_serv_is_framework(scd->scd_svc) ?
+ lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+ rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+ rpc->srpc_scd = scd;
+ rpc->srpc_reqstbuf = buffer;
+ rpc->srpc_peer = buffer->buf_peer;
+ rpc->srpc_self = buffer->buf_self;
+ LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ struct srpc_buffer *buf;
+ struct list_head *q;
+ int i;
+
+ if (svc->sv_cpt_data == NULL)
+ return;
+
+ cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+ while (1) {
+ if (!list_empty(&scd->scd_buf_posted))
+ q = &scd->scd_buf_posted;
+ else if (!list_empty(&scd->scd_buf_blocked))
+ q = &scd->scd_buf_blocked;
+ else
+ break;
+
+ while (!list_empty(q)) {
+ buf = list_entry(q->next,
+ struct srpc_buffer,
+ buf_list);
+ list_del(&buf->buf_list);
+ LIBCFS_FREE(buf, sizeof(*buf));
+ }
+ }
+
+ LASSERT(list_empty(&scd->scd_rpc_active));
+
+ while (!list_empty(&scd->scd_rpc_free)) {
+ rpc = list_entry(scd->scd_rpc_free.next,
+ struct srpc_server_rpc,
+ srpc_list);
+ list_del(&rpc->srpc_list);
+ LIBCFS_FREE(rpc, sizeof(*rpc));
+ }
+ }
+
+ cfs_percpt_free(svc->sv_cpt_data);
+ svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+ int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+ return srpc_serv_is_framework(svc) ?
+ max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ int nrpcs;
+ int i;
+ int j;
+
+ svc->sv_shuttingdown = 0;
+
+ svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(struct srpc_service_cd));
+ if (svc->sv_cpt_data == NULL)
+ return -ENOMEM;
+
+ svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+ 1 : cfs_cpt_number(lnet_cpt_table());
+ nrpcs = srpc_service_nrpcs(svc);
+
+ cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+ scd->scd_cpt = i;
+ scd->scd_svc = svc;
+ spin_lock_init(&scd->scd_lock);
+ INIT_LIST_HEAD(&scd->scd_rpc_free);
+ INIT_LIST_HEAD(&scd->scd_rpc_active);
+ INIT_LIST_HEAD(&scd->scd_buf_posted);
+ INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+ scd->scd_ev.ev_data = scd;
+ scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+ /* NB: don't use lst_sched_serial for adding buffer,
+ * see details in srpc_service_add_buffers() */
+ swi_init_workitem(&scd->scd_buf_wi, scd,
+ srpc_add_buffer, lst_sched_test[i]);
+
+ if (i != 0 && srpc_serv_is_framework(svc)) {
+ /* NB: framework service only needs srpc_service_cd for
+ * one partition, but we allocate for all to make
+ * it easier to implement, it will waste a little
+ * memory but nobody should care about this */
+ continue;
+ }
+
+ for (j = 0; j < nrpcs; j++) {
+ LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+ i, sizeof(*rpc));
+ if (rpc == NULL) {
+ srpc_service_fini(svc);
+ return -ENOMEM;
+ }
+ list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+ }
+ }
+
+ return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+ int id = sv->sv_id;
+
+ LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+ if (srpc_service_init(sv) != 0)
+ return -ENOMEM;
+
+ spin_lock(&srpc_data.rpc_glock);
+
+ LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+ if (srpc_data.rpc_services[id] != NULL) {
+ spin_unlock(&srpc_data.rpc_glock);
+ goto failed;
+ }
+
+ srpc_data.rpc_services[id] = sv;
+ spin_unlock(&srpc_data.rpc_glock);
+
+ CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+ return 0;
+
+ failed:
+ srpc_service_fini(sv);
+ return -EBUSY;
+}
+
+int
+srpc_remove_service (srpc_service_t *sv)
+{
+ int id = sv->sv_id;
+
+ spin_lock(&srpc_data.rpc_glock);
+
+ if (srpc_data.rpc_services[id] != sv) {
+ spin_unlock(&srpc_data.rpc_glock);
+ return -ENOENT;
+ }
+
+ srpc_data.rpc_services[id] = NULL;
+ spin_unlock(&srpc_data.rpc_glock);
+ return 0;
+}
+
+int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+ int len, int options, lnet_process_id_t peer,
+ lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ int rc;
+ lnet_md_t md;
+ lnet_handle_me_t meh;
+
+ rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+ local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+ if (rc != 0) {
+ CERROR ("LNetMEAttach failed: %d\n", rc);
+ LASSERT (rc == -ENOMEM);
+ return -ENOMEM;
+ }
+
+ md.threshold = 1;
+ md.user_ptr = ev;
+ md.start = buf;
+ md.length = len;
+ md.options = options;
+ md.eq_handle = srpc_data.rpc_lnet_eq;
+
+ rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+ if (rc != 0) {
+ CERROR ("LNetMDAttach failed: %d\n", rc);
+ LASSERT (rc == -ENOMEM);
+
+ rc = LNetMEUnlink(meh);
+ LASSERT (rc == 0);
+ return -ENOMEM;
+ }
+
+ CDEBUG (D_NET,
+ "Posted passive RDMA: peer %s, portal %d, matchbits "LPX64"\n",
+ libcfs_id2str(peer), portal, matchbits);
+ return 0;
+}
+
+int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+ int options, lnet_process_id_t peer, lnet_nid_t self,
+ lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ int rc;
+ lnet_md_t md;
+
+ md.user_ptr = ev;
+ md.start = buf;
+ md.length = len;
+ md.eq_handle = srpc_data.rpc_lnet_eq;
+ md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+ md.options = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+ rc = LNetMDBind(md, LNET_UNLINK, mdh);
+ if (rc != 0) {
+ CERROR ("LNetMDBind failed: %d\n", rc);
+ LASSERT (rc == -ENOMEM);
+ return -ENOMEM;
+ }
+
+ /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+ * they're only meaningful for MDs attached to an ME (i.e. passive
+ * buffers... */
+ if ((options & LNET_MD_OP_PUT) != 0) {
+ rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+ portal, matchbits, 0, 0);
+ } else {
+ LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+ rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+ }
+
+ if (rc != 0) {
+ CERROR ("LNet%s(%s, %d, "LPD64") failed: %d\n",
+ ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+ libcfs_id2str(peer), portal, matchbits, rc);
+
+ /* The forthcoming unlink event will complete this operation
+ * with failure, so fall through and return success here.
+ */
+ rc = LNetMDUnlink(*mdh);
+ LASSERT (rc == 0);
+ } else {
+ CDEBUG (D_NET,
+ "Posted active RDMA: peer %s, portal %u, matchbits "LPX64"\n",
+ libcfs_id2str(peer), portal, matchbits);
+ }
+ return 0;
+}
+
+int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+ int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ return srpc_post_active_rdma(srpc_serv_portal(service), service,
+ buf, len, LNET_MD_OP_PUT, peer,
+ LNET_NID_ANY, mdh, ev);
+}
+
+int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+ lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+ lnet_process_id_t any = {0};
+
+ any.nid = LNET_NID_ANY;
+ any.pid = LNET_PID_ANY;
+
+ return srpc_post_passive_rdma(srpc_serv_portal(service),
+ local, service, buf, len,
+ LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+{
+ struct srpc_service *sv = scd->scd_svc;
+ struct srpc_msg *msg = &buf->buf_msg;
+ int rc;
+
+ LNetInvalidateHandle(&buf->buf_mdh);
+ list_add(&buf->buf_list, &scd->scd_buf_posted);
+ scd->scd_buf_nposted++;
+ spin_unlock(&scd->scd_lock);
+
+ rc = srpc_post_passive_rqtbuf(sv->sv_id,
+ !srpc_serv_is_framework(sv),
+ msg, sizeof(*msg), &buf->buf_mdh,
+ &scd->scd_ev);
+
+ /* At this point, a RPC (new or delayed) may have arrived in
+ * msg and its event handler has been called. So we must add
+ * buf to scd_buf_posted _before_ dropping scd_lock */
+
+ spin_lock(&scd->scd_lock);
+
+ if (rc == 0) {
+ if (!sv->sv_shuttingdown)
+ return 0;
+
+ spin_unlock(&scd->scd_lock);
+ /* srpc_shutdown_service might have tried to unlink me
+ * when my buf_mdh was still invalid */
+ LNetMDUnlink(buf->buf_mdh);
+ spin_lock(&scd->scd_lock);
+ return 0;
+ }
+
+ scd->scd_buf_nposted--;
+ if (sv->sv_shuttingdown)
+ return rc; /* don't allow to change scd_buf_posted */
+
+ list_del(&buf->buf_list);
+ spin_unlock(&scd->scd_lock);
+
+ LIBCFS_FREE(buf, sizeof(*buf));
+
+ spin_lock(&scd->scd_lock);
+ return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+ struct srpc_service_cd *scd = wi->swi_workitem.wi_data;
+ struct srpc_buffer *buf;
+ int rc = 0;
+
+ /* it's called by workitem scheduler threads, these threads
+ * should have been set CPT affinity, so buffers will be posted
+ * on CPT local list of Portal */
+ spin_lock(&scd->scd_lock);
+
+ while (scd->scd_buf_adjust > 0 &&
+ !scd->scd_svc->sv_shuttingdown) {
+ scd->scd_buf_adjust--; /* consume it */
+ scd->scd_buf_posting++;
+
+ spin_unlock(&scd->scd_lock);
+
+ LIBCFS_ALLOC(buf, sizeof(*buf));
+ if (buf == NULL) {
+ CERROR("Failed to add new buf to service: %s\n",
+ scd->scd_svc->sv_name);
+ spin_lock(&scd->scd_lock);
+ rc = -ENOMEM;
+ break;
+ }
+
+ spin_lock(&scd->scd_lock);
+ if (scd->scd_svc->sv_shuttingdown) {
+ spin_unlock(&scd->scd_lock);
+ LIBCFS_FREE(buf, sizeof(*buf));
+
+ spin_lock(&scd->scd_lock);
+ rc = -ESHUTDOWN;
+ break;
+ }
+
+ rc = srpc_service_post_buffer(scd, buf);
+ if (rc != 0)
+ break; /* buf has been freed inside */
+
+ LASSERT(scd->scd_buf_posting > 0);
+ scd->scd_buf_posting--;
+ scd->scd_buf_total++;
+ scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4);
+ }
+
+ if (rc != 0) {
+ scd->scd_buf_err_stamp = cfs_time_current_sec();
+ scd->scd_buf_err = rc;
+
+ LASSERT(scd->scd_buf_posting > 0);
+ scd->scd_buf_posting--;
+ }
+
+ spin_unlock(&scd->scd_lock);
+ return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+ struct srpc_service_cd *scd;
+ int rc = 0;
+ int i;
+
+ LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ scd->scd_buf_err = 0;
+ scd->scd_buf_err_stamp = 0;
+ scd->scd_buf_posting = 0;
+ scd->scd_buf_adjust = nbuffer;
+ /* start to post buffers */
+ swi_schedule_workitem(&scd->scd_buf_wi);
+ spin_unlock(&scd->scd_lock);
+
+ /* framework service only post buffer for one partition */
+ if (srpc_serv_is_framework(sv))
+ break;
+ }
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+ /*
+ * NB: srpc_service_add_buffers() can be called inside
+ * thread context of lst_sched_serial, and we don't normally
+ * allow to sleep inside thread context of WI scheduler
+ * because it will block current scheduler thread from doing
+ * anything else, even worse, it could deadlock if it's
+ * waiting on result from another WI of the same scheduler.
+ * However, it's safe at here because scd_buf_wi is scheduled
+ * by thread in a different WI scheduler (lst_sched_test),
+ * so we don't have any risk of deadlock, though this could
+ * block all WIs pending on lst_sched_serial for a moment
+ * which is not good but not fatal.
+ */
+ lst_wait_until(scd->scd_buf_err != 0 ||
+ (scd->scd_buf_adjust == 0 &&
+ scd->scd_buf_posting == 0),
+ scd->scd_lock, "waiting for adding buffer\n");
+
+ if (scd->scd_buf_err != 0 && rc == 0)
+ rc = scd->scd_buf_err;
+
+ spin_unlock(&scd->scd_lock);
+ }
+
+ return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+ struct srpc_service_cd *scd;
+ int num;
+ int i;
+
+ LASSERT(!sv->sv_shuttingdown);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ num = scd->scd_buf_total + scd->scd_buf_posting;
+ scd->scd_buf_adjust -= min(nbuffer, num);
+
+ spin_unlock(&scd->scd_lock);
+ }
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ int i;
+
+ LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+ if (!swi_deschedule_workitem(&scd->scd_buf_wi))
+ return 0;
+
+ if (scd->scd_buf_nposted > 0) {
+ CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+ scd->scd_buf_nposted);
+ spin_unlock(&scd->scd_lock);
+ return 0;
+ }
+
+ if (list_empty(&scd->scd_rpc_active)) {
+ spin_unlock(&scd->scd_lock);
+ continue;
+ }
+
+ rpc = list_entry(scd->scd_rpc_active.next,
+ struct srpc_server_rpc, srpc_list);
+ CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+ "wi %s scheduled %d running %d, "
+ "ev fired %d type %d status %d lnet %d\n",
+ rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+ swi_state2str(rpc->srpc_wi.swi_state),
+ rpc->srpc_wi.swi_workitem.wi_scheduled,
+ rpc->srpc_wi.swi_workitem.wi_running,
+ rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+ rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+ spin_unlock(&scd->scd_lock);
+ return 0;
+ }
+
+ /* no lock needed from now on */
+ srpc_service_fini(sv);
+ return 1;
+}
+
+/* called with sv->sv_lock held */
+void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+{
+ if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+ if (srpc_service_post_buffer(scd, buf) != 0) {
+ CWARN("Failed to post %s buffer\n",
+ scd->scd_svc->sv_name);
+ }
+ return;
+ }
+
+ /* service is shutting down, or we want to recycle some buffers */
+ scd->scd_buf_total--;
+
+ if (scd->scd_buf_adjust < 0) {
+ scd->scd_buf_adjust++;
+ if (scd->scd_buf_adjust < 0 &&
+ scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+ CDEBUG(D_INFO,
+ "Try to recyle %d buffers but nothing left\n",
+ scd->scd_buf_adjust);
+ scd->scd_buf_adjust = 0;
+ }
+ }
+
+ spin_unlock(&scd->scd_lock);
+ LIBCFS_FREE(buf, sizeof(*buf));
+ spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ int i;
+
+ CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+ sv->sv_id, sv->sv_name);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ /* schedule in-flight RPCs to notice the abort, NB:
+ * racing with incoming RPCs; complete fix should make test
+ * RPCs carry session ID in its headers */
+ list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+ rpc->srpc_aborted = 1;
+ swi_schedule_workitem(&rpc->srpc_wi);
+ }
+
+ spin_unlock(&scd->scd_lock);
+ }
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+ struct srpc_service_cd *scd;
+ struct srpc_server_rpc *rpc;
+ srpc_buffer_t *buf;
+ int i;
+
+ CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+ sv->sv_id, sv->sv_name);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+ spin_lock(&scd->scd_lock);
+
+ sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+ spin_unlock(&scd->scd_lock);
+
+ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+ spin_lock(&scd->scd_lock);
+
+ /* schedule in-flight RPCs to notice the shutdown */
+ list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+ swi_schedule_workitem(&rpc->srpc_wi);
+
+ spin_unlock(&scd->scd_lock);
+
+ /* OK to traverse scd_buf_posted without lock, since no one
+ * touches scd_buf_posted now */
+ list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+ LNetMDUnlink(buf->buf_mdh);
+ }
+}
+
+int
+srpc_send_request (srpc_client_rpc_t *rpc)
+{
+ srpc_event_t *ev = &rpc->crpc_reqstev;
+ int rc;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_REQUEST_SENT;
+
+ rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+ &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+ &rpc->crpc_reqstmdh, ev);
+ if (rc != 0) {
+ LASSERT (rc == -ENOMEM);
+ ev->ev_fired = 1; /* no more event expected */
+ }
+ return rc;
+}
+
+int
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
+{
+ srpc_event_t *ev = &rpc->crpc_replyev;
+ __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+ int rc;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_REPLY_RCVD;
+
+ *id = srpc_next_id();
+
+ rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+ &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+ LNET_MD_OP_PUT, rpc->crpc_dest,
+ &rpc->crpc_replymdh, ev);
+ if (rc != 0) {
+ LASSERT (rc == -ENOMEM);
+ ev->ev_fired = 1; /* no more event expected */
+ }
+ return rc;
+}
+
+int
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+{
+ srpc_bulk_t *bk = &rpc->crpc_bulk;
+ srpc_event_t *ev = &rpc->crpc_bulkev;
+ __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+ int rc;
+ int opt;
+
+ LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+
+ if (bk->bk_niov == 0) return 0; /* nothing to do */
+
+ opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+ opt |= LNET_MD_KIOV;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_BULK_REQ_RCVD;
+
+ *id = srpc_next_id();
+
+ rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+ &bk->bk_iovs[0], bk->bk_niov, opt,
+ rpc->crpc_dest, &bk->bk_mdh, ev);
+ if (rc != 0) {
+ LASSERT (rc == -ENOMEM);
+ ev->ev_fired = 1; /* no more event expected */
+ }
+ return rc;
+}
+
+int
+srpc_do_bulk (srpc_server_rpc_t *rpc)
+{
+ srpc_event_t *ev = &rpc->srpc_ev;
+ srpc_bulk_t *bk = rpc->srpc_bulk;
+ __u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+ int rc;
+ int opt;
+
+ LASSERT (bk != NULL);
+
+ opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+ opt |= LNET_MD_KIOV;
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+ rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+ &bk->bk_iovs[0], bk->bk_niov, opt,
+ rpc->srpc_peer, rpc->srpc_self,
+ &bk->bk_mdh, ev);
+ if (rc != 0)
+ ev->ev_fired = 1; /* no more event expected */
+ return rc;
+}
+
+/* only called from srpc_handle_rpc */
+void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+ struct srpc_service_cd *scd = rpc->srpc_scd;
+ struct srpc_service *sv = scd->scd_svc;
+ srpc_buffer_t *buffer;
+
+ LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+ rpc->srpc_status = status;
+
+ CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+ "Server RPC %p done: service %s, peer %s, status %s:%d\n",
+ rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+ swi_state2str(rpc->srpc_wi.swi_state), status);
+
+ if (status != 0) {
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_dropped++;
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+
+ if (rpc->srpc_done != NULL)
+ (*rpc->srpc_done) (rpc);
+ LASSERT(rpc->srpc_bulk == NULL);
+
+ spin_lock(&scd->scd_lock);
+
+ if (rpc->srpc_reqstbuf != NULL) {
+ /* NB might drop sv_lock in srpc_service_recycle_buffer, but
+ * sv won't go away for scd_rpc_active must not be empty */
+ srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+ rpc->srpc_reqstbuf = NULL;
+ }
+
+ list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+ /*
+ * No one can schedule me now since:
+ * - I'm not on scd_rpc_active.
+ * - all LNet events have been fired.
+ * Cancel pending schedules and prevent future schedule attempts:
+ */
+ LASSERT(rpc->srpc_ev.ev_fired);
+ swi_exit_workitem(&rpc->srpc_wi);
+
+ if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+ buffer = list_entry(scd->scd_buf_blocked.next,
+ srpc_buffer_t, buf_list);
+ list_del(&buffer->buf_list);
+
+ srpc_init_server_rpc(rpc, scd, buffer);
+ list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+ swi_schedule_workitem(&rpc->srpc_wi);
+ } else {
+ list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+ }
+
+ spin_unlock(&scd->scd_lock);
+ return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+ struct srpc_server_rpc *rpc = wi->swi_workitem.wi_data;
+ struct srpc_service_cd *scd = rpc->srpc_scd;
+ struct srpc_service *sv = scd->scd_svc;
+ srpc_event_t *ev = &rpc->srpc_ev;
+ int rc = 0;
+
+ LASSERT(wi == &rpc->srpc_wi);
+
+ spin_lock(&scd->scd_lock);
+
+ if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+ spin_unlock(&scd->scd_lock);
+
+ if (rpc->srpc_bulk != NULL)
+ LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+ LNetMDUnlink(rpc->srpc_replymdh);
+
+ if (ev->ev_fired) { /* no more event, OK to finish */
+ srpc_server_rpc_done(rpc, -ESHUTDOWN);
+ return 1;
+ }
+ return 0;
+ }
+
+ spin_unlock(&scd->scd_lock);
+
+ switch (wi->swi_state) {
+ default:
+ LBUG ();
+ case SWI_STATE_NEWBORN: {
+ srpc_msg_t *msg;
+ srpc_generic_reply_t *reply;
+
+ msg = &rpc->srpc_reqstbuf->buf_msg;
+ reply = &rpc->srpc_replymsg.msg_body.reply;
+
+ if (msg->msg_magic == 0) {
+ /* moaned already in srpc_lnet_ev_handler */
+ srpc_server_rpc_done(rpc, EBADMSG);
+ return 1;
+ }
+
+ srpc_unpack_msg_hdr(msg);
+ if (msg->msg_version != SRPC_MSG_VERSION) {
+ CWARN("Version mismatch: %u, %u expected, from %s\n",
+ msg->msg_version, SRPC_MSG_VERSION,
+ libcfs_id2str(rpc->srpc_peer));
+ reply->status = EPROTO;
+ /* drop through and send reply */
+ } else {
+ reply->status = 0;
+ rc = (*sv->sv_handler)(rpc);
+ LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+ if (rc != 0) {
+ srpc_server_rpc_done(rpc, rc);
+ return 1;
+ }
+ }
+
+ wi->swi_state = SWI_STATE_BULK_STARTED;
+
+ if (rpc->srpc_bulk != NULL) {
+ rc = srpc_do_bulk(rpc);
+ if (rc == 0)
+ return 0; /* wait for bulk */
+
+ LASSERT (ev->ev_fired);
+ ev->ev_status = rc;
+ }
+ }
+ case SWI_STATE_BULK_STARTED:
+ LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+ if (rpc->srpc_bulk != NULL) {
+ rc = ev->ev_status;
+
+ if (sv->sv_bulk_ready != NULL)
+ rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+ if (rc != 0) {
+ srpc_server_rpc_done(rpc, rc);
+ return 1;
+ }
+ }
+
+ wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+ rc = srpc_send_reply(rpc);
+ if (rc == 0)
+ return 0; /* wait for reply */
+ srpc_server_rpc_done(rpc, rc);
+ return 1;
+
+ case SWI_STATE_REPLY_SUBMITTED:
+ if (!ev->ev_fired) {
+ CERROR("RPC %p: bulk %p, service %d\n",
+ rpc, rpc->srpc_bulk, sv->sv_id);
+ CERROR("Event: status %d, type %d, lnet %d\n",
+ ev->ev_status, ev->ev_type, ev->ev_lnet);
+ LASSERT (ev->ev_fired);
+ }
+
+ wi->swi_state = SWI_STATE_DONE;
+ srpc_server_rpc_done(rpc, ev->ev_status);
+ return 1;
+ }
+
+ return 0;
+}
+
+void
+srpc_client_rpc_expired (void *data)
+{
+ srpc_client_rpc_t *rpc = data;
+
+ CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ rpc->crpc_timeout);
+
+ spin_lock(&rpc->crpc_lock);
+
+ rpc->crpc_timeout = 0;
+ srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+ spin_unlock(&rpc->crpc_lock);
+
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_expired++;
+ spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+ stt_timer_t *timer = &rpc->crpc_timer;
+
+ if (rpc->crpc_timeout == 0) return;
+
+ INIT_LIST_HEAD(&timer->stt_list);
+ timer->stt_data = rpc;
+ timer->stt_func = srpc_client_rpc_expired;
+ timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+ cfs_time_current_sec());
+ stt_add_timer(timer);
+ return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+void
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+ /* timer not planted or already exploded */
+ if (rpc->crpc_timeout == 0)
+ return;
+
+ /* timer sucessfully defused */
+ if (stt_del_timer(&rpc->crpc_timer))
+ return;
+
+ /* timer detonated, wait for it to explode */
+ while (rpc->crpc_timeout != 0) {
+ spin_unlock(&rpc->crpc_lock);
+
+ schedule();
+
+ spin_lock(&rpc->crpc_lock);
+ }
+}
+
+void
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+{
+ swi_workitem_t *wi = &rpc->crpc_wi;
+
+ LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+ spin_lock(&rpc->crpc_lock);
+
+ rpc->crpc_closed = 1;
+ if (rpc->crpc_status == 0)
+ rpc->crpc_status = status;
+
+ srpc_del_client_rpc_timer(rpc);
+
+ CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+ "Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+ /*
+ * No one can schedule me now since:
+ * - RPC timer has been defused.
+ * - all LNet events have been fired.
+ * - crpc_closed has been set, preventing srpc_abort_rpc from
+ * scheduling me.
+ * Cancel pending schedules and prevent future schedule attempts:
+ */
+ LASSERT (!srpc_event_pending(rpc));
+ swi_exit_workitem(wi);
+
+ spin_unlock(&rpc->crpc_lock);
+
+ (*rpc->crpc_done)(rpc);
+ return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc (swi_workitem_t *wi)
+{
+ int rc = 0;
+ srpc_client_rpc_t *rpc;
+ srpc_msg_t *reply;
+ int do_bulk;
+
+ LASSERT(wi != NULL);
+
+ rpc = wi->swi_workitem.wi_data;
+
+ LASSERT (rpc != NULL);
+ LASSERT (wi == &rpc->crpc_wi);
+
+ reply = &rpc->crpc_replymsg;
+ do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+ spin_lock(&rpc->crpc_lock);
+
+ if (rpc->crpc_aborted) {
+ spin_unlock(&rpc->crpc_lock);
+ goto abort;
+ }
+
+ spin_unlock(&rpc->crpc_lock);
+
+ switch (wi->swi_state) {
+ default:
+ LBUG ();
+ case SWI_STATE_NEWBORN:
+ LASSERT (!srpc_event_pending(rpc));
+
+ rc = srpc_prepare_reply(rpc);
+ if (rc != 0) {
+ srpc_client_rpc_done(rpc, rc);
+ return 1;
+ }
+
+ rc = srpc_prepare_bulk(rpc);
+ if (rc != 0) break;
+
+ wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+ rc = srpc_send_request(rpc);
+ break;
+
+ case SWI_STATE_REQUEST_SUBMITTED:
+ /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+ * order; however, they're processed in a strict order:
+ * rqt, rpy, and bulk. */
+ if (!rpc->crpc_reqstev.ev_fired) break;
+
+ rc = rpc->crpc_reqstev.ev_status;
+ if (rc != 0) break;
+
+ wi->swi_state = SWI_STATE_REQUEST_SENT;
+ /* perhaps more events, fall thru */
+ case SWI_STATE_REQUEST_SENT: {
+ srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+ if (!rpc->crpc_replyev.ev_fired) break;
+
+ rc = rpc->crpc_replyev.ev_status;
+ if (rc != 0) break;
+
+ srpc_unpack_msg_hdr(reply);
+ if (reply->msg_type != type ||
+ (reply->msg_magic != SRPC_MSG_MAGIC &&
+ reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+ CWARN ("Bad message from %s: type %u (%d expected),"
+ " magic %u (%d expected).\n",
+ libcfs_id2str(rpc->crpc_dest),
+ reply->msg_type, type,
+ reply->msg_magic, SRPC_MSG_MAGIC);
+ rc = -EBADMSG;
+ break;
+ }
+
+ if (do_bulk && reply->msg_body.reply.status != 0) {
+ CWARN ("Remote error %d at %s, unlink bulk buffer in "
+ "case peer didn't initiate bulk transfer\n",
+ reply->msg_body.reply.status,
+ libcfs_id2str(rpc->crpc_dest));
+ LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+ }
+
+ wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+ }
+ case SWI_STATE_REPLY_RECEIVED:
+ if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+ rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+ /* Bulk buffer was unlinked due to remote error. Clear error
+ * since reply buffer still contains valid data.
+ * NB rpc->crpc_done shouldn't look into bulk data in case of
+ * remote error. */
+ if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+ rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+ rc = 0;
+
+ wi->swi_state = SWI_STATE_DONE;
+ srpc_client_rpc_done(rpc, rc);
+ return 1;
+ }
+
+ if (rc != 0) {
+ spin_lock(&rpc->crpc_lock);
+ srpc_abort_rpc(rpc, rc);
+ spin_unlock(&rpc->crpc_lock);
+ }
+
+abort:
+ if (rpc->crpc_aborted) {
+ LNetMDUnlink(rpc->crpc_reqstmdh);
+ LNetMDUnlink(rpc->crpc_replymdh);
+ LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+ if (!srpc_event_pending(rpc)) {
+ srpc_client_rpc_done(rpc, -EINTR);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc (lnet_process_id_t peer, int service,
+ int nbulkiov, int bulklen,
+ void (*rpc_done)(srpc_client_rpc_t *),
+ void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+ srpc_client_rpc_t *rpc;
+
+ LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+ crpc_bulk.bk_iovs[nbulkiov]));
+ if (rpc == NULL)
+ return NULL;
+
+ srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+ bulklen, rpc_done, rpc_fini, priv);
+ return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+{
+ LASSERT (why != 0);
+
+ if (rpc->crpc_aborted || /* already aborted */
+ rpc->crpc_closed) /* callback imminent */
+ return;
+
+ CDEBUG (D_NET,
+ "Aborting RPC: service %d, peer %s, state %s, why %d\n",
+ rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+ swi_state2str(rpc->crpc_wi.swi_state), why);
+
+ rpc->crpc_aborted = 1;
+ rpc->crpc_status = why;
+ swi_schedule_workitem(&rpc->crpc_wi);
+ return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc (srpc_client_rpc_t *rpc)
+{
+ LASSERT (!rpc->crpc_aborted);
+ LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+ CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+ libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+ rpc->crpc_timeout);
+
+ srpc_add_client_rpc_timer(rpc);
+ swi_schedule_workitem(&rpc->crpc_wi);
+ return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+ srpc_event_t *ev = &rpc->srpc_ev;
+ struct srpc_msg *msg = &rpc->srpc_replymsg;
+ struct srpc_buffer *buffer = rpc->srpc_reqstbuf;
+ struct srpc_service_cd *scd = rpc->srpc_scd;
+ struct srpc_service *sv = scd->scd_svc;
+ __u64 rpyid;
+ int rc;
+
+ LASSERT(buffer != NULL);
+ rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+ spin_lock(&scd->scd_lock);
+
+ if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+ /* Repost buffer before replying since test client
+ * might send me another RPC once it gets the reply */
+ if (srpc_service_post_buffer(scd, buffer) != 0)
+ CWARN("Failed to repost %s buffer\n", sv->sv_name);
+ rpc->srpc_reqstbuf = NULL;
+ }
+
+ spin_unlock(&scd->scd_lock);
+
+ ev->ev_fired = 0;
+ ev->ev_data = rpc;
+ ev->ev_type = SRPC_REPLY_SENT;
+
+ msg->msg_magic = SRPC_MSG_MAGIC;
+ msg->msg_version = SRPC_MSG_VERSION;
+ msg->msg_type = srpc_service2reply(sv->sv_id);
+
+ rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+ sizeof(*msg), LNET_MD_OP_PUT,
+ rpc->srpc_peer, rpc->srpc_self,
+ &rpc->srpc_replymdh, ev);
+ if (rc != 0)
+ ev->ev_fired = 1; /* no more event expected */
+ return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+ struct srpc_service_cd *scd;
+ srpc_event_t *rpcev = ev->md.user_ptr;
+ srpc_client_rpc_t *crpc;
+ srpc_server_rpc_t *srpc;
+ srpc_buffer_t *buffer;
+ srpc_service_t *sv;
+ srpc_msg_t *msg;
+ srpc_msg_type_t type;
+
+ LASSERT (!in_interrupt());
+
+ if (ev->status != 0) {
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.errors++;
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+
+ rpcev->ev_lnet = ev->type;
+
+ switch (rpcev->ev_type) {
+ default:
+ CERROR("Unknown event: status %d, type %d, lnet %d\n",
+ rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+ LBUG ();
+ case SRPC_REQUEST_SENT:
+ if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_sent++;
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+ case SRPC_REPLY_RCVD:
+ case SRPC_BULK_REQ_RCVD:
+ crpc = rpcev->ev_data;
+
+ if (rpcev != &crpc->crpc_reqstev &&
+ rpcev != &crpc->crpc_replyev &&
+ rpcev != &crpc->crpc_bulkev) {
+ CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+ rpcev, crpc, &crpc->crpc_reqstev,
+ &crpc->crpc_replyev, &crpc->crpc_bulkev);
+ CERROR("Bad event: status %d, type %d, lnet %d\n",
+ rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+ LBUG ();
+ }
+
+ spin_lock(&crpc->crpc_lock);
+
+ LASSERT(rpcev->ev_fired == 0);
+ rpcev->ev_fired = 1;
+ rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+ -EINTR : ev->status;
+ swi_schedule_workitem(&crpc->crpc_wi);
+
+ spin_unlock(&crpc->crpc_lock);
+ break;
+
+ case SRPC_REQUEST_RCVD:
+ scd = rpcev->ev_data;
+ sv = scd->scd_svc;
+
+ LASSERT(rpcev == &scd->scd_ev);
+
+ spin_lock(&scd->scd_lock);
+
+ LASSERT (ev->unlinked);
+ LASSERT (ev->type == LNET_EVENT_PUT ||
+ ev->type == LNET_EVENT_UNLINK);
+ LASSERT (ev->type != LNET_EVENT_UNLINK ||
+ sv->sv_shuttingdown);
+
+ buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+ buffer->buf_peer = ev->initiator;
+ buffer->buf_self = ev->target.nid;
+
+ LASSERT(scd->scd_buf_nposted > 0);
+ scd->scd_buf_nposted--;
+
+ if (sv->sv_shuttingdown) {
+ /* Leave buffer on scd->scd_buf_nposted since
+ * srpc_finish_service needs to traverse it. */
+ spin_unlock(&scd->scd_lock);
+ break;
+ }
+
+ if (scd->scd_buf_err_stamp != 0 &&
+ scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+ /* re-enable adding buffer */
+ scd->scd_buf_err_stamp = 0;
+ scd->scd_buf_err = 0;
+ }
+
+ if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+ scd->scd_buf_adjust == 0 &&
+ scd->scd_buf_nposted < scd->scd_buf_low) {
+ scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2,
+ SFW_TEST_WI_MIN);
+ swi_schedule_workitem(&scd->scd_buf_wi);
+ }
+
+ list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+ msg = &buffer->buf_msg;
+ type = srpc_service2request(sv->sv_id);
+
+ if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+ (msg->msg_type != type &&
+ msg->msg_type != __swab32(type)) ||
+ (msg->msg_magic != SRPC_MSG_MAGIC &&
+ msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+ CERROR ("Dropping RPC (%s) from %s: "
+ "status %d mlength %d type %u magic %u.\n",
+ sv->sv_name, libcfs_id2str(ev->initiator),
+ ev->status, ev->mlength,
+ msg->msg_type, msg->msg_magic);
+
+ /* NB can't call srpc_service_recycle_buffer here since
+ * it may call LNetM[DE]Attach. The invalid magic tells
+ * srpc_handle_rpc to drop this RPC */
+ msg->msg_magic = 0;
+ }
+
+ if (!list_empty(&scd->scd_rpc_free)) {
+ srpc = list_entry(scd->scd_rpc_free.next,
+ struct srpc_server_rpc,
+ srpc_list);
+ list_del(&srpc->srpc_list);
+
+ srpc_init_server_rpc(srpc, scd, buffer);
+ list_add_tail(&srpc->srpc_list,
+ &scd->scd_rpc_active);
+ swi_schedule_workitem(&srpc->srpc_wi);
+ } else {
+ list_add_tail(&buffer->buf_list,
+ &scd->scd_buf_blocked);
+ }
+
+ spin_unlock(&scd->scd_lock);
+
+ spin_lock(&srpc_data.rpc_glock);
+ srpc_data.rpc_counters.rpcs_rcvd++;
+ spin_unlock(&srpc_data.rpc_glock);
+ break;
+
+ case SRPC_BULK_GET_RPLD:
+ LASSERT (ev->type == LNET_EVENT_SEND ||
+ ev->type == LNET_EVENT_REPLY ||
+ ev->type == LNET_EVENT_UNLINK);
+
+ if (!ev->unlinked)
+ break; /* wait for final event */
+
+ case SRPC_BULK_PUT_SENT:
+ if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+ spin_lock(&srpc_data.rpc_glock);
+
+ if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+ srpc_data.rpc_counters.bulk_get += ev->mlength;
+ else
+ srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+ spin_unlock(&srpc_data.rpc_glock);
+ }
+ case SRPC_REPLY_SENT:
+ srpc = rpcev->ev_data;
+ scd = srpc->srpc_scd;
+
+ LASSERT(rpcev == &srpc->srpc_ev);
+
+ spin_lock(&scd->scd_lock);
+
+ rpcev->ev_fired = 1;
+ rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+ -EINTR : ev->status;
+ swi_schedule_workitem(&srpc->srpc_wi);
+
+ spin_unlock(&scd->scd_lock);
+ break;
+ }
+}
+
+
+int
+srpc_startup (void)
+{
+ int rc;
+
+ memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+ spin_lock_init(&srpc_data.rpc_glock);
+
+ /* 1 second pause to avoid timestamp reuse */
+ cfs_pause(cfs_time_seconds(1));
+ srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+
+ srpc_data.rpc_state = SRPC_STATE_NONE;
+
+ rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+ if (rc < 0) {
+ CERROR ("LNetNIInit() has failed: %d\n", rc);
+ return rc;
+ }
+
+ srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+ LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+ rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+ if (rc != 0) {
+ CERROR("LNetEQAlloc() has failed: %d\n", rc);
+ goto bail;
+ }
+
+ rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+ LASSERT(rc == 0);
+ rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+ LASSERT(rc == 0);
+
+ srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+ rc = stt_startup();
+
+bail:
+ if (rc != 0)
+ srpc_shutdown();
+ else
+ srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+ return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+ int i;
+ int rc;
+ int state;
+
+ state = srpc_data.rpc_state;
+ srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+ switch (state) {
+ default:
+ LBUG ();
+ case SRPC_STATE_RUNNING:
+ spin_lock(&srpc_data.rpc_glock);
+
+ for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+ srpc_service_t *sv = srpc_data.rpc_services[i];
+
+ LASSERTF (sv == NULL,
+ "service not empty: id %d, name %s\n",
+ i, sv->sv_name);
+ }
+
+ spin_unlock(&srpc_data.rpc_glock);
+
+ stt_shutdown();
+
+ case SRPC_STATE_EQ_INIT:
+ rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+ rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+ LASSERT (rc == 0);
+ rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+ LASSERT (rc == 0); /* the EQ should have no user by now */
+
+ case SRPC_STATE_NI_INIT:
+ LNetNIFini();
+ }
+
+ return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644
index 000000000000..b905d49a351f
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.h
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <linux/lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+ SRPC_MSG_MKSN_REQST = 0,
+ SRPC_MSG_MKSN_REPLY = 1,
+ SRPC_MSG_RMSN_REQST = 2,
+ SRPC_MSG_RMSN_REPLY = 3,
+ SRPC_MSG_BATCH_REQST = 4,
+ SRPC_MSG_BATCH_REPLY = 5,
+ SRPC_MSG_STAT_REQST = 6,
+ SRPC_MSG_STAT_REPLY = 7,
+ SRPC_MSG_TEST_REQST = 8,
+ SRPC_MSG_TEST_REPLY = 9,
+ SRPC_MSG_DEBUG_REQST = 10,
+ SRPC_MSG_DEBUG_REPLY = 11,
+ SRPC_MSG_BRW_REQST = 12,
+ SRPC_MSG_BRW_REPLY = 13,
+ SRPC_MSG_PING_REQST = 14,
+ SRPC_MSG_PING_REPLY = 15,
+ SRPC_MSG_JOIN_REQST = 16,
+ SRPC_MSG_JOIN_REPLY = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+ __u64 rpyid; /* reply buffer matchbits */
+ __u64 bulkid; /* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+ __u32 status;
+ lst_sid_t sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+ __u64 mksn_rpyid; /* reply buffer matchbits */
+ lst_sid_t mksn_sid; /* session id */
+ __u32 mksn_force; /* use brute force */
+ char mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t; /* make session request */
+
+typedef struct {
+ __u32 mksn_status; /* session status */
+ lst_sid_t mksn_sid; /* session id */
+ __u32 mksn_timeout; /* session timeout */
+ char mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+ __u64 rmsn_rpyid; /* reply buffer matchbits */
+ lst_sid_t rmsn_sid; /* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+ __u32 rmsn_status;
+ lst_sid_t rmsn_sid; /* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+ __u64 join_rpyid; /* reply buffer matchbits */
+ lst_sid_t join_sid; /* session id to join */
+ char join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+ __u32 join_status; /* returned status */
+ lst_sid_t join_sid; /* session id */
+ __u32 join_timeout; /* # seconds' inactivity to expire */
+ char join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+ __u64 dbg_rpyid; /* reply buffer matchbits */
+ lst_sid_t dbg_sid; /* session id */
+ __u32 dbg_flags; /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+ __u32 dbg_status; /* returned code */
+ lst_sid_t dbg_sid; /* session id */
+ __u32 dbg_timeout; /* session timeout */
+ __u32 dbg_nbatch; /* # of batches in the node */
+ char dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN 1
+#define SRPC_BATCH_OPC_STOP 2
+#define SRPC_BATCH_OPC_QUERY 3
+
+typedef struct {
+ __u64 bar_rpyid; /* reply buffer matchbits */
+ lst_sid_t bar_sid; /* session id */
+ lst_bid_t bar_bid; /* batch id */
+ __u32 bar_opc; /* create/start/stop batch */
+ __u32 bar_testidx; /* index of test */
+ __u32 bar_arg; /* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+ __u32 bar_status; /* status of request */
+ lst_sid_t bar_sid; /* session id */
+ __u32 bar_active; /* # of active tests in batch/test */
+ __u32 bar_time; /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+ __u64 str_rpyid; /* reply buffer matchbits */
+ lst_sid_t str_sid; /* session id */
+ __u32 str_type; /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+ __u32 str_status;
+ lst_sid_t str_sid;
+ sfw_counters_t str_fw;
+ srpc_counters_t str_rpc;
+ lnet_counters_t str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+ __u32 blk_opc; /* bulk operation code */
+ __u32 blk_npg; /* # of pages */
+ __u32 blk_flags; /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+ /** bulk operation code */
+ __u16 blk_opc;
+ /** data check flags */
+ __u16 blk_flags;
+ /** data length */
+ __u32 blk_len;
+ /** reserved: offset */
+ __u32 blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+ __u32 png_size; /* size of ping message */
+ __u32 png_flags; /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+ __u64 tsr_rpyid; /* reply buffer matchbits */
+ __u64 tsr_bulkid; /* bulk buffer matchbits */
+ lst_sid_t tsr_sid; /* session id */
+ lst_bid_t tsr_bid; /* batch id */
+ __u32 tsr_service; /* test type: bulk|ping|... */
+ /* test client loop count or # server buffers needed */
+ __u32 tsr_loop;
+ __u32 tsr_concur; /* concurrency of test */
+ __u8 tsr_is_client; /* is test client or not */
+ __u8 tsr_stop_onerr; /* stop on error */
+ __u32 tsr_ndest; /* # of dest nodes */
+
+ union {
+ test_ping_req_t ping;
+ test_bulk_req_t bulk_v0;
+ test_bulk_req_v1_t bulk_v1;
+ } tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+ __u32 tsr_status; /* returned code */
+ lst_sid_t tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+ __u64 pnr_rpyid;
+ __u32 pnr_magic;
+ __u32 pnr_seq;
+ __u64 pnr_time_sec;
+ __u64 pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+ __u32 pnr_status;
+ __u32 pnr_magic;
+ __u32 pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+ __u64 brw_rpyid; /* reply buffer matchbits */
+ __u64 brw_bulkid; /* bulk buffer matchbits */
+ __u32 brw_rw; /* read or write */
+ __u32 brw_len; /* bulk data len */
+ __u32 brw_flags; /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+ __u32 brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC 0xeeb0f00d
+#define SRPC_MSG_VERSION 1
+
+typedef struct srpc_msg {
+ /** magic number */
+ __u32 msg_magic;
+ /** message version number */
+ __u32 msg_version;
+ /** type of message body: srpc_msg_type_t */
+ __u32 msg_type;
+ __u32 msg_reserved0;
+ __u32 msg_reserved1;
+ /** test session features */
+ __u32 msg_ses_feats;
+ union {
+ srpc_generic_reqst_t reqst;
+ srpc_generic_reply_t reply;
+
+ srpc_mksn_reqst_t mksn_reqst;
+ srpc_mksn_reply_t mksn_reply;
+ srpc_rmsn_reqst_t rmsn_reqst;
+ srpc_rmsn_reply_t rmsn_reply;
+ srpc_debug_reqst_t dbg_reqst;
+ srpc_debug_reply_t dbg_reply;
+ srpc_batch_reqst_t bat_reqst;
+ srpc_batch_reply_t bat_reply;
+ srpc_stat_reqst_t stat_reqst;
+ srpc_stat_reply_t stat_reply;
+ srpc_test_reqst_t tes_reqst;
+ srpc_test_reply_t tes_reply;
+ srpc_join_reqst_t join_reqst;
+ srpc_join_reply_t join_reply;
+
+ srpc_ping_reqst_t ping_reqst;
+ srpc_ping_reply_t ping_reply;
+ srpc_brw_reqst_t brw_reqst;
+ srpc_brw_reply_t brw_reply;
+ } msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+ if (msg->msg_magic == SRPC_MSG_MAGIC)
+ return; /* no flipping needed */
+
+ /* We do not swap the magic number here as it is needed to
+ determine whether the body needs to be swapped. */
+ /* __swab32s(&msg->msg_magic); */
+ __swab32s(&msg->msg_type);
+ __swab32s(&msg->msg_version);
+ __swab32s(&msg->msg_ses_feats);
+ __swab32s(&msg->msg_reserved0);
+ __swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644
index 000000000000..8053b0563ff3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/selftest.h
@@ -0,0 +1,611 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN 0
+#define SWI_STATE_REPLY_SUBMITTED 1
+#define SWI_STATE_REPLY_SENT 2
+#define SWI_STATE_REQUEST_SUBMITTED 3
+#define SWI_STATE_REQUEST_SENT 4
+#define SWI_STATE_REPLY_RECEIVED 5
+#define SWI_STATE_BULK_STARTED 6
+#define SWI_STATE_DONE 10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG 0
+#define SRPC_SERVICE_MAKE_SESSION 1
+#define SRPC_SERVICE_REMOVE_SESSION 2
+#define SRPC_SERVICE_BATCH 3
+#define SRPC_SERVICE_TEST 4
+#define SRPC_SERVICE_QUERY_STAT 5
+#define SRPC_SERVICE_JOIN 6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID 10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW 11
+#define SRPC_SERVICE_PING 12
+#define SRPC_SERVICE_MAX_ID 12
+
+#define SRPC_REQUEST_PORTAL 50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL 51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL 52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+ switch (service) {
+ default:
+ LBUG ();
+ case SRPC_SERVICE_DEBUG:
+ return SRPC_MSG_DEBUG_REQST;
+
+ case SRPC_SERVICE_MAKE_SESSION:
+ return SRPC_MSG_MKSN_REQST;
+
+ case SRPC_SERVICE_REMOVE_SESSION:
+ return SRPC_MSG_RMSN_REQST;
+
+ case SRPC_SERVICE_BATCH:
+ return SRPC_MSG_BATCH_REQST;
+
+ case SRPC_SERVICE_TEST:
+ return SRPC_MSG_TEST_REQST;
+
+ case SRPC_SERVICE_QUERY_STAT:
+ return SRPC_MSG_STAT_REQST;
+
+ case SRPC_SERVICE_BRW:
+ return SRPC_MSG_BRW_REQST;
+
+ case SRPC_SERVICE_PING:
+ return SRPC_MSG_PING_REQST;
+
+ case SRPC_SERVICE_JOIN:
+ return SRPC_MSG_JOIN_REQST;
+ }
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+ return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+ SRPC_BULK_REQ_RCVD = 1, /* passive bulk request(PUT sink/GET source) received */
+ SRPC_BULK_PUT_SENT = 2, /* active bulk PUT sent (source) */
+ SRPC_BULK_GET_RPLD = 3, /* active bulk GET replied (sink) */
+ SRPC_REPLY_RCVD = 4, /* incoming reply received */
+ SRPC_REPLY_SENT = 5, /* outgoing reply sent */
+ SRPC_REQUEST_RCVD = 6, /* incoming request received */
+ SRPC_REQUEST_SENT = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+ srpc_event_type_t ev_type; /* what's up */
+ lnet_event_kind_t ev_lnet; /* LNet event type */
+ int ev_fired; /* LNet event fired? */
+ int ev_status; /* LNet event status */
+ void *ev_data; /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+ int bk_len; /* len of bulk data */
+ lnet_handle_md_t bk_mdh;
+ int bk_sink; /* sink/source */
+ int bk_niov; /* # iov in bk_iovs */
+ lnet_kiov_t bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+ struct list_head buf_list; /* chain on srpc_service::*_msgq */
+ srpc_msg_t buf_msg;
+ lnet_handle_md_t buf_mdh;
+ lnet_nid_t buf_self;
+ lnet_process_id_t buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+ struct cfs_wi_sched *swi_sched;
+ cfs_workitem_t swi_workitem;
+ swi_action_t swi_action;
+ int swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+ /* chain on srpc_service::*_rpcq */
+ struct list_head srpc_list;
+ struct srpc_service_cd *srpc_scd;
+ swi_workitem_t srpc_wi;
+ srpc_event_t srpc_ev; /* bulk/reply event */
+ lnet_nid_t srpc_self;
+ lnet_process_id_t srpc_peer;
+ srpc_msg_t srpc_replymsg;
+ lnet_handle_md_t srpc_replymdh;
+ srpc_buffer_t *srpc_reqstbuf;
+ srpc_bulk_t *srpc_bulk;
+
+ unsigned int srpc_aborted; /* being given up */
+ int srpc_status;
+ void (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+ struct list_head crpc_list; /* chain on user's lists */
+ spinlock_t crpc_lock; /* serialize */
+ int crpc_service;
+ atomic_t crpc_refcount;
+ int crpc_timeout; /* # seconds to wait for reply */
+ stt_timer_t crpc_timer;
+ swi_workitem_t crpc_wi;
+ lnet_process_id_t crpc_dest;
+
+ void (*crpc_done)(struct srpc_client_rpc *);
+ void (*crpc_fini)(struct srpc_client_rpc *);
+ int crpc_status; /* completion status */
+ void *crpc_priv; /* caller data */
+
+ /* state flags */
+ unsigned int crpc_aborted:1; /* being given up */
+ unsigned int crpc_closed:1; /* completed */
+
+ /* RPC events */
+ srpc_event_t crpc_bulkev; /* bulk event */
+ srpc_event_t crpc_reqstev; /* request event */
+ srpc_event_t crpc_replyev; /* reply event */
+
+ /* bulk, request(reqst), and reply exchanged on wire */
+ srpc_msg_t crpc_reqstmsg;
+ srpc_msg_t crpc_replymsg;
+ lnet_handle_md_t crpc_reqstmdh;
+ lnet_handle_md_t crpc_replymdh;
+ srpc_bulk_t crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc) \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc) \
+do { \
+ CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n", \
+ (rpc), libcfs_id2str((rpc)->crpc_dest), \
+ atomic_read(&(rpc)->crpc_refcount)); \
+ LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \
+ atomic_inc(&(rpc)->crpc_refcount); \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc) \
+do { \
+ CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n", \
+ (rpc), libcfs_id2str((rpc)->crpc_dest), \
+ atomic_read(&(rpc)->crpc_refcount)); \
+ LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \
+ if (atomic_dec_and_test(&(rpc)->crpc_refcount)) \
+ srpc_destroy_client_rpc(rpc); \
+} while (0)
+
+#define srpc_event_pending(rpc) ((rpc)->crpc_bulkev.ev_fired == 0 || \
+ (rpc)->crpc_reqstev.ev_fired == 0 || \
+ (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+ /** serialize */
+ spinlock_t scd_lock;
+ /** backref to service */
+ struct srpc_service *scd_svc;
+ /** event buffer */
+ srpc_event_t scd_ev;
+ /** free RPC descriptors */
+ struct list_head scd_rpc_free;
+ /** in-flight RPCs */
+ struct list_head scd_rpc_active;
+ /** workitem for posting buffer */
+ swi_workitem_t scd_buf_wi;
+ /** CPT id */
+ int scd_cpt;
+ /** error code for scd_buf_wi */
+ int scd_buf_err;
+ /** timestamp for scd_buf_err */
+ unsigned long scd_buf_err_stamp;
+ /** total # request buffers */
+ int scd_buf_total;
+ /** # posted request buffers */
+ int scd_buf_nposted;
+ /** in progress of buffer posting */
+ int scd_buf_posting;
+ /** allocate more buffers if scd_buf_nposted < scd_buf_low */
+ int scd_buf_low;
+ /** increase/decrease some buffers */
+ int scd_buf_adjust;
+ /** posted message buffers */
+ struct list_head scd_buf_posted;
+ /** blocked for RPC descriptor */
+ struct list_head scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN 256
+#define SFW_TEST_WI_MAX 2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions */
+#define SFW_TEST_WI_EXTRA 64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN 16
+#define SFW_FRWK_WI_MAX 256
+
+typedef struct srpc_service {
+ int sv_id; /* service id */
+ const char *sv_name; /* human readable name */
+ int sv_wi_total; /* total server workitems */
+ int sv_shuttingdown;
+ int sv_ncpts;
+ /* percpt data for srpc_service */
+ struct srpc_service_cd **sv_cpt_data;
+ /* Service callbacks:
+ * - sv_handler: process incoming RPC request
+ * - sv_bulk_ready: notify bulk data
+ */
+ int (*sv_handler) (srpc_server_rpc_t *);
+ int (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+ struct list_head sn_list; /* chain on fw_zombie_sessions */
+ lst_sid_t sn_id; /* unique identifier */
+ unsigned int sn_timeout; /* # seconds' inactivity to expire */
+ int sn_timer_active;
+ unsigned int sn_features;
+ stt_timer_t sn_timer;
+ struct list_head sn_batches; /* list of batches */
+ char sn_name[LST_NAME_SIZE];
+ atomic_t sn_refcount;
+ atomic_t sn_brw_errors;
+ atomic_t sn_ping_errors;
+ cfs_time_t sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \
+ (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+ struct list_head bat_list; /* chain on sn_batches */
+ lst_bid_t bat_id; /* batch id */
+ int bat_error; /* error code of batch */
+ sfw_session_t *bat_session; /* batch's session */
+ atomic_t bat_nactive; /* # of active tests */
+ struct list_head bat_tests; /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+ int (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+ void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+ int (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+ lnet_process_id_t dest,
+ srpc_client_rpc_t **rpc); /* prep a tests rpc */
+ void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+ srpc_client_rpc_t *rpc); /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+ struct list_head tsi_list; /* chain on batch */
+ int tsi_service; /* test type */
+ sfw_batch_t *tsi_batch; /* batch */
+ sfw_test_client_ops_t *tsi_ops; /* test client operations */
+
+ /* public parameter for all test units */
+ unsigned int tsi_is_client:1; /* is test client */
+ unsigned int tsi_stoptsu_onerr:1; /* stop tsu on error */
+ int tsi_concur; /* concurrency */
+ int tsi_loop; /* loop count */
+
+ /* status of test instance */
+ spinlock_t tsi_lock; /* serialize */
+ unsigned int tsi_stopping:1; /* test is stopping */
+ atomic_t tsi_nactive; /* # of active test unit */
+ struct list_head tsi_units; /* test units */
+ struct list_head tsi_free_rpcs; /* free rpcs */
+ struct list_head tsi_active_rpcs; /* active rpcs */
+
+ union {
+ test_ping_req_t ping; /* ping parameter */
+ test_bulk_req_t bulk_v0; /* bulk parameter */
+ test_bulk_req_v1_t bulk_v1; /* bulk v1 parameter */
+ } tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n) (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+ struct list_head tsu_list; /* chain on lst_test_instance */
+ lnet_process_id_t tsu_dest; /* id of dest node */
+ int tsu_loop; /* loop count of the test */
+ sfw_test_instance_t *tsu_instance; /* pointer to test instance */
+ void *tsu_private; /* private data */
+ swi_workitem_t tsu_worker; /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+ struct list_head tsc_list; /* chain on fw_tests */
+ srpc_service_t *tsc_srv_service; /* test service */
+ sfw_test_client_ops_t *tsc_cli_ops; /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+ unsigned features, int nbulkiov, int bulklen,
+ void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+ lnet_process_id_t peer, unsigned features,
+ int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+ int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+ int nbulkiov, int bulklen,
+ void (*rpc_done)(srpc_client_rpc_t *),
+ void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+ int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+ return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+ swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+ return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+ swi_action_t action, struct cfs_wi_sched *sched)
+{
+ swi->swi_sched = sched;
+ swi->swi_action = action;
+ swi->swi_state = SWI_STATE_NEWBORN;
+ cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+ cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+ cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+ return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+ LASSERT (rpc != NULL);
+ LASSERT (!srpc_event_pending(rpc));
+ LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+ if (rpc->crpc_fini == NULL) {
+ LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+ } else {
+ (*rpc->crpc_fini) (rpc);
+ }
+
+ return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+ int service, int nbulkiov, int bulklen,
+ void (*rpc_done)(srpc_client_rpc_t *),
+ void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+ LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+ memset(rpc, 0, offsetof(srpc_client_rpc_t,
+ crpc_bulk.bk_iovs[nbulkiov]));
+
+ INIT_LIST_HEAD(&rpc->crpc_list);
+ swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+ lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+ spin_lock_init(&rpc->crpc_lock);
+ atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+ rpc->crpc_dest = peer;
+ rpc->crpc_priv = priv;
+ rpc->crpc_service = service;
+ rpc->crpc_bulk.bk_len = bulklen;
+ rpc->crpc_bulk.bk_niov = nbulkiov;
+ rpc->crpc_done = rpc_done;
+ rpc->crpc_fini = rpc_fini;
+ LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+ LNetInvalidateHandle(&rpc->crpc_replymdh);
+ LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+ /* no event is expected at this point */
+ rpc->crpc_bulkev.ev_fired =
+ rpc->crpc_reqstev.ev_fired =
+ rpc->crpc_replyev.ev_fired = 1;
+
+ rpc->crpc_reqstmsg.msg_magic = SRPC_MSG_MAGIC;
+ rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+ rpc->crpc_reqstmsg.msg_type = srpc_service2request(service);
+ return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+ switch(state) {
+ default:
+ LBUG();
+ STATE2STR(SWI_STATE_NEWBORN);
+ STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+ STATE2STR(SWI_STATE_REPLY_SENT);
+ STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+ STATE2STR(SWI_STATE_REQUEST_SENT);
+ STATE2STR(SWI_STATE_REPLY_RECEIVED);
+ STATE2STR(SWI_STATE_BULK_STARTED);
+ STATE2STR(SWI_STATE_DONE);
+ }
+#undef STATE2STR
+}
+
+#define UNUSED(x) ( (void)(x) )
+
+
+#define selftest_wait_events() cfs_pause(cfs_time_seconds(1) / 10)
+
+
+#define lst_wait_until(cond, lock, fmt, ...) \
+do { \
+ int __I = 2; \
+ while (!(cond)) { \
+ CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET, \
+ fmt, ## __VA_ARGS__); \
+ spin_unlock(&(lock)); \
+ \
+ selftest_wait_events(); \
+ \
+ spin_lock(&(lock)); \
+ } \
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+ int i = 2;
+
+ LASSERT(sv->sv_shuttingdown);
+
+ while (srpc_finish_service(sv) == 0) {
+ i++;
+ CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+ "Waiting for %s service to shutdown...\n",
+ sv->sv_name);
+ selftest_wait_events();
+ }
+}
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644
index 000000000000..2c078550277b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.c
@@ -0,0 +1,253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL 3 /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS (1 << 7)
+#define STTIMER_SLOT(t) (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+ (STTIMER_NSLOTS - 1))])
+
+struct st_timer_data {
+ spinlock_t stt_lock;
+ /* start time of the slot processed previously */
+ cfs_time_t stt_prev_slot;
+ struct list_head stt_hash[STTIMER_NSLOTS];
+ int stt_shuttingdown;
+ wait_queue_head_t stt_waitq;
+ int stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+ struct list_head *pos;
+
+ spin_lock(&stt_data.stt_lock);
+
+ LASSERT (stt_data.stt_nthreads > 0);
+ LASSERT (!stt_data.stt_shuttingdown);
+ LASSERT (timer->stt_func != NULL);
+ LASSERT (list_empty(&timer->stt_list));
+ LASSERT (cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+
+ /* a simple insertion sort */
+ list_for_each_prev (pos, STTIMER_SLOT(timer->stt_expires)) {
+ stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+ if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+ break;
+ }
+ list_add(&timer->stt_list, pos);
+
+ spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer (stt_timer_t *timer)
+{
+ int ret = 0;
+
+ spin_lock(&stt_data.stt_lock);
+
+ LASSERT (stt_data.stt_nthreads > 0);
+ LASSERT (!stt_data.stt_shuttingdown);
+
+ if (!list_empty(&timer->stt_list)) {
+ ret = 1;
+ list_del_init(&timer->stt_list);
+ }
+
+ spin_unlock(&stt_data.stt_lock);
+ return ret;
+}
+
+/* called with stt_data.stt_lock held */
+int
+stt_expire_list (struct list_head *slot, cfs_time_t now)
+{
+ int expired = 0;
+ stt_timer_t *timer;
+
+ while (!list_empty(slot)) {
+ timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+ if (cfs_time_after(timer->stt_expires, now))
+ break;
+
+ list_del_init(&timer->stt_list);
+ spin_unlock(&stt_data.stt_lock);
+
+ expired++;
+ (*timer->stt_func) (timer->stt_data);
+
+ spin_lock(&stt_data.stt_lock);
+ }
+
+ return expired;
+}
+
+int
+stt_check_timers (cfs_time_t *last)
+{
+ int expired = 0;
+ cfs_time_t now;
+ cfs_time_t this_slot;
+
+ now = cfs_time_current_sec();
+ this_slot = now & STTIMER_SLOTTIMEMASK;
+
+ spin_lock(&stt_data.stt_lock);
+
+ while (cfs_time_aftereq(this_slot, *last)) {
+ expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+ this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+ }
+
+ *last = now & STTIMER_SLOTTIMEMASK;
+ spin_unlock(&stt_data.stt_lock);
+ return expired;
+}
+
+
+int
+stt_timer_main (void *arg)
+{
+ int rc = 0;
+ UNUSED(arg);
+
+ SET_BUT_UNUSED(rc);
+
+ cfs_block_allsigs();
+
+ while (!stt_data.stt_shuttingdown) {
+ stt_check_timers(&stt_data.stt_prev_slot);
+
+ rc = wait_event_timeout(stt_data.stt_waitq,
+ stt_data.stt_shuttingdown,
+ cfs_time_seconds(STTIMER_SLOTTIME));
+ }
+
+ spin_lock(&stt_data.stt_lock);
+ stt_data.stt_nthreads--;
+ spin_unlock(&stt_data.stt_lock);
+ return 0;
+}
+
+int
+stt_start_timer_thread (void)
+{
+ task_t *task;
+
+ LASSERT(!stt_data.stt_shuttingdown);
+
+ task = kthread_run(stt_timer_main, NULL, "st_timer");
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ spin_lock(&stt_data.stt_lock);
+ stt_data.stt_nthreads++;
+ spin_unlock(&stt_data.stt_lock);
+ return 0;
+}
+
+
+int
+stt_startup (void)
+{
+ int rc = 0;
+ int i;
+
+ stt_data.stt_shuttingdown = 0;
+ stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+
+ spin_lock_init(&stt_data.stt_lock);
+ for (i = 0; i < STTIMER_NSLOTS; i++)
+ INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+ stt_data.stt_nthreads = 0;
+ init_waitqueue_head(&stt_data.stt_waitq);
+ rc = stt_start_timer_thread();
+ if (rc != 0)
+ CERROR ("Can't spawn timer thread: %d\n", rc);
+
+ return rc;
+}
+
+void
+stt_shutdown (void)
+{
+ int i;
+
+ spin_lock(&stt_data.stt_lock);
+
+ for (i = 0; i < STTIMER_NSLOTS; i++)
+ LASSERT (list_empty(&stt_data.stt_hash[i]));
+
+ stt_data.stt_shuttingdown = 1;
+
+ wake_up(&stt_data.stt_waitq);
+ lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+ "waiting for %d threads to terminate\n",
+ stt_data.stt_nthreads);
+
+ spin_unlock(&stt_data.stt_lock);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644
index 000000000000..56dbfe5ea1e5
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+ struct list_head stt_list;
+ cfs_time_t stt_expires;
+ void (*stt_func) (void *);
+ void *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */