staging: add Lustre file system client support

Lustre is the most deployed distributed file system in the HPC (High Performance Computing) world. The patch adds its client side support. The code is not very clean and needs to live in drivers/staging for some time for continuing cleanup work. See drivers/staging/lustre/TODO for details. The code is based on Lustre master commit faefbfc04 commit faefbfc0460bc00f2ee4c1c1c86aa1e39b9eea49 Author: Alex Zhuravlev <alexey.zhuravlev@intel.com> Date: Tue Apr 30 23:05:21 2013 +0400 LU-3244 utils: tunefs.lustre should preserve virgin label Plus a few under-review patches on Whamcloud gerrit: 3.8 kernel support: http://review.whamcloud.com/#change,5973 http://review.whamcloud.com/#change,5974 http://review.whamcloud.com/#change,5768 http://review.whamcloud.com/#change,5781 http://review.whamcloud.com/#change,5763 http://review.whamcloud.com/#change,5613 http://review.whamcloud.com/#change,5655 3.9 kernel support: http://review.whamcloud.com/#change,5898 http://review.whamcloud.com/#change,5899 Kconfig/Kbuild: http://review.whamcloud.com/#change,4646 http://review.whamcloud.com/#change,4644 libcfs cleanup: http://review.whamcloud.com/#change,2831 http://review.whamcloud.com/#change,4775 http://review.whamcloud.com/#change,4776 http://review.whamcloud.com/#change,4777 http://review.whamcloud.com/#change,4778 http://review.whamcloud.com/#change,4779 http://review.whamcloud.com/#change,4780 All starting/trailing whitespaces are removed, to match kernel coding style. Also ran scripts/cleanfile on all lustre source files. [maked the Kconfig depend on BROKEN as the recent procfs changes causes this to fail - gregkh] Signed-off-by: Peng Tao <tao.peng@emc.com> Signed-off-by: Andreas Dilger <andreas.dilger@intel.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
author: Peng Tao <bergwolf@gmail.com> 2013-05-02 16:46:55 +0800
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2013-05-14 13:54:50 -0400
commit: d7e09d0397e84eefbabfd9cb353221f3c6448d83 (patch)
tree: caeeaffa2967acfe0988dc65cf04089c0a27c7aa /drivers/staging/lustre/lnet
parent: staging: vt6656: remove unused definitions (diff)
download: linux-dev-d7e09d0397e84eefbabfd9cb353221f3c6448d83.tar.xz
linux-dev-d7e09d0397e84eefbabfd9cb353221f3c6448d83.zip
47 files changed, 39366 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig
new file mode 100644
index 000000000000..00850eeb6a8c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Kconfig
@@ -0,0 +1,40 @@
+config LNET
+	tristate "Lustre networking subsystem"
+	depends on LUSTRE_FS
+
+config LNET_MAX_PAYLOAD
+	int "Lustre lnet max transfer payload (default 2MB)"
+	depends on LUSTRE_FS
+	default "1048576"
+	help
+	  This option defines the maximum size of payload in bytes that lnet
+	  can put into its transport.
+
+	  If unsure, use default.
+
+config LNET_SELFTEST
+	tristate "Lustre networking self testing"
+	depends on LNET
+	help
+	  Choose Y here if you want to do lnet self testing. To compile this
+	  as a module, choose M here: the module will be called lnet_selftest.
+
+	  To compile this as a kernel modules, choose M here and it will be
+	  called lnet_selftest.
+
+	  If unsure, say N.
+
+	  See also http://wiki.lustre.org/
+
+config LNET_XPRT_IB
+	tristate "LNET infiniband support"
+	depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS
+	default LNET && INFINIBAND
+	help
+	  This option allows the LNET users to use infiniband as an
+	  RDMA-enabled transport.
+
+	  To compile this as a kernel module, choose M here and it will be
+	  called ko2iblnd.
+
+	  If unsure, say N.
diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile
new file mode 100644
index 000000000000..374212b1555a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) := klnds/ lnet/ selftest/
diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile
new file mode 100644
index 000000000000..c23e4f67f837
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_LNET) += o2iblnd/  socklnd/
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
new file mode 100644
index 000000000000..71b7d8418357
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o
+ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644
index 000000000000..f4b958bbe5a3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -0,0 +1,3256 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+lnd_t the_o2iblnd = {
+	.lnd_type       = O2IBLND,
+	.lnd_startup    = kiblnd_startup,
+	.lnd_shutdown   = kiblnd_shutdown,
+	.lnd_ctl	= kiblnd_ctl,
+	.lnd_query      = kiblnd_query,
+	.lnd_send       = kiblnd_send,
+	.lnd_recv       = kiblnd_recv,
+};
+
+kib_data_t	      kiblnd_data;
+
+__u32
+kiblnd_cksum (void *ptr, int nob)
+{
+	char  *c  = ptr;
+	__u32  sum = 0;
+
+	while (nob-- > 0)
+		sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+	/* ensure I don't return 0 (== no checksum) */
+	return (sum == 0) ? 1 : sum;
+}
+
+static char *
+kiblnd_msgtype2str(int type)
+{
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+		return "CONNREQ";
+
+	case IBLND_MSG_CONNACK:
+		return "CONNACK";
+
+	case IBLND_MSG_NOOP:
+		return "NOOP";
+
+	case IBLND_MSG_IMMEDIATE:
+		return "IMMEDIATE";
+
+	case IBLND_MSG_PUT_REQ:
+		return "PUT_REQ";
+
+	case IBLND_MSG_PUT_NAK:
+		return "PUT_NAK";
+
+	case IBLND_MSG_PUT_ACK:
+		return "PUT_ACK";
+
+	case IBLND_MSG_PUT_DONE:
+		return "PUT_DONE";
+
+	case IBLND_MSG_GET_REQ:
+		return "GET_REQ";
+
+	case IBLND_MSG_GET_DONE:
+		return "GET_DONE";
+
+	default:
+		return "???";
+	}
+}
+
+static int
+kiblnd_msgtype2size(int type)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+
+	switch (type) {
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		return hdr_size + sizeof(kib_connparams_t);
+
+	case IBLND_MSG_NOOP:
+		return hdr_size;
+
+	case IBLND_MSG_IMMEDIATE:
+		return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
+
+	case IBLND_MSG_PUT_REQ:
+		return hdr_size + sizeof(kib_putreq_msg_t);
+
+	case IBLND_MSG_PUT_ACK:
+		return hdr_size + sizeof(kib_putack_msg_t);
+
+	case IBLND_MSG_GET_REQ:
+		return hdr_size + sizeof(kib_get_msg_t);
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		return hdr_size + sizeof(kib_completion_msg_t);
+	default:
+		return -1;
+	}
+}
+
+static int
+kiblnd_unpack_rd(kib_msg_t *msg, int flip)
+{
+	kib_rdma_desc_t   *rd;
+	int		nob;
+	int		n;
+	int		i;
+
+	LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
+		 msg->ibm_type == IBLND_MSG_PUT_ACK);
+
+	rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
+			      &msg->ibm_u.get.ibgm_rd :
+			      &msg->ibm_u.putack.ibpam_rd;
+
+	if (flip) {
+		__swab32s(&rd->rd_key);
+		__swab32s(&rd->rd_nfrags);
+	}
+
+	n = rd->rd_nfrags;
+
+	if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+		CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
+		       n, IBLND_MAX_RDMA_FRAGS);
+		return 1;
+	}
+
+	nob = offsetof (kib_msg_t, ibm_u) +
+	      kiblnd_rd_msg_size(rd, msg->ibm_type, n);
+
+	if (msg->ibm_nob < nob) {
+		CERROR("Short %s: %d(%d)\n",
+		       kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
+		return 1;
+	}
+
+	if (!flip)
+		return 0;
+
+	for (i = 0; i < n; i++) {
+		__swab32s(&rd->rd_frags[i].rf_nob);
+		__swab64s(&rd->rd_frags[i].rf_addr);
+	}
+
+	return 0;
+}
+
+void
+kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+		 int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+	kib_net_t *net = ni->ni_data;
+
+	/* CAVEAT EMPTOR! all message fields not set here should have been
+	 * initialised previously. */
+	msg->ibm_magic    = IBLND_MSG_MAGIC;
+	msg->ibm_version  = version;
+	/*   ibm_type */
+	msg->ibm_credits  = credits;
+	/*   ibm_nob */
+	msg->ibm_cksum    = 0;
+	msg->ibm_srcnid   = ni->ni_nid;
+	msg->ibm_srcstamp = net->ibn_incarnation;
+	msg->ibm_dstnid   = dstnid;
+	msg->ibm_dststamp = dststamp;
+
+	if (*kiblnd_tunables.kib_cksum) {
+		/* NB ibm_cksum zero while computing cksum */
+		msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+	}
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+	const int hdr_size = offsetof(kib_msg_t, ibm_u);
+	__u32     msg_cksum;
+	__u16     version;
+	int       msg_nob;
+	int       flip;
+
+	/* 6 bytes are enough to have received magic + version */
+	if (nob < 6) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+		flip = 0;
+	} else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+		flip = 1;
+	} else {
+		CERROR("Bad magic: %08x\n", msg->ibm_magic);
+		return -EPROTO;
+	}
+
+	version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+	if (version != IBLND_MSG_VERSION &&
+	    version != IBLND_MSG_VERSION_1) {
+		CERROR("Bad version: %x\n", version);
+		return -EPROTO;
+	}
+
+	if (nob < hdr_size) {
+		CERROR("Short message: %d\n", nob);
+		return -EPROTO;
+	}
+
+	msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+	if (msg_nob > nob) {
+		CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+		return -EPROTO;
+	}
+
+	/* checksum must be computed with ibm_cksum zero and BEFORE anything
+	 * gets flipped */
+	msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+	msg->ibm_cksum = 0;
+	if (msg_cksum != 0 &&
+	    msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+		CERROR("Bad checksum\n");
+		return -EPROTO;
+	}
+
+	msg->ibm_cksum = msg_cksum;
+
+	if (flip) {
+		/* leave magic unflipped as a clue to peer endianness */
+		msg->ibm_version = version;
+		CLASSERT (sizeof(msg->ibm_type) == 1);
+		CLASSERT (sizeof(msg->ibm_credits) == 1);
+		msg->ibm_nob     = msg_nob;
+		__swab64s(&msg->ibm_srcnid);
+		__swab64s(&msg->ibm_srcstamp);
+		__swab64s(&msg->ibm_dstnid);
+		__swab64s(&msg->ibm_dststamp);
+	}
+
+	if (msg->ibm_srcnid == LNET_NID_ANY) {
+		CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+		return -EPROTO;
+	}
+
+	if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
+		CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
+		       msg_nob, kiblnd_msgtype2size(msg->ibm_type));
+		return -EPROTO;
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Unknown message type %x\n", msg->ibm_type);
+		return -EPROTO;
+
+	case IBLND_MSG_NOOP:
+	case IBLND_MSG_IMMEDIATE:
+	case IBLND_MSG_PUT_REQ:
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_GET_REQ:
+		if (kiblnd_unpack_rd(msg, flip))
+			return -EPROTO;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		if (flip)
+			__swab32s(&msg->ibm_u.completion.ibcm_status);
+		break;
+
+	case IBLND_MSG_CONNREQ:
+	case IBLND_MSG_CONNACK:
+		if (flip) {
+			__swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+			__swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+			__swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+		}
+		break;
+	}
+	return 0;
+}
+
+int
+kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_net_t	*net = ni->ni_data;
+	int		cpt = lnet_cpt_of_nid(nid);
+	unsigned long   flags;
+
+	LASSERT(net != NULL);
+	LASSERT(nid != LNET_NID_ANY);
+
+	LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
+	if (peer == NULL) {
+		CERROR("Cannot allocate peer\n");
+		return -ENOMEM;
+	}
+
+	memset(peer, 0, sizeof(*peer));	 /* zero flags etc */
+
+	peer->ibp_ni = ni;
+	peer->ibp_nid = nid;
+	peer->ibp_error = 0;
+	peer->ibp_last_alive = 0;
+	atomic_set(&peer->ibp_refcount, 1);  /* 1 ref for caller */
+
+	INIT_LIST_HEAD(&peer->ibp_list);     /* not in the peer table yet */
+	INIT_LIST_HEAD(&peer->ibp_conns);
+	INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT (net->ibn_shutdown == 0);
+
+	/* npeers only grows with the global lock held */
+	atomic_inc(&net->ibn_npeers);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_t *peer)
+{
+	kib_net_t *net = peer->ibp_ni->ni_data;
+
+	LASSERT (net != NULL);
+	LASSERT (atomic_read(&peer->ibp_refcount) == 0);
+	LASSERT (!kiblnd_peer_active(peer));
+	LASSERT (peer->ibp_connecting == 0);
+	LASSERT (peer->ibp_accepting == 0);
+	LASSERT (list_empty(&peer->ibp_conns));
+	LASSERT (list_empty(&peer->ibp_tx_queue));
+
+	LIBCFS_FREE(peer, sizeof(*peer));
+
+	/* NB a peer's connections keep a reference on their peer until
+	 * they are destroyed, so we can be assured that _all_ state to do
+	 * with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	atomic_dec(&net->ibn_npeers);
+}
+
+kib_peer_t *
+kiblnd_find_peer_locked (lnet_nid_t nid)
+{
+	/* the caller is responsible for accounting the additional reference
+	 * that this creates */
+	struct list_head       *peer_list = kiblnd_nid2peerlist(nid);
+	struct list_head       *tmp;
+	kib_peer_t       *peer;
+
+	list_for_each (tmp, peer_list) {
+
+		peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+		LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+
+		if (peer->ibp_nid != nid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
+		       peer, libcfs_nid2str(nid),
+		       atomic_read(&peer->ibp_refcount),
+		       peer->ibp_version);
+		return peer;
+	}
+	return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_t *peer)
+{
+	LASSERT (list_empty(&peer->ibp_conns));
+
+	LASSERT (kiblnd_peer_active(peer));
+	list_del_init(&peer->ibp_list);
+	/* lose peerlist's ref */
+	kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_get_peer_info (lnet_ni_t *ni, int index,
+		      lnet_nid_t *nidp, int *count)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+		list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (index-- > 0)
+				continue;
+
+			*nidp = peer->ibp_nid;
+			*count = atomic_read(&peer->ibp_refcount);
+
+			read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					       flags);
+			return 0;
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return -ENOENT;
+}
+
+void
+kiblnd_del_peer_locked (kib_peer_t *peer)
+{
+	struct list_head	   *ctmp;
+	struct list_head	   *cnxt;
+	kib_conn_t	   *conn;
+
+	if (list_empty(&peer->ibp_conns)) {
+		kiblnd_unlink_peer_locked(peer);
+	} else {
+		list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			kiblnd_close_conn_locked(conn, 0);
+		}
+		/* NB closing peer's last conn unlinked it. */
+	}
+	/* NB peer now unlinked; might even be freed if the peer table had the
+	 * last ref on it. */
+}
+
+int
+kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
+{
+	LIST_HEAD	 (zombies);
+	struct list_head	    *ptmp;
+	struct list_head	    *pnxt;
+	kib_peer_t	    *peer;
+	int		    lo;
+	int		    hi;
+	int		    i;
+	unsigned long	  flags;
+	int		    rc = -ENOENT;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY) {
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	} else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+				continue;
+
+			if (!list_empty(&peer->ibp_tx_queue)) {
+				LASSERT (list_empty(&peer->ibp_conns));
+
+				list_splice_init(&peer->ibp_tx_queue,
+						     &zombies);
+			}
+
+			kiblnd_del_peer_locked(peer);
+			rc = 0;	 /* matched something */
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_txlist_done(ni, &zombies, -EIO);
+
+	return rc;
+}
+
+kib_conn_t *
+kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+	kib_peer_t	    *peer;
+	struct list_head	    *ptmp;
+	kib_conn_t	    *conn;
+	struct list_head	    *ctmp;
+	int		    i;
+	unsigned long	  flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+		list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			list_for_each (ctmp, &peer->ibp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry(ctmp, kib_conn_t,
+						      ibc_list);
+				kiblnd_conn_addref(conn);
+				read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+						       flags);
+				return conn;
+			}
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+	return NULL;
+}
+
+void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+	CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
+	       rx, rx->rx_status, rx->rx_msg->ibm_type,
+	       rx->rx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+	CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+	       "cookie "LPX64" msg %s%s type %x cred %d\n",
+	       tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+	       tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+	       tx->tx_lntmsg[0] == NULL ? "-" : "!",
+	       tx->tx_lntmsg[1] == NULL ? "-" : "!",
+	       tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+	struct list_head	*tmp;
+	int		i;
+
+	spin_lock(&conn->ibc_lock);
+
+	CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
+	       atomic_read(&conn->ibc_refcount), conn,
+	       conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+	CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
+	       conn->ibc_state, conn->ibc_noops_posted,
+	       conn->ibc_nsends_posted, conn->ibc_credits,
+	       conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+	CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+	CDEBUG(D_CONSOLE, "   early_rxs:\n");
+	list_for_each(tmp, &conn->ibc_early_rxs)
+		kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_noops:\n");
+	list_for_each(tmp, &conn->ibc_tx_noops)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   tx_queue:\n");
+	list_for_each(tmp, &conn->ibc_tx_queue)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   active_txs:\n");
+	list_for_each(tmp, &conn->ibc_active_txs)
+		kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+	CDEBUG(D_CONSOLE, "   rxs:\n");
+	for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
+		kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+	spin_unlock(&conn->ibc_lock);
+}
+
+int
+kiblnd_translate_mtu(int value)
+{
+	switch (value) {
+	default:
+		return -1;
+	case 0:
+		return 0;
+	case 256:
+		return IB_MTU_256;
+	case 512:
+		return IB_MTU_512;
+	case 1024:
+		return IB_MTU_1024;
+	case 2048:
+		return IB_MTU_2048;
+	case 4096:
+		return IB_MTU_4096;
+	}
+}
+
+static void
+kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
+{
+	int	   mtu;
+
+	/* XXX There is no path record for iWARP, set by netdev->change_mtu? */
+	if (cmid->route.path_rec == NULL)
+		return;
+
+	mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
+	LASSERT (mtu >= 0);
+	if (mtu != 0)
+		cmid->route.path_rec->mtu = mtu;
+}
+
+static int
+kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
+{
+	cpumask_t	*mask;
+	int		vectors;
+	int		off;
+	int		i;
+
+	vectors = conn->ibc_cmid->device->num_comp_vectors;
+	if (vectors <= 1)
+		return 0;
+
+	mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
+
+	/* hash NID to CPU id in this partition... */
+	off = conn->ibc_peer->ibp_nid % cpus_weight(*mask);
+	for_each_cpu_mask(i, *mask) {
+		if (off-- == 0)
+			return i % vectors;
+	}
+
+	LBUG();
+	return 1;
+}
+
+kib_conn_t *
+kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
+		   int state, int version)
+{
+	/* CAVEAT EMPTOR:
+	 * If the new conn is created successfully it takes over the caller's
+	 * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
+	 * is destroyed.  On failure, the caller's ref on 'peer' remains and
+	 * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+	 * to destroy 'cmid' here since I'm called from the CM which still has
+	 * its ref on 'cmid'). */
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_net_t	      *net = peer->ibp_ni->ni_data;
+	kib_dev_t	      *dev;
+	struct ib_qp_init_attr *init_qp_attr;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	struct ib_cq		*cq;
+	unsigned long		flags;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(net != NULL);
+	LASSERT(!in_interrupt());
+
+	dev = net->ibn_dev;
+
+	cpt = lnet_cpt_of_nid(peer->ibp_nid);
+	sched = kiblnd_data.kib_scheds[cpt];
+
+	LASSERT(sched->ibs_nthreads > 0);
+
+	LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
+			 sizeof(*init_qp_attr));
+	if (init_qp_attr == NULL) {
+		CERROR("Can't allocate qp_attr for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_0;
+	}
+
+	LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
+	if (conn == NULL) {
+		CERROR("Can't allocate connection for %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		goto failed_1;
+	}
+
+	conn->ibc_state = IBLND_CONN_INIT;
+	conn->ibc_version = version;
+	conn->ibc_peer = peer;		  /* I take the caller's ref */
+	cmid->context = conn;		   /* for future CM callbacks */
+	conn->ibc_cmid = cmid;
+
+	INIT_LIST_HEAD(&conn->ibc_early_rxs);
+	INIT_LIST_HEAD(&conn->ibc_tx_noops);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+	INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+	INIT_LIST_HEAD(&conn->ibc_active_txs);
+	spin_lock_init(&conn->ibc_lock);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
+			 sizeof(*conn->ibc_connvars));
+	if (conn->ibc_connvars == NULL) {
+		CERROR("Can't allocate in-progress connection state\n");
+		goto failed_2;
+	}
+
+	write_lock_irqsave(glock, flags);
+	if (dev->ibd_failover) {
+		write_unlock_irqrestore(glock, flags);
+		CERROR("%s: failover in progress\n", dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
+		/* wakeup failover thread and teardown connection */
+		if (kiblnd_dev_can_failover(dev)) {
+			list_add_tail(&dev->ibd_fail_list,
+				      &kiblnd_data.kib_failed_devs);
+			wake_up(&kiblnd_data.kib_failover_waitq);
+		}
+
+		write_unlock_irqrestore(glock, flags);
+		CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
+		       cmid->device->name, dev->ibd_ifname);
+		goto failed_2;
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	conn->ibc_hdev = dev->ibd_hdev;
+
+	kiblnd_setup_mtu_locked(cmid);
+
+	write_unlock_irqrestore(glock, flags);
+
+	LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
+			 IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
+	if (conn->ibc_rxs == NULL) {
+		CERROR("Cannot allocate RX buffers\n");
+		goto failed_2;
+	}
+
+	rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
+				IBLND_RX_MSG_PAGES(version));
+	if (rc != 0)
+		goto failed_2;
+
+	kiblnd_map_rx_descs(conn);
+
+	cq = ib_create_cq(cmid->device,
+			  kiblnd_cq_completion, kiblnd_cq_event, conn,
+			  IBLND_CQ_ENTRIES(version),
+			  kiblnd_get_completion_vector(conn, cpt));
+	if (IS_ERR(cq)) {
+		CERROR("Can't create CQ: %ld, cqe: %d\n",
+		       PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
+		goto failed_2;
+	}
+
+	conn->ibc_cq = cq;
+
+	rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (rc != 0) {
+		CERROR("Can't request completion notificiation: %d\n", rc);
+		goto failed_2;
+	}
+
+	init_qp_attr->event_handler = kiblnd_qp_event;
+	init_qp_attr->qp_context = conn;
+	init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
+	init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
+	init_qp_attr->cap.max_send_sge = 1;
+	init_qp_attr->cap.max_recv_sge = 1;
+	init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	init_qp_attr->qp_type = IB_QPT_RC;
+	init_qp_attr->send_cq = cq;
+	init_qp_attr->recv_cq = cq;
+
+	conn->ibc_sched = sched;
+
+	rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
+	if (rc != 0) {
+		CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
+		       rc, init_qp_attr->cap.max_send_wr,
+		       init_qp_attr->cap.max_recv_wr);
+		goto failed_2;
+	}
+
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+	/* 1 ref for caller and each rxmsg */
+	atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
+	conn->ibc_nrx = IBLND_RX_MSGS(version);
+
+	/* post receives */
+	for (i = 0; i < IBLND_RX_MSGS(version); i++) {
+		rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+				    IBLND_POSTRX_NO_CREDIT);
+		if (rc != 0) {
+			CERROR("Can't post rxmsg: %d\n", rc);
+
+			/* Make posted receives complete */
+			kiblnd_abort_receives(conn);
+
+			/* correct # of posted buffers
+			 * NB locking needed now I'm racing with completion */
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+			conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			/* cmid will be destroyed by CM(ofed) after cm_callback
+			 * returned, so we can't refer it anymore
+			 * (by kiblnd_connd()->kiblnd_destroy_conn) */
+			rdma_destroy_qp(conn->ibc_cmid);
+			conn->ibc_cmid = NULL;
+
+			/* Drop my own and unused rxbuffer refcounts */
+			while (i++ <= IBLND_RX_MSGS(version))
+				kiblnd_conn_decref(conn);
+
+			return NULL;
+		}
+	}
+
+	/* Init successful! */
+	LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+		 state == IBLND_CONN_PASSIVE_WAIT);
+	conn->ibc_state = state;
+
+	/* 1 more conn */
+	atomic_inc(&net->ibn_nconns);
+	return conn;
+
+ failed_2:
+	kiblnd_destroy_conn(conn);
+ failed_1:
+	LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+	return NULL;
+}
+
+void
+kiblnd_destroy_conn (kib_conn_t *conn)
+{
+	struct rdma_cm_id *cmid = conn->ibc_cmid;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		rc;
+
+	LASSERT (!in_interrupt());
+	LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+	LASSERT (list_empty(&conn->ibc_early_rxs));
+	LASSERT (list_empty(&conn->ibc_tx_noops));
+	LASSERT (list_empty(&conn->ibc_tx_queue));
+	LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+	LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
+	LASSERT (list_empty(&conn->ibc_active_txs));
+	LASSERT (conn->ibc_noops_posted == 0);
+	LASSERT (conn->ibc_nsends_posted == 0);
+
+	switch (conn->ibc_state) {
+	default:
+		/* conn must be completely disengaged from the network */
+		LBUG();
+
+	case IBLND_CONN_DISCONNECTED:
+		/* connvars should have been freed already */
+		LASSERT (conn->ibc_connvars == NULL);
+		break;
+
+	case IBLND_CONN_INIT:
+		break;
+	}
+
+	/* conn->ibc_cmid might be destroyed by CM already */
+	if (cmid != NULL && cmid->qp != NULL)
+		rdma_destroy_qp(cmid);
+
+	if (conn->ibc_cq != NULL) {
+		rc = ib_destroy_cq(conn->ibc_cq);
+		if (rc != 0)
+			CWARN("Error destroying CQ: %d\n", rc);
+	}
+
+	if (conn->ibc_rx_pages != NULL)
+		kiblnd_unmap_rx_descs(conn);
+
+	if (conn->ibc_rxs != NULL) {
+		LIBCFS_FREE(conn->ibc_rxs,
+			    IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
+	}
+
+	if (conn->ibc_connvars != NULL)
+		LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+	if (conn->ibc_hdev != NULL)
+		kiblnd_hdev_decref(conn->ibc_hdev);
+
+	/* See CAVEAT EMPTOR above in kiblnd_create_conn */
+	if (conn->ibc_state != IBLND_CONN_INIT) {
+		kib_net_t *net = peer->ibp_ni->ni_data;
+
+		kiblnd_peer_decref(peer);
+		rdma_destroy_id(cmid);
+		atomic_dec(&net->ibn_nconns);
+	}
+
+	LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		CDEBUG(D_NET, "Closing conn -> %s, "
+			      "version: %x, reason: %d\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, why);
+
+		kiblnd_close_conn_locked(conn, why);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+				 int version, __u64 incarnation)
+{
+	kib_conn_t	     *conn;
+	struct list_head	     *ctmp;
+	struct list_head	     *cnxt;
+	int		     count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+		conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+		if (conn->ibc_version     == version &&
+		    conn->ibc_incarnation == incarnation)
+			continue;
+
+		CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
+			      "incarnation:"LPX64"(%x, "LPX64")\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_version, conn->ibc_incarnation,
+		       version, incarnation);
+
+		kiblnd_close_conn_locked(conn, -ESTALE);
+		count++;
+	}
+
+	return count;
+}
+
+int
+kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
+{
+	kib_peer_t	     *peer;
+	struct list_head	     *ptmp;
+	struct list_head	     *pnxt;
+	int		     lo;
+	int		     hi;
+	int		     i;
+	unsigned long	   flags;
+	int		     count = 0;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (nid != LNET_NID_ANY)
+		lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+	else {
+		lo = 0;
+		hi = kiblnd_data.kib_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+			peer = list_entry(ptmp, kib_peer_t, ibp_list);
+			LASSERT (peer->ibp_connecting > 0 ||
+				 peer->ibp_accepting > 0 ||
+				 !list_empty(&peer->ibp_conns));
+
+			if (peer->ibp_ni != ni)
+				continue;
+
+			if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+				continue;
+
+			count += kiblnd_close_peer_conns_locked(peer, 0);
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* wildcards always succeed */
+	if (nid == LNET_NID_ANY)
+		return 0;
+
+	return (count == 0) ? -ENOENT : 0;
+}
+
+int
+kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	int		       rc = -EINVAL;
+
+	switch(cmd) {
+	case IOC_LIBCFS_GET_PEER: {
+		lnet_nid_t   nid = 0;
+		int	  count = 0;
+
+		rc = kiblnd_get_peer_info(ni, data->ioc_count,
+					  &nid, &count);
+		data->ioc_nid    = nid;
+		data->ioc_count  = count;
+		break;
+	}
+
+	case IOC_LIBCFS_DEL_PEER: {
+		rc = kiblnd_del_peer(ni, data->ioc_nid);
+		break;
+	}
+	case IOC_LIBCFS_GET_CONN: {
+		kib_conn_t *conn;
+
+		rc = 0;
+		conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+		if (conn == NULL) {
+			rc = -ENOENT;
+			break;
+		}
+
+		LASSERT (conn->ibc_cmid != NULL);
+		data->ioc_nid = conn->ibc_peer->ibp_nid;
+		if (conn->ibc_cmid->route.path_rec == NULL)
+			data->ioc_u32[0] = 0; /* iWarp has no path MTU */
+		else
+			data->ioc_u32[0] =
+			ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
+		kiblnd_conn_decref(conn);
+		break;
+	}
+	case IOC_LIBCFS_CLOSE_CONNECTION: {
+		rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	return rc;
+}
+
+void
+kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	cfs_time_t	last_alive = 0;
+	cfs_time_t	now = cfs_time_current();
+	rwlock_t	*glock = &kiblnd_data.kib_global_lock;
+	kib_peer_t	*peer;
+	unsigned long	flags;
+
+	read_lock_irqsave(glock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+			 peer->ibp_accepting > 0 ||
+			 !list_empty(&peer->ibp_conns));  /* active conn */
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(glock, flags);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	/* peer is not persistent in hash, trigger peer creation
+	 * and connection establishment with a NULL tx */
+	if (peer == NULL)
+		kiblnd_launch_tx(ni, NULL, nid);
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1);
+	return;
+}
+
+void
+kiblnd_free_pages(kib_pages_t *p)
+{
+	int	npages = p->ibp_npages;
+	int	i;
+
+	for (i = 0; i < npages; i++) {
+		if (p->ibp_pages[i] != NULL)
+			__free_page(p->ibp_pages[i]);
+	}
+
+	LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
+{
+	kib_pages_t	*p;
+	int		i;
+
+	LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
+			 offsetof(kib_pages_t, ibp_pages[npages]));
+	if (p == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", npages);
+		return -ENOMEM;
+	}
+
+	memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+	p->ibp_npages = npages;
+
+	for (i = 0; i < npages; i++) {
+		p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+						     __GFP_IO);
+		if (p->ibp_pages[i] == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, npages);
+			kiblnd_free_pages(p);
+			return -ENOMEM;
+		}
+	}
+
+	*pp = p;
+	return 0;
+}
+
+void
+kiblnd_unmap_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t *rx;
+	int       i;
+
+	LASSERT (conn->ibc_rxs != NULL);
+	LASSERT (conn->ibc_hdev != NULL);
+
+	for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		rx = &conn->ibc_rxs[i];
+
+		LASSERT (rx->rx_nob >= 0); /* not posted */
+
+		kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
+							  rx->rx_msgaddr),
+					IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+	}
+
+	kiblnd_free_pages(conn->ibc_rx_pages);
+
+	conn->ibc_rx_pages = NULL;
+}
+
+void
+kiblnd_map_rx_descs(kib_conn_t *conn)
+{
+	kib_rx_t       *rx;
+	struct page    *pg;
+	int	     pg_off;
+	int	     ipg;
+	int	     i;
+
+	for (pg_off = ipg = i = 0;
+	     i < IBLND_RX_MSGS(conn->ibc_version); i++) {
+		pg = conn->ibc_rx_pages->ibp_pages[ipg];
+		rx = &conn->ibc_rxs[i];
+
+		rx->rx_conn = conn;
+		rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
+
+		rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
+						       rx->rx_msg, IBLND_MSG_SIZE,
+						       DMA_FROM_DEVICE);
+		LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
+						   rx->rx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
+
+		CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
+		       i, rx->rx_msg, rx->rx_msgaddr,
+		       lnet_page2phys(pg) + pg_off);
+
+		pg_off += IBLND_MSG_SIZE;
+		LASSERT (pg_off <= PAGE_SIZE);
+
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			ipg++;
+			LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
+		}
+	}
+}
+
+static void
+kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_hca_dev_t  *hdev = tpo->tpo_hdev;
+	kib_tx_t       *tx;
+	int	     i;
+
+	LASSERT (tpo->tpo_pool.po_allocated == 0);
+
+	if (hdev == NULL)
+		return;
+
+	for (i = 0; i < tpo->tpo_pool.po_size; i++) {
+		tx = &tpo->tpo_tx_descs[i];
+		kiblnd_dma_unmap_single(hdev->ibh_ibdev,
+					KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
+							  tx->tx_msgaddr),
+					IBLND_MSG_SIZE, DMA_TO_DEVICE);
+	}
+
+	kiblnd_hdev_decref(hdev);
+	tpo->tpo_hdev = NULL;
+}
+
+static kib_hca_dev_t *
+kiblnd_current_hdev(kib_dev_t *dev)
+{
+	kib_hca_dev_t *hdev;
+	unsigned long  flags;
+	int	    i = 0;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (dev->ibd_failover) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+		if (i++ % 50 == 0)
+			CDEBUG(D_NET, "%s: Wait for failover\n",
+			       dev->ibd_ifname);
+		schedule_timeout(cfs_time_seconds(1) / 100);
+
+		read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+
+	kiblnd_hdev_addref_locked(dev->ibd_hdev);
+	hdev = dev->ibd_hdev;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	return hdev;
+}
+
+static void
+kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
+{
+	kib_pages_t    *txpgs = tpo->tpo_tx_pages;
+	kib_pool_t     *pool  = &tpo->tpo_pool;
+	kib_net_t      *net   = pool->po_owner->ps_net;
+	kib_dev_t      *dev;
+	struct page    *page;
+	kib_tx_t       *tx;
+	int	     page_offset;
+	int	     ipage;
+	int	     i;
+
+	LASSERT (net != NULL);
+
+	dev = net->ibn_dev;
+
+	/* pre-mapped messages are not bigger than 1 page */
+	CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+	/* No fancy arithmetic when we do the buffer calculations */
+	CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+	tpo->tpo_hdev = kiblnd_current_hdev(dev);
+
+	for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
+		page = txpgs->ibp_pages[ipage];
+		tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+					   page_offset);
+
+		tx->tx_msgaddr = kiblnd_dma_map_single(
+			tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
+			IBLND_MSG_SIZE, DMA_TO_DEVICE);
+		LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
+						   tx->tx_msgaddr));
+		KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
+
+		list_add(&tx->tx_list, &pool->po_free_list);
+
+		page_offset += IBLND_MSG_SIZE;
+		LASSERT (page_offset <= PAGE_SIZE);
+
+		if (page_offset == PAGE_SIZE) {
+			page_offset = 0;
+			ipage++;
+			LASSERT (ipage <= txpgs->ibp_npages);
+		}
+	}
+}
+
+struct ib_mr *
+kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
+{
+	__u64   index;
+
+	LASSERT (hdev->ibh_mrs[0] != NULL);
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	index = addr >> hdev->ibh_mr_shift;
+
+	if (index <  hdev->ibh_nmrs &&
+	    index == ((addr + size - 1) >> hdev->ibh_mr_shift))
+		return hdev->ibh_mrs[index];
+
+	return NULL;
+}
+
+struct ib_mr *
+kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
+{
+	struct ib_mr *prev_mr;
+	struct ib_mr *mr;
+	int	   i;
+
+	LASSERT (hdev->ibh_mrs[0] != NULL);
+
+	if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+	    *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
+		return NULL;
+
+	if (hdev->ibh_nmrs == 1)
+		return hdev->ibh_mrs[0];
+
+	for (i = 0, mr = prev_mr = NULL;
+	     i < rd->rd_nfrags; i++) {
+		mr = kiblnd_find_dma_mr(hdev,
+					rd->rd_frags[i].rf_addr,
+					rd->rd_frags[i].rf_nob);
+		if (prev_mr == NULL)
+			prev_mr = mr;
+
+		if (mr == NULL || prev_mr != mr) {
+			/* Can't covered by one single MR */
+			mr = NULL;
+			break;
+		}
+	}
+
+	return mr;
+}
+
+void
+kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+{
+	LASSERT (pool->fpo_map_count == 0);
+
+	if (pool->fpo_fmr_pool != NULL)
+		ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+
+	if (pool->fpo_hdev != NULL)
+		kiblnd_hdev_decref(pool->fpo_hdev);
+
+	LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
+}
+
+void
+kiblnd_destroy_fmr_pool_list(struct list_head *head)
+{
+	kib_fmr_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
+		list_del(&pool->fpo_list);
+		kiblnd_destroy_fmr_pool(pool);
+	}
+}
+
+static int kiblnd_fmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+
+	return max(IBLND_FMR_POOL, size);
+}
+
+static int kiblnd_fmr_flush_trigger(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+
+	return max(IBLND_FMR_POOL_FLUSH, size);
+}
+
+int
+kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
+{
+	/* FMR pool for RDMA */
+	kib_dev_t	       *dev = fps->fps_net->ibn_dev;
+	kib_fmr_pool_t	  *fpo;
+	struct ib_fmr_pool_param param = {
+		.max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+		.page_shift	= PAGE_SHIFT,
+		.access	    = (IB_ACCESS_LOCAL_WRITE |
+				      IB_ACCESS_REMOTE_WRITE),
+		.pool_size	   = fps->fps_pool_size,
+		.dirty_watermark   = fps->fps_flush_trigger,
+		.flush_function    = NULL,
+		.flush_arg	 = NULL,
+		.cache	     = !!*kiblnd_tunables.kib_fmr_cache};
+	int rc;
+
+	LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
+	if (fpo == NULL)
+		return -ENOMEM;
+
+	fpo->fpo_hdev = kiblnd_current_hdev(dev);
+
+	fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
+	if (IS_ERR(fpo->fpo_fmr_pool)) {
+		rc = PTR_ERR(fpo->fpo_fmr_pool);
+		CERROR("Failed to create FMR pool: %d\n", rc);
+
+		kiblnd_hdev_decref(fpo->fpo_hdev);
+		LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
+		return rc;
+	}
+
+	fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	fpo->fpo_owner    = fps;
+	*pp_fpo = fpo;
+
+	return 0;
+}
+
+static void
+kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
+{
+	if (fps->fps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&fps->fps_lock);
+
+	while (!list_empty(&fps->fps_pool_list)) {
+		kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
+						 kib_fmr_pool_t, fpo_list);
+		fpo->fpo_failed = 1;
+		list_del(&fpo->fpo_list);
+		if (fpo->fpo_map_count == 0)
+			list_add(&fpo->fpo_list, zombies);
+		else
+			list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
+	}
+
+	spin_unlock(&fps->fps_lock);
+}
+
+static void
+kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
+{
+	if (fps->fps_net != NULL) { /* initialized? */
+		kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
+		kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net,
+			int pool_size, int flush_trigger)
+{
+	kib_fmr_pool_t *fpo;
+	int	     rc;
+
+	memset(fps, 0, sizeof(kib_fmr_poolset_t));
+
+	fps->fps_net = net;
+	fps->fps_cpt = cpt;
+	fps->fps_pool_size = pool_size;
+	fps->fps_flush_trigger = flush_trigger;
+	spin_lock_init(&fps->fps_lock);
+	INIT_LIST_HEAD(&fps->fps_pool_list);
+	INIT_LIST_HEAD(&fps->fps_failed_pool_list);
+
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	if (rc == 0)
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+
+	return rc;
+}
+
+static int
+kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
+{
+	if (fpo->fpo_map_count != 0) /* still in use */
+		return 0;
+	if (fpo->fpo_failed)
+		return 1;
+	return cfs_time_aftereq(now, fpo->fpo_deadline);
+}
+
+void
+kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
+{
+	LIST_HEAD     (zombies);
+	kib_fmr_pool_t    *fpo = fmr->fmr_pool;
+	kib_fmr_poolset_t *fps = fpo->fpo_owner;
+	cfs_time_t	 now = cfs_time_current();
+	kib_fmr_pool_t    *tmp;
+	int		rc;
+
+	rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+	LASSERT (rc == 0);
+
+	if (status != 0) {
+		rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
+		LASSERT (rc == 0);
+	}
+
+	fmr->fmr_pool = NULL;
+	fmr->fmr_pfmr = NULL;
+
+	spin_lock(&fps->fps_lock);
+	fpo->fpo_map_count --;  /* decref the pool */
+
+	list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
+		/* the first pool is persistent */
+		if (fps->fps_pool_list.next == &fpo->fpo_list)
+			continue;
+
+		if (kiblnd_fmr_pool_is_idle(fpo, now)) {
+			list_move(&fpo->fpo_list, &zombies);
+			fps->fps_version ++;
+		}
+	}
+	spin_unlock(&fps->fps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_fmr_pool_list(&zombies);
+}
+
+int
+kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
+		    __u64 iov, kib_fmr_t *fmr)
+{
+	struct ib_pool_fmr *pfmr;
+	kib_fmr_pool_t     *fpo;
+	__u64	       version;
+	int		 rc;
+
+ again:
+	spin_lock(&fps->fps_lock);
+	version = fps->fps_version;
+	list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
+		fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		fpo->fpo_map_count++;
+		spin_unlock(&fps->fps_lock);
+
+		pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
+					    pages, npages, iov);
+		if (likely(!IS_ERR(pfmr))) {
+			fmr->fmr_pool = fpo;
+			fmr->fmr_pfmr = pfmr;
+			return 0;
+		}
+
+		spin_lock(&fps->fps_lock);
+		fpo->fpo_map_count--;
+		if (PTR_ERR(pfmr) != -EAGAIN) {
+			spin_unlock(&fps->fps_lock);
+			return PTR_ERR(pfmr);
+		}
+
+		/* EAGAIN and ... */
+		if (version != fps->fps_version) {
+			spin_unlock(&fps->fps_lock);
+			goto again;
+		}
+	}
+
+	if (fps->fps_increasing) {
+		spin_unlock(&fps->fps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "FMR pool, waiting for her to complete\n");
+		schedule();
+		goto again;
+
+	}
+
+	if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&fps->fps_lock);
+		return -EAGAIN;
+	}
+
+	fps->fps_increasing = 1;
+	spin_unlock(&fps->fps_lock);
+
+	CDEBUG(D_NET, "Allocate new FMR pool\n");
+	rc = kiblnd_create_fmr_pool(fps, &fpo);
+	spin_lock(&fps->fps_lock);
+	fps->fps_increasing = 0;
+	if (rc == 0) {
+		fps->fps_version++;
+		list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
+	} else {
+		fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+	}
+	spin_unlock(&fps->fps_lock);
+
+	goto again;
+}
+
+static void
+kiblnd_fini_pool(kib_pool_t *pool)
+{
+	LASSERT (list_empty(&pool->po_free_list));
+	LASSERT (pool->po_allocated == 0);
+
+	CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
+}
+
+static void
+kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
+{
+	CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
+
+	memset(pool, 0, sizeof(kib_pool_t));
+	INIT_LIST_HEAD(&pool->po_free_list);
+	pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+	pool->po_owner    = ps;
+	pool->po_size     = size;
+}
+
+void
+kiblnd_destroy_pool_list(struct list_head *head)
+{
+	kib_pool_t *pool;
+
+	while (!list_empty(head)) {
+		pool = list_entry(head->next, kib_pool_t, po_list);
+		list_del(&pool->po_list);
+
+		LASSERT (pool->po_owner != NULL);
+		pool->po_owner->ps_pool_destroy(pool);
+	}
+}
+
+static void
+kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
+{
+	if (ps->ps_net == NULL) /* intialized? */
+		return;
+
+	spin_lock(&ps->ps_lock);
+	while (!list_empty(&ps->ps_pool_list)) {
+		kib_pool_t *po = list_entry(ps->ps_pool_list.next,
+					    kib_pool_t, po_list);
+		po->po_failed = 1;
+		list_del(&po->po_list);
+		if (po->po_allocated == 0)
+			list_add(&po->po_list, zombies);
+		else
+			list_add(&po->po_list, &ps->ps_failed_pool_list);
+	}
+	spin_unlock(&ps->ps_lock);
+}
+
+static void
+kiblnd_fini_poolset(kib_poolset_t *ps)
+{
+	if (ps->ps_net != NULL) { /* initialized? */
+		kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
+		kiblnd_destroy_pool_list(&ps->ps_pool_list);
+	}
+}
+
+static int
+kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
+		    kib_net_t *net, char *name, int size,
+		    kib_ps_pool_create_t po_create,
+		    kib_ps_pool_destroy_t po_destroy,
+		    kib_ps_node_init_t nd_init,
+		    kib_ps_node_fini_t nd_fini)
+{
+	kib_pool_t	*pool;
+	int		rc;
+
+	memset(ps, 0, sizeof(kib_poolset_t));
+
+	ps->ps_cpt	    = cpt;
+	ps->ps_net	  = net;
+	ps->ps_pool_create  = po_create;
+	ps->ps_pool_destroy = po_destroy;
+	ps->ps_node_init    = nd_init;
+	ps->ps_node_fini    = nd_fini;
+	ps->ps_pool_size    = size;
+	if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
+	    >= sizeof(ps->ps_name))
+		return -E2BIG;
+	spin_lock_init(&ps->ps_lock);
+	INIT_LIST_HEAD(&ps->ps_pool_list);
+	INIT_LIST_HEAD(&ps->ps_failed_pool_list);
+
+	rc = ps->ps_pool_create(ps, size, &pool);
+	if (rc == 0)
+		list_add(&pool->po_list, &ps->ps_pool_list);
+	else
+		CERROR("Failed to create the first pool for %s\n", ps->ps_name);
+
+	return rc;
+}
+
+static int
+kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
+{
+	if (pool->po_allocated != 0) /* still in use */
+		return 0;
+	if (pool->po_failed)
+		return 1;
+	return cfs_time_aftereq(now, pool->po_deadline);
+}
+
+void
+kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
+{
+	LIST_HEAD  (zombies);
+	kib_poolset_t  *ps = pool->po_owner;
+	kib_pool_t     *tmp;
+	cfs_time_t      now = cfs_time_current();
+
+	spin_lock(&ps->ps_lock);
+
+	if (ps->ps_node_fini != NULL)
+		ps->ps_node_fini(pool, node);
+
+	LASSERT (pool->po_allocated > 0);
+	list_add(node, &pool->po_free_list);
+	pool->po_allocated --;
+
+	list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
+		/* the first pool is persistent */
+		if (ps->ps_pool_list.next == &pool->po_list)
+			continue;
+
+		if (kiblnd_pool_is_idle(pool, now))
+			list_move(&pool->po_list, &zombies);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	if (!list_empty(&zombies))
+		kiblnd_destroy_pool_list(&zombies);
+}
+
+struct list_head *
+kiblnd_pool_alloc_node(kib_poolset_t *ps)
+{
+	struct list_head	    *node;
+	kib_pool_t	    *pool;
+	int		    rc;
+
+ again:
+	spin_lock(&ps->ps_lock);
+	list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
+		if (list_empty(&pool->po_free_list))
+			continue;
+
+		pool->po_allocated ++;
+		pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
+		node = pool->po_free_list.next;
+		list_del(node);
+
+		if (ps->ps_node_init != NULL) {
+			/* still hold the lock */
+			ps->ps_node_init(pool, node);
+		}
+		spin_unlock(&ps->ps_lock);
+		return node;
+	}
+
+	/* no available tx pool and ... */
+	if (ps->ps_increasing) {
+		/* another thread is allocating a new pool */
+		spin_unlock(&ps->ps_lock);
+		CDEBUG(D_NET, "Another thread is allocating new "
+		       "%s pool, waiting for her to complete\n",
+		       ps->ps_name);
+		schedule();
+		goto again;
+	}
+
+	if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
+		/* someone failed recently */
+		spin_unlock(&ps->ps_lock);
+		return NULL;
+	}
+
+	ps->ps_increasing = 1;
+	spin_unlock(&ps->ps_lock);
+
+	CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
+
+	rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
+
+	spin_lock(&ps->ps_lock);
+	ps->ps_increasing = 0;
+	if (rc == 0) {
+		list_add_tail(&pool->po_list, &ps->ps_pool_list);
+	} else {
+		ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
+		CERROR("Can't allocate new %s pool because out of memory\n",
+		       ps->ps_name);
+	}
+	spin_unlock(&ps->ps_lock);
+
+	goto again;
+}
+
+void
+kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
+{
+	kib_pmr_pool_t      *ppo = pmr->pmr_pool;
+	struct ib_mr	*mr  = pmr->pmr_mr;
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
+	if (mr != NULL)
+		ib_dereg_mr(mr);
+}
+
+int
+kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+		    kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
+{
+	kib_phys_mr_t *pmr;
+	struct list_head    *node;
+	int	    rc;
+	int	    i;
+
+	node = kiblnd_pool_alloc_node(&pps->pps_poolset);
+	if (node == NULL) {
+		CERROR("Failed to allocate PMR descriptor\n");
+		return -ENOMEM;
+	}
+
+	pmr = container_of(node, kib_phys_mr_t, pmr_list);
+	if (pmr->pmr_pool->ppo_hdev != hdev) {
+		kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+		return -EAGAIN;
+	}
+
+	for (i = 0; i < rd->rd_nfrags; i ++) {
+		pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
+		pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
+	}
+
+	pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
+				     pmr->pmr_ipb, rd->rd_nfrags,
+				     IB_ACCESS_LOCAL_WRITE |
+				     IB_ACCESS_REMOTE_WRITE,
+				     iova);
+	if (!IS_ERR(pmr->pmr_mr)) {
+		pmr->pmr_iova = *iova;
+		*pp_pmr = pmr;
+		return 0;
+	}
+
+	rc = PTR_ERR(pmr->pmr_mr);
+	CERROR("Failed ib_reg_phys_mr: %d\n", rc);
+
+	pmr->pmr_mr = NULL;
+	kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
+
+	return rc;
+}
+
+static void
+kiblnd_destroy_pmr_pool(kib_pool_t *pool)
+{
+	kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
+	kib_phys_mr_t  *pmr;
+
+	LASSERT (pool->po_allocated == 0);
+
+	while (!list_empty(&pool->po_free_list)) {
+		pmr = list_entry(pool->po_free_list.next,
+				     kib_phys_mr_t, pmr_list);
+
+		LASSERT (pmr->pmr_mr == NULL);
+		list_del(&pmr->pmr_list);
+
+		if (pmr->pmr_ipb != NULL) {
+			LIBCFS_FREE(pmr->pmr_ipb,
+				    IBLND_MAX_RDMA_FRAGS *
+				    sizeof(struct ib_phys_buf));
+		}
+
+		LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
+	}
+
+	kiblnd_fini_pool(pool);
+	if (ppo->ppo_hdev != NULL)
+		kiblnd_hdev_decref(ppo->ppo_hdev);
+
+	LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
+}
+
+static inline int kiblnd_pmr_pool_size(int ncpts)
+{
+	int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
+
+	return max(IBLND_PMR_POOL, size);
+}
+
+static int
+kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+	struct kib_pmr_pool	*ppo;
+	struct kib_pool		*pool;
+	kib_phys_mr_t		*pmr;
+	int			i;
+
+	LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
+			 ps->ps_cpt, sizeof(kib_pmr_pool_t));
+	if (ppo == NULL) {
+		CERROR("Failed to allocate PMR pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &ppo->ppo_pool;
+	kiblnd_init_pool(ps, pool, size);
+
+	for (i = 0; i < size; i++) {
+		LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
+				 ps->ps_cpt, sizeof(kib_phys_mr_t));
+		if (pmr == NULL)
+			break;
+
+		pmr->pmr_pool = ppo;
+		LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
+		if (pmr->pmr_ipb == NULL)
+			break;
+
+		list_add(&pmr->pmr_list, &pool->po_free_list);
+	}
+
+	if (i < size) {
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
+	*pp_po = pool;
+	return 0;
+}
+
+static void
+kiblnd_destroy_tx_pool(kib_pool_t *pool)
+{
+	kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
+	int	     i;
+
+	LASSERT (pool->po_allocated == 0);
+
+	if (tpo->tpo_tx_pages != NULL) {
+		kiblnd_unmap_tx_pool(tpo);
+		kiblnd_free_pages(tpo->tpo_tx_pages);
+	}
+
+	if (tpo->tpo_tx_descs == NULL)
+		goto out;
+
+	for (i = 0; i < pool->po_size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		list_del(&tx->tx_list);
+		if (tx->tx_pages != NULL)
+			LIBCFS_FREE(tx->tx_pages,
+				    LNET_MAX_IOV *
+				    sizeof(*tx->tx_pages));
+		if (tx->tx_frags != NULL)
+			LIBCFS_FREE(tx->tx_frags,
+				    IBLND_MAX_RDMA_FRAGS *
+					    sizeof(*tx->tx_frags));
+		if (tx->tx_wrq != NULL)
+			LIBCFS_FREE(tx->tx_wrq,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_wrq));
+		if (tx->tx_sge != NULL)
+			LIBCFS_FREE(tx->tx_sge,
+				    (1 + IBLND_MAX_RDMA_FRAGS) *
+				    sizeof(*tx->tx_sge));
+		if (tx->tx_rd != NULL)
+			LIBCFS_FREE(tx->tx_rd,
+				    offsetof(kib_rdma_desc_t,
+					     rd_frags[IBLND_MAX_RDMA_FRAGS]));
+	}
+
+	LIBCFS_FREE(tpo->tpo_tx_descs,
+		    pool->po_size * sizeof(kib_tx_t));
+out:
+	kiblnd_fini_pool(pool);
+	LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+}
+
+static int kiblnd_tx_pool_size(int ncpts)
+{
+	int ntx = *kiblnd_tunables.kib_ntx / ncpts;
+
+	return max(IBLND_TX_POOL, ntx);
+}
+
+static int
+kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
+{
+	int	    i;
+	int	    npg;
+	kib_pool_t    *pool;
+	kib_tx_pool_t *tpo;
+
+	LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
+	if (tpo == NULL) {
+		CERROR("Failed to allocate TX pool\n");
+		return -ENOMEM;
+	}
+
+	pool = &tpo->tpo_pool;
+	kiblnd_init_pool(ps, pool, size);
+	tpo->tpo_tx_descs = NULL;
+	tpo->tpo_tx_pages = NULL;
+
+	npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
+	if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
+		CERROR("Can't allocate tx pages: %d\n", npg);
+		LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
+		return -ENOMEM;
+	}
+
+	LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
+			 size * sizeof(kib_tx_t));
+	if (tpo->tpo_tx_descs == NULL) {
+		CERROR("Can't allocate %d tx descriptors\n", size);
+		ps->ps_pool_destroy(pool);
+		return -ENOMEM;
+	}
+
+	memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
+
+	for (i = 0; i < size; i++) {
+		kib_tx_t *tx = &tpo->tpo_tx_descs[i];
+
+		tx->tx_pool = tpo;
+		if (ps->ps_net->ibn_fmr_ps != NULL) {
+			LIBCFS_CPT_ALLOC(tx->tx_pages,
+					 lnet_cpt_table(), ps->ps_cpt,
+					 LNET_MAX_IOV * sizeof(*tx->tx_pages));
+			if (tx->tx_pages == NULL)
+				break;
+		}
+
+		LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
+				 IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
+		if (tx->tx_frags == NULL)
+			break;
+
+		sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
+
+		LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_wrq));
+		if (tx->tx_wrq == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
+				 (1 + IBLND_MAX_RDMA_FRAGS) *
+				 sizeof(*tx->tx_sge));
+		if (tx->tx_sge == NULL)
+			break;
+
+		LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
+				 offsetof(kib_rdma_desc_t,
+					  rd_frags[IBLND_MAX_RDMA_FRAGS]));
+		if (tx->tx_rd == NULL)
+			break;
+	}
+
+	if (i == size) {
+		kiblnd_map_tx_pool(tpo);
+		*pp_po = pool;
+		return 0;
+	}
+
+	ps->ps_pool_destroy(pool);
+	return -ENOMEM;
+}
+
+static void
+kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
+{
+	kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
+					     tps_poolset);
+	kib_tx_t	 *tx  = list_entry(node, kib_tx_t, tx_list);
+
+	tx->tx_cookie = tps->tps_next_tx_cookie ++;
+}
+
+void
+kiblnd_net_fini_pools(kib_net_t *net)
+{
+	int	i;
+
+	cfs_cpt_for_each(i, lnet_cpt_table()) {
+		kib_tx_poolset_t	*tps;
+		kib_fmr_poolset_t	*fps;
+		kib_pmr_poolset_t	*pps;
+
+		if (net->ibn_tx_ps != NULL) {
+			tps = net->ibn_tx_ps[i];
+			kiblnd_fini_poolset(&tps->tps_poolset);
+		}
+
+		if (net->ibn_fmr_ps != NULL) {
+			fps = net->ibn_fmr_ps[i];
+			kiblnd_fini_fmr_poolset(fps);
+		}
+
+		if (net->ibn_pmr_ps != NULL) {
+			pps = net->ibn_pmr_ps[i];
+			kiblnd_fini_poolset(&pps->pps_poolset);
+		}
+	}
+
+	if (net->ibn_tx_ps != NULL) {
+		cfs_percpt_free(net->ibn_tx_ps);
+		net->ibn_tx_ps = NULL;
+	}
+
+	if (net->ibn_fmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_fmr_ps);
+		net->ibn_fmr_ps = NULL;
+	}
+
+	if (net->ibn_pmr_ps != NULL) {
+		cfs_percpt_free(net->ibn_pmr_ps);
+		net->ibn_pmr_ps = NULL;
+	}
+}
+
+int
+kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+{
+	unsigned long	flags;
+	int		cpt;
+	int		rc;
+	int		i;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (*kiblnd_tunables.kib_map_on_demand == 0 &&
+	    net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
+		read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					   flags);
+		goto create_tx_pool;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (*kiblnd_tunables.kib_fmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_fmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	/* TX pool must be created later than FMR/PMR, see LU-2268
+	 * for details */
+	LASSERT(net->ibn_tx_ps == NULL);
+
+	/* premapping can fail if ibd_nmr > 1, so we always create
+	 * FMR/PMR pool and map-on-demand if premapping failed */
+
+	net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_fmr_poolset_t));
+	if (net->ibn_fmr_ps == NULL) {
+		CERROR("Failed to allocate FMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
+					     kiblnd_fmr_pool_size(ncpts),
+					     kiblnd_fmr_flush_trigger(ncpts));
+		if (rc == -ENOSYS && i == 0) /* no FMR */
+			break; /* create PMR pool */
+
+		if (rc != 0) { /* a real error */
+			CERROR("Can't initialize FMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	if (i > 0) {
+		LASSERT(i == ncpts);
+		goto create_tx_pool;
+	}
+
+	cfs_percpt_free(net->ibn_fmr_ps);
+	net->ibn_fmr_ps = NULL;
+
+	CWARN("Device does not support FMR, failing back to PMR\n");
+
+	if (*kiblnd_tunables.kib_pmr_pool_size <
+	    *kiblnd_tunables.kib_ntx / 4) {
+		CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
+		       *kiblnd_tunables.kib_pmr_pool_size,
+		       *kiblnd_tunables.kib_ntx / 4);
+		rc = -EINVAL;
+		goto failed;
+	}
+
+	net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					   sizeof(kib_pmr_poolset_t));
+	if (net->ibn_pmr_ps == NULL) {
+		CERROR("Failed to allocate PMR pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
+					 cpt, net, "PMR",
+					 kiblnd_pmr_pool_size(ncpts),
+					 kiblnd_create_pmr_pool,
+					 kiblnd_destroy_pmr_pool, NULL, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize PMR pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+ create_tx_pool:
+	net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
+					  sizeof(kib_tx_poolset_t));
+	if (net->ibn_tx_ps == NULL) {
+		CERROR("Failed to allocate tx pool array\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	for (i = 0; i < ncpts; i++) {
+		cpt = (cpts == NULL) ? i : cpts[i];
+		rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
+					 cpt, net, "TX",
+					 kiblnd_tx_pool_size(ncpts),
+					 kiblnd_create_tx_pool,
+					 kiblnd_destroy_tx_pool,
+					 kiblnd_tx_init, NULL);
+		if (rc != 0) {
+			CERROR("Can't initialize TX pool for CPT %d: %d\n",
+			       cpt, rc);
+			goto failed;
+		}
+	}
+
+	return 0;
+ failed:
+	kiblnd_net_fini_pools(net);
+	LASSERT(rc != 0);
+	return rc;
+}
+
+static int
+kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
+{
+	struct ib_device_attr *attr;
+	int		    rc;
+
+	/* It's safe to assume a HCA can handle a page size
+	 * matching that of the native system */
+	hdev->ibh_page_shift = PAGE_SHIFT;
+	hdev->ibh_page_size  = 1 << PAGE_SHIFT;
+	hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
+
+	LIBCFS_ALLOC(attr, sizeof(*attr));
+	if (attr == NULL) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	rc = ib_query_device(hdev->ibh_ibdev, attr);
+	if (rc == 0)
+		hdev->ibh_mr_size = attr->max_mr_size;
+
+	LIBCFS_FREE(attr, sizeof(*attr));
+
+	if (rc != 0) {
+		CERROR("Failed to query IB device: %d\n", rc);
+		return rc;
+	}
+
+	if (hdev->ibh_mr_size == ~0ULL) {
+		hdev->ibh_mr_shift = 64;
+		return 0;
+	}
+
+	for (hdev->ibh_mr_shift = 0;
+	     hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
+		if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
+		    hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
+			return 0;
+	}
+
+	CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
+	return -EINVAL;
+}
+
+void
+kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
+{
+	int     i;
+
+	if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
+		return;
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		if (hdev->ibh_mrs[i] == NULL)
+			break;
+
+		ib_dereg_mr(hdev->ibh_mrs[i]);
+	}
+
+	LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	hdev->ibh_mrs  = NULL;
+	hdev->ibh_nmrs = 0;
+}
+
+void
+kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
+{
+	kiblnd_hdev_cleanup_mrs(hdev);
+
+	if (hdev->ibh_pd != NULL)
+		ib_dealloc_pd(hdev->ibh_pd);
+
+	if (hdev->ibh_cmid != NULL)
+		rdma_destroy_id(hdev->ibh_cmid);
+
+	LIBCFS_FREE(hdev, sizeof(*hdev));
+}
+
+int
+kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
+{
+	struct ib_mr *mr;
+	int	   i;
+	int	   rc;
+	__u64	 mm_size;
+	__u64	 mr_size;
+	int	   acflags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE;
+
+	rc = kiblnd_hdev_get_attr(hdev);
+	if (rc != 0)
+		return rc;
+
+	if (hdev->ibh_mr_shift == 64) {
+		LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
+		if (hdev->ibh_mrs == NULL) {
+			CERROR("Failed to allocate MRs table\n");
+			return -ENOMEM;
+		}
+
+		hdev->ibh_mrs[0] = NULL;
+		hdev->ibh_nmrs   = 1;
+
+		mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		hdev->ibh_mrs[0] = mr;
+
+		goto out;
+	}
+
+	mr_size = (1ULL << hdev->ibh_mr_shift);
+	mm_size = (unsigned long)high_memory - PAGE_OFFSET;
+
+	hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
+
+	if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
+		/* it's 4T..., assume we will re-code at that time */
+		CERROR("Can't support memory size: x"LPX64
+		       " with MR size: x"LPX64"\n", mm_size, mr_size);
+		return -EINVAL;
+	}
+
+	/* create an array of MRs to cover all memory */
+	LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+	if (hdev->ibh_mrs == NULL) {
+		CERROR("Failed to allocate MRs' table\n");
+		return -ENOMEM;
+	}
+
+	memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
+
+	for (i = 0; i < hdev->ibh_nmrs; i++) {
+		struct ib_phys_buf ipb;
+		__u64	      iova;
+
+		ipb.size = hdev->ibh_mr_size;
+		ipb.addr = i * mr_size;
+		iova     = ipb.addr;
+
+		mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
+		if (IS_ERR(mr)) {
+			CERROR("Failed ib_reg_phys_mr addr "LPX64
+			       " size "LPX64" : %ld\n",
+			       ipb.addr, ipb.size, PTR_ERR(mr));
+			kiblnd_hdev_cleanup_mrs(hdev);
+			return PTR_ERR(mr);
+		}
+
+		LASSERT (iova == ipb.addr);
+
+		hdev->ibh_mrs[i] = mr;
+	}
+
+out:
+	if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
+		LCONSOLE_INFO("Register global MR array, MR size: "
+			      LPX64", array size: %d\n",
+			      hdev->ibh_mr_size, hdev->ibh_nmrs);
+	return 0;
+}
+
+static int
+kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{       /* DUMMY */
+	return 0;
+}
+
+static int
+kiblnd_dev_need_failover(kib_dev_t *dev)
+{
+	struct rdma_cm_id  *cmid;
+	struct sockaddr_in  srcaddr;
+	struct sockaddr_in  dstaddr;
+	int		 rc;
+
+	if (dev->ibd_hdev == NULL || /* initializing */
+	    dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
+	    *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
+		return 1;
+
+	/* XXX: it's UGLY, but I don't have better way to find
+	 * ib-bonding HCA failover because:
+	 *
+	 * a. no reliable CM event for HCA failover...
+	 * b. no OFED API to get ib_device for current net_device...
+	 *
+	 * We have only two choices at this point:
+	 *
+	 * a. rdma_bind_addr(), it will conflict with listener cmid
+	 * b. rdma_resolve_addr() to zero addr */
+	cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		return rc;
+	}
+
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family      = AF_INET;
+	srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
+			       (struct sockaddr *)&dstaddr, 1);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+		       dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		return rc;
+	}
+
+	if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
+		/* don't need device failover */
+		rdma_destroy_id(cmid);
+		return 0;
+	}
+
+	return 1;
+}
+
+int
+kiblnd_dev_failover(kib_dev_t *dev)
+{
+	LIST_HEAD      (zombie_tpo);
+	LIST_HEAD      (zombie_ppo);
+	LIST_HEAD      (zombie_fpo);
+	struct rdma_cm_id  *cmid  = NULL;
+	kib_hca_dev_t      *hdev  = NULL;
+	kib_hca_dev_t      *old;
+	struct ib_pd       *pd;
+	kib_net_t	  *net;
+	struct sockaddr_in  addr;
+	unsigned long       flags;
+	int		 rc = 0;
+	int		    i;
+
+	LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
+		 dev->ibd_can_failover ||
+		 dev->ibd_hdev == NULL);
+
+	rc = kiblnd_dev_need_failover(dev);
+	if (rc <= 0)
+		goto out;
+
+	if (dev->ibd_hdev != NULL &&
+	    dev->ibd_hdev->ibh_cmid != NULL) {
+		/* XXX it's not good to close old listener at here,
+		 * because we can fail to create new listener.
+		 * But we have to close it now, otherwise rdma_bind_addr
+		 * will return EADDRINUSE... How crap! */
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+		cmid = dev->ibd_hdev->ibh_cmid;
+		/* make next schedule of kiblnd_dev_need_failover()
+		 * return 1 for me */
+		dev->ibd_hdev->ibh_cmid  = NULL;
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		rdma_destroy_id(cmid);
+	}
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
+				     IB_QPT_RC);
+	if (IS_ERR(cmid)) {
+		rc = PTR_ERR(cmid);
+		CERROR("Failed to create cmid for failover: %d\n", rc);
+		goto out;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sin_family      = AF_INET;
+	addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
+	addr.sin_port	= htons(*kiblnd_tunables.kib_service);
+
+	/* Bind to failover device or port */
+	rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
+	if (rc != 0 || cmid->device == NULL) {
+		CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n",
+		       dev->ibd_ifname, HIPQUAD(dev->ibd_ifip),
+		       cmid->device, rc);
+		rdma_destroy_id(cmid);
+		goto out;
+	}
+
+	LIBCFS_ALLOC(hdev, sizeof(*hdev));
+	if (hdev == NULL) {
+		CERROR("Failed to allocate kib_hca_dev\n");
+		rdma_destroy_id(cmid);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&hdev->ibh_ref, 1);
+	hdev->ibh_dev   = dev;
+	hdev->ibh_cmid  = cmid;
+	hdev->ibh_ibdev = cmid->device;
+
+	pd = ib_alloc_pd(cmid->device);
+	if (IS_ERR(pd)) {
+		rc = PTR_ERR(pd);
+		CERROR("Can't allocate PD: %d\n", rc);
+		goto out;
+	}
+
+	hdev->ibh_pd = pd;
+
+	rc = rdma_listen(cmid, 0);
+	if (rc != 0) {
+		CERROR("Can't start new listener: %d\n", rc);
+		goto out;
+	}
+
+	rc = kiblnd_hdev_setup_mrs(hdev);
+	if (rc != 0) {
+		CERROR("Can't setup device: %d\n", rc);
+		goto out;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	old = dev->ibd_hdev;
+	dev->ibd_hdev = hdev; /* take over the refcount */
+	hdev = old;
+
+	list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
+		cfs_cpt_for_each(i, lnet_cpt_table()) {
+			kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
+					    &zombie_tpo);
+
+			if (net->ibn_fmr_ps != NULL) {
+				kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
+							&zombie_fpo);
+
+			} else if (net->ibn_pmr_ps != NULL) {
+				kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
+						    pps_poolset, &zombie_ppo);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+ out:
+	if (!list_empty(&zombie_tpo))
+		kiblnd_destroy_pool_list(&zombie_tpo);
+	if (!list_empty(&zombie_ppo))
+		kiblnd_destroy_pool_list(&zombie_ppo);
+	if (!list_empty(&zombie_fpo))
+		kiblnd_destroy_fmr_pool_list(&zombie_fpo);
+	if (hdev != NULL)
+		kiblnd_hdev_decref(hdev);
+
+	if (rc != 0)
+		dev->ibd_failed_failover++;
+	else
+		dev->ibd_failed_failover = 0;
+
+	return rc;
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+	LASSERT (dev->ibd_nnets == 0);
+	LASSERT (list_empty(&dev->ibd_nets));
+
+	list_del(&dev->ibd_fail_list);
+	list_del(&dev->ibd_list);
+
+	if (dev->ibd_hdev != NULL)
+		kiblnd_hdev_decref(dev->ibd_hdev);
+
+	LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+kib_dev_t *
+kiblnd_create_dev(char *ifname)
+{
+	struct net_device *netdev;
+	kib_dev_t	 *dev;
+	__u32	      netmask;
+	__u32	      ip;
+	int		up;
+	int		rc;
+
+	rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+	if (rc != 0) {
+		CERROR("Can't query IPoIB interface %s: %d\n",
+		       ifname, rc);
+		return NULL;
+	}
+
+	if (!up) {
+		CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(dev, sizeof(*dev));
+	if (dev == NULL)
+		return NULL;
+
+	memset(dev, 0, sizeof(*dev));
+	netdev = dev_get_by_name(&init_net, ifname);
+	if (netdev == NULL) {
+		dev->ibd_can_failover = 0;
+	} else {
+		dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
+		dev_put(netdev);
+	}
+
+	INIT_LIST_HEAD(&dev->ibd_nets);
+	INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
+	INIT_LIST_HEAD(&dev->ibd_fail_list);
+	dev->ibd_ifip = ip;
+	strcpy(&dev->ibd_ifname[0], ifname);
+
+	/* initialize the device */
+	rc = kiblnd_dev_failover(dev);
+	if (rc != 0) {
+		CERROR("Can't initialize device: %d\n", rc);
+		LIBCFS_FREE(dev, sizeof(*dev));
+		return NULL;
+	}
+
+	list_add_tail(&dev->ibd_list,
+			  &kiblnd_data.kib_devs);
+	return dev;
+}
+
+void
+kiblnd_base_shutdown(void)
+{
+	struct kib_sched_info	*sched;
+	int			i;
+
+	LASSERT (list_empty(&kiblnd_data.kib_devs));
+
+	CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	switch (kiblnd_data.kib_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+	case IBLND_INIT_DATA:
+		LASSERT (kiblnd_data.kib_peers != NULL);
+		for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+			LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
+		}
+		LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
+		LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
+
+		/* flag threads to terminate; wake and wait for them to die */
+		kiblnd_data.kib_shutdown = 1;
+
+		/* NB: we really want to stop scheduler threads net by net
+		 * instead of the whole module, this should be improved
+		 * with dynamic configuration LNet */
+		cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
+			wake_up_all(&sched->ibs_waitq);
+
+		wake_up_all(&kiblnd_data.kib_connd_waitq);
+		wake_up_all(&kiblnd_data.kib_failover_waitq);
+
+		i = 2;
+		while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+			       "Waiting for %d threads to terminate\n",
+			       atomic_read(&kiblnd_data.kib_nthreads));
+			cfs_pause(cfs_time_seconds(1));
+		}
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		break;
+	}
+
+	if (kiblnd_data.kib_peers != NULL) {
+		LIBCFS_FREE(kiblnd_data.kib_peers,
+			    sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	}
+
+	if (kiblnd_data.kib_scheds != NULL)
+		cfs_percpt_free(kiblnd_data.kib_scheds);
+
+	CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+	module_put(THIS_MODULE);
+}
+
+void
+kiblnd_shutdown (lnet_ni_t *ni)
+{
+	kib_net_t	*net = ni->ni_data;
+	rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
+	int	       i;
+	unsigned long     flags;
+
+	LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+	if (net == NULL)
+		goto out;
+
+	CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	write_lock_irqsave(g_lock, flags);
+	net->ibn_shutdown = 1;
+	write_unlock_irqrestore(g_lock, flags);
+
+	switch (net->ibn_init) {
+	default:
+		LBUG();
+
+	case IBLND_INIT_ALL:
+		/* nuke all existing peers within this net */
+		kiblnd_del_peer(ni, LNET_NID_ANY);
+
+		/* Wait for all peer state to clean up */
+		i = 2;
+		while (atomic_read(&net->ibn_npeers) != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+			       "%s: waiting for %d peers to disconnect\n",
+			       libcfs_nid2str(ni->ni_nid),
+			       atomic_read(&net->ibn_npeers));
+			cfs_pause(cfs_time_seconds(1));
+		}
+
+		kiblnd_net_fini_pools(net);
+
+		write_lock_irqsave(g_lock, flags);
+		LASSERT(net->ibn_dev->ibd_nnets > 0);
+		net->ibn_dev->ibd_nnets--;
+		list_del(&net->ibn_list);
+		write_unlock_irqrestore(g_lock, flags);
+
+		/* fall through */
+
+	case IBLND_INIT_NOTHING:
+		LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+		if (net->ibn_dev != NULL &&
+		    net->ibn_dev->ibd_nnets == 0)
+			kiblnd_destroy_dev(net->ibn_dev);
+
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+	       atomic_read(&libcfs_kmemory));
+
+	net->ibn_init = IBLND_INIT_NOTHING;
+	ni->ni_data = NULL;
+
+	LIBCFS_FREE(net, sizeof(*net));
+
+out:
+	if (list_empty(&kiblnd_data.kib_devs))
+		kiblnd_base_shutdown();
+	return;
+}
+
+int
+kiblnd_base_startup(void)
+{
+	struct kib_sched_info	*sched;
+	int			rc;
+	int			i;
+
+	LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+	try_module_get(THIS_MODULE);
+	memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+	rwlock_init(&kiblnd_data.kib_global_lock);
+
+	INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+	INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
+
+	kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+	LIBCFS_ALLOC(kiblnd_data.kib_peers,
+		     sizeof(struct list_head) *
+			    kiblnd_data.kib_peer_hash_size);
+	if (kiblnd_data.kib_peers == NULL) {
+		goto failed;
+	}
+	for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+		INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+	spin_lock_init(&kiblnd_data.kib_connd_lock);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+	INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+	init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+	init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
+
+	kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
+						  sizeof(*sched));
+	if (kiblnd_data.kib_scheds == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
+		int	nthrs;
+
+		spin_lock_init(&sched->ibs_lock);
+		INIT_LIST_HEAD(&sched->ibs_conns);
+		init_waitqueue_head(&sched->ibs_waitq);
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
+		} else {
+			/* max to half of CPUs, another half is reserved for
+			 * upper layer modules */
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+		}
+
+		sched->ibs_nthreads_max = nthrs;
+		sched->ibs_cpt = i;
+	}
+
+	kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+	/* lists/ptrs/locks initialised */
+	kiblnd_data.kib_init = IBLND_INIT_DATA;
+	/*****************************************************/
+
+	rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+		goto failed;
+	}
+
+	if (*kiblnd_tunables.kib_dev_failover != 0)
+		rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
+					 "kiblnd_failover");
+
+	if (rc != 0) {
+		CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	kiblnd_data.kib_init = IBLND_INIT_ALL;
+	/*****************************************************/
+
+	return 0;
+
+ failed:
+	kiblnd_base_shutdown();
+	return -ENETDOWN;
+}
+
+int
+kiblnd_start_schedulers(struct kib_sched_info *sched)
+{
+	int	rc = 0;
+	int	nthrs;
+	int	i;
+
+	if (sched->ibs_nthreads == 0) {
+		if (*kiblnd_tunables.kib_nscheds > 0) {
+			nthrs = sched->ibs_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       sched->ibs_cpt);
+			nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
+			nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
+		}
+	} else {
+		LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
+		/* increase one thread if there is new interface */
+		nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long	id;
+		char	name[20];
+		id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
+		snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+			 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
+		rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       sched->ibs_cpt, sched->ibs_nthreads + i, rc);
+		break;
+	}
+
+	sched->ibs_nthreads += i;
+	return rc;
+}
+
+int
+kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
+{
+	int	cpt;
+	int	rc;
+	int	i;
+
+	for (i = 0; i < ncpts; i++) {
+		struct kib_sched_info *sched;
+
+		cpt = (cpts == NULL) ? i : cpts[i];
+		sched = kiblnd_data.kib_scheds[cpt];
+
+		if (!newdev && sched->ibs_nthreads > 0)
+			continue;
+
+		rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
+		if (rc != 0) {
+			CERROR("Failed to start scheduler threads for %s\n",
+			       dev->ibd_ifname);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+kib_dev_t *
+kiblnd_dev_search(char *ifname)
+{
+	kib_dev_t	*alias = NULL;
+	kib_dev_t	*dev;
+	char		*colon;
+	char		*colon2;
+
+	colon = strchr(ifname, ':');
+	list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			return dev;
+
+		if (alias != NULL)
+			continue;
+
+		colon2 = strchr(dev->ibd_ifname, ':');
+		if (colon != NULL)
+			*colon = 0;
+		if (colon2 != NULL)
+			*colon2 = 0;
+
+		if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
+			alias = dev;
+
+		if (colon != NULL)
+			*colon = ':';
+		if (colon2 != NULL)
+			*colon2 = ':';
+	}
+	return alias;
+}
+
+int
+kiblnd_startup (lnet_ni_t *ni)
+{
+	char		     *ifname;
+	kib_dev_t		*ibdev = NULL;
+	kib_net_t		*net;
+	struct timeval	    tv;
+	unsigned long	     flags;
+	int		       rc;
+	int			  newdev;
+
+	LASSERT (ni->ni_lnd == &the_o2iblnd);
+
+	if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+		rc = kiblnd_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	ni->ni_data = net;
+	if (net == NULL)
+		goto failed;
+
+	memset(net, 0, sizeof(*net));
+
+	do_gettimeofday(&tv);
+	net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+	ni->ni_peertimeout    = *kiblnd_tunables.kib_peertimeout;
+	ni->ni_maxtxcredits   = *kiblnd_tunables.kib_credits;
+	ni->ni_peertxcredits  = *kiblnd_tunables.kib_peertxcredits;
+	ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] != NULL) {
+		/* Use the IPoIB interface specified in 'networks=' */
+
+		CLASSERT (LNET_MAX_INTERFACES > 1);
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Multiple interfaces not supported\n");
+			goto failed;
+		}
+
+		ifname = ni->ni_interfaces[0];
+	} else {
+		ifname = *kiblnd_tunables.kib_default_ipif;
+	}
+
+	if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+		CERROR("IPoIB interface name too long: %s\n", ifname);
+		goto failed;
+	}
+
+	ibdev = kiblnd_dev_search(ifname);
+
+	newdev = ibdev == NULL;
+	/* hmm...create kib_dev even for alias */
+	if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
+		ibdev = kiblnd_create_dev(ifname);
+
+	if (ibdev == NULL)
+		goto failed;
+
+	net->ibn_dev = ibdev;
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+
+	rc = kiblnd_dev_start_threads(ibdev, newdev,
+				      ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto failed;
+
+	rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0) {
+		CERROR("Failed to initialize NI pools: %d\n", rc);
+		goto failed;
+	}
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	ibdev->ibd_nnets++;
+	list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	net->ibn_init = IBLND_INIT_ALL;
+
+	return 0;
+
+failed:
+	if (net->ibn_dev == NULL && ibdev != NULL)
+		kiblnd_destroy_dev(ibdev);
+
+	kiblnd_shutdown(ni);
+
+	CDEBUG(D_NET, "kiblnd_startup failed\n");
+	return -ENETDOWN;
+}
+
+void __exit
+kiblnd_module_fini (void)
+{
+	lnet_unregister_lnd(&the_o2iblnd);
+	kiblnd_tunables_fini();
+}
+
+int __init
+kiblnd_module_init (void)
+{
+	int    rc;
+
+	CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+	CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		  <= IBLND_MSG_SIZE);
+	CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+		  <= IBLND_MSG_SIZE);
+
+	rc = kiblnd_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_o2iblnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644
index 000000000000..e4626bf82fc7
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
@@ -0,0 +1,1057 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd.h
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/pci.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+#define IBLND_PEER_HASH_SIZE		101	/* # peer lists */
+/* # scheduler loops before reschedule */
+#define IBLND_RESCHED			100
+
+#define IBLND_N_SCHED			2
+#define IBLND_N_SCHED_HIGH		4
+
+typedef struct
+{
+	int	      *kib_dev_failover;     /* HCA failover */
+	unsigned int     *kib_service;	  /* IB service number */
+	int	      *kib_min_reconnect_interval; /* first failed connection retry... */
+	int	      *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+	int	      *kib_cksum;	    /* checksum kib_msg_t? */
+	int	      *kib_timeout;	  /* comms timeout (seconds) */
+	int	      *kib_keepalive;	/* keepalive timeout (seconds) */
+	int	      *kib_ntx;	      /* # tx descs */
+	int	      *kib_credits;	  /* # concurrent sends */
+	int	      *kib_peertxcredits;    /* # concurrent sends to 1 peer */
+	int	      *kib_peerrtrcredits;   /* # per-peer router buffer credits */
+	int	      *kib_peercredits_hiw;  /* # when eagerly to return credits */
+	int	      *kib_peertimeout;      /* seconds to consider peer dead */
+	char	    **kib_default_ipif;     /* default IPoIB interface */
+	int	      *kib_retry_count;
+	int	      *kib_rnr_retry_count;
+	int	      *kib_concurrent_sends; /* send work queue sizing */
+	int		 *kib_ib_mtu;		/* IB MTU */
+	int	      *kib_map_on_demand;    /* map-on-demand if RD has more fragments
+						 * than this value, 0 disable map-on-demand */
+	int	      *kib_pmr_pool_size;    /* # physical MR in pool */
+	int	      *kib_fmr_pool_size;    /* # FMRs in pool */
+	int	      *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+	int	      *kib_fmr_cache;	/* enable FMR pool cache? */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ctl_table_header_t *kib_sysctl;  /* sysctl interface */
+#endif
+	int	      *kib_require_priv_port;/* accept only privileged ports */
+	int	      *kib_use_priv_port;    /* use privileged port for active connect */
+	/* # threads on each CPT */
+	int		 *kib_nscheds;
+} kib_tunables_t;
+
+extern kib_tunables_t  kiblnd_tunables;
+
+#define IBLND_MSG_QUEUE_SIZE_V1      8	  /* V1 only : # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER_V1    7	  /* V1 only : when eagerly to return credits */
+
+#define IBLND_CREDITS_DEFAULT	8	  /* default # of peer credits */
+#define IBLND_CREDITS_MAX	  ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1)  /* Max # of peer credits */
+
+#define IBLND_MSG_QUEUE_SIZE(v)    ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MSG_QUEUE_SIZE_V1 :   \
+				     *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
+#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_CREDIT_HIGHWATER_V1 : \
+				     *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+
+static inline int
+kiblnd_concurrent_sends_v1(void)
+{
+	if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+		return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+
+	return *kiblnd_tunables.kib_concurrent_sends;
+}
+
+#define IBLND_CONCURRENT_SENDS(v)  ((v) == IBLND_MSG_VERSION_1 ? \
+				     kiblnd_concurrent_sends_v1() : \
+				     *kiblnd_tunables.kib_concurrent_sends)
+/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
+#define IBLND_OOB_CAPABLE(v)       ((v) != IBLND_MSG_VERSION_1)
+#define IBLND_OOB_MSGS(v)	   (IBLND_OOB_CAPABLE(v) ? 2 : 0)
+
+#define IBLND_MSG_SIZE	      (4<<10)		 /* max size of queued messages (inc hdr) */
+#define IBLND_MAX_RDMA_FRAGS	 LNET_MAX_IOV	   /* max # of fragments supported */
+#define IBLND_CFG_RDMA_FRAGS       (*kiblnd_tunables.kib_map_on_demand != 0 ? \
+				    *kiblnd_tunables.kib_map_on_demand :      \
+				     IBLND_MAX_RDMA_FRAGS)  /* max # of fragments configured by user */
+#define IBLND_RDMA_FRAGS(v)	((v) == IBLND_MSG_VERSION_1 ? \
+				     IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
+
+/************************/
+/* derived constants... */
+/* Pools (shared by connections on each CPT) */
+/* These pools can grow at runtime, so don't need give a very large value */
+#define IBLND_TX_POOL			256
+#define IBLND_PMR_POOL			256
+#define IBLND_FMR_POOL			256
+#define IBLND_FMR_POOL_FLUSH		192
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS()	    (*kiblnd_tunables.kib_ntx)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS(v)	    (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v))
+#define IBLND_RX_MSG_BYTES(v)       (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES(v)      ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE)
+
+/* WRs and CQEs (per connection) */
+#define IBLND_RECV_WRS(v)	    IBLND_RX_MSGS(v)
+#define IBLND_SEND_WRS(v)	  ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v))
+#define IBLND_CQ_ENTRIES(v)	 (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v))
+
+struct kib_hca_dev;
+
+/* o2iblnd can run over aliased interface */
+#ifdef IFALIASZ
+#define KIB_IFNAME_SIZE	      IFALIASZ
+#else
+#define KIB_IFNAME_SIZE	      256
+#endif
+
+typedef struct
+{
+	struct list_head	   ibd_list;	  /* chain on kib_devs */
+	struct list_head	   ibd_fail_list;     /* chain on kib_failed_devs */
+	__u32		ibd_ifip;	  /* IPoIB interface IP */
+	/** IPoIB interface name */
+	char		 ibd_ifname[KIB_IFNAME_SIZE];
+	int		  ibd_nnets;	 /* # nets extant */
+
+	cfs_time_t	   ibd_next_failover;
+	int		  ibd_failed_failover; /* # failover failures */
+	unsigned int	 ibd_failover;      /* failover in progress */
+	unsigned int	 ibd_can_failover;  /* IPoIB interface is a bonding master */
+	struct list_head	   ibd_nets;
+	struct kib_hca_dev  *ibd_hdev;
+} kib_dev_t;
+
+typedef struct kib_hca_dev
+{
+	struct rdma_cm_id   *ibh_cmid;	  /* listener cmid */
+	struct ib_device    *ibh_ibdev;	 /* IB device */
+	int		  ibh_page_shift;    /* page shift of current HCA */
+	int		  ibh_page_size;     /* page size of current HCA */
+	__u64		ibh_page_mask;     /* page mask of current HCA */
+	int		  ibh_mr_shift;      /* bits shift of max MR size */
+	__u64		ibh_mr_size;       /* size of MR */
+	int		  ibh_nmrs;	  /* # of global MRs */
+	struct ib_mr       **ibh_mrs;	   /* global MR */
+	struct ib_pd	*ibh_pd;	    /* PD */
+	kib_dev_t	   *ibh_dev;	   /* owner */
+	atomic_t	 ibh_ref;	   /* refcount */
+} kib_hca_dev_t;
+
+/** # of seconds to keep pool alive */
+#define IBLND_POOL_DEADLINE     300
+/** # of seconds to retry if allocation failed */
+#define IBLND_POOL_RETRY	1
+
+typedef struct
+{
+	int		     ibp_npages;	     /* # pages */
+	struct page	    *ibp_pages[0];	   /* page array */
+} kib_pages_t;
+
+struct kib_pmr_pool;
+
+typedef struct {
+	struct list_head	      pmr_list;	       /* chain node */
+	struct ib_phys_buf     *pmr_ipb;		/* physical buffer */
+	struct ib_mr	   *pmr_mr;		 /* IB MR */
+	struct kib_pmr_pool    *pmr_pool;	       /* owner of this MR */
+	__u64		   pmr_iova;	       /* Virtual I/O address */
+	int		     pmr_refcount;	   /* reference count */
+} kib_phys_mr_t;
+
+struct kib_pool;
+struct kib_poolset;
+
+typedef int  (*kib_ps_pool_create_t)(struct kib_poolset *ps,
+				     int inc, struct kib_pool **pp_po);
+typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po);
+typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node);
+typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node);
+
+struct kib_net;
+
+#define IBLND_POOL_NAME_LEN     32
+
+typedef struct kib_poolset
+{
+	spinlock_t		ps_lock;		/* serialize */
+	struct kib_net	 *ps_net;		 /* network it belongs to */
+	char		    ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */
+	struct list_head	      ps_pool_list;	   /* list of pools */
+	struct list_head	      ps_failed_pool_list;    /* failed pool list */
+	cfs_time_t	      ps_next_retry;	  /* time stamp for retry if failed to allocate */
+	int		     ps_increasing;	  /* is allocating new pool */
+	int		     ps_pool_size;	   /* new pool size */
+	int			ps_cpt;			/* CPT id */
+
+	kib_ps_pool_create_t    ps_pool_create;	 /* create a new pool */
+	kib_ps_pool_destroy_t   ps_pool_destroy;	/* destroy a pool */
+	kib_ps_node_init_t      ps_node_init;	   /* initialize new allocated node */
+	kib_ps_node_fini_t      ps_node_fini;	   /* finalize node */
+} kib_poolset_t;
+
+typedef struct kib_pool
+{
+	struct list_head	      po_list;		/* chain on pool list */
+	struct list_head	      po_free_list;	   /* pre-allocated node */
+	kib_poolset_t	  *po_owner;	       /* pool_set of this pool */
+	cfs_time_t	      po_deadline;	    /* deadline of this pool */
+	int		     po_allocated;	   /* # of elements in use */
+	int		     po_failed;	      /* pool is created on failed HCA */
+	int		     po_size;		/* # of pre-allocated elements */
+} kib_pool_t;
+
+typedef struct {
+	kib_poolset_t	   tps_poolset;	    /* pool-set */
+	__u64		   tps_next_tx_cookie;     /* cookie of TX */
+} kib_tx_poolset_t;
+
+typedef struct {
+	kib_pool_t	      tpo_pool;	       /* pool */
+	struct kib_hca_dev     *tpo_hdev;	       /* device for this pool */
+	struct kib_tx	  *tpo_tx_descs;	   /* all the tx descriptors */
+	kib_pages_t	    *tpo_tx_pages;	   /* premapped tx msg pages */
+} kib_tx_pool_t;
+
+typedef struct {
+	kib_poolset_t	   pps_poolset;	    /* pool-set */
+} kib_pmr_poolset_t;
+
+typedef struct kib_pmr_pool {
+	struct kib_hca_dev     *ppo_hdev;	       /* device for this pool */
+	kib_pool_t	      ppo_pool;	       /* pool */
+} kib_pmr_pool_t;
+
+typedef struct
+{
+	spinlock_t		fps_lock;		/* serialize */
+	struct kib_net	 *fps_net;		/* IB network */
+	struct list_head	      fps_pool_list;	  /* FMR pool list */
+	struct list_head	      fps_failed_pool_list;   /* FMR pool list */
+	__u64		   fps_version;	    /* validity stamp */
+	int			fps_cpt;		/* CPT id */
+	int			fps_pool_size;
+	int			fps_flush_trigger;
+	/* is allocating new pool */
+	int			fps_increasing;
+	/* time stamp for retry if failed to allocate */
+	cfs_time_t		fps_next_retry;
+} kib_fmr_poolset_t;
+
+typedef struct
+{
+	struct list_head	      fpo_list;	       /* chain on pool list */
+	struct kib_hca_dev     *fpo_hdev;	       /* device for this pool */
+	kib_fmr_poolset_t      *fpo_owner;	      /* owner of this pool */
+	struct ib_fmr_pool     *fpo_fmr_pool;	   /* IB FMR pool */
+	cfs_time_t	      fpo_deadline;	   /* deadline of this pool */
+	int		     fpo_failed;	     /* fmr pool is failed */
+	int		     fpo_map_count;	  /* # of mapped FMR */
+} kib_fmr_pool_t;
+
+typedef struct {
+	struct ib_pool_fmr     *fmr_pfmr;	       /* IB pool fmr */
+	kib_fmr_pool_t	 *fmr_pool;	       /* pool of FMR */
+} kib_fmr_t;
+
+typedef struct kib_net
+{
+	struct list_head	   ibn_list;	  /* chain on kib_dev_t::ibd_nets */
+	__u64		ibn_incarnation;   /* my epoch */
+	int		  ibn_init;	  /* initialisation state */
+	int		  ibn_shutdown;      /* shutting down? */
+
+	atomic_t		ibn_npeers;	/* # peers extant */
+	atomic_t		ibn_nconns;	/* # connections extant */
+
+	kib_tx_poolset_t	**ibn_tx_ps;	/* tx pool-set */
+	kib_fmr_poolset_t	**ibn_fmr_ps;	/* fmr pool-set */
+	kib_pmr_poolset_t	**ibn_pmr_ps;	/* pmr pool-set */
+
+	kib_dev_t		*ibn_dev;	/* underlying IB device */
+} kib_net_t;
+
+#define KIB_THREAD_SHIFT		16
+#define KIB_THREAD_ID(cpt, tid)		((cpt) << KIB_THREAD_SHIFT | (tid))
+#define KIB_THREAD_CPT(id)		((id) >> KIB_THREAD_SHIFT)
+#define KIB_THREAD_TID(id)		((id) & ((1UL << KIB_THREAD_SHIFT) - 1))
+
+struct kib_sched_info {
+	/* serialise */
+	spinlock_t		ibs_lock;
+	/* schedulers sleep here */
+	wait_queue_head_t		ibs_waitq;
+	/* conns to check for rx completions */
+	struct list_head		ibs_conns;
+	/* number of scheduler threads */
+	int			ibs_nthreads;
+	/* max allowed scheduler threads */
+	int			ibs_nthreads_max;
+	int			ibs_cpt;	/* CPT id */
+};
+
+typedef struct
+{
+	int			kib_init;	/* initialisation state */
+	int			kib_shutdown;	/* shut down? */
+	struct list_head		kib_devs;	/* IB devices extant */
+	/* list head of failed devices */
+	struct list_head		kib_failed_devs;
+	/* schedulers sleep here */
+	wait_queue_head_t		kib_failover_waitq;
+	atomic_t		kib_nthreads;	/* # live threads */
+	/* stabilize net/dev/peer/conn ops */
+	rwlock_t		kib_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*kib_peers;
+	/* size of kib_peers */
+	int			kib_peer_hash_size;
+	/* the connd task (serialisation assertions) */
+	void			*kib_connd;
+	/* connections to setup/teardown */
+	struct list_head		kib_connd_conns;
+	/* connections with zero refcount */
+	struct list_head		kib_connd_zombies;
+	/* connection daemon sleeps here */
+	wait_queue_head_t		kib_connd_waitq;
+	spinlock_t		kib_connd_lock;	/* serialise */
+	struct ib_qp_attr	kib_error_qpa;	/* QP->ERROR */
+	/* percpt data for schedulers */
+	struct kib_sched_info	**kib_scheds;
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING	 0
+#define IBLND_INIT_DATA	    1
+#define IBLND_INIT_ALL	     2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+	__u16	     ibcp_queue_depth;
+	__u16	     ibcp_max_frags;
+	__u32	     ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibim_hdr;	     /* portals header */
+	char	      ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+typedef struct
+{
+	__u32	     rf_nob;	       /* # bytes this frag */
+	__u64	     rf_addr;	      /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+	__u32	     rd_key;	       /* local/remote key */
+	__u32	     rd_nfrags;	    /* # fragments */
+	kib_rdma_frag_t   rd_frags[0];	  /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibprm_hdr;	    /* portals header */
+	__u64	     ibprm_cookie;	 /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+	__u64	     ibpam_src_cookie;     /* reflected completion cookie */
+	__u64	     ibpam_dst_cookie;     /* opaque completion cookie */
+	kib_rdma_desc_t   ibpam_rd;	     /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+	lnet_hdr_t	ibgm_hdr;	     /* portals header */
+	__u64	     ibgm_cookie;	  /* opaque completion cookie */
+	kib_rdma_desc_t   ibgm_rd;	      /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+	__u64	     ibcm_cookie;	  /* opaque completion cookie */
+	__s32	     ibcm_status;	  /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+	/* First 2 fields fixed FOR ALL TIME */
+	__u32	     ibm_magic;	    /* I'm an ibnal message */
+	__u16	     ibm_version;	  /* this is my version number */
+
+	__u8	      ibm_type;	     /* msg type */
+	__u8	      ibm_credits;	  /* returned credits */
+	__u32	     ibm_nob;	      /* # bytes in whole message */
+	__u32	     ibm_cksum;	    /* checksum (0 == no checksum) */
+	__u64	     ibm_srcnid;	   /* sender's NID */
+	__u64	     ibm_srcstamp;	 /* sender's incarnation */
+	__u64	     ibm_dstnid;	   /* destination's NID */
+	__u64	     ibm_dststamp;	 /* destination's incarnation */
+
+	union {
+		kib_connparams_t      connparams;
+		kib_immediate_msg_t   immediate;
+		kib_putreq_msg_t      putreq;
+		kib_putack_msg_t      putack;
+		kib_get_msg_t	 get;
+		kib_completion_msg_t  completion;
+	} WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC	/* unique magic */
+
+#define IBLND_MSG_VERSION_1	 0x11
+#define IBLND_MSG_VERSION_2	 0x12
+#define IBLND_MSG_VERSION	   IBLND_MSG_VERSION_2
+
+#define IBLND_MSG_CONNREQ	   0xc0	/* connection request */
+#define IBLND_MSG_CONNACK	   0xc1	/* connection acknowledge */
+#define IBLND_MSG_NOOP	      0xd0	/* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE	 0xd1	/* immediate */
+#define IBLND_MSG_PUT_REQ	   0xd2	/* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK	   0xd3	/* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK	   0xd4	/* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE	  0xd5	/* completion (src->sink) */
+#define IBLND_MSG_GET_REQ	   0xd6	/* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE	  0xd7	/* completion (src->sink: all OK) */
+
+typedef struct {
+	__u32	    ibr_magic;	     /* sender's magic */
+	__u16	    ibr_version;	   /* sender's version */
+	__u8	     ibr_why;	       /* reject reason */
+	__u8	     ibr_padding;	   /* padding */
+	__u64	    ibr_incarnation;       /* incarnation of peer */
+	kib_connparams_t ibr_cp;		/* connection parameters */
+} WIRE_ATTR kib_rej_t;
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1	  /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2	  /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL	   3	  /* Anything else */
+
+#define IBLND_REJECT_CONN_UNCOMPAT   4	  /* incompatible version peer */
+#define IBLND_REJECT_CONN_STALE      5	  /* stale peer */
+
+#define IBLND_REJECT_RDMA_FRAGS      6	  /* Fatal: peer's rdma frags can't match mine */
+#define IBLND_REJECT_MSG_QUEUE_SIZE  7	  /* Fatal: peer's msg queue size can't match mine */
+
+/***********************************************************************/
+
+typedef struct kib_rx			   /* receive message */
+{
+	struct list_head		rx_list;      /* queue for attention */
+	struct kib_conn	  *rx_conn;      /* owning conn */
+	int		       rx_nob;       /* # bytes received (-1 while posted) */
+	enum ib_wc_status	 rx_status;    /* completion status */
+	kib_msg_t		*rx_msg;       /* message buffer (host vaddr) */
+	__u64		     rx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
+	struct ib_recv_wr	 rx_wrq;       /* receive work item... */
+	struct ib_sge	     rx_sge;       /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0	     /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1	     /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2	     /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3	     /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx			   /* transmit message */
+{
+	struct list_head		tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+	kib_tx_pool_t	    *tx_pool;      /* pool I'm from */
+	struct kib_conn	  *tx_conn;      /* owning conn */
+	short		     tx_sending;   /* # tx callbacks outstanding */
+	short		     tx_queued;    /* queued for sending */
+	short		     tx_waiting;   /* waiting for peer */
+	int		       tx_status;    /* LNET completion status */
+	unsigned long	     tx_deadline;  /* completion deadline */
+	__u64		     tx_cookie;    /* completion cookie */
+	lnet_msg_t	       *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+	kib_msg_t		*tx_msg;       /* message buffer (host vaddr) */
+	__u64		     tx_msgaddr;   /* message buffer (I/O addr) */
+	DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
+	int		       tx_nwrq;      /* # send work items */
+	struct ib_send_wr	*tx_wrq;       /* send work items... */
+	struct ib_sge	    *tx_sge;       /* ...and their memory */
+	kib_rdma_desc_t	  *tx_rd;	/* rdma descriptor */
+	int		       tx_nfrags;    /* # entries in... */
+	struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
+	__u64		    *tx_pages;     /* rdma phys page addrs */
+	union {
+		kib_phys_mr_t      *pmr;	/* MR for physical buffer */
+		kib_fmr_t	   fmr;	/* FMR */
+	}			 tx_u;
+	int		       tx_dmadir;    /* dma direction */
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+	/* connection-in-progress variables */
+	kib_msg_t		 cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+	struct kib_sched_info *ibc_sched;	/* scheduler information */
+	struct kib_peer     *ibc_peer;	  /* owning peer */
+	kib_hca_dev_t       *ibc_hdev;	  /* HCA bound on */
+	struct list_head	   ibc_list;	  /* stash on peer's conn list */
+	struct list_head	   ibc_sched_list;    /* schedule for attention */
+	__u16		ibc_version;       /* version of connection */
+	__u64		ibc_incarnation;   /* which instance of the peer */
+	atomic_t	 ibc_refcount;      /* # users */
+	int		  ibc_state;	 /* what's happening */
+	int		  ibc_nsends_posted; /* # uncompleted sends */
+	int		  ibc_noops_posted;  /* # uncompleted NOOPs */
+	int		  ibc_credits;       /* # credits I have */
+	int		  ibc_outstanding_credits; /* # credits to return */
+	int		  ibc_reserved_credits;/* # ACK/DONE msg credits */
+	int		  ibc_comms_error;   /* set on comms error */
+	unsigned int	     ibc_nrx:16;	/* receive buffers owned */
+	unsigned int	     ibc_scheduled:1;   /* scheduled for attention */
+	unsigned int	     ibc_ready:1;       /* CQ callback fired */
+	/* time of last send */
+	unsigned long	ibc_last_send;
+	/** link chain for kiblnd_check_conns only */
+	struct list_head	   ibc_connd_list;
+	/** rxs completed before ESTABLISHED */
+	struct list_head	   ibc_early_rxs;
+	/** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */
+	struct list_head	   ibc_tx_noops;
+	struct list_head	   ibc_tx_queue;       /* sends that need a credit */
+	struct list_head	   ibc_tx_queue_nocred;/* sends that don't need a credit */
+	struct list_head	   ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+	struct list_head	   ibc_active_txs;     /* active tx awaiting completion */
+	spinlock_t	     ibc_lock;		 /* serialise */
+	kib_rx_t	    *ibc_rxs;	    /* the rx descs */
+	kib_pages_t	 *ibc_rx_pages;       /* premapped rx msg pages */
+
+	struct rdma_cm_id   *ibc_cmid;	   /* CM id */
+	struct ib_cq	*ibc_cq;	     /* completion queue */
+
+	kib_connvars_t      *ibc_connvars;       /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT	       0	 /* being initialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1	 /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2	 /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED	3	 /* connection established */
+#define IBLND_CONN_CLOSING	    4	 /* being closed */
+#define IBLND_CONN_DISCONNECTED       5	 /* disconnected */
+
+typedef struct kib_peer
+{
+	struct list_head	   ibp_list;	   /* stash on global peer list */
+	lnet_nid_t	   ibp_nid;	    /* who's on the other end(s) */
+	lnet_ni_t	   *ibp_ni;	     /* LNet interface */
+	atomic_t	 ibp_refcount;       /* # users */
+	struct list_head	   ibp_conns;	  /* all active connections */
+	struct list_head	   ibp_tx_queue;       /* msgs waiting for a conn */
+	__u16		ibp_version;	/* version of peer */
+	__u64		ibp_incarnation;    /* incarnation of peer */
+	int		  ibp_connecting;     /* current active connection attempts */
+	int		  ibp_accepting;      /* current passive connection attempts */
+	int		  ibp_error;	  /* errno on closing this peer */
+	cfs_time_t	   ibp_last_alive;     /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+extern kib_data_t      kiblnd_data;
+
+extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+
+static inline void
+kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	atomic_inc(&hdev->ibh_ref);
+}
+
+static inline void
+kiblnd_hdev_decref(kib_hca_dev_t *hdev)
+{
+	LASSERT (atomic_read(&hdev->ibh_ref) > 0);
+	if (atomic_dec_and_test(&hdev->ibh_ref))
+		kiblnd_hdev_destroy(hdev);
+}
+
+static inline int
+kiblnd_dev_can_failover(kib_dev_t *dev)
+{
+	if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */
+		return 0;
+
+	if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */
+		return 1;
+
+	return dev->ibd_can_failover;
+}
+
+#define kiblnd_conn_addref(conn)				\
+do {							    \
+	CDEBUG(D_NET, "conn[%p] (%d)++\n",		      \
+	       (conn), atomic_read(&(conn)->ibc_refcount)); \
+	atomic_inc(&(conn)->ibc_refcount);		  \
+} while (0)
+
+#define kiblnd_conn_decref(conn)					\
+do {									\
+	unsigned long flags;						\
+									\
+	CDEBUG(D_NET, "conn[%p] (%d)--\n",				\
+	       (conn), atomic_read(&(conn)->ibc_refcount));		\
+	LASSERT_ATOMIC_POS(&(conn)->ibc_refcount);			\
+	if (atomic_dec_and_test(&(conn)->ibc_refcount)) {		\
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);	\
+		list_add_tail(&(conn)->ibc_list,			\
+				  &kiblnd_data.kib_connd_zombies);	\
+		wake_up(&kiblnd_data.kib_connd_waitq);		\
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\
+	}								\
+} while (0)
+
+#define kiblnd_peer_addref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	atomic_inc(&(peer)->ibp_refcount);		  \
+} while (0)
+
+#define kiblnd_peer_decref(peer)				\
+do {							    \
+	CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",		\
+	       (peer), libcfs_nid2str((peer)->ibp_nid),	 \
+	       atomic_read (&(peer)->ibp_refcount));	\
+	LASSERT_ATOMIC_POS(&(peer)->ibp_refcount);	      \
+	if (atomic_dec_and_test(&(peer)->ibp_refcount))     \
+		kiblnd_destroy_peer(peer);		      \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash =
+		((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+	return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+	/* Am I in the peer hash table? */
+	return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+	LASSERT (!list_empty(&peer->ibp_conns));
+
+	/* just return the first connection */
+	return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn)
+{
+	return (*kiblnd_tunables.kib_keepalive > 0) &&
+		cfs_time_after(jiffies, conn->ibc_last_send +
+			       *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline int
+kiblnd_need_noop(kib_conn_t *conn)
+{
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (conn->ibc_outstanding_credits <
+	    IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+	    !kiblnd_send_keepalive(conn))
+		return 0; /* No need to send NOOP */
+
+	if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+		if (!list_empty(&conn->ibc_tx_queue_nocred))
+			return 0; /* NOOP can be piggybacked */
+
+		/* No tx to piggyback NOOP onto or no credit to send a tx */
+		return (list_empty(&conn->ibc_tx_queue) ||
+			conn->ibc_credits == 0);
+	}
+
+	if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */
+	    !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */
+	    conn->ibc_credits == 0)		    /* no credit */
+		return 0;
+
+	if (conn->ibc_credits == 1 &&      /* last credit reserved for */
+	    conn->ibc_outstanding_credits == 0) /* giving back credits */
+		return 0;
+
+	/* No tx to piggyback NOOP onto or no credit to send a tx */
+	return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+	ib_modify_qp(conn->ibc_cmid->qp,
+		     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+static inline const char *
+kiblnd_queue2str (kib_conn_t *conn, struct list_head *q)
+{
+	if (q == &conn->ibc_tx_queue)
+		return "tx_queue";
+
+	if (q == &conn->ibc_tx_queue_rsrvd)
+		return "tx_queue_rsrvd";
+
+	if (q == &conn->ibc_tx_queue_nocred)
+		return "tx_queue_nocred";
+
+	if (q == &conn->ibc_active_txs)
+		return "active_txs";
+
+	LBUG();
+	return NULL;
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX    0
+#define IBLND_WID_RDMA  1
+#define IBLND_WID_RX    2
+#define IBLND_WID_MASK  3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+	unsigned long lptr = (unsigned long)ptr;
+
+	LASSERT ((lptr & IBLND_WID_MASK) == 0);
+	LASSERT ((type & ~IBLND_WID_MASK) == 0);
+	return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+	return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+	return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+	conn->ibc_state = state;
+	mb();
+}
+
+static inline void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+	msg->ibm_type = type;
+	msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+	int   i;
+	int   size;
+
+	for (i = size = 0; i < rd->rd_nfrags; i++)
+		size += rd->rd_frags[i].rf_nob;
+
+	return size;
+}
+
+static inline __u64
+kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_addr;
+}
+
+static inline __u32
+kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_frags[index].rf_nob;
+}
+
+static inline __u32
+kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index)
+{
+	return rd->rd_key;
+}
+
+static inline int
+kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob)
+{
+	if (nob < rd->rd_frags[index].rf_nob) {
+		rd->rd_frags[index].rf_addr += nob;
+		rd->rd_frags[index].rf_nob  -= nob;
+	} else {
+		index ++;
+	}
+
+	return index;
+}
+
+static inline int
+kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n)
+{
+	LASSERT (msgtype == IBLND_MSG_GET_REQ ||
+		 msgtype == IBLND_MSG_PUT_ACK);
+
+	return msgtype == IBLND_MSG_GET_REQ ?
+	       offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) :
+	       offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+}
+
+
+static inline __u64
+kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return ib_dma_mapping_error(dev, dma_addr);
+}
+
+static inline __u64 kiblnd_dma_map_single(struct ib_device *dev,
+					  void *msg, size_t size,
+					  enum dma_data_direction direction)
+{
+	return ib_dma_map_single(dev, msg, size, direction);
+}
+
+static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
+					   __u64 addr, size_t size,
+					  enum dma_data_direction direction)
+{
+	ib_dma_unmap_single(dev, addr, size, direction);
+}
+
+#define KIBLND_UNMAP_ADDR_SET(p, m, a)  do {} while (0)
+#define KIBLND_UNMAP_ADDR(p, m, a)      (a)
+
+static inline int kiblnd_dma_map_sg(struct ib_device *dev,
+				    struct scatterlist *sg, int nents,
+				    enum dma_data_direction direction)
+{
+	return ib_dma_map_sg(dev, sg, nents, direction);
+}
+
+static inline void kiblnd_dma_unmap_sg(struct ib_device *dev,
+				       struct scatterlist *sg, int nents,
+				       enum dma_data_direction direction)
+{
+	ib_dma_unmap_sg(dev, sg, nents, direction);
+}
+
+static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev,
+					  struct scatterlist *sg)
+{
+	return ib_sg_dma_address(dev, sg);
+}
+
+static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
+					     struct scatterlist *sg)
+{
+	return ib_sg_dma_len(dev, sg);
+}
+
+/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly
+ * right because OFED1.2 defines it as const, to use it we have to add
+ * (void *) cast to overcome "const" */
+
+#define KIBLND_CONN_PARAM(e)	    ((e)->param.conn.private_data)
+#define KIBLND_CONN_PARAM_LEN(e)	((e)->param.conn.private_data_len)
+
+
+struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
+				    kib_rdma_desc_t *rd);
+struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev,
+				 __u64 addr, __u64 size);
+void kiblnd_map_rx_descs(kib_conn_t *conn);
+void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+		  kib_rdma_desc_t *rd, int nfrags);
+void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx);
+void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
+struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
+
+int  kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
+			 int npages, __u64 iov, kib_fmr_t *fmr);
+void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
+
+int  kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
+			 kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr);
+void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr);
+
+int  kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+
+int  kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name);
+int  kiblnd_failover_thread (void *arg);
+
+int  kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+			struct rdma_cm_event *event);
+int  kiblnd_translate_mtu(int value);
+
+int  kiblnd_dev_failover(kib_dev_t *dev);
+int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer,
+				      int version, __u64 incarnation);
+int  kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+				int state, int version);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int  kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+		       int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid);
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+			 int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
+		      int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+		 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+		 unsigned int offset, unsigned int mlen, unsigned int rlen);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644
index 000000000000..cc6232126dd0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -0,0 +1,3529 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_cb.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+void
+kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
+{
+	lnet_msg_t *lntmsg[2];
+	kib_net_t  *net = ni->ni_data;
+	int	 rc;
+	int	 i;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (!tx->tx_queued);	       /* mustn't be queued for sending */
+	LASSERT (tx->tx_sending == 0);	  /* mustn't be awaiting sent callback */
+	LASSERT (!tx->tx_waiting);	      /* mustn't be awaiting peer response */
+	LASSERT (tx->tx_pool != NULL);
+
+	kiblnd_unmap_tx(ni, tx);
+
+	/* tx may have up to 2 lnet msgs to finalise */
+	lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+	lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+	rc = tx->tx_status;
+
+	if (tx->tx_conn != NULL) {
+		LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+		kiblnd_conn_decref(tx->tx_conn);
+		tx->tx_conn = NULL;
+	}
+
+	tx->tx_nwrq = 0;
+	tx->tx_status = 0;
+
+	kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list);
+
+	/* delay finalize until my descs have been freed */
+	for (i = 0; i < 2; i++) {
+		if (lntmsg[i] == NULL)
+			continue;
+
+		lnet_finalize(ni, lntmsg[i], rc);
+	}
+}
+
+void
+kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+	kib_tx_t *tx;
+
+	while (!list_empty (txlist)) {
+		tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+		list_del(&tx->tx_list);
+		/* complete now */
+		tx->tx_waiting = 0;
+		tx->tx_status = status;
+		kiblnd_tx_done(ni, tx);
+	}
+}
+
+kib_tx_t *
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
+{
+	kib_net_t		*net = (kib_net_t *)ni->ni_data;
+	struct list_head		*node;
+	kib_tx_t		*tx;
+	kib_tx_poolset_t	*tps;
+
+	tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+	node = kiblnd_pool_alloc_node(&tps->tps_poolset);
+	if (node == NULL)
+		return NULL;
+	tx = container_of(node, kib_tx_t, tx_list);
+
+	LASSERT (tx->tx_nwrq == 0);
+	LASSERT (!tx->tx_queued);
+	LASSERT (tx->tx_sending == 0);
+	LASSERT (!tx->tx_waiting);
+	LASSERT (tx->tx_status == 0);
+	LASSERT (tx->tx_conn == NULL);
+	LASSERT (tx->tx_lntmsg[0] == NULL);
+	LASSERT (tx->tx_lntmsg[1] == NULL);
+	LASSERT (tx->tx_u.pmr == NULL);
+	LASSERT (tx->tx_nfrags == 0);
+
+	return tx;
+}
+
+void
+kiblnd_drop_rx(kib_rx_t *rx)
+{
+	kib_conn_t		*conn	= rx->rx_conn;
+	struct kib_sched_info	*sched	= conn->ibc_sched;
+	unsigned long		flags;
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+	LASSERT(conn->ibc_nrx > 0);
+	conn->ibc_nrx--;
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+	kib_conn_t	 *conn = rx->rx_conn;
+	kib_net_t	  *net = conn->ibc_peer->ibp_ni->ni_data;
+	struct ib_recv_wr  *bad_wrq = NULL;
+	struct ib_mr       *mr;
+	int		 rc;
+
+	LASSERT (net != NULL);
+	LASSERT (!in_interrupt());
+	LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+		 credit == IBLND_POSTRX_PEER_CREDIT ||
+		 credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+	mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE);
+	LASSERT (mr != NULL);
+
+	rx->rx_sge.lkey   = mr->lkey;
+	rx->rx_sge.addr   = rx->rx_msgaddr;
+	rx->rx_sge.length = IBLND_MSG_SIZE;
+
+	rx->rx_wrq.next = NULL;
+	rx->rx_wrq.sg_list = &rx->rx_sge;
+	rx->rx_wrq.num_sge = 1;
+	rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+	LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+	LASSERT (rx->rx_nob >= 0);	      /* not posted */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return 0;
+	}
+
+	rx->rx_nob = -1;			/* flag posted */
+
+	rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+	if (rc != 0) {
+		CERROR("Can't post rx for %s: %d, bad_wrq: %p\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq);
+		rx->rx_nob = 0;
+	}
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+		return rc;
+
+	if (rc != 0) {
+		kiblnd_close_conn(conn, rc);
+		kiblnd_drop_rx(rx);	     /* No more posts for this rx */
+		return rc;
+	}
+
+	if (credit == IBLND_POSTRX_NO_CREDIT)
+		return 0;
+
+	spin_lock(&conn->ibc_lock);
+	if (credit == IBLND_POSTRX_PEER_CREDIT)
+		conn->ibc_outstanding_credits++;
+	else
+		conn->ibc_reserved_credits++;
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+	return 0;
+}
+
+kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+	struct list_head   *tmp;
+
+	list_for_each(tmp, &conn->ibc_active_txs) {
+		kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+		LASSERT (!tx->tx_queued);
+		LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+		if (tx->tx_cookie != cookie)
+			continue;
+
+		if (tx->tx_waiting &&
+		    tx->tx_msg->ibm_type == txtype)
+			return tx;
+
+		CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+		      tx->tx_waiting ? "" : "NOT ",
+		      tx->tx_msg->ibm_type, txtype);
+	}
+	return NULL;
+}
+
+void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+	kib_tx_t    *tx;
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	int	  idle;
+
+	spin_lock(&conn->ibc_lock);
+
+	tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+	if (tx == NULL) {
+		spin_unlock(&conn->ibc_lock);
+
+		CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+		      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		kiblnd_close_conn(conn, -EPROTO);
+		return;
+	}
+
+	if (tx->tx_status == 0) {	       /* success so far */
+		if (status < 0) {	       /* failed? */
+			tx->tx_status = status;
+		} else if (txtype == IBLND_MSG_GET_REQ) {
+			lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+		}
+	}
+
+	tx->tx_waiting = 0;
+
+	idle = !tx->tx_queued && (tx->tx_sending == 0);
+	if (idle)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(ni, tx);
+}
+
+void
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+	lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t    *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+
+	if (tx == NULL) {
+		CERROR("Can't get tx for completion %x for %s\n",
+		       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	tx->tx_msg->ibm_u.completion.ibcm_status = status;
+	tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+	kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+	kiblnd_queue_tx(tx, conn);
+}
+
+void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	int	   credits = msg->ibm_credits;
+	kib_tx_t     *tx;
+	int	   rc = 0;
+	int	   rc2;
+	int	   post_credit;
+
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	CDEBUG (D_NET, "Received %x[%d] from %s\n",
+		msg->ibm_type, credits,
+		libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+	if (credits != 0) {
+		/* Have I received credits that will let me send? */
+		spin_lock(&conn->ibc_lock);
+
+		if (conn->ibc_credits + credits >
+		    IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) {
+			rc2 = conn->ibc_credits;
+			spin_unlock(&conn->ibc_lock);
+
+			CERROR("Bad credits from %s: %d + %d > %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       rc2, credits,
+			       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+
+			kiblnd_close_conn(conn, -EPROTO);
+			kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+			return;
+		}
+
+		conn->ibc_credits += credits;
+
+		/* This ensures the credit taken by NOOP can be returned */
+		if (msg->ibm_type == IBLND_MSG_NOOP &&
+		    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+			conn->ibc_outstanding_credits++;
+
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_check_sends(conn);
+	}
+
+	switch (msg->ibm_type) {
+	default:
+		CERROR("Bad IBLND message type %x from %s\n",
+		       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		rc = -EPROTO;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+			break;
+		}
+
+		if (credits != 0) /* credit already posted */
+			post_credit = IBLND_POSTRX_NO_CREDIT;
+		else	      /* a keepalive NOOP */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+				msg->ibm_srcnid, rx, 0);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+		CWARN ("PUT_NACK from %s\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_PUT_ACK:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+		spin_lock(&conn->ibc_lock);
+		tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+					msg->ibm_u.putack.ibpam_src_cookie);
+		if (tx != NULL)
+			list_del(&tx->tx_list);
+		spin_unlock(&conn->ibc_lock);
+
+		if (tx == NULL) {
+			CERROR("Unmatched PUT_ACK from %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			rc = -EPROTO;
+			break;
+		}
+
+		LASSERT (tx->tx_waiting);
+		/* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+		 * (a) I can overwrite tx_msg since my peer has received it!
+		 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+		tx->tx_nwrq = 0;		/* overwrite PUT_REQ */
+
+		rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE,
+				       kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+				       &msg->ibm_u.putack.ibpam_rd,
+				       msg->ibm_u.putack.ibpam_dst_cookie);
+		if (rc2 < 0)
+			CERROR("Can't setup rdma for PUT to %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+		spin_lock(&conn->ibc_lock);
+		tx->tx_waiting = 0;	/* clear waiting and queue atomically */
+		kiblnd_queue_tx_locked(tx, conn);
+		spin_unlock(&conn->ibc_lock);
+		break;
+
+	case IBLND_MSG_PUT_DONE:
+		post_credit = IBLND_POSTRX_PEER_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		post_credit = IBLND_POSTRX_DONT_POST;
+		rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+				msg->ibm_srcnid, rx, 1);
+		if (rc < 0)		     /* repost on error */
+			post_credit = IBLND_POSTRX_PEER_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_DONE:
+		post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+		kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+					 msg->ibm_u.completion.ibcm_status,
+					 msg->ibm_u.completion.ibcm_cookie);
+		break;
+	}
+
+	if (rc < 0)			     /* protocol error */
+		kiblnd_close_conn(conn, rc);
+
+	if (post_credit != IBLND_POSTRX_DONT_POST)
+		kiblnd_post_rx(rx, post_credit);
+}
+
+void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+	kib_msg_t    *msg = rx->rx_msg;
+	kib_conn_t   *conn = rx->rx_conn;
+	lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+	kib_net_t    *net = ni->ni_data;
+	int	   rc;
+	int	   err = -EIO;
+
+	LASSERT (net != NULL);
+	LASSERT (rx->rx_nob < 0);	       /* was posted */
+	rx->rx_nob = 0;			 /* isn't now */
+
+	if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+		goto ignore;
+
+	if (status != IB_WC_SUCCESS) {
+		CNETERR("Rx from %s failed: %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+		goto failed;
+	}
+
+	LASSERT (nob >= 0);
+	rx->rx_nob = nob;
+
+	rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+	if (rc != 0) {
+		CERROR ("Error %d unpacking rx from %s\n",
+			rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		goto failed;
+	}
+
+	if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+	    msg->ibm_dstnid != ni->ni_nid ||
+	    msg->ibm_srcstamp != conn->ibc_incarnation ||
+	    msg->ibm_dststamp != net->ibn_incarnation) {
+		CERROR ("Stale rx from %s\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		err = -ESTALE;
+		goto failed;
+	}
+
+	/* set time last known alive */
+	kiblnd_peer_alive(conn->ibc_peer);
+
+	/* racing with connection establishment/teardown! */
+
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		rwlock_t  *g_lock = &kiblnd_data.kib_global_lock;
+		unsigned long  flags;
+
+		write_lock_irqsave(g_lock, flags);
+		/* must check holding global lock to eliminate race */
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+			write_unlock_irqrestore(g_lock, flags);
+			return;
+		}
+		write_unlock_irqrestore(g_lock, flags);
+	}
+	kiblnd_handle_rx(rx);
+	return;
+
+ failed:
+	CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+	kiblnd_close_conn(conn, err);
+ ignore:
+	kiblnd_drop_rx(rx);		     /* Don't re-post rx. */
+}
+
+struct page *
+kiblnd_kvaddr_to_page (unsigned long vaddr)
+{
+	struct page *page;
+
+	if (vaddr >= VMALLOC_START &&
+	    vaddr < VMALLOC_END) {
+		page = vmalloc_to_page ((void *)vaddr);
+		LASSERT (page != NULL);
+		return page;
+	}
+#ifdef CONFIG_HIGHMEM
+	if (vaddr >= PKMAP_BASE &&
+	    vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+		/* No highmem pages only used for bulk (kiov) I/O */
+		CERROR("find page for address in highmem\n");
+		LBUG();
+	}
+#endif
+	page = virt_to_page (vaddr);
+	LASSERT (page != NULL);
+	return page;
+}
+
+static int
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	__u64			*pages = tx->tx_pages;
+	kib_fmr_poolset_t	*fps;
+	int			npages;
+	int			size;
+	int			cpt;
+	int			rc;
+	int			i;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev  = tx->tx_pool->tpo_hdev;
+
+	for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+		for (size = 0; size <  rd->rd_frags[i].rf_nob;
+			       size += hdev->ibh_page_size) {
+			pages[npages ++] = (rd->rd_frags[i].rf_addr &
+					    hdev->ibh_page_mask) + size;
+		}
+	}
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	fps = net->ibn_fmr_ps[cpt];
+	rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
+	if (rc != 0) {
+		CERROR ("Can't map %d pages: %d\n", npages, rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey :
+					 tx->tx_u.fmr.fmr_pfmr->fmr->lkey;
+	rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
+	rd->rd_frags[0].rf_nob   = nob;
+	rd->rd_nfrags = 1;
+
+	return 0;
+}
+
+static int
+kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+{
+	kib_hca_dev_t		*hdev;
+	kib_pmr_poolset_t	*pps;
+	__u64			iova;
+	int			cpt;
+	int			rc;
+
+	LASSERT(tx->tx_pool != NULL);
+	LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+	hdev = tx->tx_pool->tpo_hdev;
+
+	iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+
+	cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+	pps = net->ibn_pmr_ps[cpt];
+	rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
+	if (rc != 0) {
+		CERROR("Failed to create MR by phybuf: %d\n", rc);
+		return rc;
+	}
+
+	/* If rd is not tx_rd, it's going to get sent to a peer, who will need
+	 * the rkey */
+	rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey :
+					 tx->tx_u.pmr->pmr_mr->lkey;
+	rd->rd_nfrags = 1;
+	rd->rd_frags[0].rf_addr = iova;
+	rd->rd_frags[0].rf_nob  = nob;
+
+	return 0;
+}
+
+void
+kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
+{
+	kib_net_t  *net = ni->ni_data;
+
+	LASSERT(net != NULL);
+
+	if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+		kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+		tx->tx_u.fmr.fmr_pfmr = NULL;
+
+	} else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+		kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+		tx->tx_u.pmr = NULL;
+	}
+
+	if (tx->tx_nfrags != 0) {
+		kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
+				    tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+		tx->tx_nfrags = 0;
+	}
+}
+
+int
+kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx,
+	      kib_rdma_desc_t *rd, int nfrags)
+{
+	kib_hca_dev_t      *hdev  = tx->tx_pool->tpo_hdev;
+	kib_net_t	  *net   = ni->ni_data;
+	struct ib_mr       *mr    = NULL;
+	__u32	       nob;
+	int		 i;
+
+	/* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+	 * RDMA sink */
+	tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	tx->tx_nfrags = nfrags;
+
+	rd->rd_nfrags =
+		kiblnd_dma_map_sg(hdev->ibh_ibdev,
+				  tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+
+	for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
+		rd->rd_frags[i].rf_nob  = kiblnd_sg_dma_len(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address(
+			hdev->ibh_ibdev, &tx->tx_frags[i]);
+		nob += rd->rd_frags[i].rf_nob;
+	}
+
+	/* looking for pre-mapping MR */
+	mr = kiblnd_find_rd_dma_mr(hdev, rd);
+	if (mr != NULL) {
+		/* found pre-mapping MR */
+		rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey;
+		return 0;
+	}
+
+	if (net->ibn_fmr_ps != NULL)
+		return kiblnd_fmr_map_tx(net, tx, rd, nob);
+	else if (net->ibn_pmr_ps != NULL)
+		return kiblnd_pmr_map_tx(net, tx, rd, nob);
+
+	return -EINVAL;
+}
+
+
+int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		    unsigned int niov, struct iovec *iov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct page	*page;
+	struct scatterlist *sg;
+	unsigned long       vaddr;
+	int		 fragnob;
+	int		 page_offset;
+
+	LASSERT (nob > 0);
+	LASSERT (niov > 0);
+	LASSERT (net != NULL);
+
+	while (offset >= iov->iov_len) {
+		offset -= iov->iov_len;
+		niov--;
+		iov++;
+		LASSERT (niov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT (niov > 0);
+
+		vaddr = ((unsigned long)iov->iov_base) + offset;
+		page_offset = vaddr & (PAGE_SIZE - 1);
+		page = kiblnd_kvaddr_to_page(vaddr);
+		if (page == NULL) {
+			CERROR ("Can't find page\n");
+			return -EFAULT;
+		}
+
+		fragnob = min((int)(iov->iov_len - offset), nob);
+		fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+		sg_set_page(sg, page, fragnob, page_offset);
+		sg++;
+
+		if (offset + fragnob < iov->iov_len) {
+			offset += fragnob;
+		} else {
+			offset = 0;
+			iov++;
+			niov--;
+		}
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+		      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+	kib_net_t	  *net = ni->ni_data;
+	struct scatterlist *sg;
+	int		 fragnob;
+
+	CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+	LASSERT (nob > 0);
+	LASSERT (nkiov > 0);
+	LASSERT (net != NULL);
+
+	while (offset >= kiov->kiov_len) {
+		offset -= kiov->kiov_len;
+		nkiov--;
+		kiov++;
+		LASSERT (nkiov > 0);
+	}
+
+	sg = tx->tx_frags;
+	do {
+		LASSERT (nkiov > 0);
+
+		fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+		sg_set_page(sg, kiov->kiov_page, fragnob,
+			    kiov->kiov_offset + offset);
+		sg++;
+
+		offset = 0;
+		kiov++;
+		nkiov--;
+		nob -= fragnob;
+	} while (nob > 0);
+
+	return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags);
+}
+
+int
+kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
+{
+	kib_msg_t	 *msg = tx->tx_msg;
+	kib_peer_t	*peer = conn->ibc_peer;
+	int		ver = conn->ibc_version;
+	int		rc;
+	int		done;
+	struct ib_send_wr *bad_wrq;
+
+	LASSERT (tx->tx_queued);
+	/* We rely on this for QP sizing */
+	LASSERT (tx->tx_nwrq > 0);
+	LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver));
+
+	LASSERT (credit == 0 || credit == 1);
+	LASSERT (conn->ibc_outstanding_credits >= 0);
+	LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+	LASSERT (conn->ibc_credits >= 0);
+	LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver));
+
+	if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+		/* tx completions outstanding... */
+		CDEBUG(D_NET, "%s: posted enough\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && conn->ibc_credits == 0) {   /* no credits */
+		CDEBUG(D_NET, "%s: no credits\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
+	    conn->ibc_credits == 1 &&   /* last credit reserved */
+	    msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
+		CDEBUG(D_NET, "%s: not using last credit\n",
+		       libcfs_nid2str(peer->ibp_nid));
+		return -EAGAIN;
+	}
+
+	/* NB don't drop ibc_lock before bumping tx_sending */
+	list_del(&tx->tx_list);
+	tx->tx_queued = 0;
+
+	if (msg->ibm_type == IBLND_MSG_NOOP &&
+	    (!kiblnd_need_noop(conn) ||     /* redundant NOOP */
+	     (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
+	      conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
+		/* OK to drop when posted enough NOOPs, since
+		 * kiblnd_check_sends will queue NOOP again when
+		 * posted NOOPs complete */
+		spin_unlock(&conn->ibc_lock);
+		kiblnd_tx_done(peer->ibp_ni, tx);
+		spin_lock(&conn->ibc_lock);
+		CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       conn->ibc_noops_posted);
+		return 0;
+	}
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits,
+			peer->ibp_nid, conn->ibc_incarnation);
+
+	conn->ibc_credits -= credit;
+	conn->ibc_outstanding_credits = 0;
+	conn->ibc_nsends_posted++;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted++;
+
+	/* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+	 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+	 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+	 * and then re-queued here.  It's (just) possible that
+	 * tx_sending is non-zero if we've not done the tx_complete()
+	 * from the first send; hence the ++ rather than = below. */
+	tx->tx_sending++;
+	list_add(&tx->tx_list, &conn->ibc_active_txs);
+
+	/* I'm still holding ibc_lock! */
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+		rc = -ECONNABORTED;
+	} else if (tx->tx_pool->tpo_pool.po_failed ||
+		 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+		/* close_conn will launch failover */
+		rc = -ENETDOWN;
+	} else {
+		rc = ib_post_send(conn->ibc_cmid->qp,
+				  tx->tx_wrq, &bad_wrq);
+	}
+
+	conn->ibc_last_send = jiffies;
+
+	if (rc == 0)
+		return 0;
+
+	/* NB credits are transferred in the actual
+	 * message, which can only be the last work item */
+	conn->ibc_credits += credit;
+	conn->ibc_outstanding_credits += msg->ibm_credits;
+	conn->ibc_nsends_posted--;
+	if (msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	tx->tx_status = rc;
+	tx->tx_waiting = 0;
+	tx->tx_sending--;
+
+	done = (tx->tx_sending == 0);
+	if (done)
+		list_del(&tx->tx_list);
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+		CERROR("Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+	else
+		CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+		       rc, libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_close_conn(conn, rc);
+
+	if (done)
+		kiblnd_tx_done(peer->ibp_ni, tx);
+
+	spin_lock(&conn->ibc_lock);
+
+	return -EIO;
+}
+
+void
+kiblnd_check_sends (kib_conn_t *conn)
+{
+	int	ver = conn->ibc_version;
+	lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+	kib_tx_t  *tx;
+
+	/* Don't send anything until after the connection is established */
+	if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+		CDEBUG(D_NET, "%s too soon\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+	LASSERT (!IBLND_OOB_CAPABLE(ver) ||
+		 conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
+	LASSERT (conn->ibc_reserved_credits >= 0);
+
+	while (conn->ibc_reserved_credits > 0 &&
+	       !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+		tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+				    kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+		list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+		conn->ibc_reserved_credits--;
+	}
+
+	if (kiblnd_need_noop(conn)) {
+		spin_unlock(&conn->ibc_lock);
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx != NULL)
+			kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+		spin_lock(&conn->ibc_lock);
+		if (tx != NULL)
+			kiblnd_queue_tx_locked(tx, conn);
+	}
+
+	kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */
+
+	for (;;) {
+		int credit;
+
+		if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+			credit = 0;
+			tx = list_entry(conn->ibc_tx_queue_nocred.next,
+					    kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_noops)) {
+			LASSERT (!IBLND_OOB_CAPABLE(ver));
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_noops.next,
+					kib_tx_t, tx_list);
+		} else if (!list_empty(&conn->ibc_tx_queue)) {
+			credit = 1;
+			tx = list_entry(conn->ibc_tx_queue.next,
+					    kib_tx_t, tx_list);
+		} else
+			break;
+
+		if (kiblnd_post_tx_locked(conn, tx, credit) != 0)
+			break;
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_conn_decref(conn); /* ...until here */
+}
+
+void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+	int	   failed = (status != IB_WC_SUCCESS);
+	kib_conn_t   *conn = tx->tx_conn;
+	int	   idle;
+
+	LASSERT (tx->tx_sending > 0);
+
+	if (failed) {
+		if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+			CNETERR("Tx -> %s cookie "LPX64
+				" sending %d waiting %d: failed %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+				status);
+
+		kiblnd_close_conn(conn, -EIO);
+	} else {
+		kiblnd_peer_alive(conn->ibc_peer);
+	}
+
+	spin_lock(&conn->ibc_lock);
+
+	/* I could be racing with rdma completion.  Whoever makes 'tx' idle
+	 * gets to free it, which also drops its ref on 'conn'. */
+
+	tx->tx_sending--;
+	conn->ibc_nsends_posted--;
+	if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP)
+		conn->ibc_noops_posted--;
+
+	if (failed) {
+		tx->tx_waiting = 0;	     /* don't wait for peer */
+		tx->tx_status = -EIO;
+	}
+
+	idle = (tx->tx_sending == 0) &&	 /* This is the final callback */
+	       !tx->tx_waiting &&	       /* Not waiting for peer */
+	       !tx->tx_queued;		  /* Not re-queued (PUT_DONE) */
+	if (idle)
+		list_del(&tx->tx_list);
+
+	kiblnd_conn_addref(conn);	       /* 1 ref for me.... */
+
+	spin_unlock(&conn->ibc_lock);
+
+	if (idle)
+		kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+	kiblnd_check_sends(conn);
+
+	kiblnd_conn_decref(conn);	       /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+	kib_hca_dev_t     *hdev = tx->tx_pool->tpo_hdev;
+	struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
+	struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+	int		nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+	struct ib_mr      *mr;
+
+	LASSERT (tx->tx_nwrq >= 0);
+	LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+	LASSERT (nob <= IBLND_MSG_SIZE);
+
+	kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+	mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob);
+	LASSERT (mr != NULL);
+
+	sge->lkey   = mr->lkey;
+	sge->addr   = tx->tx_msgaddr;
+	sge->length = nob;
+
+	memset(wrq, 0, sizeof(*wrq));
+
+	wrq->next       = NULL;
+	wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+	wrq->sg_list    = sge;
+	wrq->num_sge    = 1;
+	wrq->opcode     = IB_WR_SEND;
+	wrq->send_flags = IB_SEND_SIGNALED;
+
+	tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type,
+		  int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+	kib_msg_t	 *ibmsg = tx->tx_msg;
+	kib_rdma_desc_t   *srcrd = tx->tx_rd;
+	struct ib_sge     *sge = &tx->tx_sge[0];
+	struct ib_send_wr *wrq = &tx->tx_wrq[0];
+	int		rc  = resid;
+	int		srcidx;
+	int		dstidx;
+	int		wrknob;
+
+	LASSERT (!in_interrupt());
+	LASSERT (tx->tx_nwrq == 0);
+	LASSERT (type == IBLND_MSG_GET_DONE ||
+		 type == IBLND_MSG_PUT_DONE);
+
+	srcidx = dstidx = 0;
+
+	while (resid > 0) {
+		if (srcidx >= srcrd->rd_nfrags) {
+			CERROR("Src buffer exhausted: %d frags\n", srcidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (dstidx == dstrd->rd_nfrags) {
+			CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) {
+			CERROR("RDMA too fragmented for %s (%d): "
+			       "%d/%d src %d/%d dst frags\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+			       IBLND_RDMA_FRAGS(conn->ibc_version),
+			       srcidx, srcrd->rd_nfrags,
+			       dstidx, dstrd->rd_nfrags);
+			rc = -EMSGSIZE;
+			break;
+		}
+
+		wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx),
+				 kiblnd_rd_frag_size(dstrd, dstidx)), resid);
+
+		sge = &tx->tx_sge[tx->tx_nwrq];
+		sge->addr   = kiblnd_rd_frag_addr(srcrd, srcidx);
+		sge->lkey   = kiblnd_rd_frag_key(srcrd, srcidx);
+		sge->length = wrknob;
+
+		wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+		wrq->next       = wrq + 1;
+		wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+		wrq->sg_list    = sge;
+		wrq->num_sge    = 1;
+		wrq->opcode     = IB_WR_RDMA_WRITE;
+		wrq->send_flags = 0;
+
+		wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx);
+		wrq->wr.rdma.rkey	= kiblnd_rd_frag_key(dstrd, dstidx);
+
+		srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob);
+		dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob);
+
+		resid -= wrknob;
+
+		tx->tx_nwrq++;
+		wrq++;
+		sge++;
+	}
+
+	if (rc < 0)			     /* no RDMA if completing with failure */
+		tx->tx_nwrq = 0;
+
+	ibmsg->ibm_u.completion.ibcm_status = rc;
+	ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+	kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx,
+			   type, sizeof (kib_completion_msg_t));
+
+	return rc;
+}
+
+void
+kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+	struct list_head   *q;
+
+	LASSERT (tx->tx_nwrq > 0);	      /* work items set up */
+	LASSERT (!tx->tx_queued);	       /* not queued for sending already */
+	LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	tx->tx_queued = 1;
+	tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+	if (tx->tx_conn == NULL) {
+		kiblnd_conn_addref(conn);
+		tx->tx_conn = conn;
+		LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+	} else {
+		/* PUT_DONE first attached to conn as a PUT_REQ */
+		LASSERT (tx->tx_conn == conn);
+		LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+	}
+
+	switch (tx->tx_msg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_PUT_REQ:
+	case IBLND_MSG_GET_REQ:
+		q = &conn->ibc_tx_queue_rsrvd;
+		break;
+
+	case IBLND_MSG_PUT_NAK:
+	case IBLND_MSG_PUT_ACK:
+	case IBLND_MSG_PUT_DONE:
+	case IBLND_MSG_GET_DONE:
+		q = &conn->ibc_tx_queue_nocred;
+		break;
+
+	case IBLND_MSG_NOOP:
+		if (IBLND_OOB_CAPABLE(conn->ibc_version))
+			q = &conn->ibc_tx_queue_nocred;
+		else
+			q = &conn->ibc_tx_noops;
+		break;
+
+	case IBLND_MSG_IMMEDIATE:
+		q = &conn->ibc_tx_queue;
+		break;
+	}
+
+	list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+	spin_lock(&conn->ibc_lock);
+	kiblnd_queue_tx_locked(tx, conn);
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+}
+
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+			       struct sockaddr_in *srcaddr,
+			       struct sockaddr_in *dstaddr,
+			       int timeout_ms)
+{
+	unsigned short port;
+	int rc;
+
+	/* allow the port to be reused */
+	rc = rdma_set_reuseaddr(cmid, 1);
+	if (rc != 0) {
+		CERROR("Unable to set reuse on cmid: %d\n", rc);
+		return rc;
+	}
+
+	/* look for a free privileged port */
+	for (port = PROT_SOCK-1; port > 0; port--) {
+		srcaddr->sin_port = htons(port);
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)srcaddr,
+				       (struct sockaddr *)dstaddr,
+				       timeout_ms);
+		if (rc == 0) {
+			CDEBUG(D_NET, "bound to port %hu\n", port);
+			return 0;
+		} else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+			CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+			       port, rc);
+		} else {
+			return rc;
+		}
+	}
+
+	CERROR("Failed to bind to a free privileged port\n");
+	return rc;
+}
+
+void
+kiblnd_connect_peer (kib_peer_t *peer)
+{
+	struct rdma_cm_id *cmid;
+	kib_dev_t	 *dev;
+	kib_net_t	 *net = peer->ibp_ni->ni_data;
+	struct sockaddr_in srcaddr;
+	struct sockaddr_in dstaddr;
+	int		rc;
+
+	LASSERT (net != NULL);
+	LASSERT (peer->ibp_connecting > 0);
+
+	cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+				     IB_QPT_RC);
+
+	if (IS_ERR(cmid)) {
+		CERROR("Can't create CMID for %s: %ld\n",
+		       libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+		rc = PTR_ERR(cmid);
+		goto failed;
+	}
+
+	dev = net->ibn_dev;
+	memset(&srcaddr, 0, sizeof(srcaddr));
+	srcaddr.sin_family = AF_INET;
+	srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip);
+
+	memset(&dstaddr, 0, sizeof(dstaddr));
+	dstaddr.sin_family = AF_INET;
+	dstaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+	dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+	kiblnd_peer_addref(peer);	       /* cmid's ref */
+
+	if (*kiblnd_tunables.kib_use_priv_port) {
+		rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+					 *kiblnd_tunables.kib_timeout * 1000);
+	} else {
+		rc = rdma_resolve_addr(cmid,
+				       (struct sockaddr *)&srcaddr,
+				       (struct sockaddr *)&dstaddr,
+				       *kiblnd_tunables.kib_timeout * 1000);
+	}
+	if (rc != 0) {
+		/* Can't initiate address resolution:  */
+		CERROR("Can't resolve addr for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed2;
+	}
+
+	LASSERT (cmid->device != NULL);
+	CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+	       libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+	       HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+	return;
+
+ failed2:
+	kiblnd_peer_decref(peer);	       /* cmid's ref */
+	rdma_destroy_id(cmid);
+ failed:
+	kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+	kib_peer_t	*peer;
+	kib_peer_t	*peer2;
+	kib_conn_t	*conn;
+	rwlock_t	*g_lock = &kiblnd_data.kib_global_lock;
+	unsigned long      flags;
+	int		rc;
+
+	/* If I get here, I've committed to send, so I complete the tx with
+	 * failure on any problems */
+
+	LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */
+	LASSERT (tx == NULL || tx->tx_nwrq > 0);     /* work items have been set up */
+
+	/* First time, just use a read lock since I expect to find my peer
+	 * connected */
+	read_lock_irqsave(g_lock, flags);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+		/* Found a peer with an established connection */
+		conn = kiblnd_get_conn_locked(peer);
+		kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+		read_unlock_irqrestore(g_lock, flags);
+
+		if (tx != NULL)
+			kiblnd_queue_tx(tx, conn);
+		kiblnd_conn_decref(conn); /* ...to here */
+		return;
+	}
+
+	read_unlock(g_lock);
+	/* Re-try with a write lock */
+	write_lock(g_lock);
+
+	peer = kiblnd_find_peer_locked(nid);
+	if (peer != NULL) {
+		if (list_empty(&peer->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT (peer->ibp_connecting != 0 ||
+				 peer->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+		return;
+	}
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	/* Allocate a peer ready to add to the peer table and retry */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+		if (tx != NULL) {
+			tx->tx_status = -EHOSTUNREACH;
+			tx->tx_waiting = 0;
+			kiblnd_tx_done(ni, tx);
+		}
+		return;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (list_empty(&peer2->ibp_conns)) {
+			/* found a peer, but it's still connecting... */
+			LASSERT (peer2->ibp_connecting != 0 ||
+				 peer2->ibp_accepting != 0);
+			if (tx != NULL)
+				list_add_tail(&tx->tx_list,
+						  &peer2->ibp_tx_queue);
+			write_unlock_irqrestore(g_lock, flags);
+		} else {
+			conn = kiblnd_get_conn_locked(peer2);
+			kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+			write_unlock_irqrestore(g_lock, flags);
+
+			if (tx != NULL)
+				kiblnd_queue_tx(tx, conn);
+			kiblnd_conn_decref(conn); /* ...to here */
+		}
+
+		kiblnd_peer_decref(peer);
+		return;
+	}
+
+	/* Brand new peer */
+	LASSERT (peer->ibp_connecting == 0);
+	peer->ibp_connecting = 1;
+
+	/* always called with a ref on ni, which prevents ni being shutdown */
+	LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
+	if (tx != NULL)
+		list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+	kiblnd_peer_addref(peer);
+	list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+	write_unlock_irqrestore(g_lock, flags);
+
+	kiblnd_connect_peer(peer);
+	kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	int	       target_is_router = lntmsg->msg_target_is_router;
+	int	       routing = lntmsg->msg_routing;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct iovec     *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	kib_msg_t	*ibmsg;
+	kib_tx_t	 *tx;
+	int	       nob;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.... */
+
+	CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+
+	/* Thread context */
+	LASSERT (!in_interrupt());
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+	switch (type) {
+	default:
+		LBUG();
+		return (-EIO);
+
+	case LNET_MSG_ACK:
+		LASSERT (payload_nob == 0);
+		break;
+
+	case LNET_MSG_GET:
+		if (routing || target_is_router)
+			break;		  /* send IMMEDIATE */
+
+		/* is the REPLY message too small for RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate txd for GET to %s\n",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		ibmsg = tx->tx_msg;
+
+		if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &ibmsg->ibm_u.get.ibgm_rd,
+						 lntmsg->msg_md->md_niov,
+						 lntmsg->msg_md->md_iov.iov,
+						 0, lntmsg->msg_md->md_length);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &ibmsg->ibm_u.get.ibgm_rd,
+						  lntmsg->msg_md->md_niov,
+						  lntmsg->msg_md->md_iov.kiov,
+						  0, lntmsg->msg_md->md_length);
+		if (rc != 0) {
+			CERROR("Can't setup GET sink for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+		ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+		ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+		tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+		if (tx->tx_lntmsg[1] == NULL) {
+			CERROR("Can't create reply for GET -> %s\n",
+			       libcfs_nid2str(target.nid));
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+		tx->tx_waiting = 1;	     /* waiting for GET_DONE */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+
+	case LNET_MSG_REPLY:
+	case LNET_MSG_PUT:
+		/* Is the payload small enough not to need RDMA? */
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+		if (nob <= IBLND_MSG_SIZE)
+			break;		  /* send IMMEDIATE */
+
+		tx = kiblnd_get_idle_tx(ni, target.nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate %s txd for %s\n",
+			       type == LNET_MSG_PUT ? "PUT" : "REPLY",
+			       libcfs_nid2str(target.nid));
+			return -ENOMEM;
+		}
+
+		if (payload_kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+						 payload_niov, payload_iov,
+						 payload_offset, payload_nob);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+						  payload_niov, payload_kiov,
+						  payload_offset, payload_nob);
+		if (rc != 0) {
+			CERROR("Can't setup PUT src for %s: %d\n",
+			       libcfs_nid2str(target.nid), rc);
+			kiblnd_tx_done(ni, tx);
+			return -EIO;
+		}
+
+		ibmsg = tx->tx_msg;
+		ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+		ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_{ACK,NAK} */
+		kiblnd_launch_tx(ni, tx, target.nid);
+		return 0;
+	}
+
+	/* send IMMEDIATE */
+
+	LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+		 <= IBLND_MSG_SIZE);
+
+	tx = kiblnd_get_idle_tx(ni, target.nid);
+	if (tx == NULL) {
+		CERROR ("Can't send %d to %s: tx descs exhausted\n",
+			type, libcfs_nid2str(target.nid));
+		return -ENOMEM;
+	}
+
+	ibmsg = tx->tx_msg;
+	ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+	if (payload_kiov != NULL)
+		lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+				    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				    payload_niov, payload_kiov,
+				    payload_offset, payload_nob);
+	else
+		lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+				   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+				   payload_niov, payload_iov,
+				   payload_offset, payload_nob);
+
+	nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+	kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+	tx->tx_lntmsg[0] = lntmsg;	      /* finalise lntmsg on completion */
+	kiblnd_launch_tx(ni, tx, target.nid);
+	return 0;
+}
+
+void
+kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      niov = lntmsg->msg_niov;
+	struct iovec     *iov = lntmsg->msg_iov;
+	lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+	unsigned int      offset = lntmsg->msg_offset;
+	unsigned int      nob = lntmsg->msg_len;
+	kib_tx_t	 *tx;
+	int	       rc;
+
+	tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
+	if (tx == NULL) {
+		CERROR("Can't get tx for REPLY to %s\n",
+		       libcfs_nid2str(target.nid));
+		goto failed_0;
+	}
+
+	if (nob == 0)
+		rc = 0;
+	else if (kiov == NULL)
+		rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+					 niov, iov, offset, nob);
+	else
+		rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+					  niov, kiov, offset, nob);
+
+	if (rc != 0) {
+		CERROR("Can't setup GET src for %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	rc = kiblnd_init_rdma(rx->rx_conn, tx,
+			      IBLND_MSG_GET_DONE, nob,
+			      &rx->rx_msg->ibm_u.get.ibgm_rd,
+			      rx->rx_msg->ibm_u.get.ibgm_cookie);
+	if (rc < 0) {
+		CERROR("Can't setup rdma for GET from %s: %d\n",
+		       libcfs_nid2str(target.nid), rc);
+		goto failed_1;
+	}
+
+	if (nob == 0) {
+		/* No RDMA: local completion may happen now! */
+		lnet_finalize(ni, lntmsg, 0);
+	} else {
+		/* RDMA: lnet_finalize(lntmsg) when it
+		 * completes */
+		tx->tx_lntmsg[0] = lntmsg;
+	}
+
+	kiblnd_queue_tx(tx, rx->rx_conn);
+	return;
+
+ failed_1:
+	kiblnd_tx_done(ni, tx);
+ failed_0:
+	lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+	     unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	kib_rx_t    *rx = private;
+	kib_msg_t   *rxmsg = rx->rx_msg;
+	kib_conn_t  *conn = rx->rx_conn;
+	kib_tx_t    *tx;
+	kib_msg_t   *txmsg;
+	int	  nob;
+	int	  post_credit = IBLND_POSTRX_PEER_CREDIT;
+	int	  rc = 0;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (!in_interrupt());
+	/* Either all pages or all vaddrs */
+	LASSERT (!(kiov != NULL && iov != NULL));
+
+	switch (rxmsg->ibm_type) {
+	default:
+		LBUG();
+
+	case IBLND_MSG_IMMEDIATE:
+		nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+		if (nob > rx->rx_nob) {
+			CERROR ("Immediate message from %s too big: %d(%d)\n",
+				libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+				nob, rx->rx_nob);
+			rc = -EPROTO;
+			break;
+		}
+
+		if (kiov != NULL)
+			lnet_copy_flat2kiov(niov, kiov, offset,
+					    IBLND_MSG_SIZE, rxmsg,
+					    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					    mlen);
+		else
+			lnet_copy_flat2iov(niov, iov, offset,
+					   IBLND_MSG_SIZE, rxmsg,
+					   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+					   mlen);
+		lnet_finalize (ni, lntmsg, 0);
+		break;
+
+	case IBLND_MSG_PUT_REQ:
+		if (mlen == 0) {
+			lnet_finalize(ni, lntmsg, 0);
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
+		if (tx == NULL) {
+			CERROR("Can't allocate tx for %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			/* Not replying will break the connection */
+			rc = -ENOMEM;
+			break;
+		}
+
+		txmsg = tx->tx_msg;
+		if (kiov == NULL)
+			rc = kiblnd_setup_rd_iov(ni, tx,
+						 &txmsg->ibm_u.putack.ibpam_rd,
+						 niov, iov, offset, mlen);
+		else
+			rc = kiblnd_setup_rd_kiov(ni, tx,
+						  &txmsg->ibm_u.putack.ibpam_rd,
+						  niov, kiov, offset, mlen);
+		if (rc != 0) {
+			CERROR("Can't setup PUT sink for %s: %d\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+			kiblnd_tx_done(ni, tx);
+			/* tell peer it's over */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+					       rxmsg->ibm_u.putreq.ibprm_cookie);
+			break;
+		}
+
+		nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+		txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+		txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+
+		kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+		tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+		tx->tx_waiting = 1;	     /* waiting for PUT_DONE */
+		kiblnd_queue_tx(tx, conn);
+
+		/* reposted buffer reserved for PUT_DONE */
+		post_credit = IBLND_POSTRX_NO_CREDIT;
+		break;
+
+	case IBLND_MSG_GET_REQ:
+		if (lntmsg != NULL) {
+			/* Optimized GET; RDMA lntmsg's payload */
+			kiblnd_reply(ni, rx, lntmsg);
+		} else {
+			/* GET didn't match anything */
+			kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+					       -ENODATA,
+					       rxmsg->ibm_u.get.ibgm_cookie);
+		}
+		break;
+	}
+
+	kiblnd_post_rx(rx, post_credit);
+	return rc;
+}
+
+int
+kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	task_t *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	atomic_inc(&kiblnd_data.kib_nthreads);
+	return 0;
+}
+
+void
+kiblnd_thread_fini (void)
+{
+	atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive (kib_peer_t *peer)
+{
+	/* This is racy, but everyone's only writing cfs_time_current() */
+	peer->ibp_last_alive = cfs_time_current();
+	mb();
+}
+
+void
+kiblnd_peer_notify (kib_peer_t *peer)
+{
+	int	   error = 0;
+	cfs_time_t    last_alive = 0;
+	unsigned long flags;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (list_empty(&peer->ibp_conns) &&
+	    peer->ibp_accepting == 0 &&
+	    peer->ibp_connecting == 0 &&
+	    peer->ibp_error != 0) {
+		error = peer->ibp_error;
+		peer->ibp_error = 0;
+
+		last_alive = peer->ibp_last_alive;
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (error != 0)
+		lnet_notify(peer->ibp_ni,
+			    peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+	/* This just does the immediate housekeeping.  'error' is zero for a
+	 * normal shutdown which can happen only after the connection has been
+	 * established.  If the connection is established, schedule the
+	 * connection to be finished off by the connd.  Otherwise the connd is
+	 * already dealing with it (either to set it up or tear it down).
+	 * Caller holds kib_global_lock exclusively in irq context */
+	kib_peer_t       *peer = conn->ibc_peer;
+	kib_dev_t	*dev;
+	unsigned long     flags;
+
+	LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	if (error != 0 && conn->ibc_comms_error == 0)
+		conn->ibc_comms_error = error;
+
+	if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+		return; /* already being handled  */
+
+	if (error == 0 &&
+	    list_empty(&conn->ibc_tx_noops) &&
+	    list_empty(&conn->ibc_tx_queue) &&
+	    list_empty(&conn->ibc_tx_queue_rsrvd) &&
+	    list_empty(&conn->ibc_tx_queue_nocred) &&
+	    list_empty(&conn->ibc_active_txs)) {
+		CDEBUG(D_NET, "closing conn to %s\n",
+		       libcfs_nid2str(peer->ibp_nid));
+	} else {
+		CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
+		       libcfs_nid2str(peer->ibp_nid), error,
+		       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+		       list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
+		       list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+		       list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+		       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+	}
+
+	dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev;
+	list_del(&conn->ibc_list);
+	/* connd (see below) takes over ibc_list's ref */
+
+	if (list_empty (&peer->ibp_conns) &&    /* no more conns */
+	    kiblnd_peer_active(peer)) {	 /* still in peer table */
+		kiblnd_unlink_peer_locked(peer);
+
+		/* set/clear error on last conn */
+		peer->ibp_error = conn->ibc_comms_error;
+	}
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+	if (error != 0 &&
+	    kiblnd_dev_can_failover(dev)) {
+		list_add_tail(&dev->ibd_fail_list,
+			      &kiblnd_data.kib_failed_devs);
+		wake_up(&kiblnd_data.kib_failover_waitq);
+	}
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+	wake_up(&kiblnd_data.kib_connd_waitq);
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn(kib_conn_t *conn, int error)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_close_conn_locked(conn, error);
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+	unsigned long    flags;
+	kib_rx_t	*rx;
+
+	LASSERT(!in_interrupt());
+	LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	while (!list_empty(&conn->ibc_early_rxs)) {
+		rx = list_entry(conn->ibc_early_rxs.next,
+				    kib_rx_t, rx_list);
+		list_del(&rx->rx_list);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_handle_rx(rx);
+
+		write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	}
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+	LIST_HEAD       (zombies);
+	struct list_head	  *tmp;
+	struct list_head	  *nxt;
+	kib_tx_t	    *tx;
+
+	spin_lock(&conn->ibc_lock);
+
+	list_for_each_safe (tmp, nxt, txs) {
+		tx = list_entry (tmp, kib_tx_t, tx_list);
+
+		if (txs == &conn->ibc_active_txs) {
+			LASSERT (!tx->tx_queued);
+			LASSERT (tx->tx_waiting ||
+				 tx->tx_sending != 0);
+		} else {
+			LASSERT (tx->tx_queued);
+		}
+
+		tx->tx_status = -ECONNABORTED;
+		tx->tx_waiting = 0;
+
+		if (tx->tx_sending == 0) {
+			tx->tx_queued = 0;
+			list_del (&tx->tx_list);
+			list_add (&tx->tx_list, &zombies);
+		}
+	}
+
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED);
+}
+
+void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+	kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+	/* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+	 * for connections that didn't get as far as being connected, because
+	 * rdma_disconnect() does this for free. */
+	kiblnd_abort_receives(conn);
+
+	/* Complete all tx descs not waiting for sends to complete.
+	 * NB we should be safe from RDMA now that the QP has changed state */
+
+	kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+	kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+	kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+	kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error)
+{
+	LIST_HEAD    (zombies);
+	unsigned long     flags;
+
+	LASSERT (error != 0);
+	LASSERT (!in_interrupt());
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	if (active) {
+		LASSERT (peer->ibp_connecting > 0);
+		peer->ibp_connecting--;
+	} else {
+		LASSERT (peer->ibp_accepting > 0);
+		peer->ibp_accepting--;
+	}
+
+	if (peer->ibp_connecting != 0 ||
+	    peer->ibp_accepting != 0) {
+		/* another connection attempt under way... */
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+					    flags);
+		return;
+	}
+
+	if (list_empty(&peer->ibp_conns)) {
+		/* Take peer's blocked transmits to complete with error */
+		list_add(&zombies, &peer->ibp_tx_queue);
+		list_del_init(&peer->ibp_tx_queue);
+
+		if (kiblnd_peer_active(peer))
+			kiblnd_unlink_peer_locked(peer);
+
+		peer->ibp_error = error;
+	} else {
+		/* Can't have blocked transmits if there are connections */
+		LASSERT (list_empty(&peer->ibp_tx_queue));
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	kiblnd_peer_notify(peer);
+
+	if (list_empty (&zombies))
+		return;
+
+	CNETERR("Deleting messages for %s: connection failed\n",
+		libcfs_nid2str(peer->ibp_nid));
+
+	kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+	kib_peer_t	*peer = conn->ibc_peer;
+	kib_tx_t	  *tx;
+	struct list_head	 txs;
+	unsigned long      flags;
+	int		active;
+
+	active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n",
+	       libcfs_nid2str(peer->ibp_nid), active,
+	       conn->ibc_version, status);
+
+	LASSERT (!in_interrupt());
+	LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+		  peer->ibp_connecting > 0) ||
+		 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+		  peer->ibp_accepting > 0));
+
+	LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+	conn->ibc_connvars = NULL;
+
+	if (status != 0) {
+		/* failed to establish connection */
+		kiblnd_peer_connect_failed(peer, active, status);
+		kiblnd_finalise_conn(conn);
+		return;
+	}
+
+	/* connection established */
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	conn->ibc_last_send = jiffies;
+	kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+	kiblnd_peer_alive(peer);
+
+	/* Add conn to peer's list and nuke any dangling conns from a different
+	 * peer instance... */
+	kiblnd_conn_addref(conn);	       /* +1 ref for ibc_list */
+	list_add(&conn->ibc_list, &peer->ibp_conns);
+	if (active)
+		peer->ibp_connecting--;
+	else
+		peer->ibp_accepting--;
+
+	if (peer->ibp_version == 0) {
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	if (peer->ibp_version     != conn->ibc_version ||
+	    peer->ibp_incarnation != conn->ibc_incarnation) {
+		kiblnd_close_stale_conns_locked(peer, conn->ibc_version,
+						conn->ibc_incarnation);
+		peer->ibp_version     = conn->ibc_version;
+		peer->ibp_incarnation = conn->ibc_incarnation;
+	}
+
+	/* grab pending txs while I have the lock */
+	list_add(&txs, &peer->ibp_tx_queue);
+	list_del_init(&peer->ibp_tx_queue);
+
+	if (!kiblnd_peer_active(peer) ||	/* peer has been deleted */
+	    conn->ibc_comms_error != 0) {       /* error has happened already */
+		lnet_ni_t *ni = peer->ibp_ni;
+
+		/* start to shut down connection */
+		kiblnd_close_conn_locked(conn, -ECONNABORTED);
+		write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+		kiblnd_txlist_done(ni, &txs, -ECONNABORTED);
+
+		return;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Schedule blocked txs */
+	spin_lock(&conn->ibc_lock);
+	while (!list_empty(&txs)) {
+		tx = list_entry(txs.next, kib_tx_t, tx_list);
+		list_del(&tx->tx_list);
+
+		kiblnd_queue_tx_locked(tx, conn);
+	}
+	spin_unlock(&conn->ibc_lock);
+
+	kiblnd_check_sends(conn);
+
+	/* schedule blocked rxs */
+	kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej)
+{
+	int	  rc;
+
+	rc = rdma_reject(cmid, rej, sizeof(*rej));
+
+	if (rc != 0)
+		CWARN("Error %d sending reject\n", rc);
+}
+
+int
+kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+	rwlock_t		*g_lock = &kiblnd_data.kib_global_lock;
+	kib_msg_t	     *reqmsg = priv;
+	kib_msg_t	     *ackmsg;
+	kib_dev_t	     *ibdev;
+	kib_peer_t	    *peer;
+	kib_peer_t	    *peer2;
+	kib_conn_t	    *conn;
+	lnet_ni_t	     *ni  = NULL;
+	kib_net_t	     *net = NULL;
+	lnet_nid_t	     nid;
+	struct rdma_conn_param cp;
+	kib_rej_t	      rej;
+	int		    version = IBLND_MSG_VERSION;
+	unsigned long	  flags;
+	int		    rc;
+	struct sockaddr_in    *peer_addr;
+	LASSERT (!in_interrupt());
+
+	/* cmid inherits 'context' from the corresponding listener id */
+	ibdev = (kib_dev_t *)cmid->context;
+	LASSERT (ibdev != NULL);
+
+	memset(&rej, 0, sizeof(rej));
+	rej.ibr_magic		= IBLND_MSG_MAGIC;
+	rej.ibr_why		  = IBLND_REJECT_FATAL;
+	rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+	if (*kiblnd_tunables.kib_require_priv_port &&
+	    ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+		__u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+		CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+		       HIPQUAD(ip), ntohs(peer_addr->sin_port));
+		goto failed;
+	}
+
+	if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+		CERROR("Short connection request\n");
+		goto failed;
+	}
+
+	/* Future protocol version compatibility support!  If the
+	 * o2iblnd-specific protocol changes, or when LNET unifies
+	 * protocols over all LNDs, the initial connection will
+	 * negotiate a protocol version.  I trap this here to avoid
+	 * console errors; the reject tells the peer which protocol I
+	 * speak. */
+	if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+	    reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+		goto failed;
+	if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION &&
+	    reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+		goto failed;
+	if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+	    reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+		goto failed;
+
+	rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+	if (rc != 0) {
+		CERROR("Can't parse connection request: %d\n", rc);
+		goto failed;
+	}
+
+	nid = reqmsg->ibm_srcnid;
+	ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+
+	if (ni != NULL) {
+		net = (kib_net_t *)ni->ni_data;
+		rej.ibr_incarnation = net->ibn_incarnation;
+	}
+
+	if (ni == NULL ||			 /* no matching net */
+	    ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+	    net->ibn_dev != ibdev) {	      /* wrong device */
+		CERROR("Can't accept %s on %s (%s:%d:%u.%u.%u.%u): "
+		       "bad dst nid %s\n", libcfs_nid2str(nid),
+		       ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+		       ibdev->ibd_ifname, ibdev->ibd_nnets,
+		       HIPQUAD(ibdev->ibd_ifip),
+		       libcfs_nid2str(reqmsg->ibm_dstnid));
+
+		goto failed;
+	}
+
+       /* check time stamp as soon as possible */
+	if (reqmsg->ibm_dststamp != 0 &&
+	    reqmsg->ibm_dststamp != net->ibn_incarnation) {
+		CWARN("Stale connection request\n");
+		rej.ibr_why = IBLND_REJECT_CONN_STALE;
+		goto failed;
+	}
+
+	/* I can accept peer's version */
+	version = reqmsg->ibm_version;
+
+	if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+		CERROR("Unexpected connreq msg type: %x from %s\n",
+		       reqmsg->ibm_type, libcfs_nid2str(nid));
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(version)) {
+		CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+		       libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
+
+		goto failed;
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(version)) {
+		CERROR("Can't accept %s(version %x): "
+		       "incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(nid), version,
+		       reqmsg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(version));
+
+		if (version == IBLND_MSG_VERSION)
+			rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
+
+		goto failed;
+
+	}
+
+	if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("Can't accept %s: message size %d too big (%d max)\n",
+		       libcfs_nid2str(nid),
+		       reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		goto failed;
+	}
+
+	/* assume 'nid' is a new peer; create  */
+	rc = kiblnd_create_peer(ni, &peer, nid);
+	if (rc != 0) {
+		CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	write_lock_irqsave(g_lock, flags);
+
+	peer2 = kiblnd_find_peer_locked(nid);
+	if (peer2 != NULL) {
+		if (peer2->ibp_version == 0) {
+			peer2->ibp_version     = version;
+			peer2->ibp_incarnation = reqmsg->ibm_srcstamp;
+		}
+
+		/* not the guy I've talked with */
+		if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp ||
+		    peer2->ibp_version     != version) {
+			kiblnd_close_peer_conns_locked(peer2, -ESTALE);
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn stale %s [old ver: %x, new ver: %x]\n",
+			      libcfs_nid2str(nid), peer2->ibp_version, version);
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_STALE;
+			goto failed;
+		}
+
+		/* tie-break connection race in favour of the higher NID */
+		if (peer2->ibp_connecting != 0 &&
+		    nid < ni->ni_nid) {
+			write_unlock_irqrestore(g_lock, flags);
+
+			CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid));
+
+			kiblnd_peer_decref(peer);
+			rej.ibr_why = IBLND_REJECT_CONN_RACE;
+			goto failed;
+		}
+
+		peer2->ibp_accepting++;
+		kiblnd_peer_addref(peer2);
+
+		write_unlock_irqrestore(g_lock, flags);
+		kiblnd_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* Brand new peer */
+		LASSERT (peer->ibp_accepting == 0);
+		LASSERT (peer->ibp_version == 0 &&
+			 peer->ibp_incarnation == 0);
+
+		peer->ibp_accepting   = 1;
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = reqmsg->ibm_srcstamp;
+
+		/* I have a ref on ni that prevents it being shutdown */
+		LASSERT (net->ibn_shutdown == 0);
+
+		kiblnd_peer_addref(peer);
+		list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+		write_unlock_irqrestore(g_lock, flags);
+	}
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+		kiblnd_peer_decref(peer);
+		rej.ibr_why = IBLND_REJECT_NO_RESOURCES;
+		goto failed;
+	}
+
+	/* conn now "owns" cmid, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. */
+
+	conn->ibc_incarnation      = reqmsg->ibm_srcstamp;
+	conn->ibc_credits	  = IBLND_MSG_QUEUE_SIZE(version);
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version);
+	LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version)
+		 <= IBLND_RX_MSGS(version));
+
+	ackmsg = &conn->ibc_connvars->cv_msg;
+	memset(ackmsg, 0, sizeof(*ackmsg));
+
+	kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+			sizeof(ackmsg->ibm_u.connparams));
+	ackmsg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+	ackmsg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+
+	kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= ackmsg;
+	cp.private_data_len    = ackmsg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+	rc = rdma_accept(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+		rej.ibr_version = version;
+		rej.ibr_why     = IBLND_REJECT_FATAL;
+
+		kiblnd_reject(cmid, &rej);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	lnet_ni_decref(ni);
+	return 0;
+
+ failed:
+	if (ni != NULL)
+		lnet_ni_decref(ni);
+
+	rej.ibr_version = version;
+	rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
+	rej.ibr_cp.ibcp_max_frags   = IBLND_RDMA_FRAGS(version);
+	kiblnd_reject(cmid, &rej);
+
+	return -ECONNREFUSED;
+}
+
+void
+kiblnd_reconnect (kib_conn_t *conn, int version,
+		  __u64 incarnation, int why, kib_connparams_t *cp)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	char	  *reason;
+	int	    retry = 0;
+	unsigned long  flags;
+
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+	LASSERT (peer->ibp_connecting > 0);     /* 'conn' at least */
+
+	write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	/* retry connection if it's still needed and no other connection
+	 * attempts (active or passive) are in progress
+	 * NB: reconnect is still needed even when ibp_tx_queue is
+	 * empty if ibp_version != version because reconnect may be
+	 * initiated by kiblnd_query() */
+	if ((!list_empty(&peer->ibp_tx_queue) ||
+	     peer->ibp_version != version) &&
+	    peer->ibp_connecting == 1 &&
+	    peer->ibp_accepting == 0) {
+		retry = 1;
+		peer->ibp_connecting++;
+
+		peer->ibp_version     = version;
+		peer->ibp_incarnation = incarnation;
+	}
+
+	write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (!retry)
+		return;
+
+	switch (why) {
+	default:
+		reason = "Unknown";
+		break;
+
+	case IBLND_REJECT_CONN_STALE:
+		reason = "stale";
+		break;
+
+	case IBLND_REJECT_CONN_RACE:
+		reason = "conn race";
+		break;
+
+	case IBLND_REJECT_CONN_UNCOMPAT:
+		reason = "version negotiation";
+		break;
+	}
+
+	CNETERR("%s: retrying (%s), %x, %x, "
+		"queue_dep: %d, max_frag: %d, msg_size: %d\n",
+		libcfs_nid2str(peer->ibp_nid),
+		reason, IBLND_MSG_VERSION, version,
+		cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version),
+		cp != NULL? cp->ibcp_max_frags   : IBLND_RDMA_FRAGS(version),
+		cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE);
+
+	kiblnd_connect_peer(peer);
+}
+
+void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+
+	LASSERT (!in_interrupt());
+	LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+	switch (reason) {
+	case IB_CM_REJ_STALE_CONN:
+		kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0,
+				 IBLND_REJECT_CONN_STALE, NULL);
+		break;
+
+	case IB_CM_REJ_INVALID_SERVICE_ID:
+		CNETERR("%s rejected: no listener at %d\n",
+			libcfs_nid2str(peer->ibp_nid),
+			*kiblnd_tunables.kib_service);
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) {
+			kib_rej_t	*rej	 = priv;
+			kib_connparams_t *cp	  = NULL;
+			int	       flip	= 0;
+			__u64	     incarnation = -1;
+
+			/* NB. default incarnation is -1 because:
+			 * a) V1 will ignore dst incarnation in connreq.
+			 * b) V2 will provide incarnation while rejecting me,
+			 *    -1 will be overwrote.
+			 *
+			 * if I try to connect to a V1 peer with V2 protocol,
+			 * it rejected me then upgrade to V2, I have no idea
+			 * about the upgrading and try to reconnect with V1,
+			 * in this case upgraded V2 can find out I'm trying to
+			 * talk to the old guy and reject me(incarnation is -1).
+			 */
+
+			if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+			    rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+				__swab32s(&rej->ibr_magic);
+				__swab16s(&rej->ibr_version);
+				flip = 1;
+			}
+
+			if (priv_nob >= sizeof(kib_rej_t) &&
+			    rej->ibr_version > IBLND_MSG_VERSION_1) {
+				/* priv_nob is always 148 in current version
+				 * of OFED, so we still need to check version.
+				 * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */
+				cp = &rej->ibr_cp;
+
+				if (flip) {
+					__swab64s(&rej->ibr_incarnation);
+					__swab16s(&cp->ibcp_queue_depth);
+					__swab16s(&cp->ibcp_max_frags);
+					__swab32s(&cp->ibcp_max_msg_size);
+				}
+
+				incarnation = rej->ibr_incarnation;
+			}
+
+			if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+			    rej->ibr_magic != LNET_PROTO_MAGIC) {
+				CERROR("%s rejected: consumer defined fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+			}
+
+			if (rej->ibr_version != IBLND_MSG_VERSION &&
+			    rej->ibr_version != IBLND_MSG_VERSION_1) {
+				CERROR("%s rejected: o2iblnd version %x error\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_version);
+				break;
+			}
+
+			if (rej->ibr_why     == IBLND_REJECT_FATAL &&
+			    rej->ibr_version == IBLND_MSG_VERSION_1) {
+				CDEBUG(D_NET, "rejected by old version peer %s: %x\n",
+				       libcfs_nid2str(peer->ibp_nid), rej->ibr_version);
+
+				if (conn->ibc_version != IBLND_MSG_VERSION_1)
+					rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT;
+			}
+
+			switch (rej->ibr_why) {
+			case IBLND_REJECT_CONN_RACE:
+			case IBLND_REJECT_CONN_STALE:
+			case IBLND_REJECT_CONN_UNCOMPAT:
+				kiblnd_reconnect(conn, rej->ibr_version,
+						 incarnation, rej->ibr_why, cp);
+				break;
+
+			case IBLND_REJECT_MSG_QUEUE_SIZE:
+				CERROR("%s rejected: incompatible message queue depth %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid), cp->ibcp_queue_depth,
+				       IBLND_MSG_QUEUE_SIZE(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_RDMA_FRAGS:
+				CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n",
+				       libcfs_nid2str(peer->ibp_nid), cp->ibcp_max_frags,
+				       IBLND_RDMA_FRAGS(conn->ibc_version));
+				break;
+
+			case IBLND_REJECT_NO_RESOURCES:
+				CERROR("%s rejected: o2iblnd no resources\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			case IBLND_REJECT_FATAL:
+				CERROR("%s rejected: o2iblnd fatal error\n",
+				       libcfs_nid2str(peer->ibp_nid));
+				break;
+
+			default:
+				CERROR("%s rejected: o2iblnd reason %d\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       rej->ibr_why);
+				break;
+			}
+			break;
+		}
+		/* fall through */
+	default:
+		CNETERR("%s rejected: reason %d, size %d\n",
+			libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+		break;
+	}
+
+	kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+	kib_peer_t    *peer = conn->ibc_peer;
+	lnet_ni_t     *ni   = peer->ibp_ni;
+	kib_net_t     *net  = ni->ni_data;
+	kib_msg_t     *msg  = priv;
+	int	    ver  = conn->ibc_version;
+	int	    rc   = kiblnd_unpack_msg(msg, priv_nob);
+	unsigned long  flags;
+
+	LASSERT (net != NULL);
+
+	if (rc != 0) {
+		CERROR("Can't unpack connack from %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		goto failed;
+	}
+
+	if (msg->ibm_type != IBLND_MSG_CONNACK) {
+		CERROR("Unexpected message %d from %s\n",
+		       msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (ver != msg->ibm_version) {
+		CERROR("%s replied version %x is different with "
+		       "requested version %x\n",
+		       libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_queue_depth !=
+	    IBLND_MSG_QUEUE_SIZE(ver)) {
+		CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_queue_depth,
+		       IBLND_MSG_QUEUE_SIZE(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_frags !=
+	    IBLND_RDMA_FRAGS(ver)) {
+		CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_frags,
+		       IBLND_RDMA_FRAGS(ver));
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+		CERROR("%s max message size %d too big (%d max)\n",
+		       libcfs_nid2str(peer->ibp_nid),
+		       msg->ibm_u.connparams.ibcp_max_msg_size,
+		       IBLND_MSG_SIZE);
+		rc = -EPROTO;
+		goto failed;
+	}
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+	if (msg->ibm_dstnid == ni->ni_nid &&
+	    msg->ibm_dststamp == net->ibn_incarnation)
+		rc = 0;
+	else
+		rc = -ESTALE;
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	if (rc != 0) {
+		CERROR("Bad connection reply from %s, rc = %d, "
+		       "version: %x max_frags: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc,
+		       msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags);
+		goto failed;
+	}
+
+	conn->ibc_incarnation      = msg->ibm_srcstamp;
+	conn->ibc_credits	  =
+	conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver);
+	LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver)
+		 <= IBLND_RX_MSGS(ver));
+
+	kiblnd_connreq_done(conn, 0);
+	return;
+
+ failed:
+	/* NB My QP has already established itself, so I handle anything going
+	 * wrong here by setting ibc_comms_error.
+	 * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+	 * immediately tears it down. */
+
+	LASSERT (rc != 0);
+	conn->ibc_comms_error = rc;
+	kiblnd_connreq_done(conn, 0);
+}
+
+int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+	kib_peer_t	      *peer = (kib_peer_t *)cmid->context;
+	kib_conn_t	      *conn;
+	kib_msg_t	       *msg;
+	struct rdma_conn_param   cp;
+	int		      version;
+	__u64		    incarnation;
+	unsigned long	    flags;
+	int		      rc;
+
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	incarnation = peer->ibp_incarnation;
+	version     = (peer->ibp_version == 0) ? IBLND_MSG_VERSION :
+						 peer->ibp_version;
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version);
+	if (conn == NULL) {
+		kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+		kiblnd_peer_decref(peer); /* lose cmid's ref */
+		return -ENOMEM;
+	}
+
+	/* conn "owns" cmid now, so I return success from here on to ensure the
+	 * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+	 * on peer */
+
+	msg = &conn->ibc_connvars->cv_msg;
+
+	memset(msg, 0, sizeof(*msg));
+	kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+	msg->ibm_u.connparams.ibcp_queue_depth  = IBLND_MSG_QUEUE_SIZE(version);
+	msg->ibm_u.connparams.ibcp_max_frags    = IBLND_RDMA_FRAGS(version);
+	msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+
+	kiblnd_pack_msg(peer->ibp_ni, msg, version,
+			0, peer->ibp_nid, incarnation);
+
+	memset(&cp, 0, sizeof(cp));
+	cp.private_data	= msg;
+	cp.private_data_len    = msg->ibm_nob;
+	cp.responder_resources = 0;	     /* No atomic ops or RDMA reads */
+	cp.initiator_depth     = 0;
+	cp.flow_control	= 1;
+	cp.retry_count	 = *kiblnd_tunables.kib_retry_count;
+	cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+	LASSERT(cmid->context == (void *)conn);
+	LASSERT(conn->ibc_cmid == cmid);
+
+	rc = rdma_connect(cmid, &cp);
+	if (rc != 0) {
+		CERROR("Can't connect to %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), rc);
+		kiblnd_connreq_done(conn, rc);
+		kiblnd_conn_decref(conn);
+	}
+
+	return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+	kib_peer_t  *peer;
+	kib_conn_t  *conn;
+	int	  rc;
+
+	switch (event->event) {
+	default:
+		CERROR("Unexpected event: %d, status: %d\n",
+		       event->event, event->status);
+		LBUG();
+
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		/* destroy cmid on failure */
+		rc = kiblnd_passive_connect(cmid,
+					    (void *)KIBLND_CONN_PARAM(event),
+					    KIBLND_CONN_PARAM_LEN(event));
+		CDEBUG(D_NET, "connreq: %d\n", rc);
+		return rc;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ADDR ERROR %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+
+		CDEBUG(D_NET,"%s Addr resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status != 0) {
+			CNETERR("Can't resolve address for %s: %d\n",
+				libcfs_nid2str(peer->ibp_nid), event->status);
+			rc = event->status;
+		} else {
+			rc = rdma_resolve_route(
+				cmid, *kiblnd_tunables.kib_timeout * 1000);
+			if (rc == 0)
+				return 0;
+			/* Can't initiate route resolution */
+			CERROR("Can't resolve route for %s: %d\n",
+			       libcfs_nid2str(peer->ibp_nid), rc);
+		}
+		kiblnd_peer_connect_failed(peer, 1, rc);
+		kiblnd_peer_decref(peer);
+		return rc;		      /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		peer = (kib_peer_t *)cmid->context;
+		CNETERR("%s: ROUTE ERROR %d\n",
+			libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+		kiblnd_peer_decref(peer);
+		return -EHOSTUNREACH;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		peer = (kib_peer_t *)cmid->context;
+		CDEBUG(D_NET,"%s Route resolved: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+
+		if (event->status == 0)
+			return kiblnd_active_connect(cmid);
+
+		CNETERR("Can't resolve route for %s: %d\n",
+		       libcfs_nid2str(peer->ibp_nid), event->status);
+		kiblnd_peer_connect_failed(peer, 1, event->status);
+		kiblnd_peer_decref(peer);
+		return event->status;	   /* rc != 0 destroys cmid */
+
+	case RDMA_CM_EVENT_UNREACHABLE:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: UNREACHABLE %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENETDOWN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		conn = (kib_conn_t *)cmid->context;
+		LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+			conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+		CNETERR("%s: CONNECT ERROR %d\n",
+			libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+		kiblnd_connreq_done(conn, -ENOTCONN);
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_REJECTED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CERROR ("%s: REJECTED %d\n",
+				libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				event->status);
+			kiblnd_connreq_done(conn, -ECONNRESET);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			kiblnd_rejected(conn, event->status,
+					(void *)KIBLND_CONN_PARAM(event),
+					KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		kiblnd_conn_decref(conn);
+		return 0;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		conn = (kib_conn_t *)cmid->context;
+		switch (conn->ibc_state) {
+		default:
+			LBUG();
+
+		case IBLND_CONN_PASSIVE_WAIT:
+			CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, 0);
+			break;
+
+		case IBLND_CONN_ACTIVE_CONNECT:
+			CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_check_connreply(conn,
+					       (void *)KIBLND_CONN_PARAM(event),
+					       KIBLND_CONN_PARAM_LEN(event));
+			break;
+		}
+		/* net keeps its ref on conn! */
+		return 0;
+
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n");
+		return 0;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		conn = (kib_conn_t *)cmid->context;
+		if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+			CERROR("%s DISCONNECTED\n",
+			       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+			kiblnd_connreq_done(conn, -ECONNRESET);
+		} else {
+			kiblnd_close_conn(conn, 0);
+		}
+		kiblnd_conn_decref(conn);
+		cmid->context = NULL;
+		return 0;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		LCONSOLE_ERROR_MSG(0x131,
+				   "Received notification of device removal\n"
+				   "Please shutdown LNET to allow this to proceed\n");
+		/* Can't remove network from underneath LNET for now, so I have
+		 * to ignore this */
+		return 0;
+
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		LCONSOLE_INFO("Physical link changed (eg hca/port)\n");
+		return 0;
+	}
+}
+
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs)
+{
+	kib_tx_t	  *tx;
+	struct list_head	*ttmp;
+
+	list_for_each (ttmp, txs) {
+		tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+		if (txs != &conn->ibc_active_txs) {
+			LASSERT (tx->tx_queued);
+		} else {
+			LASSERT (!tx->tx_queued);
+			LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+		}
+
+		if (cfs_time_aftereq (jiffies, tx->tx_deadline)) {
+			CERROR("Timed out tx: %s, %lu seconds\n",
+			       kiblnd_queue2str(conn, txs),
+			       cfs_duration_sec(jiffies - tx->tx_deadline));
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
+{
+	return  kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+		kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
+}
+
+void
+kiblnd_check_conns (int idx)
+{
+	LIST_HEAD (closes);
+	LIST_HEAD (checksends);
+	struct list_head    *peers = &kiblnd_data.kib_peers[idx];
+	struct list_head    *ptmp;
+	kib_peer_t    *peer;
+	kib_conn_t    *conn;
+	struct list_head    *ctmp;
+	unsigned long  flags;
+
+	/* NB. We expect to have a look at all the peers and not find any
+	 * RDMAs to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+	list_for_each (ptmp, peers) {
+		peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+		list_for_each (ctmp, &peer->ibp_conns) {
+			int timedout;
+			int sendnoop;
+
+			conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+			LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+			spin_lock(&conn->ibc_lock);
+
+			sendnoop = kiblnd_need_noop(conn);
+			timedout = kiblnd_conn_timed_out_locked(conn);
+			if (!sendnoop && !timedout) {
+				spin_unlock(&conn->ibc_lock);
+				continue;
+			}
+
+			if (timedout) {
+				CERROR("Timed out RDMA with %s (%lu): "
+				       "c: %u, oc: %u, rc: %u\n",
+				       libcfs_nid2str(peer->ibp_nid),
+				       cfs_duration_sec(cfs_time_current() -
+							peer->ibp_last_alive),
+				       conn->ibc_credits,
+				       conn->ibc_outstanding_credits,
+				       conn->ibc_reserved_credits);
+				list_add(&conn->ibc_connd_list, &closes);
+			} else {
+				list_add(&conn->ibc_connd_list,
+					     &checksends);
+			}
+			/* +ref for 'closes' or 'checksends' */
+			kiblnd_conn_addref(conn);
+
+			spin_unlock(&conn->ibc_lock);
+		}
+	}
+
+	read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+	/* Handle timeout by closing the whole
+	 * connection. We can only be sure RDMA activity
+	 * has ceased once the QP has been modified. */
+	while (!list_empty(&closes)) {
+		conn = list_entry(closes.next,
+				      kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_close_conn(conn, -ETIMEDOUT);
+		kiblnd_conn_decref(conn);
+	}
+
+	/* In case we have enough credits to return via a
+	 * NOOP, but there were no non-blocking tx descs
+	 * free to do it last time... */
+	while (!list_empty(&checksends)) {
+		conn = list_entry(checksends.next,
+				      kib_conn_t, ibc_connd_list);
+		list_del(&conn->ibc_connd_list);
+		kiblnd_check_sends(conn);
+		kiblnd_conn_decref(conn);
+	}
+}
+
+void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+	LASSERT (!in_interrupt());
+	LASSERT (current == kiblnd_data.kib_connd);
+	LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+	rdma_disconnect(conn->ibc_cmid);
+	kiblnd_finalise_conn(conn);
+
+	kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd (void *arg)
+{
+	wait_queue_t     wait;
+	unsigned long      flags;
+	kib_conn_t	*conn;
+	int		timeout;
+	int		i;
+	int		dropped_lock;
+	int		peer_index = 0;
+	unsigned long      deadline = jiffies;
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current (&wait);
+	kiblnd_data.kib_connd = current;
+
+	spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+
+		dropped_lock = 0;
+
+		if (!list_empty (&kiblnd_data.kib_connd_zombies)) {
+			conn = list_entry(kiblnd_data. \
+					      kib_connd_zombies.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_destroy_conn(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (!list_empty(&kiblnd_data.kib_connd_conns)) {
+			conn = list_entry(kiblnd_data.kib_connd_conns.next,
+					      kib_conn_t, ibc_list);
+			list_del(&conn->ibc_list);
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock,
+					       flags);
+			dropped_lock = 1;
+
+			kiblnd_disconnect_conn(conn);
+			kiblnd_conn_decref(conn);
+
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		/* careful with the jiffy wrap... */
+		timeout = (int)(deadline - jiffies);
+		if (timeout <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = kiblnd_data.kib_peer_hash_size;
+
+			spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+			dropped_lock = 1;
+
+			/* Time to check for RDMA timeouts on a few more
+			 * peers: I do checks every 'p' seconds on a
+			 * proportion of the peer table and I need to check
+			 * every connection 'n' times within a timeout
+			 * interval, to ensure I detect a timeout on any
+			 * connection within (n+1)/n times the timeout
+			 * interval. */
+
+			if (*kiblnd_tunables.kib_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*kiblnd_tunables.kib_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				kiblnd_check_conns(peer_index);
+				peer_index = (peer_index + 1) %
+					     kiblnd_data.kib_peer_hash_size;
+			}
+
+			deadline += p * HZ;
+			spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+		}
+
+		if (dropped_lock)
+			continue;
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+		waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait);
+		spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		CDEBUG(D_NET, "%s established\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+		return;
+
+	default:
+		CERROR("%s: Async QP event type %d\n",
+		       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+		return;
+	}
+}
+
+void
+kiblnd_complete (struct ib_wc *wc)
+{
+	switch (kiblnd_wreqid2type(wc->wr_id)) {
+	default:
+		LBUG();
+
+	case IBLND_WID_RDMA:
+		/* We only get RDMA completion notification if it fails.  All
+		 * subsequent work items, including the final SEND will fail
+		 * too.  However we can't print out any more info about the
+		 * failing RDMA because 'tx' might be back on the idle list or
+		 * even reused already if we didn't manage to post all our work
+		 * items */
+		CNETERR("RDMA (tx: %p) failed: %d\n",
+			kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_TX:
+		kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+		return;
+
+	case IBLND_WID_RX:
+		kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+				   wc->byte_len);
+		return;
+	}
+}
+
+void
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
+{
+	/* NB I'm not allowed to schedule this conn once its refcount has
+	 * reached 0.  Since fundamentally I'm racing with scheduler threads
+	 * consuming my CQ I could be called after all completions have
+	 * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+	 * and this CQ is about to be destroyed so I NOOP. */
+	kib_conn_t		*conn = (kib_conn_t *)arg;
+	struct kib_sched_info	*sched = conn->ibc_sched;
+	unsigned long		flags;
+
+	LASSERT(cq == conn->ibc_cq);
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	conn->ibc_ready = 1;
+
+	if (!conn->ibc_scheduled &&
+	    (conn->ibc_nrx > 0 ||
+	     conn->ibc_nsends_posted > 0)) {
+		kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+		conn->ibc_scheduled = 1;
+		list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+		if (waitqueue_active(&sched->ibs_waitq))
+			wake_up(&sched->ibs_waitq);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+	kib_conn_t *conn = arg;
+
+	CERROR("%s: async CQ event type %d\n",
+	       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+	long			id = (long)arg;
+	struct kib_sched_info	*sched;
+	kib_conn_t		*conn;
+	wait_queue_t		wait;
+	unsigned long		flags;
+	struct ib_wc		wc;
+	int			did_something;
+	int			busy_loops = 0;
+	int			rc;
+
+	cfs_block_allsigs();
+
+	init_waitqueue_entry_current(&wait);
+
+	sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+	if (rc != 0) {
+		CWARN("Failed to bind on CPT %d, please verify whether "
+		      "all CPUs are healthy and reload modules if necessary, "
+		      "otherwise your system might under risk of low "
+		      "performance\n", sched->ibs_cpt);
+	}
+
+	spin_lock_irqsave(&sched->ibs_lock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		if (busy_loops++ >= IBLND_RESCHED) {
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			cond_resched();
+			busy_loops = 0;
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+		}
+
+		did_something = 0;
+
+		if (!list_empty(&sched->ibs_conns)) {
+			conn = list_entry(sched->ibs_conns.next,
+					      kib_conn_t, ibc_sched_list);
+			/* take over kib_sched_conns' ref on conn... */
+			LASSERT(conn->ibc_scheduled);
+			list_del(&conn->ibc_sched_list);
+			conn->ibc_ready = 0;
+
+			spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+			rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			if (rc == 0) {
+				rc = ib_req_notify_cq(conn->ibc_cq,
+						      IB_CQ_NEXT_COMP);
+				if (rc < 0) {
+					CWARN("%s: ib_req_notify_cq failed: %d, "
+					      "closing connection\n",
+					      libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+					kiblnd_close_conn(conn, -EIO);
+					kiblnd_conn_decref(conn);
+					spin_lock_irqsave(&sched->ibs_lock,
+							      flags);
+					continue;
+				}
+
+				rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+			}
+
+			if (rc < 0) {
+				CWARN("%s: ib_poll_cq failed: %d, "
+				      "closing connection\n",
+				      libcfs_nid2str(conn->ibc_peer->ibp_nid),
+				      rc);
+				kiblnd_close_conn(conn, -EIO);
+				kiblnd_conn_decref(conn);
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+				continue;
+			}
+
+			spin_lock_irqsave(&sched->ibs_lock, flags);
+
+			if (rc != 0 || conn->ibc_ready) {
+				/* There may be another completion waiting; get
+				 * another scheduler to check while I handle
+				 * this one... */
+				/* +1 ref for sched_conns */
+				kiblnd_conn_addref(conn);
+				list_add_tail(&conn->ibc_sched_list,
+						  &sched->ibs_conns);
+				if (waitqueue_active(&sched->ibs_waitq))
+					wake_up(&sched->ibs_waitq);
+			} else {
+				conn->ibc_scheduled = 0;
+			}
+
+			if (rc != 0) {
+				spin_unlock_irqrestore(&sched->ibs_lock, flags);
+				kiblnd_complete(&wc);
+
+				spin_lock_irqsave(&sched->ibs_lock, flags);
+			}
+
+			kiblnd_conn_decref(conn); /* ...drop my ref from above */
+			did_something = 1;
+		}
+
+		if (did_something)
+			continue;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&sched->ibs_waitq, &wait);
+		spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+		waitq_wait(&wait, TASK_INTERRUPTIBLE);
+		busy_loops = 0;
+
+		remove_wait_queue(&sched->ibs_waitq, &wait);
+		set_current_state(TASK_RUNNING);
+		spin_lock_irqsave(&sched->ibs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
+
+int
+kiblnd_failover_thread(void *arg)
+{
+	rwlock_t		*glock = &kiblnd_data.kib_global_lock;
+	kib_dev_t	 *dev;
+	wait_queue_t     wait;
+	unsigned long      flags;
+	int		rc;
+
+	LASSERT (*kiblnd_tunables.kib_dev_failover != 0);
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current(&wait);
+	write_lock_irqsave(glock, flags);
+
+	while (!kiblnd_data.kib_shutdown) {
+		int     do_failover = 0;
+		int     long_sleep;
+
+		list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+				    ibd_fail_list) {
+			if (cfs_time_before(cfs_time_current(),
+					    dev->ibd_next_failover))
+				continue;
+			do_failover = 1;
+			break;
+		}
+
+		if (do_failover) {
+			list_del_init(&dev->ibd_fail_list);
+			dev->ibd_failover = 1;
+			write_unlock_irqrestore(glock, flags);
+
+			rc = kiblnd_dev_failover(dev);
+
+			write_lock_irqsave(glock, flags);
+
+			LASSERT (dev->ibd_failover);
+			dev->ibd_failover = 0;
+			if (rc >= 0) { /* Device is OK or failover succeed */
+				dev->ibd_next_failover = cfs_time_shift(3);
+				continue;
+			}
+
+			/* failed to failover, retry later */
+			dev->ibd_next_failover =
+				cfs_time_shift(min(dev->ibd_failed_failover, 10));
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+
+			continue;
+		}
+
+		/* long sleep if no more pending failover */
+		long_sleep = list_empty(&kiblnd_data.kib_failed_devs);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_unlock_irqrestore(glock, flags);
+
+		rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+						   cfs_time_seconds(1));
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait);
+		write_lock_irqsave(glock, flags);
+
+		if (!long_sleep || rc != 0)
+			continue;
+
+		/* have a long sleep, routine check all active devices,
+		 * we need checking like this because if there is not active
+		 * connection on the dev and no SEND from local, we may listen
+		 * on wrong HCA for ever while there is a bonding failover */
+		list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+			if (kiblnd_dev_can_failover(dev)) {
+				list_add_tail(&dev->ibd_fail_list,
+					      &kiblnd_data.kib_failed_devs);
+			}
+		}
+	}
+
+	write_unlock_irqrestore(glock, flags);
+
+	kiblnd_thread_fini();
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644
index 000000000000..e21028b72302
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -0,0 +1,493 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
+ *
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+CFS_MODULE_PARM(service, "i", int, 0444,
+		"service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+		"set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+		"timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+		"number of threads in each scheduler pool");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+		"# of message descriptors allocated for each pool");
+
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+		"# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+		"# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444,
+		"when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+		"Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+static char *ipif_name = "ib0";
+CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
+		"IPoIB interface name");
+
+static int retry_count = 5;
+CFS_MODULE_PARM(retry_count, "i", int, 0644,
+		"Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
+		"RNR retransmissions");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+		"Idle time in seconds before sending a keepalive");
+
+static int ib_mtu = 0;
+CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
+		"IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends = 0;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+		"send work-queue sizing");
+
+static int map_on_demand = 0;
+CFS_MODULE_PARM(map_on_demand, "i", int, 0444,
+		"map on demand");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_pool_size = 512;
+CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
+		"size of fmr pool on each CPT (>= ntx / 4)");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int fmr_flush_trigger = 384;
+CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
+		"# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
+		"non-zero to enable FMR caching");
+
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int pmr_pool_size = 512;
+CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444,
+		"size of MR cache pmr pool on each CPT");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+CFS_MODULE_PARM(dev_failover, "i", int, 0444,
+	       "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+
+static int require_privileged_port = 0;
+CFS_MODULE_PARM(require_privileged_port, "i", int, 0644,
+		"require privileged port when accepting connection");
+
+static int use_privileged_port = 1;
+CFS_MODULE_PARM(use_privileged_port, "i", int, 0644,
+		"use privileged port when initiating connection");
+
+kib_tunables_t kiblnd_tunables = {
+	.kib_dev_failover	   = &dev_failover,
+	.kib_service		= &service,
+	.kib_cksum		  = &cksum,
+	.kib_timeout		= &timeout,
+	.kib_keepalive	      = &keepalive,
+	.kib_ntx		    = &ntx,
+	.kib_credits		= &credits,
+	.kib_peertxcredits	  = &peer_credits,
+	.kib_peercredits_hiw	= &peer_credits_hiw,
+	.kib_peerrtrcredits	 = &peer_buffer_credits,
+	.kib_peertimeout	    = &peer_timeout,
+	.kib_default_ipif	   = &ipif_name,
+	.kib_retry_count	    = &retry_count,
+	.kib_rnr_retry_count	= &rnr_retry_count,
+	.kib_concurrent_sends       = &concurrent_sends,
+	.kib_ib_mtu		 = &ib_mtu,
+	.kib_map_on_demand	  = &map_on_demand,
+	.kib_fmr_pool_size	  = &fmr_pool_size,
+	.kib_fmr_flush_trigger      = &fmr_flush_trigger,
+	.kib_fmr_cache	      = &fmr_cache,
+	.kib_pmr_pool_size	  = &pmr_pool_size,
+	.kib_require_priv_port      = &require_privileged_port,
+	.kib_use_priv_port	    = &use_privileged_port,
+	.kib_nscheds		    = &nscheds
+};
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+static char ipif_basename_space[32];
+
+
+enum {
+	O2IBLND_SERVICE  = 1,
+	O2IBLND_CKSUM,
+	O2IBLND_TIMEOUT,
+	O2IBLND_NTX,
+	O2IBLND_CREDITS,
+	O2IBLND_PEER_TXCREDITS,
+	O2IBLND_PEER_CREDITS_HIW,
+	O2IBLND_PEER_RTRCREDITS,
+	O2IBLND_PEER_TIMEOUT,
+	O2IBLND_IPIF_BASENAME,
+	O2IBLND_RETRY_COUNT,
+	O2IBLND_RNR_RETRY_COUNT,
+	O2IBLND_KEEPALIVE,
+	O2IBLND_CONCURRENT_SENDS,
+	O2IBLND_IB_MTU,
+	O2IBLND_MAP_ON_DEMAND,
+	O2IBLND_FMR_POOL_SIZE,
+	O2IBLND_FMR_FLUSH_TRIGGER,
+	O2IBLND_FMR_CACHE,
+	O2IBLND_PMR_POOL_SIZE,
+	O2IBLND_DEV_FAILOVER
+};
+
+static ctl_table_t kiblnd_ctl_table[] = {
+	{
+		.ctl_name = O2IBLND_SERVICE,
+		.procname = "service",
+		.data     = &service,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CKSUM,
+		.procname = "cksum",
+		.data     = &cksum,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_TIMEOUT,
+		.procname = "timeout",
+		.data     = &timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_NTX,
+		.procname = "ntx",
+		.data     = &ntx,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CREDITS,
+		.procname = "credits",
+		.data     = &credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_TXCREDITS,
+		.procname = "peer_credits",
+		.data     = &peer_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_CREDITS_HIW,
+		.procname = "peer_credits_hiw",
+		.data     = &peer_credits_hiw,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_RTRCREDITS,
+		.procname = "peer_buffer_credits",
+		.data     = &peer_buffer_credits,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PEER_TIMEOUT,
+		.procname = "peer_timeout",
+		.data     = &peer_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_IPIF_BASENAME,
+		.procname = "ipif_name",
+		.data     = ipif_basename_space,
+		.maxlen   = sizeof(ipif_basename_space),
+		.mode     = 0444,
+		.proc_handler = &proc_dostring
+	},
+	{
+		.ctl_name = O2IBLND_RETRY_COUNT,
+		.procname = "retry_count",
+		.data     = &retry_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_RNR_RETRY_COUNT,
+		.procname = "rnr_retry_count",
+		.data     = &rnr_retry_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_KEEPALIVE,
+		.procname = "keepalive",
+		.data     = &keepalive,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_CONCURRENT_SENDS,
+		.procname = "concurrent_sends",
+		.data     = &concurrent_sends,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_IB_MTU,
+		.procname = "ib_mtu",
+		.data     = &ib_mtu,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_MAP_ON_DEMAND,
+		.procname = "map_on_demand",
+		.data     = &map_on_demand,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+
+	{
+		.ctl_name = O2IBLND_FMR_POOL_SIZE,
+		.procname = "fmr_pool_size",
+		.data     = &fmr_pool_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_FMR_FLUSH_TRIGGER,
+		.procname = "fmr_flush_trigger",
+		.data     = &fmr_flush_trigger,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_FMR_CACHE,
+		.procname = "fmr_cache",
+		.data     = &fmr_cache,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_PMR_POOL_SIZE,
+		.procname = "pmr_pool_size",
+		.data     = &pmr_pool_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		.ctl_name = O2IBLND_DEV_FAILOVER,
+		.procname = "dev_failover",
+		.data     = &dev_failover,
+		.maxlen   = sizeof(int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+	},
+	{0}
+};
+
+static ctl_table_t kiblnd_top_ctl_table[] = {
+	{
+		.ctl_name = CTL_O2IBLND,
+		.procname = "o2iblnd",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = kiblnd_ctl_table
+	},
+	{0}
+};
+
+void
+kiblnd_initstrtunable(char *space, char *str, int size)
+{
+	strncpy(space, str, size);
+	space[size-1] = 0;
+}
+
+void
+kiblnd_sysctl_init (void)
+{
+	kiblnd_initstrtunable(ipif_basename_space, ipif_name,
+			      sizeof(ipif_basename_space));
+
+	kiblnd_tunables.kib_sysctl =
+		cfs_register_sysctl_table(kiblnd_top_ctl_table, 0);
+
+	if (kiblnd_tunables.kib_sysctl == NULL)
+		CWARN("Can't setup /proc tunables\n");
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+	if (kiblnd_tunables.kib_sysctl != NULL)
+		unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
+}
+
+#else
+
+void
+kiblnd_sysctl_init (void)
+{
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+}
+
+#endif
+
+int
+kiblnd_tunables_init (void)
+{
+	if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+		CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+		       *kiblnd_tunables.kib_ib_mtu);
+		return -EINVAL;
+	}
+
+	if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+
+	if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
+		*kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+
+	if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
+		*kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
+		*kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+
+	if (*kiblnd_tunables.kib_map_on_demand < 0 ||
+	    *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
+		*kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+
+	if (*kiblnd_tunables.kib_map_on_demand == 1)
+		*kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+
+	if (*kiblnd_tunables.kib_concurrent_sends == 0) {
+		if (*kiblnd_tunables.kib_map_on_demand > 0 &&
+		    *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
+		else
+			*kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+	}
+
+	if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
+		*kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+
+	if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+		CWARN("Concurrent sends %d is lower than message queue size: %d, "
+		      "performance may drop slightly.\n",
+		      *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+	}
+
+	kiblnd_sysctl_init();
+	return 0;
+}
+
+void
+kiblnd_tunables_fini (void)
+{
+	kiblnd_sysctl_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
new file mode 100644
index 000000000000..6494b2bada05
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_LNET) += ksocklnd.o
+
+ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o
+
+
+
+ccflags-y := -I$(src)/../../include
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
new file mode 100644
index 000000000000..c826bf9d49ac
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c
@@ -0,0 +1,2902 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd.c
+ *
+ * Author: Zach Brown <zab@zabbo.net>
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Eric Barton <eric@bartonsoftware.com>
+ */
+
+#include "socklnd.h"
+
+lnd_t		   the_ksocklnd;
+ksock_nal_data_t	ksocknal_data;
+
+ksock_interface_t *
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT(i < LNET_MAX_INTERFACES);
+		iface = &net->ksnn_interfaces[i];
+
+		if (iface->ksni_ipaddr == ip)
+			return (iface);
+	}
+
+	return (NULL);
+}
+
+ksock_route_t *
+ksocknal_create_route (__u32 ipaddr, int port)
+{
+	ksock_route_t *route;
+
+	LIBCFS_ALLOC (route, sizeof (*route));
+	if (route == NULL)
+		return (NULL);
+
+	atomic_set (&route->ksnr_refcount, 1);
+	route->ksnr_peer = NULL;
+	route->ksnr_retry_interval = 0;	 /* OK to connect at any time */
+	route->ksnr_ipaddr = ipaddr;
+	route->ksnr_port = port;
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+	route->ksnr_connected = 0;
+	route->ksnr_deleted = 0;
+	route->ksnr_conn_count = 0;
+	route->ksnr_share_count = 0;
+
+	return (route);
+}
+
+void
+ksocknal_destroy_route (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) == 0);
+
+	if (route->ksnr_peer != NULL)
+		ksocknal_peer_decref(route->ksnr_peer);
+
+	LIBCFS_FREE (route, sizeof (*route));
+}
+
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_net_t   *net = ni->ni_data;
+	ksock_peer_t  *peer;
+
+	LASSERT (id.nid != LNET_NID_ANY);
+	LASSERT (id.pid != LNET_PID_ANY);
+	LASSERT (!in_interrupt());
+
+	LIBCFS_ALLOC (peer, sizeof (*peer));
+	if (peer == NULL)
+		return -ENOMEM;
+
+	memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
+
+	peer->ksnp_ni = ni;
+	peer->ksnp_id = id;
+	atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
+	peer->ksnp_closing = 0;
+	peer->ksnp_accepting = 0;
+	peer->ksnp_proto = NULL;
+	peer->ksnp_last_alive = 0;
+	peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	INIT_LIST_HEAD (&peer->ksnp_conns);
+	INIT_LIST_HEAD (&peer->ksnp_routes);
+	INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+	INIT_LIST_HEAD (&peer->ksnp_zc_req_list);
+	spin_lock_init(&peer->ksnp_lock);
+
+	spin_lock_bh(&net->ksnn_lock);
+
+	if (net->ksnn_shutdown) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		LIBCFS_FREE(peer, sizeof(*peer));
+		CERROR("Can't create peer: network shutdown\n");
+		return -ESHUTDOWN;
+	}
+
+	net->ksnn_npeers++;
+
+	spin_unlock_bh(&net->ksnn_lock);
+
+	*peerp = peer;
+	return 0;
+}
+
+void
+ksocknal_destroy_peer (ksock_peer_t *peer)
+{
+	ksock_net_t    *net = peer->ksnp_ni->ni_data;
+
+	CDEBUG (D_NET, "peer %s %p deleted\n",
+		libcfs_id2str(peer->ksnp_id), peer);
+
+	LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+	LASSERT (peer->ksnp_accepting == 0);
+	LASSERT (list_empty (&peer->ksnp_conns));
+	LASSERT (list_empty (&peer->ksnp_routes));
+	LASSERT (list_empty (&peer->ksnp_tx_queue));
+	LASSERT (list_empty (&peer->ksnp_zc_req_list));
+
+	LIBCFS_FREE (peer, sizeof (*peer));
+
+	/* NB a peer's connections and routes keep a reference on their peer
+	 * until they are destroyed, so we can be assured that _all_ state to
+	 * do with this peer has been cleaned up when its refcount drops to
+	 * zero. */
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_npeers--;
+	spin_unlock_bh(&net->ksnn_lock);
+}
+
+ksock_peer_t *
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	struct list_head       *peer_list = ksocknal_nid2peerlist(id.nid);
+	struct list_head       *tmp;
+	ksock_peer_t     *peer;
+
+	list_for_each (tmp, peer_list) {
+
+		peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+		LASSERT (!peer->ksnp_closing);
+
+		if (peer->ksnp_ni != ni)
+			continue;
+
+		if (peer->ksnp_id.nid != id.nid ||
+		    peer->ksnp_id.pid != id.pid)
+			continue;
+
+		CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+		       peer, libcfs_id2str(id),
+		       atomic_read(&peer->ksnp_refcount));
+		return (peer);
+	}
+	return (NULL);
+}
+
+ksock_peer_t *
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)			/* +1 ref for caller? */
+		ksocknal_peer_addref(peer);
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (peer);
+}
+
+void
+ksocknal_unlink_peer_locked (ksock_peer_t *peer)
+{
+	int		i;
+	__u32	      ip;
+	ksock_interface_t *iface;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
+		LASSERT (i < LNET_MAX_INTERFACES);
+		ip = peer->ksnp_passive_ips[i];
+
+		iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+		/* All IPs in peer->ksnp_passive_ips[] come from the
+		 * interface list, therefore the call must succeed. */
+		LASSERT (iface != NULL);
+
+		CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n",
+		       peer, iface, iface->ksni_nroutes);
+		iface->ksni_npeers--;
+	}
+
+	LASSERT (list_empty(&peer->ksnp_conns));
+	LASSERT (list_empty(&peer->ksnp_routes));
+	LASSERT (!peer->ksnp_closing);
+	peer->ksnp_closing = 1;
+	list_del (&peer->ksnp_list);
+	/* lose peerlist's ref */
+	ksocknal_peer_decref(peer);
+}
+
+int
+ksocknal_get_peer_info (lnet_ni_t *ni, int index,
+			lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip,
+			int *port, int *conn_count, int *share_count)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_route_t     *route;
+	struct list_head	*rtmp;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+
+		list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (peer->ksnp_n_passive_ips == 0 &&
+			    list_empty(&peer->ksnp_routes)) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = 0;
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			for (j = 0; j < peer->ksnp_n_passive_ips; j++) {
+				if (index-- > 0)
+					continue;
+
+				*id = peer->ksnp_id;
+				*myip = peer->ksnp_passive_ips[j];
+				*peer_ip = 0;
+				*port = 0;
+				*conn_count = 0;
+				*share_count = 0;
+				rc = 0;
+				goto out;
+			}
+
+			list_for_each (rtmp, &peer->ksnp_routes) {
+				if (index-- > 0)
+					continue;
+
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				*id = peer->ksnp_id;
+				*myip = route->ksnr_myipaddr;
+				*peer_ip = route->ksnr_ipaddr;
+				*port = route->ksnr_port;
+				*conn_count = route->ksnr_conn_count;
+				*share_count = route->ksnr_share_count;
+				rc = 0;
+				goto out;
+			}
+		}
+	}
+ out:
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (rc);
+}
+
+void
+ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	int		type = conn->ksnc_type;
+	ksock_interface_t *iface;
+
+	conn->ksnc_route = route;
+	ksocknal_route_addref(route);
+
+	if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
+		if (route->ksnr_myipaddr == 0) {
+			/* route wasn't bound locally yet (the initial route) */
+			CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       HIPQUAD(route->ksnr_ipaddr),
+			       HIPQUAD(conn->ksnc_myipaddr));
+		} else {
+			CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+			       "%u.%u.%u.%u to %u.%u.%u.%u\n",
+			       libcfs_id2str(peer->ksnp_id),
+			       HIPQUAD(route->ksnr_ipaddr),
+			       HIPQUAD(route->ksnr_myipaddr),
+			       HIPQUAD(conn->ksnc_myipaddr));
+
+			iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+						  route->ksnr_myipaddr);
+			if (iface != NULL)
+				iface->ksni_nroutes--;
+		}
+		route->ksnr_myipaddr = conn->ksnc_myipaddr;
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes++;
+	}
+
+	route->ksnr_connected |= (1<<type);
+	route->ksnr_conn_count++;
+
+	/* Successful connection => further attempts can
+	 * proceed immediately */
+	route->ksnr_retry_interval = 0;
+}
+
+void
+ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
+{
+	struct list_head	*tmp;
+	ksock_conn_t      *conn;
+	ksock_route_t     *route2;
+
+	LASSERT (!peer->ksnp_closing);
+	LASSERT (route->ksnr_peer == NULL);
+	LASSERT (!route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+	LASSERT (route->ksnr_connected == 0);
+
+	/* LASSERT(unique) */
+	list_for_each(tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
+			CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(route->ksnr_ipaddr));
+			LBUG();
+		}
+	}
+
+	route->ksnr_peer = peer;
+	ksocknal_peer_addref(peer);
+	/* peer's routelist takes over my ref on 'route' */
+	list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+
+	list_for_each(tmp, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		/* keep going (typed routes) */
+	}
+}
+
+void
+ksocknal_del_route_locked (ksock_route_t *route)
+{
+	ksock_peer_t      *peer = route->ksnr_peer;
+	ksock_interface_t *iface;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	struct list_head	*cnxt;
+
+	LASSERT (!route->ksnr_deleted);
+
+	/* Close associated conns */
+	list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry(ctmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_route != route)
+			continue;
+
+		ksocknal_close_conn_locked (conn, 0);
+	}
+
+	if (route->ksnr_myipaddr != 0) {
+		iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+					  route->ksnr_myipaddr);
+		if (iface != NULL)
+			iface->ksni_nroutes--;
+	}
+
+	route->ksnr_deleted = 1;
+	list_del (&route->ksnr_list);
+	ksocknal_route_decref(route);	     /* drop peer's ref */
+
+	if (list_empty (&peer->ksnp_routes) &&
+	    list_empty (&peer->ksnp_conns)) {
+		/* I've just removed the last route to a peer with no active
+		 * connections */
+		ksocknal_unlink_peer_locked (peer);
+	}
+}
+
+int
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
+{
+	struct list_head	*tmp;
+	ksock_peer_t      *peer;
+	ksock_peer_t      *peer2;
+	ksock_route_t     *route;
+	ksock_route_t     *route2;
+	int		rc;
+
+	if (id.nid == LNET_NID_ANY ||
+	    id.pid == LNET_PID_ANY)
+		return (-EINVAL);
+
+	/* Have a brand new peer ready... */
+	rc = ksocknal_create_peer(&peer, ni, id);
+	if (rc != 0)
+		return rc;
+
+	route = ksocknal_create_route (ipaddr, port);
+	if (route == NULL) {
+		ksocknal_peer_decref(peer);
+		return (-ENOMEM);
+	}
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* always called with a ref on ni, so shutdown can't have started */
+	LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+	peer2 = ksocknal_find_peer_locked (ni, id);
+	if (peer2 != NULL) {
+		ksocknal_peer_decref(peer);
+		peer = peer2;
+	} else {
+		/* peer table takes my ref on peer */
+		list_add_tail (&peer->ksnp_list,
+				   ksocknal_nid2peerlist (id.nid));
+	}
+
+	route2 = NULL;
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route2 = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route2->ksnr_ipaddr == ipaddr)
+			break;
+
+		route2 = NULL;
+	}
+	if (route2 == NULL) {
+		ksocknal_add_route_locked(peer, route);
+		route->ksnr_share_count++;
+	} else {
+		ksocknal_route_decref(route);
+		route2->ksnr_share_count++;
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (0);
+}
+
+void
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
+{
+	ksock_conn_t     *conn;
+	ksock_route_t    *route;
+	struct list_head       *tmp;
+	struct list_head       *nxt;
+	int	       nshared;
+
+	LASSERT (!peer->ksnp_closing);
+
+	/* Extra ref prevents peer disappearing until I'm done with it */
+	ksocknal_peer_addref(peer);
+
+	list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		/* no match */
+		if (!(ip == 0 || route->ksnr_ipaddr == ip))
+			continue;
+
+		route->ksnr_share_count = 0;
+		/* This deletes associated conns too */
+		ksocknal_del_route_locked (route);
+	}
+
+	nshared = 0;
+	list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+		nshared += route->ksnr_share_count;
+	}
+
+	if (nshared == 0) {
+		/* remove everything else if there are no explicit entries
+		 * left */
+
+		list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+			/* we should only be removing auto-entries */
+			LASSERT(route->ksnr_share_count == 0);
+			ksocknal_del_route_locked (route);
+		}
+
+		list_for_each_safe (tmp, nxt, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			ksocknal_close_conn_locked(conn, 0);
+		}
+	}
+
+	ksocknal_peer_decref(peer);
+	/* NB peer unlinks itself when last conn/route is removed */
+}
+
+int
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
+{
+	LIST_HEAD     (zombies);
+	struct list_head	*ptmp;
+	struct list_head	*pnxt;
+	ksock_peer_t      *peer;
+	int		lo;
+	int		hi;
+	int		i;
+	int		rc = -ENOENT;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+			      (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+				continue;
+
+			ksocknal_peer_addref(peer);     /* a ref for me... */
+
+			ksocknal_del_peer_locked (peer, ip);
+
+			if (peer->ksnp_closing &&
+			    !list_empty(&peer->ksnp_tx_queue)) {
+				LASSERT (list_empty(&peer->ksnp_conns));
+				LASSERT (list_empty(&peer->ksnp_routes));
+
+				list_splice_init(&peer->ksnp_tx_queue,
+						     &zombies);
+			}
+
+			ksocknal_peer_decref(peer);     /* ...till here */
+
+			rc = 0;		 /* matched! */
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+
+	return (rc);
+}
+
+ksock_conn_t *
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*ptmp;
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			LASSERT (!peer->ksnp_closing);
+
+			if (peer->ksnp_ni != ni)
+				continue;
+
+			list_for_each (ctmp, &peer->ksnp_conns) {
+				if (index-- > 0)
+					continue;
+
+				conn = list_entry (ctmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				read_unlock(&ksocknal_data. \
+						 ksnd_global_lock);
+				return (conn);
+			}
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (NULL);
+}
+
+ksock_sched_t *
+ksocknal_choose_scheduler_locked(unsigned int cpt)
+{
+	struct ksock_sched_info	*info = ksocknal_data.ksnd_sched_info[cpt];
+	ksock_sched_t		*sched;
+	int			i;
+
+	LASSERT(info->ksi_nthreads > 0);
+
+	sched = &info->ksi_scheds[0];
+	/*
+	 * NB: it's safe so far, but info->ksi_nthreads could be changed
+	 * at runtime when we have dynamic LNet configuration, then we
+	 * need to take care of this.
+	 */
+	for (i = 1; i < info->ksi_nthreads; i++) {
+		if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns)
+			sched = &info->ksi_scheds[i];
+	}
+
+	return sched;
+}
+
+int
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		i;
+	int		nip;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	nip = net->ksnn_ninterfaces;
+	LASSERT (nip <= LNET_MAX_INTERFACES);
+
+	/* Only offer interfaces for additional connections if I have
+	 * more than one. */
+	if (nip < 2) {
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return 0;
+	}
+
+	for (i = 0; i < nip; i++) {
+		ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
+		LASSERT (ipaddrs[i] != 0);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return (nip);
+}
+
+int
+ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips)
+{
+	int   best_netmatch = 0;
+	int   best_xor      = 0;
+	int   best	  = -1;
+	int   this_xor;
+	int   this_netmatch;
+	int   i;
+
+	for (i = 0; i < nips; i++) {
+		if (ips[i] == 0)
+			continue;
+
+		this_xor = (ips[i] ^ iface->ksni_ipaddr);
+		this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+		if (!(best < 0 ||
+		      best_netmatch < this_netmatch ||
+		      (best_netmatch == this_netmatch &&
+		       best_xor > this_xor)))
+			continue;
+
+		best = i;
+		best_netmatch = this_netmatch;
+		best_xor = this_xor;
+	}
+
+	LASSERT (best >= 0);
+	return (best);
+}
+
+int
+ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	ksock_net_t	*net = peer->ksnp_ni->ni_data;
+	ksock_interface_t  *iface;
+	ksock_interface_t  *best_iface;
+	int		 n_ips;
+	int		 i;
+	int		 j;
+	int		 k;
+	__u32	       ip;
+	__u32	       xor;
+	int		 this_netmatch;
+	int		 best_netmatch;
+	int		 best_npeers;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness shouldn't matter */
+
+	/* Also note that I'm not going to return more than n_peerips
+	 * interfaces, even if I have more myself */
+
+	write_lock_bh(global_lock);
+
+	LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+	LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+	/* Only match interfaces for additional connections
+	 * if I have > 1 interface */
+	n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+		MIN(n_peerips, net->ksnn_ninterfaces);
+
+	for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
+		/*	      ^ yes really... */
+
+		/* If we have any new interfaces, first tick off all the
+		 * peer IPs that match old interfaces, then choose new
+		 * interfaces to match the remaining peer IPS.
+		 * We don't forget interfaces we've stopped using; we might
+		 * start using them again... */
+
+		if (i < peer->ksnp_n_passive_ips) {
+			/* Old interface. */
+			ip = peer->ksnp_passive_ips[i];
+			best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
+
+			/* peer passive ips are kept up to date */
+			LASSERT(best_iface != NULL);
+		} else {
+			/* choose a new interface */
+			LASSERT (i == peer->ksnp_n_passive_ips);
+
+			best_iface = NULL;
+			best_netmatch = 0;
+			best_npeers = 0;
+
+			for (j = 0; j < net->ksnn_ninterfaces; j++) {
+				iface = &net->ksnn_interfaces[j];
+				ip = iface->ksni_ipaddr;
+
+				for (k = 0; k < peer->ksnp_n_passive_ips; k++)
+					if (peer->ksnp_passive_ips[k] == ip)
+						break;
+
+				if (k < peer->ksnp_n_passive_ips) /* using it already */
+					continue;
+
+				k = ksocknal_match_peerip(iface, peerips, n_peerips);
+				xor = (ip ^ peerips[k]);
+				this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0;
+
+				if (!(best_iface == NULL ||
+				      best_netmatch < this_netmatch ||
+				      (best_netmatch == this_netmatch &&
+				       best_npeers > iface->ksni_npeers)))
+					continue;
+
+				best_iface = iface;
+				best_netmatch = this_netmatch;
+				best_npeers = iface->ksni_npeers;
+			}
+
+			best_iface->ksni_npeers++;
+			ip = best_iface->ksni_ipaddr;
+			peer->ksnp_passive_ips[i] = ip;
+			peer->ksnp_n_passive_ips = i+1;
+		}
+
+		LASSERT (best_iface != NULL);
+
+		/* mark the best matching peer IP used */
+		j = ksocknal_match_peerip(best_iface, peerips, n_peerips);
+		peerips[j] = 0;
+	}
+
+	/* Overwrite input peer IP addresses */
+	memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
+
+	write_unlock_bh(global_lock);
+
+	return (n_ips);
+}
+
+void
+ksocknal_create_routes(ksock_peer_t *peer, int port,
+		       __u32 *peer_ipaddrs, int npeer_ipaddrs)
+{
+	ksock_route_t       *newroute = NULL;
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	lnet_ni_t	   *ni = peer->ksnp_ni;
+	ksock_net_t	 *net = ni->ni_data;
+	struct list_head	  *rtmp;
+	ksock_route_t       *route;
+	ksock_interface_t   *iface;
+	ksock_interface_t   *best_iface;
+	int		  best_netmatch;
+	int		  this_netmatch;
+	int		  best_nroutes;
+	int		  i;
+	int		  j;
+
+	/* CAVEAT EMPTOR: We do all our interface matching with an
+	 * exclusive hold of global lock at IRQ priority.  We're only
+	 * expecting to be dealing with small numbers of interfaces, so the
+	 * O(n**3)-ness here shouldn't matter */
+
+	write_lock_bh(global_lock);
+
+	if (net->ksnn_ninterfaces < 2) {
+		/* Only create additional connections
+		 * if I have > 1 interface */
+		write_unlock_bh(global_lock);
+		return;
+	}
+
+	LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
+
+	for (i = 0; i < npeer_ipaddrs; i++) {
+		if (newroute != NULL) {
+			newroute->ksnr_ipaddr = peer_ipaddrs[i];
+		} else {
+			write_unlock_bh(global_lock);
+
+			newroute = ksocknal_create_route(peer_ipaddrs[i], port);
+			if (newroute == NULL)
+				return;
+
+			write_lock_bh(global_lock);
+		}
+
+		if (peer->ksnp_closing) {
+			/* peer got closed under me */
+			break;
+		}
+
+		/* Already got a route? */
+		route = NULL;
+		list_for_each(rtmp, &peer->ksnp_routes) {
+			route = list_entry(rtmp, ksock_route_t, ksnr_list);
+
+			if (route->ksnr_ipaddr == newroute->ksnr_ipaddr)
+				break;
+
+			route = NULL;
+		}
+		if (route != NULL)
+			continue;
+
+		best_iface = NULL;
+		best_nroutes = 0;
+		best_netmatch = 0;
+
+		LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
+
+		/* Select interface to connect from */
+		for (j = 0; j < net->ksnn_ninterfaces; j++) {
+			iface = &net->ksnn_interfaces[j];
+
+			/* Using this interface already? */
+			list_for_each(rtmp, &peer->ksnp_routes) {
+				route = list_entry(rtmp, ksock_route_t,
+						       ksnr_list);
+
+				if (route->ksnr_myipaddr == iface->ksni_ipaddr)
+					break;
+
+				route = NULL;
+			}
+			if (route != NULL)
+				continue;
+
+			this_netmatch = (((iface->ksni_ipaddr ^
+					   newroute->ksnr_ipaddr) &
+					   iface->ksni_netmask) == 0) ? 1 : 0;
+
+			if (!(best_iface == NULL ||
+			      best_netmatch < this_netmatch ||
+			      (best_netmatch == this_netmatch &&
+			       best_nroutes > iface->ksni_nroutes)))
+				continue;
+
+			best_iface = iface;
+			best_netmatch = this_netmatch;
+			best_nroutes = iface->ksni_nroutes;
+		}
+
+		if (best_iface == NULL)
+			continue;
+
+		newroute->ksnr_myipaddr = best_iface->ksni_ipaddr;
+		best_iface->ksni_nroutes++;
+
+		ksocknal_add_route_locked(peer, newroute);
+		newroute = NULL;
+	}
+
+	write_unlock_bh(global_lock);
+	if (newroute != NULL)
+		ksocknal_route_decref(newroute);
+}
+
+int
+ksocknal_accept (lnet_ni_t *ni, socket_t *sock)
+{
+	ksock_connreq_t    *cr;
+	int		 rc;
+	__u32	       peer_ip;
+	int		 peer_port;
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT (rc == 0);		      /* we succeeded before */
+
+	LIBCFS_ALLOC(cr, sizeof(*cr));
+	if (cr == NULL) {
+		LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from "
+				   "%u.%u.%u.%u: memory exhausted\n",
+				   HIPQUAD(peer_ip));
+		return -ENOMEM;
+	}
+
+	lnet_ni_addref(ni);
+	cr->ksncr_ni   = ni;
+	cr->ksncr_sock = sock;
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+	return 0;
+}
+
+int
+ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr)
+{
+	ksock_route_t   *route;
+
+	list_for_each_entry (route, &peer->ksnp_routes, ksnr_list) {
+
+		if (route->ksnr_ipaddr == ipaddr)
+			return route->ksnr_connecting;
+	}
+	return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+		      socket_t *sock, int type)
+{
+	rwlock_t		*global_lock = &ksocknal_data.ksnd_global_lock;
+	LIST_HEAD     (zombies);
+	lnet_process_id_t  peerid;
+	struct list_head	*tmp;
+	__u64	      incarnation;
+	ksock_conn_t      *conn;
+	ksock_conn_t      *conn2;
+	ksock_peer_t      *peer = NULL;
+	ksock_peer_t      *peer2;
+	ksock_sched_t     *sched;
+	ksock_hello_msg_t *hello;
+	int		   cpt;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*txtmp;
+	int		rc;
+	int		active;
+	char	      *warn = NULL;
+
+	active = (route != NULL);
+
+	LASSERT (active == (type != SOCKLND_CONN_NONE));
+
+	LIBCFS_ALLOC(conn, sizeof(*conn));
+	if (conn == NULL) {
+		rc = -ENOMEM;
+		goto failed_0;
+	}
+
+	memset (conn, 0, sizeof (*conn));
+
+	conn->ksnc_peer = NULL;
+	conn->ksnc_route = NULL;
+	conn->ksnc_sock = sock;
+	/* 2 ref, 1 for conn, another extra ref prevents socket
+	 * being closed before establishment of connection */
+	atomic_set (&conn->ksnc_sock_refcount, 2);
+	conn->ksnc_type = type;
+	ksocknal_lib_save_callback(sock, conn);
+	atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+	conn->ksnc_rx_ready = 0;
+	conn->ksnc_rx_scheduled = 0;
+
+	INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+	conn->ksnc_tx_ready = 0;
+	conn->ksnc_tx_scheduled = 0;
+	conn->ksnc_tx_carrier = NULL;
+	atomic_set (&conn->ksnc_tx_nob, 0);
+
+	LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+				     kshm_ips[LNET_MAX_INTERFACES]));
+	if (hello == NULL) {
+		rc = -ENOMEM;
+		goto failed_1;
+	}
+
+	/* stash conn's local and remote addrs */
+	rc = ksocknal_lib_get_conn_addrs (conn);
+	if (rc != 0)
+		goto failed_1;
+
+	/* Find out/confirm peer's NID and connection type and get the
+	 * vector of interfaces she's willing to let me connect to.
+	 * Passive connections use the listener timeout since the peer sends
+	 * eagerly */
+
+	if (active) {
+		peer = route->ksnr_peer;
+		LASSERT(ni == peer->ksnp_ni);
+
+		/* Active connection sends HELLO eagerly */
+		hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+		peerid = peer->ksnp_id;
+
+		write_lock_bh(global_lock);
+		conn->ksnc_proto = peer->ksnp_proto;
+		write_unlock_bh(global_lock);
+
+		if (conn->ksnc_proto == NULL) {
+			 conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			 if (*ksocknal_tunables.ksnd_protocol == 2)
+				 conn->ksnc_proto = &ksocknal_protocol_v2x;
+			 else if (*ksocknal_tunables.ksnd_protocol == 1)
+				 conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+		}
+
+		rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
+		if (rc != 0)
+			goto failed_1;
+	} else {
+		peerid.nid = LNET_NID_ANY;
+		peerid.pid = LNET_PID_ANY;
+
+		/* Passive, get protocol from peer */
+		conn->ksnc_proto = NULL;
+	}
+
+	rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+	if (rc < 0)
+		goto failed_1;
+
+	LASSERT (rc == 0 || active);
+	LASSERT (conn->ksnc_proto != NULL);
+	LASSERT (peerid.nid != LNET_NID_ANY);
+
+	cpt = lnet_cpt_of_nid(peerid.nid);
+
+	if (active) {
+		ksocknal_peer_addref(peer);
+		write_lock_bh(global_lock);
+	} else {
+		rc = ksocknal_create_peer(&peer, ni, peerid);
+		if (rc != 0)
+			goto failed_1;
+
+		write_lock_bh(global_lock);
+
+		/* called with a ref on ni, so shutdown can't have started */
+		LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
+		peer2 = ksocknal_find_peer_locked(ni, peerid);
+		if (peer2 == NULL) {
+			/* NB this puts an "empty" peer in the peer
+			 * table (which takes my ref) */
+			list_add_tail(&peer->ksnp_list,
+					  ksocknal_nid2peerlist(peerid.nid));
+		} else {
+			ksocknal_peer_decref(peer);
+			peer = peer2;
+		}
+
+		/* +1 ref for me */
+		ksocknal_peer_addref(peer);
+		peer->ksnp_accepting++;
+
+		/* Am I already connecting to this guy?  Resolve in
+		 * favour of higher NID... */
+		if (peerid.nid < ni->ni_nid &&
+		    ksocknal_connecting(peer, conn->ksnc_ipaddr)) {
+			rc = EALREADY;
+			warn = "connection race resolution";
+			goto failed_2;
+		}
+	}
+
+	if (peer->ksnp_closing ||
+	    (active && route->ksnr_deleted)) {
+		/* peer/route got closed under me */
+		rc = -ESTALE;
+		warn = "peer/route removed";
+		goto failed_2;
+	}
+
+	if (peer->ksnp_proto == NULL) {
+		/* Never connected before.
+		 * NB recv_hello may have returned EPROTO to signal my peer
+		 * wants a different protocol than the one I asked for.
+		 */
+		LASSERT (list_empty(&peer->ksnp_conns));
+
+		peer->ksnp_proto = conn->ksnc_proto;
+		peer->ksnp_incarnation = incarnation;
+	}
+
+	if (peer->ksnp_proto != conn->ksnc_proto ||
+	    peer->ksnp_incarnation != incarnation) {
+		/* Peer rebooted or I've got the wrong protocol version */
+		ksocknal_close_peer_conns_locked(peer, 0, 0);
+
+		peer->ksnp_proto = NULL;
+		rc = ESTALE;
+		warn = peer->ksnp_incarnation != incarnation ?
+		       "peer rebooted" :
+		       "wrong proto version";
+		goto failed_2;
+	}
+
+	switch (rc) {
+	default:
+		LBUG();
+	case 0:
+		break;
+	case EALREADY:
+		warn = "lost conn race";
+		goto failed_2;
+	case EPROTO:
+		warn = "retry with different protocol version";
+		goto failed_2;
+	}
+
+	/* Refuse to duplicate an existing connection, unless this is a
+	 * loopback connection */
+	if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr ||
+			    conn2->ksnc_myipaddr != conn->ksnc_myipaddr ||
+			    conn2->ksnc_type != conn->ksnc_type)
+				continue;
+
+			/* Reply on a passive connection attempt so the peer
+			 * realises we're connected. */
+			LASSERT (rc == 0);
+			if (!active)
+				rc = EALREADY;
+
+			warn = "duplicate";
+			goto failed_2;
+		}
+	}
+
+	/* If the connection created by this route didn't bind to the IP
+	 * address the route connected to, the connection/route matching
+	 * code below probably isn't going to work. */
+	if (active &&
+	    route->ksnr_ipaddr != conn->ksnc_ipaddr) {
+		CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+		       libcfs_id2str(peer->ksnp_id),
+		       HIPQUAD(route->ksnr_ipaddr),
+		       HIPQUAD(conn->ksnc_ipaddr));
+	}
+
+	/* Search for a route corresponding to the new connection and
+	 * create an association.  This allows incoming connections created
+	 * by routes in my peer to match my own route entries so I don't
+	 * continually create duplicate routes. */
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry(tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+			continue;
+
+		ksocknal_associate_route_conn_locked(route, conn);
+		break;
+	}
+
+	conn->ksnc_peer = peer;		 /* conn takes my ref on peer */
+	peer->ksnp_last_alive = cfs_time_current();
+	peer->ksnp_send_keepalive = 0;
+	peer->ksnp_error = 0;
+
+	sched = ksocknal_choose_scheduler_locked(cpt);
+	sched->kss_nconns++;
+	conn->ksnc_scheduler = sched;
+
+	conn->ksnc_tx_last_post = cfs_time_current();
+	/* Set the deadline for the outgoing HELLO to drain */
+	conn->ksnc_tx_bufnob = cfs_sock_wmem_queued(sock);
+	conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();   /* order with adding to peer's conn list */
+
+	list_add (&conn->ksnc_list, &peer->ksnp_conns);
+	ksocknal_conn_addref(conn);
+
+	ksocknal_new_packet(conn, 0);
+
+	conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn);
+
+	/* Take packets blocking for this connection. */
+	list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) {
+		if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO)
+				continue;
+
+		list_del (&tx->tx_list);
+		ksocknal_queue_tx_locked (tx, conn);
+	}
+
+	write_unlock_bh(global_lock);
+
+	/* We've now got a new connection.  Any errors from here on are just
+	 * like "normal" comms errors and we close the connection normally.
+	 * NB (a) we still have to send the reply HELLO for passive
+	 *	connections,
+	 *    (b) normal I/O on the conn is blocked until I setup and call the
+	 *	socket callbacks.
+	 */
+
+	CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+	       " incarnation:"LPD64" sched[%d:%d]\n",
+	       libcfs_id2str(peerid), conn->ksnc_proto->pro_version,
+	       HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr),
+	       conn->ksnc_port, incarnation, cpt,
+	       (int)(sched - &sched->kss_info->ksi_scheds[0]));
+
+	if (active) {
+		/* additional routes after interface exchange? */
+		ksocknal_create_routes(peer, conn->ksnc_port,
+				       hello->kshm_ips, hello->kshm_nips);
+	} else {
+		hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+						       hello->kshm_nips);
+		rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+	}
+
+	LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+				    kshm_ips[LNET_MAX_INTERFACES]));
+
+	/* setup the socket AFTER I've received hello (it disables
+	 * SO_LINGER).  I might call back to the acceptor who may want
+	 * to send a protocol version response and then close the
+	 * socket; this ensures the socket only tears down after the
+	 * response has been sent. */
+	if (rc == 0)
+		rc = ksocknal_lib_setup_sock(sock);
+
+	write_lock_bh(global_lock);
+
+	/* NB my callbacks block while I hold ksnd_global_lock */
+	ksocknal_lib_set_callback(sock, conn);
+
+	if (!active)
+		peer->ksnp_accepting--;
+
+	write_unlock_bh(global_lock);
+
+	if (rc != 0) {
+		write_lock_bh(global_lock);
+		if (!conn->ksnc_closing) {
+			/* could be closed by another thread */
+			ksocknal_close_conn_locked(conn, rc);
+		}
+		write_unlock_bh(global_lock);
+	} else if (ksocknal_connsock_addref(conn) == 0) {
+		/* Allow I/O to proceed. */
+		ksocknal_read_callback(conn);
+		ksocknal_write_callback(conn);
+		ksocknal_connsock_decref(conn);
+	}
+
+	ksocknal_connsock_decref(conn);
+	ksocknal_conn_decref(conn);
+	return rc;
+
+ failed_2:
+	if (!peer->ksnp_closing &&
+	    list_empty (&peer->ksnp_conns) &&
+	    list_empty (&peer->ksnp_routes)) {
+		list_add(&zombies, &peer->ksnp_tx_queue);
+		list_del_init(&peer->ksnp_tx_queue);
+		ksocknal_unlink_peer_locked(peer);
+	}
+
+	write_unlock_bh(global_lock);
+
+	if (warn != NULL) {
+		if (rc < 0)
+			CERROR("Not creating conn %s type %d: %s\n",
+			       libcfs_id2str(peerid), conn->ksnc_type, warn);
+		else
+			CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+			      libcfs_id2str(peerid), conn->ksnc_type, warn);
+	}
+
+	if (!active) {
+		if (rc > 0) {
+			/* Request retry by replying with CONN_NONE
+			 * ksnc_proto has been set already */
+			conn->ksnc_type = SOCKLND_CONN_NONE;
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, peerid.nid, hello);
+		}
+
+		write_lock_bh(global_lock);
+		peer->ksnp_accepting--;
+		write_unlock_bh(global_lock);
+	}
+
+	ksocknal_txlist_done(ni, &zombies, 1);
+	ksocknal_peer_decref(peer);
+
+ failed_1:
+	if (hello != NULL)
+		LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+					    kshm_ips[LNET_MAX_INTERFACES]));
+
+	LIBCFS_FREE (conn, sizeof(*conn));
+
+ failed_0:
+	libcfs_sock_release(sock);
+	return rc;
+}
+
+void
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
+{
+	/* This just does the immmediate housekeeping, and queues the
+	 * connection for the reaper to terminate.
+	 * Caller holds ksnd_global_lock exclusively in irq context */
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_route_t     *route;
+	ksock_conn_t      *conn2;
+	struct list_head	*tmp;
+
+	LASSERT (peer->ksnp_error == 0);
+	LASSERT (!conn->ksnc_closing);
+	conn->ksnc_closing = 1;
+
+	/* ksnd_deathrow_conns takes over peer's ref */
+	list_del (&conn->ksnc_list);
+
+	route = conn->ksnc_route;
+	if (route != NULL) {
+		/* dissociate conn from route... */
+		LASSERT (!route->ksnr_deleted);
+		LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0);
+
+		conn2 = NULL;
+		list_for_each(tmp, &peer->ksnp_conns) {
+			conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+			if (conn2->ksnc_route == route &&
+			    conn2->ksnc_type == conn->ksnc_type)
+				break;
+
+			conn2 = NULL;
+		}
+		if (conn2 == NULL)
+			route->ksnr_connected &= ~(1 << conn->ksnc_type);
+
+		conn->ksnc_route = NULL;
+
+#if 0	   /* irrelevent with only eager routes */
+		/* make route least favourite */
+		list_del (&route->ksnr_list);
+		list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
+#endif
+		ksocknal_route_decref(route);     /* drop conn's ref on route */
+	}
+
+	if (list_empty (&peer->ksnp_conns)) {
+		/* No more connections to this peer */
+
+		if (!list_empty(&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx;
+
+			LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+
+			/* throw them to the last connection...,
+			 * these TXs will be send to /dev/null by scheduler */
+			list_for_each_entry(tx, &peer->ksnp_tx_queue,
+						tx_list)
+				ksocknal_tx_prep(conn, tx);
+
+			spin_lock_bh(&conn->ksnc_scheduler->kss_lock);
+			list_splice_init(&peer->ksnp_tx_queue,
+					     &conn->ksnc_tx_queue);
+			spin_unlock_bh(&conn->ksnc_scheduler->kss_lock);
+		}
+
+		peer->ksnp_proto = NULL;	/* renegotiate protocol version */
+		peer->ksnp_error = error;       /* stash last conn close reason */
+
+		if (list_empty (&peer->ksnp_routes)) {
+			/* I've just closed last conn belonging to a
+			 * peer with no routes to it */
+			ksocknal_unlink_peer_locked (peer);
+		}
+	}
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list,
+			  &ksocknal_data.ksnd_deathrow_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+	int	notify = 0;
+	cfs_time_t last_alive = 0;
+
+	/* There has been a connection failure or comms error; but I'll only
+	 * tell LNET I think the peer is dead if it's to another kernel and
+	 * there are no connections or connection attempts in existance. */
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+	    list_empty(&peer->ksnp_conns) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		notify = 1;
+		last_alive = peer->ksnp_last_alive;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	if (notify)
+		lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+			     last_alive);
+}
+
+void
+ksocknal_finalize_zcreq(ksock_conn_t *conn)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_tx_t       *tx;
+	ksock_tx_t       *tmp;
+	LIST_HEAD    (zlist);
+
+	/* NB safe to finalize TXs because closing of socket will
+	 * abort all buffered data */
+	LASSERT (conn->ksnc_sock == NULL);
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) {
+		if (tx->tx_conn != conn)
+			continue;
+
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0);
+
+		tx->tx_msg.ksm_zc_cookies[0] = 0;
+		tx->tx_zc_aborted = 1; /* mark it as not-acked */
+		list_del(&tx->tx_zc_list);
+		list_add(&tx->tx_zc_list, &zlist);
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+}
+
+void
+ksocknal_terminate_conn (ksock_conn_t *conn)
+{
+	/* This gets called by the reaper (guaranteed thread context) to
+	 * disengage the socket from its callbacks and close it.
+	 * ksnc_refcount will eventually hit zero, and then the reaper will
+	 * destroy it. */
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	ksock_sched_t    *sched = conn->ksnc_scheduler;
+	int	       failed = 0;
+
+	LASSERT(conn->ksnc_closing);
+
+	/* wake up the scheduler to "send" all remaining packets to /dev/null */
+	spin_lock_bh(&sched->kss_lock);
+
+	/* a closing conn is always ready to tx */
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled &&
+	    !list_empty(&conn->ksnc_tx_queue)){
+		list_add_tail (&conn->ksnc_tx_list,
+			       &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	/* serialise with callbacks */
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
+
+	/* OK, so this conn may not be completely disengaged from its
+	 * scheduler yet, but it _has_ committed to terminate... */
+	conn->ksnc_scheduler->kss_nconns--;
+
+	if (peer->ksnp_error != 0) {
+		/* peer's last conn closed in error */
+		LASSERT (list_empty (&peer->ksnp_conns));
+		failed = 1;
+		peer->ksnp_error = 0;     /* avoid multiple notifications */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (failed)
+		ksocknal_peer_failed(peer);
+
+	/* The socket is closed on the final put; either here, or in
+	 * ksocknal_{send,recv}msg().  Since we set up the linger2 option
+	 * when the connection was established, this will close the socket
+	 * immediately, aborting anything buffered in it. Any hung
+	 * zero-copy transmits will therefore complete in finite time. */
+	ksocknal_connsock_decref(conn);
+}
+
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+	/* Queue the conn for the reaper to destroy */
+
+	LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0);
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+	wake_up(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_destroy_conn (ksock_conn_t *conn)
+{
+	cfs_time_t      last_rcv;
+
+	/* Final coup-de-grace of the reaper */
+	CDEBUG (D_NET, "connection %p\n", conn);
+
+	LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+	LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+	LASSERT (conn->ksnc_sock == NULL);
+	LASSERT (conn->ksnc_route == NULL);
+	LASSERT (!conn->ksnc_tx_scheduled);
+	LASSERT (!conn->ksnc_rx_scheduled);
+	LASSERT (list_empty(&conn->ksnc_tx_queue));
+
+	/* complete current receive if any */
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		last_rcv = conn->ksnc_rx_deadline -
+			   cfs_time_seconds(*ksocknal_tunables.ksnd_timeout);
+		CERROR("Completing partial receive from %s[%d]"
+		       ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, "
+		       "last alive is %ld secs ago\n",
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type,
+		       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+		       conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left,
+		       cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+					last_rcv)));
+		lnet_finalize (conn->ksnc_peer->ksnp_ni,
+			       conn->ksnc_cookie, -EIO);
+		break;
+	case SOCKNAL_RX_LNET_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of lnet header from %s"
+			       ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of ksock message from %s"
+			       ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+			       conn->ksnc_proto->pro_version);
+		break;
+	case SOCKNAL_RX_SLOP:
+		if (conn->ksnc_rx_started)
+			CERROR("Incomplete receive of slops from %s"
+			       ", ip %d.%d.%d.%d:%d, with error\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	       break;
+	default:
+		LBUG ();
+		break;
+	}
+
+	ksocknal_peer_decref(conn->ksnc_peer);
+
+	LIBCFS_FREE (conn, sizeof (*conn));
+}
+
+int
+ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why)
+{
+	ksock_conn_t       *conn;
+	struct list_head	 *ctmp;
+	struct list_head	 *cnxt;
+	int		 count = 0;
+
+	list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) {
+		conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+		if (ipaddr == 0 ||
+		    conn->ksnc_ipaddr == ipaddr) {
+			count++;
+			ksocknal_close_conn_locked (conn, why);
+		}
+	}
+
+	return (count);
+}
+
+int
+ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
+{
+	ksock_peer_t     *peer = conn->ksnc_peer;
+	__u32	     ipaddr = conn->ksnc_ipaddr;
+	int	       count;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (count);
+}
+
+int
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
+{
+	ksock_peer_t       *peer;
+	struct list_head	 *ptmp;
+	struct list_head	 *pnxt;
+	int		 lo;
+	int		 hi;
+	int		 i;
+	int		 count = 0;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	if (id.nid != LNET_NID_ANY)
+		lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers);
+	else {
+		lo = 0;
+		hi = ksocknal_data.ksnd_peer_hash_size - 1;
+	}
+
+	for (i = lo; i <= hi; i++) {
+		list_for_each_safe (ptmp, pnxt,
+					&ksocknal_data.ksnd_peers[i]) {
+
+			peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
+
+			if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+			      (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
+				continue;
+
+			count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	/* wildcards always succeed */
+	if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
+		return (0);
+
+	return (count == 0 ? -ENOENT : 0);
+}
+
+void
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
+{
+	/* The router is telling me she's been notified of a change in
+	 * gateway state.... */
+	lnet_process_id_t  id = {0};
+
+	id.nid = gw_nid;
+	id.pid = LNET_PID_ANY;
+
+	CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid),
+		alive ? "up" : "down");
+
+	if (!alive) {
+		/* If the gateway crashed, close all open connections... */
+		ksocknal_close_matching_conns (id, 0);
+		return;
+	}
+
+	/* ...otherwise do nothing.  We can only establish new connections
+	 * if we have autroutes, and these connect on demand. */
+}
+
+void
+ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
+{
+	int		connect = 1;
+	cfs_time_t	 last_alive = 0;
+	cfs_time_t	 now = cfs_time_current();
+	ksock_peer_t      *peer = NULL;
+	rwlock_t		*glock = &ksocknal_data.ksnd_global_lock;
+	lnet_process_id_t  id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID};
+
+	read_lock(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL) {
+		struct list_head       *tmp;
+		ksock_conn_t     *conn;
+		int	       bufnob;
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+
+			if (bufnob < conn->ksnc_tx_bufnob) {
+				/* something got ACKed */
+				conn->ksnc_tx_deadline =
+					cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+				peer->ksnp_last_alive = now;
+				conn->ksnc_tx_bufnob = bufnob;
+			}
+		}
+
+		last_alive = peer->ksnp_last_alive;
+		if (ksocknal_find_connectable_route_locked(peer) == NULL)
+			connect = 0;
+	}
+
+	read_unlock(glock);
+
+	if (last_alive != 0)
+		*when = last_alive;
+
+	CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n",
+	       libcfs_nid2str(nid), peer,
+	       last_alive ? cfs_duration_sec(now - last_alive) : -1,
+	       connect);
+
+	if (!connect)
+		return;
+
+	ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port());
+
+	write_lock_bh(glock);
+
+	peer = ksocknal_find_peer_locked(ni, id);
+	if (peer != NULL)
+		ksocknal_launch_all_connections_locked(peer);
+
+	write_unlock_bh(glock);
+	return;
+}
+
+void
+ksocknal_push_peer (ksock_peer_t *peer)
+{
+	int	       index;
+	int	       i;
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+
+	for (index = 0; ; index++) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		i = 0;
+		conn = NULL;
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			if (i++ == index) {
+				conn = list_entry (tmp, ksock_conn_t,
+						       ksnc_list);
+				ksocknal_conn_addref(conn);
+				break;
+			}
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		if (conn == NULL)
+			break;
+
+		ksocknal_lib_push_conn (conn);
+		ksocknal_conn_decref(conn);
+	}
+}
+
+int
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
+{
+	ksock_peer_t      *peer;
+	struct list_head	*tmp;
+	int		index;
+	int		i;
+	int		j;
+	int		rc = -ENOENT;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		for (j = 0; ; j++) {
+			read_lock(&ksocknal_data.ksnd_global_lock);
+
+			index = 0;
+			peer = NULL;
+
+			list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (!((id.nid == LNET_NID_ANY ||
+				       id.nid == peer->ksnp_id.nid) &&
+				      (id.pid == LNET_PID_ANY ||
+				       id.pid == peer->ksnp_id.pid))) {
+					peer = NULL;
+					continue;
+				}
+
+				if (index++ == j) {
+					ksocknal_peer_addref(peer);
+					break;
+				}
+			}
+
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			if (peer != NULL) {
+				rc = 0;
+				ksocknal_push_peer (peer);
+				ksocknal_peer_decref(peer);
+			}
+		}
+
+	}
+
+	return (rc);
+}
+
+int
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
+{
+	ksock_net_t       *net = ni->ni_data;
+	ksock_interface_t *iface;
+	int		rc;
+	int		i;
+	int		j;
+	struct list_head	*ptmp;
+	ksock_peer_t      *peer;
+	struct list_head	*rtmp;
+	ksock_route_t     *route;
+
+	if (ipaddress == 0 ||
+	    netmask == 0)
+		return (-EINVAL);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	iface = ksocknal_ip2iface(ni, ipaddress);
+	if (iface != NULL) {
+		/* silently ignore dups */
+		rc = 0;
+	} else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
+		rc = -ENOSPC;
+	} else {
+		iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
+
+		iface->ksni_ipaddr = ipaddress;
+		iface->ksni_netmask = netmask;
+		iface->ksni_nroutes = 0;
+		iface->ksni_npeers = 0;
+
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) {
+				peer = list_entry(ptmp, ksock_peer_t,
+						      ksnp_list);
+
+				for (j = 0; j < peer->ksnp_n_passive_ips; j++)
+					if (peer->ksnp_passive_ips[j] == ipaddress)
+						iface->ksni_npeers++;
+
+				list_for_each(rtmp, &peer->ksnp_routes) {
+					route = list_entry(rtmp,
+							       ksock_route_t,
+							       ksnr_list);
+
+					if (route->ksnr_myipaddr == ipaddress)
+						iface->ksni_nroutes++;
+				}
+			}
+		}
+
+		rc = 0;
+		/* NB only new connections will pay attention to the new interface! */
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+void
+ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
+{
+	struct list_head	 *tmp;
+	struct list_head	 *nxt;
+	ksock_route_t      *route;
+	ksock_conn_t       *conn;
+	int		 i;
+	int		 j;
+
+	for (i = 0; i < peer->ksnp_n_passive_ips; i++)
+		if (peer->ksnp_passive_ips[i] == ipaddr) {
+			for (j = i+1; j < peer->ksnp_n_passive_ips; j++)
+				peer->ksnp_passive_ips[j-1] =
+					peer->ksnp_passive_ips[j];
+			peer->ksnp_n_passive_ips--;
+			break;
+		}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		if (route->ksnr_myipaddr != ipaddr)
+			continue;
+
+		if (route->ksnr_share_count != 0) {
+			/* Manually created; keep, but unbind */
+			route->ksnr_myipaddr = 0;
+		} else {
+			ksocknal_del_route_locked(route);
+		}
+	}
+
+	list_for_each_safe(tmp, nxt, &peer->ksnp_conns) {
+		conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+		if (conn->ksnc_myipaddr == ipaddr)
+			ksocknal_close_conn_locked (conn, 0);
+	}
+}
+
+int
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
+{
+	ksock_net_t       *net = ni->ni_data;
+	int		rc = -ENOENT;
+	struct list_head	*tmp;
+	struct list_head	*nxt;
+	ksock_peer_t      *peer;
+	__u32	      this_ip;
+	int		i;
+	int		j;
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
+
+		if (!(ipaddress == 0 ||
+		      ipaddress == this_ip))
+			continue;
+
+		rc = 0;
+
+		for (j = i+1; j < net->ksnn_ninterfaces; j++)
+			net->ksnn_interfaces[j-1] =
+				net->ksnn_interfaces[j];
+
+		net->ksnn_ninterfaces--;
+
+		for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
+			list_for_each_safe(tmp, nxt,
+					       &ksocknal_data.ksnd_peers[j]) {
+				peer = list_entry(tmp, ksock_peer_t,
+						      ksnp_list);
+
+				if (peer->ksnp_ni != ni)
+					continue;
+
+				ksocknal_peer_del_interface_locked(peer, this_ip);
+			}
+		}
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+int
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+	lnet_process_id_t id = {0};
+	struct libcfs_ioctl_data *data = arg;
+	int rc;
+
+	switch(cmd) {
+	case IOC_LIBCFS_GET_INTERFACE: {
+		ksock_net_t       *net = ni->ni_data;
+		ksock_interface_t *iface;
+
+		read_lock(&ksocknal_data.ksnd_global_lock);
+
+		if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) {
+			rc = -ENOENT;
+		} else {
+			rc = 0;
+			iface = &net->ksnn_interfaces[data->ioc_count];
+
+			data->ioc_u32[0] = iface->ksni_ipaddr;
+			data->ioc_u32[1] = iface->ksni_netmask;
+			data->ioc_u32[2] = iface->ksni_npeers;
+			data->ioc_u32[3] = iface->ksni_nroutes;
+		}
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return rc;
+	}
+
+	case IOC_LIBCFS_ADD_INTERFACE:
+		return ksocknal_add_interface(ni,
+					      data->ioc_u32[0], /* IP address */
+					      data->ioc_u32[1]); /* net mask */
+
+	case IOC_LIBCFS_DEL_INTERFACE:
+		return ksocknal_del_interface(ni,
+					      data->ioc_u32[0]); /* IP address */
+
+	case IOC_LIBCFS_GET_PEER: {
+		__u32	    myip = 0;
+		__u32	    ip = 0;
+		int	      port = 0;
+		int	      conn_count = 0;
+		int	      share_count = 0;
+
+		rc = ksocknal_get_peer_info(ni, data->ioc_count,
+					    &id, &myip, &ip, &port,
+					    &conn_count,  &share_count);
+		if (rc != 0)
+			return rc;
+
+		data->ioc_nid    = id.nid;
+		data->ioc_count  = share_count;
+		data->ioc_u32[0] = ip;
+		data->ioc_u32[1] = port;
+		data->ioc_u32[2] = myip;
+		data->ioc_u32[3] = conn_count;
+		data->ioc_u32[4] = id.pid;
+		return 0;
+	}
+
+	case IOC_LIBCFS_ADD_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		return ksocknal_add_peer (ni, id,
+					  data->ioc_u32[0], /* IP */
+					  data->ioc_u32[1]); /* port */
+
+	case IOC_LIBCFS_DEL_PEER:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_del_peer (ni, id,
+					  data->ioc_u32[0]); /* IP */
+
+	case IOC_LIBCFS_GET_CONN: {
+		int	   txmem;
+		int	   rxmem;
+		int	   nagle;
+		ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
+
+		if (conn == NULL)
+			return -ENOENT;
+
+		ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+		data->ioc_count  = txmem;
+		data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+		data->ioc_flags  = nagle;
+		data->ioc_u32[0] = conn->ksnc_ipaddr;
+		data->ioc_u32[1] = conn->ksnc_port;
+		data->ioc_u32[2] = conn->ksnc_myipaddr;
+		data->ioc_u32[3] = conn->ksnc_type;
+		data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt;
+		data->ioc_u32[5] = rxmem;
+		data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+		ksocknal_conn_decref(conn);
+		return 0;
+	}
+
+	case IOC_LIBCFS_CLOSE_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_close_matching_conns (id,
+						      data->ioc_u32[0]);
+
+	case IOC_LIBCFS_REGISTER_MYNID:
+		/* Ignore if this is a noop */
+		if (data->ioc_nid == ni->ni_nid)
+			return 0;
+
+		CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+		       libcfs_nid2str(data->ioc_nid),
+		       libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+
+	case IOC_LIBCFS_PUSH_CONNECTION:
+		id.nid = data->ioc_nid;
+		id.pid = LNET_PID_ANY;
+		return ksocknal_push(ni, id);
+
+	default:
+		return -EINVAL;
+	}
+	/* not reached */
+}
+
+void
+ksocknal_free_buffers (void)
+{
+	LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
+
+	if (ksocknal_data.ksnd_sched_info != NULL) {
+		struct ksock_sched_info	*info;
+		int			i;
+
+		cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+			if (info->ksi_scheds != NULL) {
+				LIBCFS_FREE(info->ksi_scheds,
+					    info->ksi_nthreads_max *
+					    sizeof(info->ksi_scheds[0]));
+			}
+		}
+		cfs_percpt_free(ksocknal_data.ksnd_sched_info);
+	}
+
+	LIBCFS_FREE (ksocknal_data.ksnd_peers,
+		     sizeof (struct list_head) *
+		     ksocknal_data.ksnd_peer_hash_size);
+
+	spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+	if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+		struct list_head	zlist;
+		ksock_tx_t	*tx;
+
+		list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+		list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+		while (!list_empty(&zlist)) {
+			tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+			list_del(&tx->tx_list);
+			LIBCFS_FREE(tx, tx->tx_desc_size);
+		}
+	} else {
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+}
+
+void
+ksocknal_base_shutdown(void)
+{
+	struct ksock_sched_info *info;
+	ksock_sched_t		*sched;
+	int			i;
+	int			j;
+
+	CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+	switch (ksocknal_data.ksnd_init) {
+	default:
+		LASSERT (0);
+
+	case SOCKNAL_INIT_ALL:
+	case SOCKNAL_INIT_DATA:
+		LASSERT (ksocknal_data.ksnd_peers != NULL);
+		for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+			LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
+		}
+
+		LASSERT(list_empty(&ksocknal_data.ksnd_nets));
+		LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
+		LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
+		LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+		LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+
+					sched = &info->ksi_scheds[j];
+					LASSERT(list_empty(&sched->\
+							       kss_tx_conns));
+					LASSERT(list_empty(&sched->\
+							       kss_rx_conns));
+					LASSERT(list_empty(&sched-> \
+						  kss_zombie_noop_txs));
+					LASSERT(sched->kss_nconns == 0);
+				}
+			}
+		}
+
+		/* flag threads to terminate; wake and wait for them to die */
+		ksocknal_data.ksnd_shuttingdown = 1;
+		wake_up_all(&ksocknal_data.ksnd_connd_waitq);
+		wake_up_all(&ksocknal_data.ksnd_reaper_waitq);
+
+		if (ksocknal_data.ksnd_sched_info != NULL) {
+			cfs_percpt_for_each(info, i,
+					    ksocknal_data.ksnd_sched_info) {
+				if (info->ksi_scheds == NULL)
+					continue;
+
+				for (j = 0; j < info->ksi_nthreads_max; j++) {
+					sched = &info->ksi_scheds[j];
+					wake_up_all(&sched->kss_waitq);
+				}
+			}
+		}
+
+		i = 4;
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		while (ksocknal_data.ksnd_nthreads != 0) {
+			i++;
+			CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+			       "waiting for %d threads to terminate\n",
+				ksocknal_data.ksnd_nthreads);
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			cfs_pause(cfs_time_seconds(1));
+			read_lock(&ksocknal_data.ksnd_global_lock);
+		}
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		ksocknal_free_buffers();
+
+		ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
+		break;
+	}
+
+	CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+	       atomic_read (&libcfs_kmemory));
+
+	module_put(THIS_MODULE);
+}
+
+__u64
+ksocknal_new_incarnation (void)
+{
+	struct timeval tv;
+
+	/* The incarnation number is the time this module loaded and it
+	 * identifies this particular instance of the socknal.  Hopefully
+	 * we won't be able to reboot more frequently than 1MHz for the
+	 * forseeable future :) */
+
+	do_gettimeofday(&tv);
+
+	return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+}
+
+int
+ksocknal_base_startup(void)
+{
+	struct ksock_sched_info	*info;
+	int			rc;
+	int			i;
+
+	LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+	LASSERT (ksocknal_data.ksnd_nnets == 0);
+
+	memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+	ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
+	LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
+		      sizeof (struct list_head) *
+		      ksocknal_data.ksnd_peer_hash_size);
+	if (ksocknal_data.ksnd_peers == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
+		INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
+
+	rwlock_init(&ksocknal_data.ksnd_global_lock);
+	INIT_LIST_HEAD(&ksocknal_data.ksnd_nets);
+
+	spin_lock_init(&ksocknal_data.ksnd_reaper_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
+	init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_connd_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+	init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_lock_init(&ksocknal_data.ksnd_tx_lock);
+	INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
+
+	/* NB memset above zeros whole of ksocknal_data */
+
+	/* flag lists/ptrs/locks initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+	try_module_get(THIS_MODULE);
+
+	ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(),
+							 sizeof(*info));
+	if (ksocknal_data.ksnd_sched_info == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) {
+		ksock_sched_t	*sched;
+		int		nthrs;
+
+		nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds);
+		} else {
+			/* max to half of CPUs, assume another half should be
+			 * reserved for upper layer modules */
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+		}
+
+		info->ksi_nthreads_max = nthrs;
+		info->ksi_cpt = i;
+
+		LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i,
+				 info->ksi_nthreads_max * sizeof(*sched));
+		if (info->ksi_scheds == NULL)
+			goto failed;
+
+		for (; nthrs > 0; nthrs--) {
+			sched = &info->ksi_scheds[nthrs - 1];
+
+			sched->kss_info = info;
+			spin_lock_init(&sched->kss_lock);
+			INIT_LIST_HEAD(&sched->kss_rx_conns);
+			INIT_LIST_HEAD(&sched->kss_tx_conns);
+			INIT_LIST_HEAD(&sched->kss_zombie_noop_txs);
+			init_waitqueue_head(&sched->kss_waitq);
+		}
+	}
+
+	ksocknal_data.ksnd_connd_starting	 = 0;
+	ksocknal_data.ksnd_connd_failed_stamp     = 0;
+	ksocknal_data.ksnd_connd_starting_stamp   = cfs_time_current_sec();
+	/* must have at least 2 connds to remain responsive to accepts while
+	 * connecting */
+	if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1)
+		*ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1;
+
+	if (*ksocknal_tunables.ksnd_nconnds_max <
+	    *ksocknal_tunables.ksnd_nconnds) {
+		ksocknal_tunables.ksnd_nconnds_max =
+			ksocknal_tunables.ksnd_nconnds;
+	}
+
+	for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+		char name[16];
+		spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+		ksocknal_data.ksnd_connd_starting++;
+		spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+
+		snprintf(name, sizeof(name), "socknal_cd%02d", i);
+		rc = ksocknal_thread_start(ksocknal_connd,
+					   (void *)((ulong_ptr_t)i), name);
+		if (rc != 0) {
+			spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+			ksocknal_data.ksnd_connd_starting--;
+			spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+			CERROR("Can't spawn socknal connd: %d\n", rc);
+			goto failed;
+		}
+	}
+
+	rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper");
+	if (rc != 0) {
+		CERROR ("Can't spawn socknal reaper: %d\n", rc);
+		goto failed;
+	}
+
+	/* flag everything initialised */
+	ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+	return 0;
+
+ failed:
+	ksocknal_base_shutdown();
+	return -ENETDOWN;
+}
+
+void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+	ksock_peer_t	*peer = NULL;
+	struct list_head	*tmp;
+	int		i;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+		list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+			peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+			if (peer->ksnp_ni == ni) break;
+
+			peer = NULL;
+		}
+	}
+
+	if (peer != NULL) {
+		ksock_route_t *route;
+		ksock_conn_t  *conn;
+
+		CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+		       "closing %d, accepting %d, err %d, zcookie "LPU64", "
+		       "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+		       atomic_read(&peer->ksnp_refcount),
+		       peer->ksnp_sharecount, peer->ksnp_closing,
+		       peer->ksnp_accepting, peer->ksnp_error,
+		       peer->ksnp_zc_next_cookie,
+		       !list_empty(&peer->ksnp_tx_queue),
+		       !list_empty(&peer->ksnp_zc_req_list));
+
+		list_for_each (tmp, &peer->ksnp_routes) {
+			route = list_entry(tmp, ksock_route_t, ksnr_list);
+			CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+			       "del %d\n", atomic_read(&route->ksnr_refcount),
+			       route->ksnr_scheduled, route->ksnr_connecting,
+			       route->ksnr_connected, route->ksnr_deleted);
+		}
+
+		list_for_each (tmp, &peer->ksnp_conns) {
+			conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+			CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+			       atomic_read(&conn->ksnc_conn_refcount),
+			       atomic_read(&conn->ksnc_sock_refcount),
+			       conn->ksnc_type, conn->ksnc_closing);
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+	return;
+}
+
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+	ksock_net_t      *net = ni->ni_data;
+	int	       i;
+	lnet_process_id_t anyid = {0};
+
+	anyid.nid =  LNET_NID_ANY;
+	anyid.pid =  LNET_PID_ANY;
+
+	LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+	LASSERT(ksocknal_data.ksnd_nnets > 0);
+
+	spin_lock_bh(&net->ksnn_lock);
+	net->ksnn_shutdown = 1;		 /* prevent new peers */
+	spin_unlock_bh(&net->ksnn_lock);
+
+	/* Delete all peers */
+	ksocknal_del_peer(ni, anyid, 0);
+
+	/* Wait for all peer state to clean up */
+	i = 2;
+	spin_lock_bh(&net->ksnn_lock);
+	while (net->ksnn_npeers != 0) {
+		spin_unlock_bh(&net->ksnn_lock);
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+		       "waiting for %d peers to disconnect\n",
+		       net->ksnn_npeers);
+		cfs_pause(cfs_time_seconds(1));
+
+		ksocknal_debug_peerhash(ni);
+
+		spin_lock_bh(&net->ksnn_lock);
+	}
+	spin_unlock_bh(&net->ksnn_lock);
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+		LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+	}
+
+	list_del(&net->ksnn_list);
+	LIBCFS_FREE(net, sizeof(*net));
+
+	ksocknal_data.ksnd_nnets--;
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+}
+
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+	char      **names;
+	int	 i;
+	int	 j;
+	int	 rc;
+	int	 n;
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Can't enumerate interfaces: %d\n", n);
+		return n;
+	}
+
+	for (i = j = 0; i < n; i++) {
+		int	up;
+		__u32      ip;
+		__u32      mask;
+
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+		if (rc != 0) {
+			CWARN("Can't get interface %s info: %d\n",
+			      names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s (down)\n",
+			      names[i]);
+			continue;
+		}
+
+		if (j == LNET_MAX_INTERFACES) {
+			CWARN("Ignoring interface %s (too many interfaces)\n",
+			      names[i]);
+			continue;
+		}
+
+		net->ksnn_interfaces[j].ksni_ipaddr = ip;
+		net->ksnn_interfaces[j].ksni_netmask = mask;
+		strncpy(&net->ksnn_interfaces[j].ksni_name[0],
+			names[i], IFNAMSIZ);
+		j++;
+	}
+
+	libcfs_ipif_free_enumeration(names, n);
+
+	if (j == 0)
+		CERROR("Can't find any usable interfaces\n");
+
+	return j;
+}
+
+int
+ksocknal_search_new_ipif(ksock_net_t *net)
+{
+	int	new_ipif = 0;
+	int	i;
+
+	for (i = 0; i < net->ksnn_ninterfaces; i++) {
+		char		*ifnam = &net->ksnn_interfaces[i].ksni_name[0];
+		char		*colon = strchr(ifnam, ':');
+		int		found  = 0;
+		ksock_net_t	*tmp;
+		int		j;
+
+		if (colon != NULL) /* ignore alias device */
+			*colon = 0;
+
+		list_for_each_entry(tmp, &ksocknal_data.ksnd_nets,
+					ksnn_list) {
+			for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) {
+				char *ifnam2 = &tmp->ksnn_interfaces[j].\
+					     ksni_name[0];
+				char *colon2 = strchr(ifnam2, ':');
+
+				if (colon2 != NULL)
+					*colon2 = 0;
+
+				found = strcmp(ifnam, ifnam2) == 0;
+				if (colon2 != NULL)
+					*colon2 = ':';
+			}
+			if (found)
+				break;
+		}
+
+		new_ipif += !found;
+		if (colon != NULL)
+			*colon = ':';
+	}
+
+	return new_ipif;
+}
+
+int
+ksocknal_start_schedulers(struct ksock_sched_info *info)
+{
+	int	nthrs;
+	int	rc = 0;
+	int	i;
+
+	if (info->ksi_nthreads == 0) {
+		if (*ksocknal_tunables.ksnd_nscheds > 0) {
+			nthrs = info->ksi_nthreads_max;
+		} else {
+			nthrs = cfs_cpt_weight(lnet_cpt_table(),
+					       info->ksi_cpt);
+			nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs);
+			nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs);
+		}
+		nthrs = min(nthrs, info->ksi_nthreads_max);
+	} else {
+		LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max);
+		/* increase two threads if there is new interface */
+		nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads);
+	}
+
+	for (i = 0; i < nthrs; i++) {
+		long		id;
+		char		name[20];
+		ksock_sched_t	*sched;
+		id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i);
+		sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+		snprintf(name, sizeof(name), "socknal_sd%02d_%02d",
+			 info->ksi_cpt, (int)(sched - &info->ksi_scheds[0]));
+
+		rc = ksocknal_thread_start(ksocknal_scheduler,
+					   (void *)id, name);
+		if (rc == 0)
+			continue;
+
+		CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
+		       info->ksi_cpt, info->ksi_nthreads + i, rc);
+		break;
+	}
+
+	info->ksi_nthreads += i;
+	return rc;
+}
+
+int
+ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts)
+{
+	int	newif = ksocknal_search_new_ipif(net);
+	int	rc;
+	int	i;
+
+	LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table()));
+
+	for (i = 0; i < ncpts; i++) {
+		struct ksock_sched_info	*info;
+		int cpt = (cpts == NULL) ? i : cpts[i];
+
+		LASSERT(cpt < cfs_cpt_number(lnet_cpt_table()));
+		info = ksocknal_data.ksnd_sched_info[cpt];
+
+		if (!newif && info->ksi_nthreads > 0)
+			continue;
+
+		rc = ksocknal_start_schedulers(info);
+		if (rc != 0)
+			return rc;
+	}
+	return 0;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+	ksock_net_t  *net;
+	int	   rc;
+	int	   i;
+
+	LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+	if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+		rc = ksocknal_base_startup();
+		if (rc != 0)
+			return rc;
+	}
+
+	LIBCFS_ALLOC(net, sizeof(*net));
+	if (net == NULL)
+		goto fail_0;
+
+	spin_lock_init(&net->ksnn_lock);
+	net->ksnn_incarnation = ksocknal_new_incarnation();
+	ni->ni_data = net;
+	ni->ni_peertimeout    = *ksocknal_tunables.ksnd_peertimeout;
+	ni->ni_maxtxcredits   = *ksocknal_tunables.ksnd_credits;
+	ni->ni_peertxcredits  = *ksocknal_tunables.ksnd_peertxcredits;
+	ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits;
+
+	if (ni->ni_interfaces[0] == NULL) {
+		rc = ksocknal_enumerate_interfaces(net);
+		if (rc <= 0)
+			goto fail_1;
+
+		net->ksnn_ninterfaces = 1;
+	} else {
+		for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+			int    up;
+
+			if (ni->ni_interfaces[i] == NULL)
+				break;
+
+			rc = libcfs_ipif_query(
+				ni->ni_interfaces[i], &up,
+				&net->ksnn_interfaces[i].ksni_ipaddr,
+				&net->ksnn_interfaces[i].ksni_netmask);
+
+			if (rc != 0) {
+				CERROR("Can't get interface %s info: %d\n",
+				       ni->ni_interfaces[i], rc);
+				goto fail_1;
+			}
+
+			if (!up) {
+				CERROR("Interface %s is down\n",
+				       ni->ni_interfaces[i]);
+				goto fail_1;
+			}
+
+			strncpy(&net->ksnn_interfaces[i].ksni_name[0],
+				ni->ni_interfaces[i], IFNAMSIZ);
+		}
+		net->ksnn_ninterfaces = i;
+	}
+
+	/* call it before add it to ksocknal_data.ksnd_nets */
+	rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts);
+	if (rc != 0)
+		goto fail_1;
+
+	ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+				net->ksnn_interfaces[0].ksni_ipaddr);
+	list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets);
+
+	ksocknal_data.ksnd_nnets++;
+
+	return 0;
+
+ fail_1:
+	LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+	if (ksocknal_data.ksnd_nnets == 0)
+		ksocknal_base_shutdown();
+
+	return -ENETDOWN;
+}
+
+
+void __exit
+ksocknal_module_fini (void)
+{
+	lnet_unregister_lnd(&the_ksocklnd);
+	ksocknal_tunables_fini();
+}
+
+int __init
+ksocknal_module_init (void)
+{
+	int    rc;
+
+	/* check ksnr_connected/connecting field large enough */
+	CLASSERT (SOCKLND_CONN_NTYPES <= 4);
+	CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN);
+
+	/* initialize the_ksocklnd */
+	the_ksocklnd.lnd_type     = SOCKLND;
+	the_ksocklnd.lnd_startup  = ksocknal_startup;
+	the_ksocklnd.lnd_shutdown = ksocknal_shutdown;
+	the_ksocklnd.lnd_ctl      = ksocknal_ctl;
+	the_ksocklnd.lnd_send     = ksocknal_send;
+	the_ksocklnd.lnd_recv     = ksocknal_recv;
+	the_ksocklnd.lnd_notify   = ksocknal_notify;
+	the_ksocklnd.lnd_query    = ksocknal_query;
+	the_ksocklnd.lnd_accept   = ksocknal_accept;
+
+	rc = ksocknal_tunables_init();
+	if (rc != 0)
+		return rc;
+
+	lnet_register_lnd(&the_ksocklnd);
+
+	return 0;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0");
+MODULE_LICENSE("GPL");
+
+cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
new file mode 100644
index 000000000000..b483e0c3a69a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "socklnd_lib-linux.h"
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/socklnd.h>
+#include <linux/lnet/lnet-sysctl.h>
+
+#define SOCKNAL_PEER_HASH_SIZE  101	     /* # peer lists */
+#define SOCKNAL_RESCHED	 100	     /* # scheduler loops before reschedule */
+#define SOCKNAL_INSANITY_RECONN 5000	    /* connd is trying on reconn infinitely */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK	/* jiffies between retries */
+
+#define SOCKNAL_SINGLE_FRAG_TX      0	   /* disable multi-fragment sends */
+#define SOCKNAL_SINGLE_FRAG_RX      0	   /* disable multi-fragment receives */
+
+#define SOCKNAL_VERSION_DEBUG       0	   /* enable protocol version debugging */
+
+/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled).
+ * no risk if we're not running on a CONFIG_HIGHMEM platform. */
+#ifdef CONFIG_HIGHMEM
+# define SOCKNAL_RISK_KMAP_DEADLOCK  0
+#else
+# define SOCKNAL_RISK_KMAP_DEADLOCK  1
+#endif
+
+struct ksock_sched_info;
+
+typedef struct				  /* per scheduler state */
+{
+	spinlock_t		kss_lock;	/* serialise */
+	struct list_head		kss_rx_conns;	/* conn waiting to be read */
+	/* conn waiting to be written */
+	struct list_head		kss_tx_conns;
+	/* zombie noop tx list */
+	struct list_head		kss_zombie_noop_txs;
+	wait_queue_head_t		kss_waitq;	/* where scheduler sleeps */
+	/* # connections assigned to this scheduler */
+	int			kss_nconns;
+	struct ksock_sched_info	*kss_info;	/* owner of it */
+	struct page		*kss_rx_scratch_pgs[LNET_MAX_IOV];
+	struct iovec		kss_scratch_iov[LNET_MAX_IOV];
+} ksock_sched_t;
+
+struct ksock_sched_info {
+	int			ksi_nthreads_max; /* max allowed threads */
+	int			ksi_nthreads;	/* number of threads */
+	int			ksi_cpt;	/* CPT id */
+	ksock_sched_t		*ksi_scheds;	/* array of schedulers */
+};
+
+#define KSOCK_CPT_SHIFT			16
+#define KSOCK_THREAD_ID(cpt, sid)	(((cpt) << KSOCK_CPT_SHIFT) | (sid))
+#define KSOCK_THREAD_CPT(id)		((id) >> KSOCK_CPT_SHIFT)
+#define KSOCK_THREAD_SID(id)		((id) & ((1UL << KSOCK_CPT_SHIFT) - 1))
+
+typedef struct				  /* in-use interface */
+{
+	__u32		ksni_ipaddr;		/* interface's IP address */
+	__u32		ksni_netmask;		/* interface's network mask */
+	int		ksni_nroutes;		/* # routes using (active) */
+	int		ksni_npeers;		/* # peers using (passive) */
+	char		ksni_name[IFNAMSIZ];	/* interface name */
+} ksock_interface_t;
+
+typedef struct
+{
+	/* "stuck" socket timeout (seconds) */
+	int	      *ksnd_timeout;
+	/* # scheduler threads in each pool while starting */
+	int		 *ksnd_nscheds;
+	int	      *ksnd_nconnds;	 /* # connection daemons */
+	int	      *ksnd_nconnds_max;     /* max # connection daemons */
+	int	      *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+	int	      *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+	int	      *ksnd_eager_ack;       /* make TCP ack eagerly? */
+	int	      *ksnd_typed_conns;     /* drive sockets by type? */
+	int	      *ksnd_min_bulk;	/* smallest "large" message */
+	int	      *ksnd_tx_buffer_size;  /* socket tx buffer size */
+	int	      *ksnd_rx_buffer_size;  /* socket rx buffer size */
+	int	      *ksnd_nagle;	   /* enable NAGLE? */
+	int	      *ksnd_round_robin;     /* round robin for multiple interfaces */
+	int	      *ksnd_keepalive;       /* # secs for sending keepalive NOOP */
+	int	      *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+	int	      *ksnd_keepalive_count; /* # probes */
+	int	      *ksnd_keepalive_intvl; /* time between probes */
+	int	      *ksnd_credits;	 /* # concurrent sends */
+	int	      *ksnd_peertxcredits;   /* # concurrent sends to 1 peer */
+	int	      *ksnd_peerrtrcredits;  /* # per-peer router buffer credits */
+	int	      *ksnd_peertimeout;     /* seconds to consider peer dead */
+	int	      *ksnd_enable_csum;     /* enable check sum */
+	int	      *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+	int	      *ksnd_nonblk_zcack;    /* always send zc-ack on non-blocking connection */
+	unsigned int     *ksnd_zc_min_payload;  /* minimum zero copy payload size */
+	int	      *ksnd_zc_recv;	 /* enable ZC receive (for Chelsio TOE) */
+	int	      *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ctl_table_header_t *ksnd_sysctl;   /* sysctl interface */
+#endif
+} ksock_tunables_t;
+
+typedef struct
+{
+	__u64		  ksnn_incarnation;	/* my epoch */
+	spinlock_t	  ksnn_lock;		/* serialise */
+	struct list_head	  ksnn_list;		/* chain on global list */
+	int		  ksnn_npeers;		/* # peers */
+	int		  ksnn_shutdown;	/* shutting down? */
+	int		  ksnn_ninterfaces;	/* IP interfaces */
+	ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+/** connd timeout */
+#define SOCKNAL_CONND_TIMEOUT  120
+/** reserved thread for accepting & creating new connd */
+#define SOCKNAL_CONND_RESV     1
+
+typedef struct
+{
+	int			ksnd_init;	/* initialisation state */
+	int			ksnd_nnets;	/* # networks set up */
+	struct list_head		ksnd_nets;	/* list of nets */
+	/* stabilize peer/conn ops */
+	rwlock_t		ksnd_global_lock;
+	/* hash table of all my known peers */
+	struct list_head		*ksnd_peers;
+	int			ksnd_peer_hash_size; /* size of ksnd_peers */
+
+	int			ksnd_nthreads;	/* # live threads */
+	int			ksnd_shuttingdown; /* tell threads to exit */
+	/* schedulers information */
+	struct ksock_sched_info	**ksnd_sched_info;
+
+	atomic_t      ksnd_nactive_txs;    /* #active txs */
+
+	struct list_head	ksnd_deathrow_conns; /* conns to close: reaper_lock*/
+	struct list_head	ksnd_zombie_conns;   /* conns to free: reaper_lock */
+	struct list_head	ksnd_enomem_conns;   /* conns to retry: reaper_lock*/
+	wait_queue_head_t       ksnd_reaper_waitq;   /* reaper sleeps here */
+	cfs_time_t	ksnd_reaper_waketime;/* when reaper will wake */
+	spinlock_t	  ksnd_reaper_lock;	/* serialise */
+
+	int	       ksnd_enomem_tx;      /* test ENOMEM sender */
+	int	       ksnd_stall_tx;       /* test sluggish sender */
+	int	       ksnd_stall_rx;       /* test sluggish receiver */
+
+	struct list_head	ksnd_connd_connreqs; /* incoming connection requests */
+	struct list_head	ksnd_connd_routes;   /* routes waiting to be connected */
+	wait_queue_head_t       ksnd_connd_waitq;    /* connds sleep here */
+	int	       ksnd_connd_connecting;/* # connds connecting */
+	/** time stamp of the last failed connecting attempt */
+	long	      ksnd_connd_failed_stamp;
+	/** # starting connd */
+	unsigned	  ksnd_connd_starting;
+	/** time stamp of the last starting connd */
+	long	      ksnd_connd_starting_stamp;
+	/** # running connd */
+	unsigned	  ksnd_connd_running;
+	spinlock_t	  ksnd_connd_lock;	/* serialise */
+
+	struct list_head	  ksnd_idle_noop_txs;	/* list head for freed noop tx */
+	spinlock_t	  ksnd_tx_lock;		/* serialise, g_lock unsafe */
+
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_ALL	2
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments (the first frag contains the portals header),
+ * followed by 0 or more lnet_kiov_t fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, the payload is
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
+ * what the header matched or whether the message needs forwarding. */
+
+struct ksock_conn;			      /* forward ref */
+struct ksock_peer;			      /* forward ref */
+struct ksock_route;			     /* forward ref */
+struct ksock_proto;			     /* forward ref */
+
+typedef struct				  /* transmit packet */
+{
+	struct list_head     tx_list;	/* queue on conn for transmission etc */
+	struct list_head     tx_zc_list;     /* queue on peer for ZC request */
+	atomic_t   tx_refcount;    /* tx reference count */
+	int	    tx_nob;	 /* # packet bytes */
+	int	    tx_resid;       /* residual bytes */
+	int	    tx_niov;	/* # packet iovec frags */
+	struct iovec  *tx_iov;	 /* packet iovec frags */
+	int	    tx_nkiov;       /* # packet page frags */
+	unsigned short tx_zc_aborted;  /* aborted ZC request */
+	unsigned short tx_zc_capable:1; /* payload is large enough for ZC */
+	unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */
+	unsigned short tx_nonblk:1;    /* it's a non-blocking ACK */
+	lnet_kiov_t   *tx_kiov;	/* packet page frags */
+	struct ksock_conn  *tx_conn;	/* owning conn */
+	lnet_msg_t    *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+	cfs_time_t     tx_deadline;    /* when (in jiffies) tx times out */
+	ksock_msg_t    tx_msg;	 /* socklnd message buffer */
+	int	    tx_desc_size;   /* size of this descriptor */
+	union {
+		struct {
+			struct iovec iov;       /* virt hdr */
+			lnet_kiov_t  kiov[0];   /* paged payload */
+		}		  paged;
+		struct {
+			struct iovec iov[1];    /* virt hdr + payload */
+		}		  virt;
+	}		       tx_frags;
+} ksock_tx_t;
+
+#define KSOCK_NOOP_TX_SIZE  ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0]))
+
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
+typedef union {
+	struct iovec     iov[LNET_MAX_IOV];
+	lnet_kiov_t      kiov[LNET_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_KSM_HEADER   1	       /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2	       /* reading lnet message header */
+#define SOCKNAL_RX_PARSE	3	       /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4	       /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5	       /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP	 6	       /* skipping body */
+
+typedef struct ksock_conn
+{
+	struct ksock_peer  *ksnc_peer;	 /* owning peer */
+	struct ksock_route *ksnc_route;	/* owning route */
+	struct list_head	  ksnc_list;	 /* stash on peer's conn list */
+	socket_t       *ksnc_sock;	 /* actual socket */
+	void	       *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+	void	       *ksnc_saved_write_space; /* socket's original write_space() callback */
+	atomic_t	ksnc_conn_refcount; /* conn refcount */
+	atomic_t	ksnc_sock_refcount; /* sock refcount */
+	ksock_sched_t      *ksnc_scheduler;  /* who schedules this connection */
+	__u32	       ksnc_myipaddr;   /* my IP */
+	__u32	       ksnc_ipaddr;     /* peer's IP */
+	int		 ksnc_port;       /* peer's port */
+	signed int	  ksnc_type:3;     /* type of connection,
+					      * should be signed value */
+	unsigned int	    ksnc_closing:1;  /* being shut down */
+	unsigned int	    ksnc_flip:1;     /* flip or not, only for V2.x */
+	unsigned int	    ksnc_zc_capable:1; /* enable to ZC */
+	struct ksock_proto *ksnc_proto;      /* protocol for the connection */
+
+	/* reader */
+	struct list_head  ksnc_rx_list;     /* where I enq waiting input or a forwarding descriptor */
+	cfs_time_t	    ksnc_rx_deadline; /* when (in jiffies) receive times out */
+	__u8		  ksnc_rx_started;  /* started receiving a message */
+	__u8		  ksnc_rx_ready;    /* data ready to read */
+	__u8		  ksnc_rx_scheduled;/* being progressed */
+	__u8		  ksnc_rx_state;    /* what is being read */
+	int		   ksnc_rx_nob_left; /* # bytes to next hdr/body */
+	int		   ksnc_rx_nob_wanted; /* bytes actually wanted */
+	int		   ksnc_rx_niov;     /* # iovec frags */
+	struct iovec	 *ksnc_rx_iov;      /* the iovec frags */
+	int		   ksnc_rx_nkiov;    /* # page frags */
+	lnet_kiov_t	  *ksnc_rx_kiov;     /* the page frags */
+	ksock_rxiovspace_t    ksnc_rx_iov_space;/* space for frag descriptors */
+	__u32		 ksnc_rx_csum;     /* partial checksum for incoming data */
+	void		 *ksnc_cookie;      /* rx lnet_finalize passthru arg */
+	ksock_msg_t	   ksnc_msg;	 /* incoming message buffer:
+						 * V2.x message takes the
+						 * whole struct
+						 * V1.x message is a bare
+						 * lnet_hdr_t, it's stored in
+						 * ksnc_msg.ksm_u.lnetmsg */
+
+	/* WRITER */
+	struct list_head	    ksnc_tx_list;     /* where I enq waiting for output space */
+	struct list_head	    ksnc_tx_queue;    /* packets waiting to be sent */
+	ksock_tx_t	   *ksnc_tx_carrier;  /* next TX that can carry a LNet message or ZC-ACK */
+	cfs_time_t	    ksnc_tx_deadline; /* when (in jiffies) tx times out */
+	int		   ksnc_tx_bufnob;     /* send buffer marker */
+	atomic_t	  ksnc_tx_nob;	/* # bytes queued */
+	int		   ksnc_tx_ready;      /* write space */
+	int		   ksnc_tx_scheduled;  /* being progressed */
+	cfs_time_t	    ksnc_tx_last_post;  /* time stamp of the last posted TX */
+} ksock_conn_t;
+
+typedef struct ksock_route
+{
+	struct list_head	    ksnr_list;	/* chain on peer route list */
+	struct list_head	    ksnr_connd_list;  /* chain on ksnr_connd_routes */
+	struct ksock_peer    *ksnr_peer;	/* owning peer */
+	atomic_t	  ksnr_refcount;    /* # users */
+	cfs_time_t	    ksnr_timeout;     /* when (in jiffies) reconnection can happen next */
+	cfs_duration_t	ksnr_retry_interval; /* how long between retries */
+	__u32		 ksnr_myipaddr;    /* my IP */
+	__u32		 ksnr_ipaddr;      /* IP address to connect to */
+	int		   ksnr_port;	/* port to connect to */
+	unsigned int	  ksnr_scheduled:1; /* scheduled for attention */
+	unsigned int	  ksnr_connecting:1;/* connection establishment in progress */
+	unsigned int	  ksnr_connected:4; /* connections established by type */
+	unsigned int	  ksnr_deleted:1;   /* been removed from peer? */
+	unsigned int	  ksnr_share_count; /* created explicitly? */
+	int		   ksnr_conn_count;  /* # conns established by this route */
+} ksock_route_t;
+
+#define SOCKNAL_KEEPALIVE_PING	  1       /* cookie for keepalive ping */
+
+typedef struct ksock_peer
+{
+	struct list_head	    ksnp_list;	/* stash on global peer list */
+	cfs_time_t	    ksnp_last_alive;  /* when (in jiffies) I was last alive */
+	lnet_process_id_t     ksnp_id;       /* who's on the other end(s) */
+	atomic_t	  ksnp_refcount; /* # users */
+	int		   ksnp_sharecount;  /* lconf usage counter */
+	int		   ksnp_closing;  /* being closed */
+	int		   ksnp_accepting;/* # passive connections pending */
+	int		   ksnp_error;    /* errno on closing last conn */
+	__u64		 ksnp_zc_next_cookie;/* ZC completion cookie */
+	__u64		 ksnp_incarnation;   /* latest known peer incarnation */
+	struct ksock_proto   *ksnp_proto;    /* latest known peer protocol */
+	struct list_head	    ksnp_conns;    /* all active connections */
+	struct list_head	    ksnp_routes;   /* routes */
+	struct list_head	    ksnp_tx_queue; /* waiting packets */
+	spinlock_t	      ksnp_lock;	/* serialize, g_lock unsafe */
+	struct list_head	    ksnp_zc_req_list;   /* zero copy requests wait for ACK  */
+	cfs_time_t	    ksnp_send_keepalive; /* time to send keepalive */
+	lnet_ni_t	    *ksnp_ni;       /* which network */
+	int		   ksnp_n_passive_ips; /* # of... */
+	__u32		 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
+} ksock_peer_t;
+
+typedef struct ksock_connreq
+{
+	struct list_head	    ksncr_list;     /* stash on ksnd_connd_connreqs */
+	lnet_ni_t	    *ksncr_ni;       /* chosen NI */
+	socket_t	 *ksncr_sock;     /* accepted socket */
+} ksock_connreq_t;
+
+extern ksock_nal_data_t ksocknal_data;
+extern ksock_tunables_t ksocknal_tunables;
+
+#define SOCKNAL_MATCH_NO	0	/* TX can't match type of connection */
+#define SOCKNAL_MATCH_YES       1	/* TX matches type of connection */
+#define SOCKNAL_MATCH_MAY       2	/* TX can be sent on the connection, but not preferred */
+
+typedef struct ksock_proto
+{
+	int	   pro_version;					      /* version number of protocol */
+	int	 (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *);     /* handshake function */
+	int	 (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+	void	(*pro_pack)(ksock_tx_t *);				  /* message pack */
+	void	(*pro_unpack)(ksock_msg_t *);			       /* message unpack */
+	ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *);	  /* queue tx on the connection */
+	int	 (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */
+	int	 (*pro_handle_zcreq)(ksock_conn_t *, __u64, int);	    /* handle ZC request */
+	int	 (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64);	  /* handle ZC ACK */
+	int	 (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int);	 /* msg type matches the connection type:
+										 * return value:
+										 *   return MATCH_NO  : no
+										 *   return MATCH_YES : matching type
+										 *   return MATCH_MAY : can be backup */
+} ksock_proto_t;
+
+extern ksock_proto_t ksocknal_protocol_v1x;
+extern ksock_proto_t ksocknal_protocol_v2x;
+extern ksock_proto_t ksocknal_protocol_v3x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1	  KSOCK_PROTO_V1_MAJOR
+
+#ifndef CPU_MASK_NONE
+#define CPU_MASK_NONE   0UL
+#endif
+
+static inline int
+ksocknal_route_mask(void)
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return (1 << SOCKLND_CONN_ANY);
+
+	return ((1 << SOCKLND_CONN_CONTROL) |
+		(1 << SOCKLND_CONN_BULK_IN) |
+		(1 << SOCKLND_CONN_BULK_OUT));
+}
+
+static inline struct list_head *
+ksocknal_nid2peerlist (lnet_nid_t nid)
+{
+	unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
+
+	return (&ksocknal_data.ksnd_peers [hash]);
+}
+
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+extern void ksocknal_finalize_zcreq(ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+		ksocknal_queue_zombie_conn(conn);
+}
+
+static inline int
+ksocknal_connsock_addref (ksock_conn_t *conn)
+{
+	int   rc = -ESHUTDOWN;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+	if (!conn->ksnc_closing) {
+		LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0);
+		atomic_inc(&conn->ksnc_sock_refcount);
+		rc = 0;
+	}
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	return (rc);
+}
+
+static inline void
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+	LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+	if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+		LASSERT (conn->ksnc_closing);
+		libcfs_sock_release(conn->ksnc_sock);
+		conn->ksnc_sock = NULL;
+		ksocknal_finalize_zcreq(conn);
+	}
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
+{
+	LASSERT (atomic_read(&tx->tx_refcount) > 0);
+	if (atomic_dec_and_test(&tx->tx_refcount))
+		ksocknal_tx_done(NULL, tx);
+}
+
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+	LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+	atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+	LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+	if (atomic_dec_and_test(&route->ksnr_refcount))
+		ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+	LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+	atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+	LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+	if (atomic_dec_and_test(&peer->ksnp_refcount))
+		ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+		  int delayed, unsigned int niov,
+		  struct iovec *iov, lnet_kiov_t *kiov,
+		  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, socket_t *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+				 socket_t *sock, int type);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern void ksocknal_terminate_conn (ksock_conn_t *conn);
+extern void ksocknal_destroy_conn (ksock_conn_t *conn);
+extern int  ksocknal_close_peer_conns_locked (ksock_peer_t *peer,
+					      __u32 ipaddr, int why);
+extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
+extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer,
+					       ksock_tx_t *tx, int nonblk);
+
+extern int  ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx,
+				   lnet_process_id_t id);
+extern ksock_tx_t *ksocknal_alloc_tx(int type, int size);
+extern void ksocknal_free_tx (ksock_tx_t *tx);
+extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk);
+extern void ksocknal_next_tx_carrier(ksock_conn_t *conn);
+extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist,
+				  int error);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
+extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
+extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name);
+extern void ksocknal_thread_fini (void);
+extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_connd (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+				lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+				ksock_hello_msg_t *hello, lnet_process_id_t *id,
+				__u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(ksock_conn_t *conn);
+extern void ksocknal_lib_save_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(socket_t *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
+extern int ksocknal_lib_setup_sock (socket_t *so);
+extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
+extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
+extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
+extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem,
+					   int *rxmem, int *nagle);
+
+extern int ksocknal_tunables_init(void);
+extern void ksocknal_tunables_fini(void);
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
+
+extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn);
+extern int ksocknal_lib_bind_thread_to_cpu(int id);
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644
index 000000000000..ad5e24104238
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -0,0 +1,2664 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+ksock_tx_t *
+ksocknal_alloc_tx(int type, int size)
+{
+	ksock_tx_t *tx = NULL;
+
+	if (type == KSOCK_MSG_NOOP) {
+		LASSERT(size == KSOCK_NOOP_TX_SIZE);
+
+		/* searching for a noop tx in free list */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+			tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \
+					    next, ksock_tx_t, tx_list);
+			LASSERT(tx->tx_desc_size == size);
+			list_del(&tx->tx_list);
+		}
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	}
+
+	if (tx == NULL)
+		LIBCFS_ALLOC(tx, size);
+
+	if (tx == NULL)
+		return NULL;
+
+	atomic_set(&tx->tx_refcount, 1);
+	tx->tx_zc_aborted = 0;
+	tx->tx_zc_capable = 0;
+	tx->tx_zc_checked = 0;
+	tx->tx_desc_size  = size;
+
+	atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+	return tx;
+}
+
+ksock_tx_t *
+ksocknal_alloc_tx_noop(__u64 cookie, int nonblk)
+{
+	ksock_tx_t *tx;
+
+	tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE);
+	if (tx == NULL) {
+		CERROR("Can't allocate noop tx desc\n");
+		return NULL;
+	}
+
+	tx->tx_conn     = NULL;
+	tx->tx_lnetmsg  = NULL;
+	tx->tx_kiov     = NULL;
+	tx->tx_nkiov    = 0;
+	tx->tx_iov      = tx->tx_frags.virt.iov;
+	tx->tx_niov     = 1;
+	tx->tx_nonblk   = nonblk;
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+
+	return tx;
+}
+
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+	atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+	if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+		/* it's a noop tx */
+		spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+		list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+		spin_unlock(&ksocknal_data.ksnd_tx_lock);
+	} else {
+		LIBCFS_FREE(tx, tx->tx_desc_size);
+	}
+}
+
+int
+ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct iovec  *iov = tx->tx_iov;
+	int    nob;
+	int    rc;
+
+	LASSERT (tx->tx_niov > 0);
+
+	/* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */
+	rc = ksocknal_lib_send_iov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return (rc);
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" iov */
+	do {
+		LASSERT (tx->tx_niov > 0);
+
+		if (nob < (int) iov->iov_len) {
+			iov->iov_base = (void *)((char *)iov->iov_base + nob);
+			iov->iov_len -= nob;
+			return (rc);
+		}
+
+		nob -= iov->iov_len;
+		tx->tx_iov = ++iov;
+		tx->tx_niov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	lnet_kiov_t    *kiov = tx->tx_kiov;
+	int     nob;
+	int     rc;
+
+	LASSERT (tx->tx_niov == 0);
+	LASSERT (tx->tx_nkiov > 0);
+
+	/* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */
+	rc = ksocknal_lib_send_kiov(conn, tx);
+
+	if (rc <= 0)			    /* sent nothing? */
+		return (rc);
+
+	nob = rc;
+	LASSERT (nob <= tx->tx_resid);
+	tx->tx_resid -= nob;
+
+	/* "consume" kiov */
+	do {
+		LASSERT(tx->tx_nkiov > 0);
+
+		if (nob < (int)kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return rc;
+		}
+
+		nob -= (int)kiov->kiov_len;
+		tx->tx_kiov = ++kiov;
+		tx->tx_nkiov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int      rc;
+	int      bufnob;
+
+	if (ksocknal_data.ksnd_stall_tx != 0) {
+		cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+	}
+
+	LASSERT (tx->tx_resid != 0);
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return (-ESHUTDOWN);
+	}
+
+	do {
+		if (ksocknal_data.ksnd_enomem_tx > 0) {
+			/* testing... */
+			ksocknal_data.ksnd_enomem_tx--;
+			rc = -EAGAIN;
+		} else if (tx->tx_niov != 0) {
+			rc = ksocknal_send_iov (conn, tx);
+		} else {
+			rc = ksocknal_send_kiov (conn, tx);
+		}
+
+		bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+		if (rc > 0)		     /* sent something? */
+			conn->ksnc_tx_bufnob += rc; /* account it */
+
+		if (bufnob < conn->ksnc_tx_bufnob) {
+			/* allocated send buffer bytes < computed; infer
+			 * something got ACKed */
+			conn->ksnc_tx_deadline =
+				cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+			conn->ksnc_tx_bufnob = bufnob;
+			mb();
+		}
+
+		if (rc <= 0) { /* Didn't write anything? */
+
+			if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
+				rc = -EAGAIN;
+
+			/* Check if EAGAIN is due to memory pressure */
+			if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn))
+				rc = -ENOMEM;
+
+			break;
+		}
+
+		/* socket's wmem_queued now includes 'rc' bytes */
+		atomic_sub (rc, &conn->ksnc_tx_nob);
+		rc = 0;
+
+	} while (tx->tx_resid != 0);
+
+	ksocknal_connsock_decref(conn);
+	return (rc);
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+	struct iovec *iov = conn->ksnc_rx_iov;
+	int     nob;
+	int     rc;
+
+	LASSERT (conn->ksnc_rx_niov > 0);
+
+	/* Never touch conn->ksnc_rx_iov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_iov(conn);
+
+	if (rc <= 0)
+		return (rc);
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_niov > 0);
+
+		if (nob < (int)iov->iov_len) {
+			iov->iov_len -= nob;
+			iov->iov_base = (void *)((char *)iov->iov_base + nob);
+			return (-EAGAIN);
+		}
+
+		nob -= iov->iov_len;
+		conn->ksnc_rx_iov = ++iov;
+		conn->ksnc_rx_niov--;
+	} while (nob != 0);
+
+	return (rc);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	int     nob;
+	int     rc;
+	LASSERT (conn->ksnc_rx_nkiov > 0);
+
+	/* Never touch conn->ksnc_rx_kiov or change connection
+	 * status inside ksocknal_lib_recv_iov */
+	rc = ksocknal_lib_recv_kiov(conn);
+
+	if (rc <= 0)
+		return (rc);
+
+	/* received something... */
+	nob = rc;
+
+	conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+	conn->ksnc_rx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+	mb();		       /* order with setting rx_started */
+	conn->ksnc_rx_started = 1;
+
+	conn->ksnc_rx_nob_wanted -= nob;
+	conn->ksnc_rx_nob_left -= nob;
+
+	do {
+		LASSERT (conn->ksnc_rx_nkiov > 0);
+
+		if (nob < (int) kiov->kiov_len) {
+			kiov->kiov_offset += nob;
+			kiov->kiov_len -= nob;
+			return -EAGAIN;
+		}
+
+		nob -= kiov->kiov_len;
+		conn->ksnc_rx_kiov = ++kiov;
+		conn->ksnc_rx_nkiov--;
+	} while (nob != 0);
+
+	return 1;
+}
+
+int
+ksocknal_receive (ksock_conn_t *conn)
+{
+	/* Return 1 on success, 0 on EOF, < 0 on error.
+	 * Caller checks ksnc_rx_nob_wanted to determine
+	 * progress/completion. */
+	int     rc;
+	ENTRY;
+
+	if (ksocknal_data.ksnd_stall_rx != 0) {
+		cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+	}
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		return (-ESHUTDOWN);
+	}
+
+	for (;;) {
+		if (conn->ksnc_rx_niov != 0)
+			rc = ksocknal_recv_iov (conn);
+		else
+			rc = ksocknal_recv_kiov (conn);
+
+		if (rc <= 0) {
+			/* error/EOF or partial receive */
+			if (rc == -EAGAIN) {
+				rc = 1;
+			} else if (rc == 0 && conn->ksnc_rx_started) {
+				/* EOF in the middle of a message */
+				rc = -EPROTO;
+			}
+			break;
+		}
+
+		/* Completed a fragment */
+
+		if (conn->ksnc_rx_nob_wanted == 0) {
+			rc = 1;
+			break;
+		}
+	}
+
+	ksocknal_connsock_decref(conn);
+	RETURN (rc);
+}
+
+void
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
+{
+	lnet_msg_t  *lnetmsg = tx->tx_lnetmsg;
+	int	  rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
+	ENTRY;
+
+	LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+	if (tx->tx_conn != NULL)
+		ksocknal_conn_decref(tx->tx_conn);
+
+	if (ni == NULL && tx->tx_conn != NULL)
+		ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+
+	ksocknal_free_tx (tx);
+	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+		lnet_finalize (ni, lnetmsg, rc);
+
+	EXIT;
+}
+
+void
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
+{
+	ksock_tx_t *tx;
+
+	while (!list_empty (txlist)) {
+		tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+		if (error && tx->tx_lnetmsg != NULL) {
+			CNETERR("Deleting packet type %d len %d %s->%s\n",
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+				le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+		} else if (error) {
+			CNETERR("Deleting noop packet\n");
+		}
+
+		list_del (&tx->tx_list);
+
+		LASSERT (atomic_read(&tx->tx_refcount) == 1);
+		ksocknal_tx_done (ni, tx);
+	}
+}
+
+static void
+ksocknal_check_zc_req(ksock_tx_t *tx)
+{
+	ksock_conn_t   *conn = tx->tx_conn;
+	ksock_peer_t   *peer = conn->ksnc_peer;
+
+	/* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx
+	 * to ksnp_zc_req_list if some fragment of this message should be sent
+	 * zero-copy.  Our peer will send an ACK containing this cookie when
+	 * she has received this message to tell us we can signal completion.
+	 * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on
+	 * ksnp_zc_req_list. */
+	LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT (tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 1;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x ||
+	    !conn->ksnc_zc_capable)
+		return;
+
+	/* assign cookie and queue tx to pending list, it will be released when
+	 * a matching ack is received. See ksocknal_handle_zcack() */
+
+	ksocknal_tx_addref(tx);
+
+	spin_lock(&peer->ksnp_lock);
+
+	/* ZC_REQ is going to be pinned to the peer */
+	tx->tx_deadline =
+		cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+	LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+
+	tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++;
+
+	if (peer->ksnp_zc_next_cookie == 0)
+		peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1;
+
+	list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+	spin_unlock(&peer->ksnp_lock);
+}
+
+static void
+ksocknal_uncheck_zc_req(ksock_tx_t *tx)
+{
+	ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_zc_capable);
+
+	tx->tx_zc_checked = 0;
+
+	spin_lock(&peer->ksnp_lock);
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* Not waiting for an ACK */
+		spin_unlock(&peer->ksnp_lock);
+		return;
+	}
+
+	tx->tx_msg.ksm_zc_cookies[0] = 0;
+	list_del(&tx->tx_zc_list);
+
+	spin_unlock(&peer->ksnp_lock);
+
+	ksocknal_tx_decref(tx);
+}
+
+int
+ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	int	    rc;
+
+	if (tx->tx_zc_capable && !tx->tx_zc_checked)
+		ksocknal_check_zc_req(tx);
+
+	rc = ksocknal_transmit (conn, tx);
+
+	CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
+
+	if (tx->tx_resid == 0) {
+		/* Sent everything OK */
+		LASSERT (rc == 0);
+
+		return (0);
+	}
+
+	if (rc == -EAGAIN)
+		return (rc);
+
+	if (rc == -ENOMEM) {
+		static int counter;
+
+		counter++;   /* exponential backoff warnings */
+		if ((counter & (-counter)) == counter)
+			CWARN("%u ENOMEM tx %p (%u allocated)\n",
+			      counter, conn, atomic_read(&libcfs_kmemory));
+
+		/* Queue on ksnd_enomem_conns for retry after a timeout */
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* enomem list takes over scheduler's ref... */
+		LASSERT (conn->ksnc_tx_scheduled);
+		list_add_tail(&conn->ksnc_tx_list,
+				  &ksocknal_data.ksnd_enomem_conns);
+		if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(),
+						   SOCKNAL_ENOMEM_RETRY),
+				   ksocknal_data.ksnd_reaper_waketime))
+			wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+		return (rc);
+	}
+
+	/* Actual error */
+	LASSERT (rc < 0);
+
+	if (!conn->ksnc_closing) {
+		switch (rc) {
+		case -ECONNRESET:
+			LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection "
+				      "while we were sending data; it may have "
+				      "rebooted.\n",
+				      HIPQUAD(conn->ksnc_ipaddr));
+			break;
+		default:
+			LCONSOLE_WARN("There was an unexpected network error "
+				      "while writing to %u.%u.%u.%u: %d.\n",
+				      HIPQUAD(conn->ksnc_ipaddr), rc);
+			break;
+		}
+		CDEBUG(D_NET, "[%p] Error %d on write to %s"
+		       " ip %d.%d.%d.%d:%d\n", conn, rc,
+		       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		       HIPQUAD(conn->ksnc_ipaddr),
+		       conn->ksnc_port);
+	}
+
+	if (tx->tx_zc_checked)
+		ksocknal_uncheck_zc_req(tx);
+
+	/* it's not an error if conn is being closed */
+	ksocknal_close_conn_and_siblings (conn,
+					  (conn->ksnc_closing) ? 0 : rc);
+
+	return (rc);
+}
+
+void
+ksocknal_launch_connection_locked (ksock_route_t *route)
+{
+
+	/* called holding write lock on ksnd_global_lock */
+
+	LASSERT (!route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+	LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+
+	route->ksnr_scheduled = 1;	      /* scheduling conn for connd */
+	ksocknal_route_addref(route);	   /* extra ref for connd */
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	list_add_tail(&route->ksnr_connd_list,
+			  &ksocknal_data.ksnd_connd_routes);
+	wake_up(&ksocknal_data.ksnd_connd_waitq);
+
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+}
+
+void
+ksocknal_launch_all_connections_locked (ksock_peer_t *peer)
+{
+	ksock_route_t *route;
+
+	/* called holding write lock on ksnd_global_lock */
+	for (;;) {
+		/* launch any/all connections that need it */
+		route = ksocknal_find_connectable_route_locked(peer);
+		if (route == NULL)
+			return;
+
+		ksocknal_launch_connection_locked(route);
+	}
+}
+
+ksock_conn_t *
+ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk)
+{
+	struct list_head       *tmp;
+	ksock_conn_t     *conn;
+	ksock_conn_t     *typed = NULL;
+	ksock_conn_t     *fallback = NULL;
+	int	       tnob     = 0;
+	int	       fnob     = 0;
+
+	list_for_each (tmp, &peer->ksnp_conns) {
+		ksock_conn_t *c  = list_entry(tmp, ksock_conn_t, ksnc_list);
+		int	   nob = atomic_read(&c->ksnc_tx_nob) +
+				    cfs_sock_wmem_queued(c->ksnc_sock);
+		int	   rc;
+
+		LASSERT (!c->ksnc_closing);
+		LASSERT (c->ksnc_proto != NULL &&
+			 c->ksnc_proto->pro_match_tx != NULL);
+
+		rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk);
+
+		switch (rc) {
+		default:
+			LBUG();
+		case SOCKNAL_MATCH_NO: /* protocol rejected the tx */
+			continue;
+
+		case SOCKNAL_MATCH_YES: /* typed connection */
+			if (typed == NULL || tnob > nob ||
+			    (tnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				typed = c;
+				tnob  = nob;
+			}
+			break;
+
+		case SOCKNAL_MATCH_MAY: /* fallback connection */
+			if (fallback == NULL || fnob > nob ||
+			    (fnob == nob && *ksocknal_tunables.ksnd_round_robin &&
+			     cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) {
+				fallback = c;
+				fnob     = nob;
+			}
+			break;
+		}
+	}
+
+	/* prefer the typed selection */
+	conn = (typed != NULL) ? typed : fallback;
+
+	if (conn != NULL)
+		conn->ksnc_tx_last_post = cfs_time_current();
+
+	return conn;
+}
+
+void
+ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	conn->ksnc_proto->pro_pack(tx);
+
+	atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
+	ksocknal_conn_addref(conn); /* +1 ref for tx */
+	tx->tx_conn = conn;
+}
+
+void
+ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
+{
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+	ksock_msg_t   *msg = &tx->tx_msg;
+	ksock_tx_t    *ztx = NULL;
+	int	    bufnob = 0;
+
+	/* called holding global lock (read or irq-write) and caller may
+	 * not have dropped this lock between finding conn and calling me,
+	 * so we don't need the {get,put}connsock dance to deref
+	 * ksnc_sock... */
+	LASSERT(!conn->ksnc_closing);
+
+	CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n",
+		libcfs_id2str(conn->ksnc_peer->ksnp_id),
+		HIPQUAD(conn->ksnc_ipaddr),
+		conn->ksnc_port);
+
+	ksocknal_tx_prep(conn, tx);
+
+	/* Ensure the frags we've been given EXACTLY match the number of
+	 * bytes we want to send.  Many TCP/IP stacks disregard any total
+	 * size parameters passed to them and just look at the frags.
+	 *
+	 * We always expect at least 1 mapped fragment containing the
+	 * complete ksocknal message header. */
+	LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+		 lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) ==
+		 (unsigned int)tx->tx_nob);
+	LASSERT (tx->tx_niov >= 1);
+	LASSERT (tx->tx_resid == tx->tx_nob);
+
+	CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+		tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type:
+					       KSOCK_MSG_NOOP,
+		tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+
+	/*
+	 * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__
+	 * but they're used inside spinlocks a lot.
+	 */
+	bufnob = cfs_sock_wmem_queued(conn->ksnc_sock);
+	spin_lock_bh(&sched->kss_lock);
+
+	if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
+		/* First packet starts the timeout */
+		conn->ksnc_tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+		if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */
+			conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
+		conn->ksnc_tx_bufnob = 0;
+		mb(); /* order with adding to tx_queue */
+	}
+
+	if (msg->ksm_type == KSOCK_MSG_NOOP) {
+		/* The packet is noop ZC ACK, try to piggyback the ack_cookie
+		 * on a normal packet so I don't need to send it */
+		LASSERT (msg->ksm_zc_cookies[1] != 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0))
+			ztx = tx; /* ZC ACK piggybacked on ztx release tx later */
+
+	} else {
+		/* It's a normal packet - can it piggback a noop zc-ack that
+		 * has been queued already? */
+		LASSERT (msg->ksm_zc_cookies[1] == 0);
+		LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL);
+
+		ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx);
+		/* ztx will be released later */
+	}
+
+	if (ztx != NULL) {
+		atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+		list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+	}
+
+	if (conn->ksnc_tx_ready &&      /* able to send */
+	    !conn->ksnc_tx_scheduled) { /* not scheduled to send */
+		/* +1 ref for scheduler */
+		ksocknal_conn_addref(conn);
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+}
+
+
+ksock_route_t *
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+{
+	cfs_time_t     now = cfs_time_current();
+	struct list_head    *tmp;
+	ksock_route_t *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)      /* connections being established */
+			continue;
+
+		/* all route types connected ? */
+		if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
+			continue;
+
+		if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+		      cfs_time_aftereq(now, route->ksnr_timeout))) {
+			CDEBUG(D_NET,
+			       "Too soon to retry route %u.%u.%u.%u "
+			       "(cnted %d, interval %ld, %ld secs later)\n",
+			       HIPQUAD(route->ksnr_ipaddr),
+			       route->ksnr_connected,
+			       route->ksnr_retry_interval,
+			       cfs_duration_sec(route->ksnr_timeout - now));
+			continue;
+		}
+
+		return (route);
+	}
+
+	return (NULL);
+}
+
+ksock_route_t *
+ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
+{
+	struct list_head	*tmp;
+	ksock_route_t     *route;
+
+	list_for_each (tmp, &peer->ksnp_routes) {
+		route = list_entry (tmp, ksock_route_t, ksnr_list);
+
+		LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+
+		if (route->ksnr_scheduled)
+			return (route);
+	}
+
+	return (NULL);
+}
+
+int
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
+{
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	rwlock_t     *g_lock;
+	int	       retry;
+	int	       rc;
+
+	LASSERT (tx->tx_conn == NULL);
+
+	g_lock = &ksocknal_data.ksnd_global_lock;
+
+	for (retry = 0;; retry = 1) {
+		read_lock(g_lock);
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL) {
+			if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+				conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+				if (conn != NULL) {
+					/* I've got no routes that need to be
+					 * connecting and I do have an actual
+					 * connection... */
+					ksocknal_queue_tx_locked (tx, conn);
+					read_unlock(g_lock);
+					return (0);
+				}
+			}
+		}
+
+		/* I'll need a write lock... */
+		read_unlock(g_lock);
+
+		write_lock_bh(g_lock);
+
+		peer = ksocknal_find_peer_locked(ni, id);
+		if (peer != NULL)
+			break;
+
+		write_unlock_bh(g_lock);
+
+		if ((id.pid & LNET_PID_USERFLAG) != 0) {
+			CERROR("Refusing to create a connection to "
+			       "userspace process %s\n", libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		if (retry) {
+			CERROR("Can't find peer %s\n", libcfs_id2str(id));
+			return -EHOSTUNREACH;
+		}
+
+		rc = ksocknal_add_peer(ni, id,
+				       LNET_NIDADDR(id.nid),
+				       lnet_acceptor_port());
+		if (rc != 0) {
+			CERROR("Can't add peer %s: %d\n",
+			       libcfs_id2str(id), rc);
+			return rc;
+		}
+	}
+
+	ksocknal_launch_all_connections_locked(peer);
+
+	conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk);
+	if (conn != NULL) {
+		/* Connection exists; queue message on it */
+		ksocknal_queue_tx_locked (tx, conn);
+		write_unlock_bh(g_lock);
+		return (0);
+	}
+
+	if (peer->ksnp_accepting > 0 ||
+	    ksocknal_find_connecting_route_locked (peer) != NULL) {
+		/* the message is going to be pinned to the peer */
+		tx->tx_deadline =
+			cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
+
+		/* Queue the message until a connection is established */
+		list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
+		write_unlock_bh(g_lock);
+		return 0;
+	}
+
+	write_unlock_bh(g_lock);
+
+	/* NB Routes may be ignored if connections to them failed recently */
+	CNETERR("No usable routes to %s\n", libcfs_id2str(id));
+	return (-EHOSTUNREACH);
+}
+
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	int	       mpflag = 0;
+	int	       type = lntmsg->msg_type;
+	lnet_process_id_t target = lntmsg->msg_target;
+	unsigned int      payload_niov = lntmsg->msg_niov;
+	struct iovec     *payload_iov = lntmsg->msg_iov;
+	lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+	unsigned int      payload_offset = lntmsg->msg_offset;
+	unsigned int      payload_nob = lntmsg->msg_len;
+	ksock_tx_t       *tx;
+	int	       desc_size;
+	int	       rc;
+
+	/* NB 'private' is different depending on what we're sending.
+	 * Just ignore it... */
+
+	CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+	       payload_nob, payload_niov, libcfs_id2str(target));
+
+	LASSERT (payload_nob == 0 || payload_niov > 0);
+	LASSERT (payload_niov <= LNET_MAX_IOV);
+	/* payload is either all vaddrs or all pages */
+	LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+	LASSERT (!in_interrupt ());
+
+	if (payload_iov != NULL)
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.virt.iov[1 + payload_niov]);
+	else
+		desc_size = offsetof(ksock_tx_t,
+				     tx_frags.paged.kiov[payload_niov]);
+
+	if (lntmsg->msg_vmflush)
+		mpflag = cfs_memory_pressure_get_and_set();
+	tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size);
+	if (tx == NULL) {
+		CERROR("Can't allocate tx desc type %d size %d\n",
+		       type, desc_size);
+		if (lntmsg->msg_vmflush)
+			cfs_memory_pressure_restore(mpflag);
+		return (-ENOMEM);
+	}
+
+	tx->tx_conn = NULL;		     /* set when assigned a conn */
+	tx->tx_lnetmsg = lntmsg;
+
+	if (payload_iov != NULL) {
+		tx->tx_kiov = NULL;
+		tx->tx_nkiov = 0;
+		tx->tx_iov = tx->tx_frags.virt.iov;
+		tx->tx_niov = 1 +
+			      lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+					       payload_niov, payload_iov,
+					       payload_offset, payload_nob);
+	} else {
+		tx->tx_niov = 1;
+		tx->tx_iov = &tx->tx_frags.paged.iov;
+		tx->tx_kiov = tx->tx_frags.paged.kiov;
+		tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+						 payload_niov, payload_kiov,
+						 payload_offset, payload_nob);
+
+		if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload)
+			tx->tx_zc_capable = 1;
+	}
+
+	socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
+
+	/* The first fragment will be set later in pro_pack */
+	rc = ksocknal_launch_packet(ni, tx, target);
+	if (lntmsg->msg_vmflush)
+		cfs_memory_pressure_restore(mpflag);
+	if (rc == 0)
+		return (0);
+
+	ksocknal_free_tx(tx);
+	return (-EIO);
+}
+
+int
+ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name)
+{
+	task_t *task = kthread_run(fn, arg, name);
+
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads++;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return 0;
+}
+
+void
+ksocknal_thread_fini (void)
+{
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	ksocknal_data.ksnd_nthreads--;
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+	static char ksocknal_slop_buffer[4096];
+
+	int	    nob;
+	unsigned int   niov;
+	int	    skipped;
+
+	LASSERT(conn->ksnc_proto != NULL);
+
+	if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+		/* Remind the socket to ack eagerly... */
+		ksocknal_lib_eager_ack(conn);
+	}
+
+	if (nob_to_skip == 0) {	 /* right at next packet boundary now */
+		conn->ksnc_rx_started = 0;
+		mb();		       /* racing with timeout thread */
+
+		switch (conn->ksnc_proto->pro_version) {
+		case  KSOCK_PROTO_V2:
+		case  KSOCK_PROTO_V3:
+			conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+			conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+			conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+			conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u);
+			break;
+
+		case KSOCK_PROTO_V1:
+			/* Receiving bare lnet_hdr_t */
+			conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+			conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+			conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
+
+			conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+			conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+			conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+			break;
+
+		default:
+			LBUG ();
+		}
+		conn->ksnc_rx_niov = 1;
+
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_csum = ~0;
+		return (1);
+	}
+
+	/* Set up to skip as much as possible now.  If there's more left
+	 * (ran out of iov entries) we'll get called again */
+
+	conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+	conn->ksnc_rx_nob_left = nob_to_skip;
+	conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+	skipped = 0;
+	niov = 0;
+
+	do {
+		nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+		conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+		conn->ksnc_rx_iov[niov].iov_len  = nob;
+		niov++;
+		skipped += nob;
+		nob_to_skip -=nob;
+
+	} while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+		 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+	conn->ksnc_rx_niov = niov;
+	conn->ksnc_rx_kiov = NULL;
+	conn->ksnc_rx_nkiov = 0;
+	conn->ksnc_rx_nob_wanted = skipped;
+	return (0);
+}
+
+int
+ksocknal_process_receive (ksock_conn_t *conn)
+{
+	lnet_hdr_t	*lhdr;
+	lnet_process_id_t *id;
+	int		rc;
+
+	LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+
+	/* NB: sched lock NOT held */
+	/* SOCKNAL_RX_LNET_HEADER is here for backward compatability */
+	LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
+		 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+	if (conn->ksnc_rx_nob_wanted != 0) {
+		rc = ksocknal_receive(conn);
+
+		if (rc <= 0) {
+			LASSERT (rc != -EAGAIN);
+
+			if (rc == 0)
+				CDEBUG (D_NET, "[%p] EOF from %s"
+					" ip %d.%d.%d.%d:%d\n", conn,
+					libcfs_id2str(conn->ksnc_peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+			else if (!conn->ksnc_closing)
+				CERROR ("[%p] Error %d on read from %s"
+					" ip %d.%d.%d.%d:%d\n",
+					conn, rc,
+					libcfs_id2str(conn->ksnc_peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+
+			/* it's not an error if conn is being closed */
+			ksocknal_close_conn_and_siblings (conn,
+							  (conn->ksnc_closing) ? 0 : rc);
+			return (rc == 0 ? -ESHUTDOWN : rc);
+		}
+
+		if (conn->ksnc_rx_nob_wanted != 0) {
+			/* short read */
+			return (-EAGAIN);
+		}
+	}
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_KSM_HEADER:
+		if (conn->ksnc_flip) {
+			__swab32s(&conn->ksnc_msg.ksm_type);
+			__swab32s(&conn->ksnc_msg.ksm_csum);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]);
+			__swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]);
+		}
+
+		if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) {
+			CERROR("%s: Unknown message type: %x\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_type);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return (-EPROTO);
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+		    conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			/* NOOP Checksum error */
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings(conn, -EPROTO);
+			return (-EIO);
+		}
+
+		if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) {
+			__u64 cookie = 0;
+
+			LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP)
+				cookie = conn->ksnc_msg.ksm_zc_cookies[0];
+
+			rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie,
+					       conn->ksnc_msg.ksm_zc_cookies[1]);
+
+			if (rc != 0) {
+				CERROR("%s: Unknown ZC-ACK cookie: "LPU64", "LPU64"\n",
+				       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+				       cookie, conn->ksnc_msg.ksm_zc_cookies[1]);
+				ksocknal_new_packet(conn, 0);
+				ksocknal_close_conn_and_siblings(conn, -EPROTO);
+				return (rc);
+			}
+		}
+
+		if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) {
+			ksocknal_new_packet (conn, 0);
+			return 0;       /* NOOP is done and just return */
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+		conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+		conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+		conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+		conn->ksnc_rx_iov[0].iov_len  = sizeof(ksock_lnet_msg_t);
+
+		conn->ksnc_rx_niov = 1;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_nkiov = 0;
+
+		goto again;     /* read lnet header now */
+
+	case SOCKNAL_RX_LNET_HEADER:
+		/* unpack message header */
+		conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+		if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) {
+			/* Userspace peer */
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			/* Substitute process ID assigned at connection time */
+			lhdr->src_pid = cpu_to_le32(id->pid);
+			lhdr->src_nid = cpu_to_le64(id->nid);
+		}
+
+		conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+		ksocknal_conn_addref(conn);     /* ++ref while parsing */
+
+		rc = lnet_parse(conn->ksnc_peer->ksnp_ni,
+				&conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr,
+				conn->ksnc_peer->ksnp_id.nid, conn, 0);
+		if (rc < 0) {
+			/* I just received garbage: give up on this conn */
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			ksocknal_conn_decref(conn);
+			return (-EPROTO);
+		}
+
+		/* I'm racing with ksocknal_recv() */
+		LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+			 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+
+		if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+			return 0;
+
+		/* ksocknal_recv() got called */
+		goto again;
+
+	case SOCKNAL_RX_LNET_PAYLOAD:
+		/* payload all received */
+		rc = 0;
+
+		if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+		    conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+		    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) {
+			CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+			       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+			       conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+			rc = -EIO;
+		}
+
+		if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) {
+			LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x);
+
+			lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+			id   = &conn->ksnc_peer->ksnp_id;
+
+			rc = conn->ksnc_proto->pro_handle_zcreq(conn,
+					conn->ksnc_msg.ksm_zc_cookies[0],
+					*ksocknal_tunables.ksnd_nonblk_zcack ||
+					le64_to_cpu(lhdr->src_nid) != id->nid);
+		}
+
+		lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+		if (rc != 0) {
+			ksocknal_new_packet(conn, 0);
+			ksocknal_close_conn_and_siblings (conn, rc);
+			return (-EPROTO);
+		}
+		/* Fall through */
+
+	case SOCKNAL_RX_SLOP:
+		/* starting new packet? */
+		if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+			return 0;       /* come back later */
+		goto again;	     /* try to finish reading slop now */
+
+	default:
+		break;
+	}
+
+	/* Not Reached */
+	LBUG ();
+	return (-EINVAL);		       /* keep gcc happy */
+}
+
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	       unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+	       unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	ksock_conn_t  *conn = (ksock_conn_t *)private;
+	ksock_sched_t *sched = conn->ksnc_scheduler;
+
+	LASSERT (mlen <= rlen);
+	LASSERT (niov <= LNET_MAX_IOV);
+
+	conn->ksnc_cookie = msg;
+	conn->ksnc_rx_nob_wanted = mlen;
+	conn->ksnc_rx_nob_left   = rlen;
+
+	if (mlen == 0 || iov != NULL) {
+		conn->ksnc_rx_nkiov = 0;
+		conn->ksnc_rx_kiov = NULL;
+		conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+		conn->ksnc_rx_niov =
+			lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+					 niov, iov, offset, mlen);
+	} else {
+		conn->ksnc_rx_niov = 0;
+		conn->ksnc_rx_iov  = NULL;
+		conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+		conn->ksnc_rx_nkiov =
+			lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+					  niov, kiov, offset, mlen);
+	}
+
+	LASSERT (mlen ==
+		 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+		 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+	LASSERT (conn->ksnc_rx_scheduled);
+
+	spin_lock_bh(&sched->kss_lock);
+
+	switch (conn->ksnc_rx_state) {
+	case SOCKNAL_RX_PARSE_WAIT:
+		list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+		wake_up (&sched->kss_waitq);
+		LASSERT (conn->ksnc_rx_ready);
+		break;
+
+	case SOCKNAL_RX_PARSE:
+		/* scheduler hasn't noticed I'm parsing yet */
+		break;
+	}
+
+	conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_conn_decref(conn);
+	return 0;
+}
+
+static inline int
+ksocknal_sched_cansleep(ksock_sched_t *sched)
+{
+	int	   rc;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	rc = (!ksocknal_data.ksnd_shuttingdown &&
+	      list_empty(&sched->kss_rx_conns) &&
+	      list_empty(&sched->kss_tx_conns));
+
+	spin_unlock_bh(&sched->kss_lock);
+	return rc;
+}
+
+int ksocknal_scheduler(void *arg)
+{
+	struct ksock_sched_info	*info;
+	ksock_sched_t		*sched;
+	ksock_conn_t		*conn;
+	ksock_tx_t		*tx;
+	int			rc;
+	int			nloops = 0;
+	long			id = (long)arg;
+
+	info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)];
+	sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)];
+
+	cfs_block_allsigs();
+
+	rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt);
+	if (rc != 0) {
+		CERROR("Can't set CPT affinity to %d: %d\n",
+		       info->ksi_cpt, rc);
+	}
+
+	spin_lock_bh(&sched->kss_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		int did_something = 0;
+
+		/* Ensure I progress everything semi-fairly */
+
+		if (!list_empty (&sched->kss_rx_conns)) {
+			conn = list_entry(sched->kss_rx_conns.next,
+					      ksock_conn_t, ksnc_rx_list);
+			list_del(&conn->ksnc_rx_list);
+
+			LASSERT(conn->ksnc_rx_scheduled);
+			LASSERT(conn->ksnc_rx_ready);
+
+			/* clear rx_ready in case receive isn't complete.
+			 * Do it BEFORE we call process_recv, since
+			 * data_ready can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_rx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			rc = ksocknal_process_receive(conn);
+
+			spin_lock_bh(&sched->kss_lock);
+
+			/* I'm the only one that can clear this flag */
+			LASSERT(conn->ksnc_rx_scheduled);
+
+			/* Did process_receive get everything it wanted? */
+			if (rc == 0)
+				conn->ksnc_rx_ready = 1;
+
+			if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+				/* Conn blocked waiting for ksocknal_recv()
+				 * I change its state (under lock) to signal
+				 * it can be rescheduled */
+				conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
+			} else if (conn->ksnc_rx_ready) {
+				/* reschedule for rx */
+				list_add_tail (&conn->ksnc_rx_list,
+						   &sched->kss_rx_conns);
+			} else {
+				conn->ksnc_rx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+
+		if (!list_empty (&sched->kss_tx_conns)) {
+			LIST_HEAD    (zlist);
+
+			if (!list_empty(&sched->kss_zombie_noop_txs)) {
+				list_add(&zlist,
+					     &sched->kss_zombie_noop_txs);
+				list_del_init(&sched->kss_zombie_noop_txs);
+			}
+
+			conn = list_entry(sched->kss_tx_conns.next,
+					      ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			LASSERT(conn->ksnc_tx_ready);
+			LASSERT(!list_empty(&conn->ksnc_tx_queue));
+
+			tx = list_entry(conn->ksnc_tx_queue.next,
+					    ksock_tx_t, tx_list);
+
+			if (conn->ksnc_tx_carrier == tx)
+				ksocknal_next_tx_carrier(conn);
+
+			/* dequeue now so empty list => more to send */
+			list_del(&tx->tx_list);
+
+			/* Clear tx_ready in case send isn't complete.  Do
+			 * it BEFORE we call process_transmit, since
+			 * write_space can set it any time after we release
+			 * kss_lock. */
+			conn->ksnc_tx_ready = 0;
+			spin_unlock_bh(&sched->kss_lock);
+
+			if (!list_empty(&zlist)) {
+				/* free zombie noop txs, it's fast because
+				 * noop txs are just put in freelist */
+				ksocknal_txlist_done(NULL, &zlist, 0);
+			}
+
+			rc = ksocknal_process_transmit(conn, tx);
+
+			if (rc == -ENOMEM || rc == -EAGAIN) {
+				/* Incomplete send: replace tx on HEAD of tx_queue */
+				spin_lock_bh(&sched->kss_lock);
+				list_add(&tx->tx_list,
+					     &conn->ksnc_tx_queue);
+			} else {
+				/* Complete send; tx -ref */
+				ksocknal_tx_decref(tx);
+
+				spin_lock_bh(&sched->kss_lock);
+				/* assume space for more */
+				conn->ksnc_tx_ready = 1;
+			}
+
+			if (rc == -ENOMEM) {
+				/* Do nothing; after a short timeout, this
+				 * conn will be reposted on kss_tx_conns. */
+			} else if (conn->ksnc_tx_ready &&
+				   !list_empty (&conn->ksnc_tx_queue)) {
+				/* reschedule for tx */
+				list_add_tail (&conn->ksnc_tx_list,
+						   &sched->kss_tx_conns);
+			} else {
+				conn->ksnc_tx_scheduled = 0;
+				/* drop my ref */
+				ksocknal_conn_decref(conn);
+			}
+
+			did_something = 1;
+		}
+		if (!did_something ||	   /* nothing to do */
+		    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+			spin_unlock_bh(&sched->kss_lock);
+
+			nloops = 0;
+
+			if (!did_something) {   /* wait for something to do */
+				cfs_wait_event_interruptible_exclusive(
+					sched->kss_waitq,
+					!ksocknal_sched_cansleep(sched), rc);
+				LASSERT (rc == 0);
+			} else {
+				cond_resched();
+			}
+
+			spin_lock_bh(&sched->kss_lock);
+		}
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+	ksocknal_thread_fini();
+	return 0;
+}
+
+/*
+ * Add connection to kss_rx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_read_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_rx_ready = 1;
+
+	if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+		list_add_tail(&conn->ksnc_rx_list,
+				  &sched->kss_rx_conns);
+		conn->ksnc_rx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+/*
+ * Add connection to kss_tx_conns of scheduler
+ * and wakeup the scheduler.
+ */
+void ksocknal_write_callback (ksock_conn_t *conn)
+{
+	ksock_sched_t *sched;
+	ENTRY;
+
+	sched = conn->ksnc_scheduler;
+
+	spin_lock_bh(&sched->kss_lock);
+
+	conn->ksnc_tx_ready = 1;
+
+	if (!conn->ksnc_tx_scheduled && // not being progressed
+	    !list_empty(&conn->ksnc_tx_queue)){//packets to send
+		list_add_tail (&conn->ksnc_tx_list,
+				   &sched->kss_tx_conns);
+		conn->ksnc_tx_scheduled = 1;
+		/* extra ref for scheduler */
+		ksocknal_conn_addref(conn);
+
+		wake_up (&sched->kss_waitq);
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	EXIT;
+}
+
+ksock_proto_t *
+ksocknal_parse_proto_version (ksock_hello_msg_t *hello)
+{
+	__u32   version = 0;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		version = hello->kshm_version;
+	else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC))
+		version = __swab32(hello->kshm_version);
+
+	if (version != 0) {
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol == 1)
+			return NULL;
+
+		if (*ksocknal_tunables.ksnd_protocol == 2 &&
+		    version == KSOCK_PROTO_V3)
+			return NULL;
+#endif
+		if (version == KSOCK_PROTO_V2)
+			return &ksocknal_protocol_v2x;
+
+		if (version == KSOCK_PROTO_V3)
+			return &ksocknal_protocol_v3x;
+
+		return NULL;
+	}
+
+	if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+		lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
+
+		CLASSERT (sizeof (lnet_magicversion_t) ==
+			  offsetof (ksock_hello_msg_t, kshm_src_nid));
+
+		if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+		    hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+			return &ksocknal_protocol_v1x;
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
+{
+	/* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+	ksock_net_t	 *net = (ksock_net_t *)ni->ni_data;
+
+	LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES);
+
+	/* rely on caller to hold a ref on socket so it wouldn't disappear */
+	LASSERT (conn->ksnc_proto != NULL);
+
+	hello->kshm_src_nid	 = ni->ni_nid;
+	hello->kshm_dst_nid	 = peer_nid;
+	hello->kshm_src_pid	 = the_lnet.ln_pid;
+
+	hello->kshm_src_incarnation = net->ksnn_incarnation;
+	hello->kshm_ctype	   = conn->ksnc_type;
+
+	return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+	switch (type)
+	{
+	case SOCKLND_CONN_ANY:
+	case SOCKLND_CONN_CONTROL:
+		return (type);
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKLND_CONN_BULK_OUT;
+	case SOCKLND_CONN_BULK_OUT:
+		return SOCKLND_CONN_BULK_IN;
+	default:
+		return (SOCKLND_CONN_NONE);
+	}
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+		     ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+		     __u64 *incarnation)
+{
+	/* Return < 0	fatal error
+	 *	0	  success
+	 *	EALREADY   lost connection race
+	 *	EPROTO     protocol version mismatch
+	 */
+	socket_t	*sock = conn->ksnc_sock;
+	int		  active = (conn->ksnc_proto != NULL);
+	int		  timeout;
+	int		  proto_match;
+	int		  rc;
+	ksock_proto_t       *proto;
+	lnet_process_id_t    recv_id;
+
+	/* socket type set on active connections - not set on passive */
+	LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE));
+
+	timeout = active ? *ksocknal_tunables.ksnd_timeout :
+			    lnet_acceptor_timeout();
+
+	rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+	    hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+	    hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+		/* Unexpected magic! */
+		CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+			"%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+			LNET_PROTO_TCP_MAGIC,
+			HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &hello->kshm_version,
+			      sizeof(hello->kshm_version), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	proto = ksocknal_parse_proto_version(hello);
+	if (proto == NULL) {
+		if (!active) {
+			/* unknown protocol from peer, tell peer my protocol */
+			conn->ksnc_proto = &ksocknal_protocol_v3x;
+#if SOCKNAL_VERSION_DEBUG
+			if (*ksocknal_tunables.ksnd_protocol == 2)
+				conn->ksnc_proto = &ksocknal_protocol_v2x;
+			else if (*ksocknal_tunables.ksnd_protocol == 1)
+				conn->ksnc_proto = &ksocknal_protocol_v1x;
+#endif
+			hello->kshm_nips = 0;
+			ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+		}
+
+		CERROR ("Unknown protocol version (%d.x expected)"
+			" from %u.%u.%u.%u\n",
+			conn->ksnc_proto->pro_version,
+			HIPQUAD(conn->ksnc_ipaddr));
+
+		return -EPROTO;
+	}
+
+	proto_match = (conn->ksnc_proto == proto);
+	conn->ksnc_proto = proto;
+
+	/* receive the rest of hello message anyway */
+	rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n",
+		       rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0);
+		return rc;
+	}
+
+	*incarnation = hello->kshm_src_incarnation;
+
+	if (hello->kshm_src_nid == LNET_NID_ANY) {
+		CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+		       "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	if (!active &&
+	    conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+		/* Userspace NAL assigns peer process ID from socket */
+		recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+		recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+	} else {
+		recv_id.nid = hello->kshm_src_nid;
+		recv_id.pid = hello->kshm_src_pid;
+	}
+
+	if (!active) {
+		*peerid = recv_id;
+
+		/* peer determines type */
+		conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+		if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+			CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+				hello->kshm_ctype, libcfs_id2str(*peerid),
+				HIPQUAD(conn->ksnc_ipaddr));
+			return -EPROTO;
+		}
+
+		return 0;
+	}
+
+	if (peerid->pid != recv_id.pid ||
+	    peerid->nid != recv_id.nid) {
+		LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host"
+				   " %u.%u.%u.%u, but they claimed they were "
+				   "%s; please check your Lustre "
+				   "configuration.\n",
+				   libcfs_id2str(*peerid),
+				   HIPQUAD(conn->ksnc_ipaddr),
+				   libcfs_id2str(recv_id));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+		/* Possible protocol mismatch or I lost the connection race */
+		return proto_match ? EALREADY : EPROTO;
+	}
+
+	if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+		CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+			conn->ksnc_type, libcfs_id2str(*peerid),
+			HIPQUAD(conn->ksnc_ipaddr),
+			hello->kshm_ctype);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_connect (ksock_route_t *route)
+{
+	LIST_HEAD    (zombies);
+	ksock_peer_t     *peer = route->ksnr_peer;
+	int	       type;
+	int	       wanted;
+	socket_t     *sock;
+	cfs_time_t	deadline;
+	int	       retry_later = 0;
+	int	       rc = 0;
+
+	deadline = cfs_time_add(cfs_time_current(),
+				cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	LASSERT (route->ksnr_scheduled);
+	LASSERT (!route->ksnr_connecting);
+
+	route->ksnr_connecting = 1;
+
+	for (;;) {
+		wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+		/* stop connecting if peer/route got closed under me, or
+		 * route got connected while queued */
+		if (peer->ksnp_closing || route->ksnr_deleted ||
+		    wanted == 0) {
+			retry_later = 0;
+			break;
+		}
+
+		/* reschedule if peer is connecting to me */
+		if (peer->ksnp_accepting > 0) {
+			CDEBUG(D_NET,
+			       "peer %s(%d) already connecting to me, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+			retry_later = 1;
+		}
+
+		if (retry_later) /* needs reschedule */
+			break;
+
+		if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+			type = SOCKLND_CONN_ANY;
+		} else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+			type = SOCKLND_CONN_CONTROL;
+		} else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+			type = SOCKLND_CONN_BULK_IN;
+		} else {
+			LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+			type = SOCKLND_CONN_BULK_OUT;
+		}
+
+		write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+		if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+			rc = -ETIMEDOUT;
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		rc = lnet_connect(&sock, peer->ksnp_id.nid,
+				  route->ksnr_myipaddr,
+				  route->ksnr_ipaddr, route->ksnr_port);
+		if (rc != 0)
+			goto failed;
+
+		rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+		if (rc < 0) {
+			lnet_connect_console_error(rc, peer->ksnp_id.nid,
+						   route->ksnr_ipaddr,
+						   route->ksnr_port);
+			goto failed;
+		}
+
+		/* A +ve RC means I have to retry because I lost the connection
+		 * race or I have to renegotiate protocol version */
+		retry_later = (rc != 0);
+		if (retry_later)
+			CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+			       libcfs_nid2str(peer->ksnp_id.nid));
+
+		write_lock_bh(&ksocknal_data.ksnd_global_lock);
+	}
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	if (retry_later) {
+		/* re-queue for attention; this frees me up to handle
+		 * the peer's incoming connection request */
+
+		if (rc == EALREADY ||
+		    (rc == 0 && peer->ksnp_accepting > 0)) {
+			/* We want to introduce a delay before next
+			 * attempt to connect if we lost conn race,
+			 * but the race is resolved quickly usually,
+			 * so min_reconnectms should be good heuristic */
+			route->ksnr_retry_interval =
+				cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000;
+			route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+							   route->ksnr_retry_interval);
+		}
+
+		ksocknal_launch_connection_locked(route);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+	return retry_later;
+
+ failed:
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	route->ksnr_scheduled = 0;
+	route->ksnr_connecting = 0;
+
+	/* This is a retry rather than a new connection */
+	route->ksnr_retry_interval *= 2;
+	route->ksnr_retry_interval =
+		MAX(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+	route->ksnr_retry_interval =
+		MIN(route->ksnr_retry_interval,
+		    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+
+	LASSERT (route->ksnr_retry_interval != 0);
+	route->ksnr_timeout = cfs_time_add(cfs_time_current(),
+					   route->ksnr_retry_interval);
+
+	if (!list_empty(&peer->ksnp_tx_queue) &&
+	    peer->ksnp_accepting == 0 &&
+	    ksocknal_find_connecting_route_locked(peer) == NULL) {
+		ksock_conn_t *conn;
+
+		/* ksnp_tx_queue is queued on a conn on successful
+		 * connection for V1.x and V2.x */
+		if (!list_empty (&peer->ksnp_conns)) {
+			conn = list_entry(peer->ksnp_conns.next,
+					      ksock_conn_t, ksnc_list);
+			LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x);
+		}
+
+		/* take all the blocked packets while I've got the lock and
+		 * complete below... */
+		list_splice_init(&peer->ksnp_tx_queue, &zombies);
+	}
+
+#if 0	   /* irrelevent with only eager routes */
+	if (!route->ksnr_deleted) {
+		/* make this route least-favourite for re-selection */
+		list_del(&route->ksnr_list);
+		list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
+	}
+#endif
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_peer_failed(peer);
+	ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+	return 0;
+}
+
+/*
+ * check whether we need to create more connds.
+ * It will try to create new thread if it's necessary, @timeout can
+ * be updated if failed to create, so caller wouldn't keep try while
+ * running out of resource.
+ */
+static int
+ksocknal_connd_check_start(long sec, long *timeout)
+{
+	char name[16];
+	int rc;
+	int total = ksocknal_data.ksnd_connd_starting +
+		    ksocknal_data.ksnd_connd_running;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (total >= *ksocknal_tunables.ksnd_nconnds_max ||
+	    total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) {
+		/* can't create more connd, or still have enough
+		 * threads to handle more connecting */
+		return 0;
+	}
+
+	if (list_empty(&ksocknal_data.ksnd_connd_routes)) {
+		/* no pending connecting request */
+		return 0;
+	}
+
+	if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) {
+		/* may run out of resource, retry later */
+		*timeout = cfs_time_seconds(1);
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* serialize starting to avoid flood */
+		return 0;
+	}
+
+	ksocknal_data.ksnd_connd_starting_stamp = sec;
+	ksocknal_data.ksnd_connd_starting++;
+	spin_unlock_bh(&ksocknal_data.ksnd_connd_lock);
+
+	/* NB: total is the next id */
+	snprintf(name, sizeof(name), "socknal_cd%02d", total);
+	rc = ksocknal_thread_start(ksocknal_connd, NULL, name);
+
+	spin_lock_bh(&ksocknal_data.ksnd_connd_lock);
+	if (rc == 0)
+		return 1;
+
+	/* we tried ... */
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec();
+
+	return 1;
+}
+
+/*
+ * check whether current thread can exit, it will return 1 if there are too
+ * many threads and no creating in past 120 seconds.
+ * Also, this function may update @timeout to make caller come back
+ * again to recheck these conditions.
+ */
+static int
+ksocknal_connd_check_stop(long sec, long *timeout)
+{
+	int val;
+
+	if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) {
+		/* still in initializing */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_starting > 0) {
+		/* in progress of starting new thread */
+		return 0;
+	}
+
+	if (ksocknal_data.ksnd_connd_running <=
+	    *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */
+		return 0;
+	}
+
+	/* created thread in past 120 seconds? */
+	val = (int)(ksocknal_data.ksnd_connd_starting_stamp +
+		    SOCKNAL_CONND_TIMEOUT - sec);
+
+	*timeout = (val > 0) ? cfs_time_seconds(val) :
+			       cfs_time_seconds(SOCKNAL_CONND_TIMEOUT);
+	if (val > 0)
+		return 0;
+
+	/* no creating in past 120 seconds */
+
+	return ksocknal_data.ksnd_connd_running >
+	       ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV;
+}
+
+/* Go through connd_routes queue looking for a route that we can process
+ * right now, @timeout_p can be updated if we need to come back later */
+static ksock_route_t *
+ksocknal_connd_get_route_locked(signed long *timeout_p)
+{
+	ksock_route_t *route;
+	cfs_time_t     now;
+
+	now = cfs_time_current();
+
+	/* connd_routes can contain both pending and ordinary routes */
+	list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes,
+				 ksnr_connd_list) {
+
+		if (route->ksnr_retry_interval == 0 ||
+		    cfs_time_aftereq(now, route->ksnr_timeout))
+			return route;
+
+		if (*timeout_p == MAX_SCHEDULE_TIMEOUT ||
+		    (int)*timeout_p > (int)(route->ksnr_timeout - now))
+			*timeout_p = (int)(route->ksnr_timeout - now);
+	}
+
+	return NULL;
+}
+
+int
+ksocknal_connd (void *arg)
+{
+	spinlock_t    *connd_lock = &ksocknal_data.ksnd_connd_lock;
+	ksock_connreq_t   *cr;
+	wait_queue_t     wait;
+	int		nloops = 0;
+	int		cons_retry = 0;
+
+	cfs_block_allsigs ();
+
+	init_waitqueue_entry_current (&wait);
+
+	spin_lock_bh(connd_lock);
+
+	LASSERT(ksocknal_data.ksnd_connd_starting > 0);
+	ksocknal_data.ksnd_connd_starting--;
+	ksocknal_data.ksnd_connd_running++;
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+		ksock_route_t *route = NULL;
+		long sec = cfs_time_current_sec();
+		long timeout = MAX_SCHEDULE_TIMEOUT;
+		int  dropped_lock = 0;
+
+		if (ksocknal_connd_check_stop(sec, &timeout)) {
+			/* wakeup another one to check stop */
+			wake_up(&ksocknal_data.ksnd_connd_waitq);
+			break;
+		}
+
+		if (ksocknal_connd_check_start(sec, &timeout)) {
+			/* created new thread */
+			dropped_lock = 1;
+		}
+
+		if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+			/* Connection accepted by the listener */
+			cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \
+					    next, ksock_connreq_t, ksncr_list);
+
+			list_del(&cr->ksncr_list);
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			ksocknal_create_conn(cr->ksncr_ni, NULL,
+					     cr->ksncr_sock, SOCKLND_CONN_NONE);
+			lnet_ni_decref(cr->ksncr_ni);
+			LIBCFS_FREE(cr, sizeof(*cr));
+
+			spin_lock_bh(connd_lock);
+		}
+
+		/* Only handle an outgoing connection request if there
+		 * is a thread left to handle incoming connections and
+		 * create new connd */
+		if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV <
+		    ksocknal_data.ksnd_connd_running) {
+			route = ksocknal_connd_get_route_locked(&timeout);
+		}
+		if (route != NULL) {
+			list_del (&route->ksnr_connd_list);
+			ksocknal_data.ksnd_connd_connecting++;
+			spin_unlock_bh(connd_lock);
+			dropped_lock = 1;
+
+			if (ksocknal_connect(route)) {
+				/* consecutive retry */
+				if (cons_retry++ > SOCKNAL_INSANITY_RECONN) {
+					CWARN("massive consecutive "
+					      "re-connecting to %u.%u.%u.%u\n",
+					      HIPQUAD(route->ksnr_ipaddr));
+					cons_retry = 0;
+				}
+			} else {
+				cons_retry = 0;
+			}
+
+			ksocknal_route_decref(route);
+
+			spin_lock_bh(connd_lock);
+			ksocknal_data.ksnd_connd_connecting--;
+		}
+
+		if (dropped_lock) {
+			if (++nloops < SOCKNAL_RESCHED)
+				continue;
+			spin_unlock_bh(connd_lock);
+			nloops = 0;
+			cond_resched();
+			spin_lock_bh(connd_lock);
+			continue;
+		}
+
+		/* Nothing to do for 'timeout'  */
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_unlock_bh(connd_lock);
+
+		nloops = 0;
+		waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout);
+
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait);
+		spin_lock_bh(connd_lock);
+	}
+	ksocknal_data.ksnd_connd_running--;
+	spin_unlock_bh(connd_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
+
+ksock_conn_t *
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
+{
+	/* We're called with a shared lock on ksnd_global_lock */
+	ksock_conn_t      *conn;
+	struct list_head	*ctmp;
+
+	list_for_each (ctmp, &peer->ksnp_conns) {
+		int     error;
+		conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
+
+		/* Don't need the {get,put}connsock dance to deref ksnc_sock */
+		LASSERT (!conn->ksnc_closing);
+
+		/* SOCK_ERROR will reset error code of socket in
+		 * some platform (like Darwin8.x) */
+		error = cfs_sock_error(conn->ksnc_sock);
+		if (error != 0) {
+			ksocknal_conn_addref(conn);
+
+			switch (error) {
+			case ECONNRESET:
+				CNETERR("A connection with %s "
+					"(%u.%u.%u.%u:%d) was reset; "
+					"it may have rebooted.\n",
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			case ETIMEDOUT:
+				CNETERR("A connection with %s "
+					"(%u.%u.%u.%u:%d) timed out; the "
+					"network or node may be down.\n",
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			default:
+				CNETERR("An unexpected network error %d "
+					"occurred with %s "
+					"(%u.%u.%u.%u:%d\n", error,
+					libcfs_id2str(peer->ksnp_id),
+					HIPQUAD(conn->ksnc_ipaddr),
+					conn->ksnc_port);
+				break;
+			}
+
+			return (conn);
+		}
+
+		if (conn->ksnc_rx_started &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_rx_deadline)) {
+			/* Timed out incomplete incoming message */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout receiving from %s (%u.%u.%u.%u:%d), "
+				"state %d wanted %d left %d\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(conn->ksnc_ipaddr),
+				conn->ksnc_port,
+				conn->ksnc_rx_state,
+				conn->ksnc_rx_nob_wanted,
+				conn->ksnc_rx_nob_left);
+			return (conn);
+		}
+
+		if ((!list_empty(&conn->ksnc_tx_queue) ||
+		     cfs_sock_wmem_queued(conn->ksnc_sock) != 0) &&
+		    cfs_time_aftereq(cfs_time_current(),
+				     conn->ksnc_tx_deadline)) {
+			/* Timed out messages queued for sending or
+			 * buffered in the socket's send buffer */
+			ksocknal_conn_addref(conn);
+			CNETERR("Timeout sending data to %s (%u.%u.%u.%u:%d) "
+				"the network or that node may be down.\n",
+				libcfs_id2str(peer->ksnp_id),
+				HIPQUAD(conn->ksnc_ipaddr),
+				conn->ksnc_port);
+			return (conn);
+		}
+	}
+
+	return (NULL);
+}
+
+static inline void
+ksocknal_flush_stale_txs(ksock_peer_t *peer)
+{
+	ksock_tx_t	*tx;
+	LIST_HEAD      (stale_txs);
+
+	write_lock_bh(&ksocknal_data.ksnd_global_lock);
+
+	while (!list_empty (&peer->ksnp_tx_queue)) {
+		tx = list_entry (peer->ksnp_tx_queue.next,
+				     ksock_tx_t, tx_list);
+
+		if (!cfs_time_aftereq(cfs_time_current(),
+				      tx->tx_deadline))
+			break;
+
+		list_del (&tx->tx_list);
+		list_add_tail (&tx->tx_list, &stale_txs);
+	}
+
+	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+
+	ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1);
+}
+
+int
+ksocknal_send_keepalive_locked(ksock_peer_t *peer)
+{
+	ksock_sched_t  *sched;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+
+	if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */
+		return 0;
+
+	if (peer->ksnp_proto != &ksocknal_protocol_v3x)
+		return 0;
+
+	if (*ksocknal_tunables.ksnd_keepalive <= 0 ||
+	    cfs_time_before(cfs_time_current(),
+			    cfs_time_add(peer->ksnp_last_alive,
+					 cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive))))
+		return 0;
+
+	if (cfs_time_before(cfs_time_current(),
+			    peer->ksnp_send_keepalive))
+		return 0;
+
+	/* retry 10 secs later, so we wouldn't put pressure
+	 * on this peer if we failed to send keepalive this time */
+	peer->ksnp_send_keepalive = cfs_time_shift(10);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, 1);
+	if (conn != NULL) {
+		sched = conn->ksnc_scheduler;
+
+		spin_lock_bh(&sched->kss_lock);
+		if (!list_empty(&conn->ksnc_tx_queue)) {
+			spin_unlock_bh(&sched->kss_lock);
+			/* there is an queued ACK, don't need keepalive */
+			return 0;
+		}
+
+		spin_unlock_bh(&sched->kss_lock);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* cookie = 1 is reserved for keepalive PING */
+	tx = ksocknal_alloc_tx_noop(1, 1);
+	if (tx == NULL) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return -ENOMEM;
+	}
+
+	if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) {
+		read_lock(&ksocknal_data.ksnd_global_lock);
+		return 1;
+	}
+
+	ksocknal_free_tx(tx);
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	return -EIO;
+}
+
+
+void
+ksocknal_check_peer_timeouts (int idx)
+{
+	struct list_head       *peers = &ksocknal_data.ksnd_peers[idx];
+	ksock_peer_t     *peer;
+	ksock_conn_t     *conn;
+	ksock_tx_t       *tx;
+
+ again:
+	/* NB. We expect to have a look at all the peers and not find any
+	 * connections to time out, so we just use a shared lock while we
+	 * take a look... */
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	list_for_each_entry(peer, peers, ksnp_list) {
+		cfs_time_t  deadline = 0;
+		int	 resid = 0;
+		int	 n     = 0;
+
+		if (ksocknal_send_keepalive_locked(peer) != 0) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			goto again;
+		}
+
+		conn = ksocknal_find_timed_out_conn (peer);
+
+		if (conn != NULL) {
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+
+			ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+
+			/* NB we won't find this one again, but we can't
+			 * just proceed with the next peer, since we dropped
+			 * ksnd_global_lock and it might be dead already! */
+			ksocknal_conn_decref(conn);
+			goto again;
+		}
+
+		/* we can't process stale txs right here because we're
+		 * holding only shared lock */
+		if (!list_empty (&peer->ksnp_tx_queue)) {
+			ksock_tx_t *tx =
+				list_entry (peer->ksnp_tx_queue.next,
+						ksock_tx_t, tx_list);
+
+			if (cfs_time_aftereq(cfs_time_current(),
+					     tx->tx_deadline)) {
+
+				ksocknal_peer_addref(peer);
+				read_unlock(&ksocknal_data.ksnd_global_lock);
+
+				ksocknal_flush_stale_txs(peer);
+
+				ksocknal_peer_decref(peer);
+				goto again;
+			}
+		}
+
+		if (list_empty(&peer->ksnp_zc_req_list))
+			continue;
+
+		spin_lock(&peer->ksnp_lock);
+		list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) {
+			if (!cfs_time_aftereq(cfs_time_current(),
+					      tx->tx_deadline))
+				break;
+			/* ignore the TX if connection is being closed */
+			if (tx->tx_conn->ksnc_closing)
+				continue;
+			n++;
+		}
+
+		if (n == 0) {
+			spin_unlock(&peer->ksnp_lock);
+			continue;
+		}
+
+		tx = list_entry(peer->ksnp_zc_req_list.next,
+				    ksock_tx_t, tx_zc_list);
+		deadline = tx->tx_deadline;
+		resid    = tx->tx_resid;
+		conn     = tx->tx_conn;
+		ksocknal_conn_addref(conn);
+
+		spin_unlock(&peer->ksnp_lock);
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+
+		CERROR("Total %d stale ZC_REQs for peer %s detected; the "
+		       "oldest(%p) timed out %ld secs ago, "
+		       "resid: %d, wmem: %d\n",
+		       n, libcfs_nid2str(peer->ksnp_id.nid), tx,
+		       cfs_duration_sec(cfs_time_current() - deadline),
+		       resid, cfs_sock_wmem_queued(conn->ksnc_sock));
+
+		ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
+		ksocknal_conn_decref(conn);
+		goto again;
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+	wait_queue_t     wait;
+	ksock_conn_t      *conn;
+	ksock_sched_t     *sched;
+	struct list_head	 enomem_conns;
+	int		nenomem_conns;
+	cfs_duration_t     timeout;
+	int		i;
+	int		peer_index = 0;
+	cfs_time_t	 deadline = cfs_time_current();
+
+	cfs_block_allsigs ();
+
+	INIT_LIST_HEAD(&enomem_conns);
+	init_waitqueue_entry_current (&wait);
+
+	spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	while (!ksocknal_data.ksnd_shuttingdown) {
+
+		if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) {
+			conn = list_entry (ksocknal_data. \
+					       ksnd_deathrow_conns.next,
+					       ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_terminate_conn(conn);
+			ksocknal_conn_decref(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) {
+			conn = list_entry (ksocknal_data.ksnd_zombie_conns.\
+					       next, ksock_conn_t, ksnc_list);
+			list_del (&conn->ksnc_list);
+
+			spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+			ksocknal_destroy_conn(conn);
+
+			spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+			continue;
+		}
+
+		if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) {
+			list_add(&enomem_conns,
+				     &ksocknal_data.ksnd_enomem_conns);
+			list_del_init(&ksocknal_data.ksnd_enomem_conns);
+		}
+
+		spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+		/* reschedule all the connections that stalled with ENOMEM... */
+		nenomem_conns = 0;
+		while (!list_empty (&enomem_conns)) {
+			conn = list_entry (enomem_conns.next,
+					       ksock_conn_t, ksnc_tx_list);
+			list_del (&conn->ksnc_tx_list);
+
+			sched = conn->ksnc_scheduler;
+
+			spin_lock_bh(&sched->kss_lock);
+
+			LASSERT(conn->ksnc_tx_scheduled);
+			conn->ksnc_tx_ready = 1;
+			list_add_tail(&conn->ksnc_tx_list,
+					  &sched->kss_tx_conns);
+			wake_up(&sched->kss_waitq);
+
+			spin_unlock_bh(&sched->kss_lock);
+			nenomem_conns++;
+		}
+
+		/* careful with the jiffy wrap... */
+		while ((timeout = cfs_time_sub(deadline,
+					       cfs_time_current())) <= 0) {
+			const int n = 4;
+			const int p = 1;
+			int       chunk = ksocknal_data.ksnd_peer_hash_size;
+
+			/* Time to check for timeouts on a few more peers: I do
+			 * checks every 'p' seconds on a proportion of the peer
+			 * table and I need to check every connection 'n' times
+			 * within a timeout interval, to ensure I detect a
+			 * timeout on any connection within (n+1)/n times the
+			 * timeout interval. */
+
+			if (*ksocknal_tunables.ksnd_timeout > n * p)
+				chunk = (chunk * n * p) /
+					*ksocknal_tunables.ksnd_timeout;
+			if (chunk == 0)
+				chunk = 1;
+
+			for (i = 0; i < chunk; i++) {
+				ksocknal_check_peer_timeouts (peer_index);
+				peer_index = (peer_index + 1) %
+					     ksocknal_data.ksnd_peer_hash_size;
+			}
+
+			deadline = cfs_time_add(deadline, cfs_time_seconds(p));
+		}
+
+		if (nenomem_conns != 0) {
+			/* Reduce my timeout if I rescheduled ENOMEM conns.
+			 * This also prevents me getting woken immediately
+			 * if any go back on my enomem list. */
+			timeout = SOCKNAL_ENOMEM_RETRY;
+		}
+		ksocknal_data.ksnd_reaper_waketime =
+			cfs_time_add(cfs_time_current(), timeout);
+
+		set_current_state (TASK_INTERRUPTIBLE);
+		add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		if (!ksocknal_data.ksnd_shuttingdown &&
+		    list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
+		    list_empty (&ksocknal_data.ksnd_zombie_conns))
+			waitq_timedwait (&wait, TASK_INTERRUPTIBLE,
+					     timeout);
+
+		set_current_state (TASK_RUNNING);
+		remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
+
+		spin_lock_bh(&ksocknal_data.ksnd_reaper_lock);
+	}
+
+	spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock);
+
+	ksocknal_thread_fini();
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
new file mode 100644
index 000000000000..3e08fe2d1489
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c
@@ -0,0 +1,1088 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include "socklnd.h"
+
+# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+
+
+enum {
+	SOCKLND_TIMEOUT = 1,
+	SOCKLND_CREDITS,
+	SOCKLND_PEER_TXCREDITS,
+	SOCKLND_PEER_RTRCREDITS,
+	SOCKLND_PEER_TIMEOUT,
+	SOCKLND_NCONNDS,
+	SOCKLND_RECONNECTS_MIN,
+	SOCKLND_RECONNECTS_MAX,
+	SOCKLND_EAGER_ACK,
+	SOCKLND_ZERO_COPY,
+	SOCKLND_TYPED,
+	SOCKLND_BULK_MIN,
+	SOCKLND_RX_BUFFER_SIZE,
+	SOCKLND_TX_BUFFER_SIZE,
+	SOCKLND_NAGLE,
+	SOCKLND_IRQ_AFFINITY,
+	SOCKLND_ROUND_ROBIN,
+	SOCKLND_KEEPALIVE,
+	SOCKLND_KEEPALIVE_IDLE,
+	SOCKLND_KEEPALIVE_COUNT,
+	SOCKLND_KEEPALIVE_INTVL,
+	SOCKLND_BACKOFF_INIT,
+	SOCKLND_BACKOFF_MAX,
+	SOCKLND_PROTOCOL,
+	SOCKLND_ZERO_COPY_RECV,
+	SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS
+};
+
+static ctl_table_t ksocknal_ctl_table[] = {
+	{
+		.ctl_name = SOCKLND_TIMEOUT,
+		.procname = "timeout",
+		.data     = &ksocknal_tunables.ksnd_timeout,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_CREDITS,
+		.procname = "credits",
+		.data     = &ksocknal_tunables.ksnd_credits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	 {
+		.ctl_name = SOCKLND_PEER_TXCREDITS,
+		.procname = "peer_credits",
+		.data     = &ksocknal_tunables.ksnd_peertxcredits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	 {
+		.ctl_name = SOCKLND_PEER_RTRCREDITS,
+		.procname = "peer_buffer_credits",
+		.data     = &ksocknal_tunables.ksnd_peerrtrcredits,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_PEER_TIMEOUT,
+		.procname = "peer_timeout",
+		.data     = &ksocknal_tunables.ksnd_peertimeout,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_NCONNDS,
+		.procname = "nconnds",
+		.data     = &ksocknal_tunables.ksnd_nconnds,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RECONNECTS_MIN,
+		.procname = "min_reconnectms",
+		.data     = &ksocknal_tunables.ksnd_min_reconnectms,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RECONNECTS_MAX,
+		.procname = "max_reconnectms",
+		.data     = &ksocknal_tunables.ksnd_max_reconnectms,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_EAGER_ACK,
+		.procname = "eager_ack",
+		.data     = &ksocknal_tunables.ksnd_eager_ack,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ZERO_COPY,
+		.procname = "zero_copy",
+		.data     = &ksocknal_tunables.ksnd_zc_min_payload,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ZERO_COPY_RECV,
+		.procname = "zero_copy_recv",
+		.data     = &ksocknal_tunables.ksnd_zc_recv,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+
+	{
+		.ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS,
+		.procname = "zero_copy_recv",
+		.data     = &ksocknal_tunables.ksnd_zc_recv_min_nfrags,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_TYPED,
+		.procname = "typed",
+		.data     = &ksocknal_tunables.ksnd_typed_conns,
+		.maxlen   = sizeof (int),
+		.mode     = 0444,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_BULK_MIN,
+		.procname = "min_bulk",
+		.data     = &ksocknal_tunables.ksnd_min_bulk,
+		.maxlen   = sizeof (int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_RX_BUFFER_SIZE,
+		.procname = "rx_buffer_size",
+		.data     = &ksocknal_tunables.ksnd_rx_buffer_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_TX_BUFFER_SIZE,
+		.procname = "tx_buffer_size",
+		.data     = &ksocknal_tunables.ksnd_tx_buffer_size,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_NAGLE,
+		.procname = "nagle",
+		.data     = &ksocknal_tunables.ksnd_nagle,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_ROUND_ROBIN,
+		.procname = "round_robin",
+		.data     = &ksocknal_tunables.ksnd_round_robin,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE,
+		.procname = "keepalive",
+		.data     = &ksocknal_tunables.ksnd_keepalive,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_IDLE,
+		.procname = "keepalive_idle",
+		.data     = &ksocknal_tunables.ksnd_keepalive_idle,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_COUNT,
+		.procname = "keepalive_count",
+		.data     = &ksocknal_tunables.ksnd_keepalive_count,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+	{
+		.ctl_name = SOCKLND_KEEPALIVE_INTVL,
+		.procname = "keepalive_intvl",
+		.data     = &ksocknal_tunables.ksnd_keepalive_intvl,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+#if SOCKNAL_VERSION_DEBUG
+	{
+		.ctl_name = SOCKLND_PROTOCOL,
+		.procname = "protocol",
+		.data     = &ksocknal_tunables.ksnd_protocol,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec,
+		.strategy = &sysctl_intvec,
+	},
+#endif
+	{0}
+};
+
+
+ctl_table_t ksocknal_top_ctl_table[] = {
+	{
+		.ctl_name = CTL_SOCKLND,
+		.procname = "socknal",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = ksocknal_ctl_table
+	},
+	{ 0 }
+};
+
+int
+ksocknal_lib_tunables_init ()
+{
+	if (!*ksocknal_tunables.ksnd_typed_conns) {
+		int rc = -EINVAL;
+#if SOCKNAL_VERSION_DEBUG
+		if (*ksocknal_tunables.ksnd_protocol < 3)
+			rc = 0;
+#endif
+		if (rc != 0) {
+			CERROR("Protocol V3.x MUST have typed connections\n");
+			return rc;
+		}
+	}
+
+	if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
+		*ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
+	if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
+		*ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
+
+	ksocknal_tunables.ksnd_sysctl =
+		cfs_register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+	if (ksocknal_tunables.ksnd_sysctl == NULL)
+		CWARN("Can't setup /proc tunables\n");
+
+	return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+	if (ksocknal_tunables.ksnd_sysctl != NULL)
+		unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
+}
+#else
+int
+ksocknal_lib_tunables_init ()
+{
+	return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+	int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+				     &conn->ksnc_ipaddr,
+				     &conn->ksnc_port);
+
+	/* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+	LASSERT (!conn->ksnc_closing);
+
+	if (rc != 0) {
+		CERROR ("Error %d getting sock peer IP\n", rc);
+		return rc;
+	}
+
+	rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+				 &conn->ksnc_myipaddr, NULL);
+	if (rc != 0) {
+		CERROR ("Error %d getting sock local IP\n", rc);
+		return rc;
+	}
+
+	return 0;
+}
+
+int
+ksocknal_lib_zc_capable(ksock_conn_t *conn)
+{
+	int  caps = conn->ksnc_sock->sk->sk_route_caps;
+
+	if (conn->ksnc_proto == &ksocknal_protocol_v1x)
+		return 0;
+
+	/* ZC if the socket supports scatter/gather and doesn't need software
+	 * checksums */
+	return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0);
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	int	    nob;
+	int	    rc;
+
+	if (*ksocknal_tunables.ksnd_enable_csum	&& /* checksum enabled */
+	    conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+	    tx->tx_nob == tx->tx_resid		 && /* frist sending    */
+	    tx->tx_msg.ksm_csum == 0)		     /* not checksummed  */
+		ksocknal_lib_csum_tx(tx);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+
+	{
+#if SOCKNAL_SINGLE_FRAG_TX
+		struct iovec    scratch;
+		struct iovec   *scratchiov = &scratch;
+		unsigned int    niov = 1;
+#else
+		struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int    niov = tx->tx_niov;
+#endif
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= scratchiov,
+			.msg_iovlen     = niov,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = MSG_DONTWAIT
+		};
+		mm_segment_t oldmm = get_fs();
+		int  i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i] = tx->tx_iov[i];
+			nob += scratchiov[i].iov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		set_fs (KERNEL_DS);
+		rc = sock_sendmsg(sock, &msg, nob);
+		set_fs (oldmm);
+	}
+	return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+	struct socket *sock = conn->ksnc_sock;
+	lnet_kiov_t   *kiov = tx->tx_kiov;
+	int	    rc;
+	int	    nob;
+
+	/* Not NOOP message */
+	LASSERT (tx->tx_lnetmsg != NULL);
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
+		/* Zero copy is enabled */
+		struct sock   *sk = sock->sk;
+		struct page   *page = kiov->kiov_page;
+		int	    offset = kiov->kiov_offset;
+		int	    fragsize = kiov->kiov_len;
+		int	    msgflg = MSG_DONTWAIT;
+
+		CDEBUG(D_NET, "page %p + offset %x for %d\n",
+			       page, offset, kiov->kiov_len);
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    fragsize < tx->tx_resid)
+			msgflg |= MSG_MORE;
+
+		if (sk->sk_prot->sendpage != NULL) {
+			rc = sk->sk_prot->sendpage(sk, page,
+						   offset, fragsize, msgflg);
+		} else {
+			rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
+					      msgflg);
+		}
+	} else {
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+		struct iovec  scratch;
+		struct iovec *scratchiov = &scratch;
+		unsigned int  niov = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+		struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+		unsigned int  niov = tx->tx_nkiov;
+#endif
+		struct msghdr msg = {
+			.msg_name       = NULL,
+			.msg_namelen    = 0,
+			.msg_iov	= scratchiov,
+			.msg_iovlen     = niov,
+			.msg_control    = NULL,
+			.msg_controllen = 0,
+			.msg_flags      = MSG_DONTWAIT
+		};
+		mm_segment_t  oldmm = get_fs();
+		int	   i;
+
+		for (nob = i = 0; i < niov; i++) {
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+		}
+
+		if (!list_empty(&conn->ksnc_tx_queue) ||
+		    nob < tx->tx_resid)
+			msg.msg_flags |= MSG_MORE;
+
+		set_fs (KERNEL_DS);
+		rc = sock_sendmsg(sock, &msg, nob);
+		set_fs (oldmm);
+
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+	return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+	int	    opt = 1;
+	mm_segment_t   oldmm = get_fs();
+	struct socket *sock = conn->ksnc_sock;
+
+	/* Remind the socket to ACK eagerly.  If I don't, the socket might
+	 * think I'm about to send something it could piggy-back the ACK
+	 * on, introducing delay in completing zero-copy sends in my
+	 * peer. */
+
+	set_fs(KERNEL_DS);
+	sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+			       (char *)&opt, sizeof (opt));
+	set_fs(oldmm);
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+	struct iovec  scratch;
+	struct iovec *scratchiov = &scratch;
+	unsigned int  niov = 1;
+#else
+	struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+	struct iovec *iov = conn->ksnc_rx_iov;
+	struct msghdr msg = {
+		.msg_name       = NULL,
+		.msg_namelen    = 0,
+		.msg_iov	= scratchiov,
+		.msg_iovlen     = niov,
+		.msg_control    = NULL,
+		.msg_controllen = 0,
+		.msg_flags      = 0
+	};
+	mm_segment_t oldmm = get_fs();
+	int	  nob;
+	int	  i;
+	int	  rc;
+	int	  fragnob;
+	int	  sum;
+	__u32	saved_csum;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	LASSERT (niov > 0);
+
+	for (nob = i = 0; i < niov; i++) {
+		scratchiov[i] = iov[i];
+		nob += scratchiov[i].iov_len;
+	}
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	set_fs (KERNEL_DS);
+	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+	/* NB this is just a boolean..........................^ */
+	set_fs (oldmm);
+
+	saved_csum = 0;
+	if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+		saved_csum = conn->ksnc_msg.ksm_csum;
+		conn->ksnc_msg.ksm_csum = 0;
+	}
+
+	if (saved_csum != 0) {
+		/* accumulate checksum */
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT (i < niov);
+
+			fragnob = iov[i].iov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   iov[i].iov_base, fragnob);
+		}
+		conn->ksnc_msg.ksm_csum = saved_csum;
+	}
+
+	return rc;
+}
+
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+	if (addr == NULL)
+		return;
+
+	vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+		       struct iovec *iov, struct page **pages)
+{
+	void	     *addr;
+	int	       nob;
+	int	       i;
+
+	if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+		return NULL;
+
+	LASSERT (niov <= LNET_MAX_IOV);
+
+	if (niov < 2 ||
+	    niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+		return NULL;
+
+	for (nob = i = 0; i < niov; i++) {
+		if ((kiov[i].kiov_offset != 0 && i > 0) ||
+		    (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+			return NULL;
+
+		pages[i] = kiov[i].kiov_page;
+		nob += kiov[i].kiov_len;
+	}
+
+	addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+	if (addr == NULL)
+		return NULL;
+
+	iov->iov_base = addr + kiov[0].kiov_offset;
+	iov->iov_len = nob;
+
+	return addr;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+	struct iovec   scratch;
+	struct iovec  *scratchiov = &scratch;
+	struct page  **pages      = NULL;
+	unsigned int   niov       = 1;
+#else
+#ifdef CONFIG_HIGHMEM
+#warning "XXX risk of kmap deadlock on multiple frags..."
+#endif
+	struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+	struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+	unsigned int   niov       = conn->ksnc_rx_nkiov;
+#endif
+	lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+	struct msghdr msg = {
+		.msg_name       = NULL,
+		.msg_namelen    = 0,
+		.msg_iov	= scratchiov,
+		.msg_control    = NULL,
+		.msg_controllen = 0,
+		.msg_flags      = 0
+	};
+	mm_segment_t oldmm = get_fs();
+	int	  nob;
+	int	  i;
+	int	  rc;
+	void	*base;
+	void	*addr;
+	int	  sum;
+	int	  fragnob;
+
+	/* NB we can't trust socket ops to either consume our iovs
+	 * or leave them alone. */
+	if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+		nob = scratchiov[0].iov_len;
+		msg.msg_iovlen = 1;
+
+	} else {
+		for (nob = i = 0; i < niov; i++) {
+			nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+			scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+						 kiov[i].kiov_offset;
+		}
+		msg.msg_iovlen = niov;
+	}
+
+	LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+	set_fs (KERNEL_DS);
+	rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
+	/* NB this is just a boolean.......................^ */
+	set_fs (oldmm);
+
+	if (conn->ksnc_msg.ksm_csum != 0) {
+		for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+			LASSERT (i < niov);
+
+			/* Dang! have to kmap again because I have nowhere to stash the
+			 * mapped address.  But by doing it while the page is still
+			 * mapped, the kernel just bumps the map count and returns me
+			 * the address it stashed. */
+			base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+			fragnob = kiov[i].kiov_len;
+			if (fragnob > sum)
+				fragnob = sum;
+
+			conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+							   base, fragnob);
+
+			kunmap(kiov[i].kiov_page);
+		}
+	}
+
+	if (addr != NULL) {
+		ksocknal_lib_kiov_vunmap(addr);
+	} else {
+		for (i = 0; i < niov; i++)
+			kunmap(kiov[i].kiov_page);
+	}
+
+	return (rc);
+}
+
+void
+ksocknal_lib_csum_tx(ksock_tx_t *tx)
+{
+	int	  i;
+	__u32	csum;
+	void	*base;
+
+	LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+	LASSERT(tx->tx_conn != NULL);
+	LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+	tx->tx_msg.ksm_csum = 0;
+
+	csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+			     tx->tx_iov[0].iov_len);
+
+	if (tx->tx_kiov != NULL) {
+		for (i = 0; i < tx->tx_nkiov; i++) {
+			base = kmap(tx->tx_kiov[i].kiov_page) +
+			       tx->tx_kiov[i].kiov_offset;
+
+			csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
+
+			kunmap(tx->tx_kiov[i].kiov_page);
+		}
+	} else {
+		for (i = 1; i < tx->tx_niov; i++)
+			csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+					     tx->tx_iov[i].iov_len);
+	}
+
+	if (*ksocknal_tunables.ksnd_inject_csum_error) {
+		csum++;
+		*ksocknal_tunables.ksnd_inject_csum_error = 0;
+	}
+
+	tx->tx_msg.ksm_csum = csum;
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+	mm_segment_t   oldmm = get_fs ();
+	struct socket *sock = conn->ksnc_sock;
+	int	    len;
+	int	    rc;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0) {
+		LASSERT (conn->ksnc_closing);
+		*txmem = *rxmem = *nagle = 0;
+		return (-ESHUTDOWN);
+	}
+
+	rc = libcfs_sock_getbuf(sock, txmem, rxmem);
+	if (rc == 0) {
+		len = sizeof(*nagle);
+		set_fs(KERNEL_DS);
+		rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
+					   (char *)nagle, &len);
+		set_fs(oldmm);
+	}
+
+	ksocknal_connsock_decref(conn);
+
+	if (rc == 0)
+		*nagle = !*nagle;
+	else
+		*txmem = *rxmem = *nagle = 0;
+
+	return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+	mm_segment_t    oldmm = get_fs ();
+	int	     rc;
+	int	     option;
+	int	     keep_idle;
+	int	     keep_intvl;
+	int	     keep_count;
+	int	     do_keepalive;
+	struct linger   linger;
+
+	sock->sk->sk_allocation = GFP_NOFS;
+
+	/* Ensure this socket aborts active sends immediately when we close
+	 * it. */
+
+	linger.l_onoff = 0;
+	linger.l_linger = 0;
+
+	set_fs (KERNEL_DS);
+	rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER,
+			      (char *)&linger, sizeof (linger));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_LINGER: %d\n", rc);
+		return (rc);
+	}
+
+	option = -1;
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2,
+				    (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_LINGER2: %d\n", rc);
+		return (rc);
+	}
+
+	if (!*ksocknal_tunables.ksnd_nagle) {
+		option = 1;
+
+		set_fs (KERNEL_DS);
+		rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY,
+					    (char *)&option, sizeof (option));
+		set_fs (oldmm);
+		if (rc != 0) {
+			CERROR ("Can't disable nagle: %d\n", rc);
+			return (rc);
+		}
+	}
+
+	rc = libcfs_sock_setbuf(sock,
+				*ksocknal_tunables.ksnd_tx_buffer_size,
+				*ksocknal_tunables.ksnd_rx_buffer_size);
+	if (rc != 0) {
+		CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+			*ksocknal_tunables.ksnd_tx_buffer_size,
+			*ksocknal_tunables.ksnd_rx_buffer_size, rc);
+		return (rc);
+	}
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+
+	/* snapshot tunables */
+	keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+	keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+	keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+	do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+	option = (do_keepalive ? 1 : 0);
+	set_fs (KERNEL_DS);
+	rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+			      (char *)&option, sizeof (option));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+		return (rc);
+	}
+
+	if (!do_keepalive)
+		return (0);
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+				    (char *)&keep_idle, sizeof (keep_idle));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+				    (char *)&keep_intvl, sizeof (keep_intvl));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+		return (rc);
+	}
+
+	set_fs (KERNEL_DS);
+	rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+				    (char *)&keep_count, sizeof (keep_count));
+	set_fs (oldmm);
+	if (rc != 0) {
+		CERROR ("Can't set TCP_KEEPCNT: %d\n", rc);
+		return (rc);
+	}
+
+	return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+	struct sock    *sk;
+	struct tcp_sock *tp;
+	int	     nonagle;
+	int	     val = 1;
+	int	     rc;
+	mm_segment_t    oldmm;
+
+	rc = ksocknal_connsock_addref(conn);
+	if (rc != 0)			    /* being shut down */
+		return;
+
+	sk = conn->ksnc_sock->sk;
+	tp = tcp_sk(sk);
+
+	lock_sock (sk);
+	nonagle = tp->nonagle;
+	tp->nonagle = 1;
+	release_sock (sk);
+
+	oldmm = get_fs ();
+	set_fs (KERNEL_DS);
+
+	rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+				      (char *)&val, sizeof (val));
+	LASSERT (rc == 0);
+
+	set_fs (oldmm);
+
+	lock_sock (sk);
+	tp->nonagle = nonagle;
+	release_sock (sk);
+
+	ksocknal_connsock_decref(conn);
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+/*
+ * socket call back in Linux
+ */
+static void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+	ksock_conn_t  *conn;
+	ENTRY;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
+		sk->sk_data_ready (sk, n);
+	} else
+		ksocknal_read_callback(conn);
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	EXIT;
+}
+
+static void
+ksocknal_write_space (struct sock *sk)
+{
+	ksock_conn_t  *conn;
+	int	    wspace;
+	int	    min_wpace;
+
+	/* interleave correctly with closing sockets... */
+	LASSERT(!in_irq());
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = sk->sk_user_data;
+	wspace = SOCKNAL_WSPACE(sk);
+	min_wpace = SOCKNAL_MIN_WSPACE(sk);
+
+	CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+	       sk, wspace, min_wpace, conn,
+	       (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
+				      " ready" : " blocked"),
+	       (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+				      " scheduled" : " idle"),
+	       (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+				      " empty" : " queued"));
+
+	if (conn == NULL) {	     /* raced with ksocknal_terminate_conn */
+		LASSERT (sk->sk_write_space != &ksocknal_write_space);
+		sk->sk_write_space (sk);
+
+		read_unlock(&ksocknal_data.ksnd_global_lock);
+		return;
+	}
+
+	if (wspace >= min_wpace) {	      /* got enough space */
+		ksocknal_write_callback(conn);
+
+		/* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
+		 * ENOMEM check in ksocknal_transmit is race-free (think about
+		 * it). */
+
+		clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	conn->ksnc_saved_data_ready = sock->sk->sk_data_ready;
+	conn->ksnc_saved_write_space = sock->sk->sk_write_space;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+	sock->sk->sk_user_data = conn;
+	sock->sk->sk_data_ready = ksocknal_data_ready;
+	sock->sk->sk_write_space = ksocknal_write_space;
+	return;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+	/* Remove conn's network callbacks.
+	 * NB I _have_ to restore the callback, rather than storing a noop,
+	 * since the socket could survive past this module being unloaded!! */
+	sock->sk->sk_data_ready = conn->ksnc_saved_data_ready;
+	sock->sk->sk_write_space = conn->ksnc_saved_write_space;
+
+	/* A callback could be in progress already; they hold a read lock
+	 * on ksnd_global_lock (to serialise with me) and NOOP if
+	 * sk_user_data is NULL. */
+	sock->sk->sk_user_data = NULL;
+
+	return ;
+}
+
+int
+ksocknal_lib_memory_pressure(ksock_conn_t *conn)
+{
+	int	    rc = 0;
+	ksock_sched_t *sched;
+
+	sched = conn->ksnc_scheduler;
+	spin_lock_bh(&sched->kss_lock);
+
+	if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
+	    !conn->ksnc_tx_ready) {
+		/* SOCK_NOSPACE is set when the socket fills
+		 * and cleared in the write_space callback
+		 * (which also sets ksnc_tx_ready).  If
+		 * SOCK_NOSPACE and ksnc_tx_ready are BOTH
+		 * zero, I didn't fill the socket and
+		 * write_space won't reschedule me, so I
+		 * return -ENOMEM to get my caller to retry
+		 * after a timeout */
+		rc = -ENOMEM;
+	}
+
+	spin_unlock_bh(&sched->kss_lock);
+
+	return rc;
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
new file mode 100644
index 000000000000..3c135786dc11
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h
@@ -0,0 +1,91 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_PORTAL_ALLOC
+
+#ifndef __LINUX_SOCKNAL_LIB_H__
+#define __LINUX_SOCKNAL_LIB_H__
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/if.h>
+
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <asm/uaccess.h>
+#include <asm/div64.h>
+#include <linux/syscalls.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/libcfs/linux/portals_compat25.h>
+
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+	return crc32_le(crc, p, len);
+#else
+	while (len-- > 0)
+		crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+	return crc;
+#endif
+}
+
+#define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
+#define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
+
+/* assume one thread for each connection type */
+#define SOCKNAL_NSCHEDS		3
+#define SOCKNAL_NSCHEDS_HIGH	(SOCKNAL_NSCHEDS << 1)
+
+#endif
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644
index 000000000000..8a474f64abbe
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
+		"dead socket timeout (seconds)");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+		"# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+		"# concurrent sends to 1 peer");
+
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# per-peer router buffer credits");
+
+static int peer_timeout = 180;
+CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
+		"Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+CFS_MODULE_PARM(nscheds, "i", int, 0444,
+		"# scheduler daemons in each pool while starting");
+
+static int nconnds = 4;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+		"# connection daemons while starting");
+
+static int nconnds_max = 64;
+CFS_MODULE_PARM(nconnds_max, "i", int, 0444,
+		"max # connection daemons");
+
+static int min_reconnectms = 1000;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+		"min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+		"max connection retry interval (mS)");
+
+# define DEFAULT_EAGER_ACK 0
+static int eager_ack = DEFAULT_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+		"send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+		"use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+		"smallest 'large' message");
+
+# define DEFAULT_BUFFER_SIZE 0
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
+		"socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
+		"socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+		"enable NAGLE?");
+
+static int round_robin = 1;
+CFS_MODULE_PARM(round_robin, "i", int, 0644,
+		"Round robin for multiple interfaces");
+
+static int keepalive = 30;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+		"# seconds before send keepalive");
+
+static int keepalive_idle = 30;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+		"# idle seconds before probe");
+
+#define DEFAULT_KEEPALIVE_COUNT  5
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+		"# missed probes == dead");
+
+static int keepalive_intvl = 5;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+		"seconds between probes");
+
+static int enable_csum = 0;
+CFS_MODULE_PARM(enable_csum, "i", int, 0644,
+		"enable check sum");
+
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+		"set non-zero to inject a checksum error");
+
+static int nonblk_zcack = 1;
+CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644,
+		"always send ZC-ACK on non-blocking connection");
+
+static unsigned int zc_min_payload = (16 << 10);
+CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
+		"minimum payload size to zero copy");
+
+static unsigned int zc_recv = 0;
+CFS_MODULE_PARM(zc_recv, "i", int, 0644,
+		"enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644,
+		"minimum # of fragments to enable ZC recv");
+
+
+#if SOCKNAL_VERSION_DEBUG
+static int protocol = 3;
+CFS_MODULE_PARM(protocol, "i", int, 0644,
+		"protocol version");
+#endif
+
+ksock_tunables_t ksocknal_tunables;
+
+int ksocknal_tunables_init(void)
+{
+
+	/* initialize ksocknal_tunables structure */
+	ksocknal_tunables.ksnd_timeout	    = &sock_timeout;
+	ksocknal_tunables.ksnd_nscheds		  = &nscheds;
+	ksocknal_tunables.ksnd_nconnds	    = &nconnds;
+	ksocknal_tunables.ksnd_nconnds_max	= &nconnds_max;
+	ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+	ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+	ksocknal_tunables.ksnd_eager_ack	  = &eager_ack;
+	ksocknal_tunables.ksnd_typed_conns	= &typed_conns;
+	ksocknal_tunables.ksnd_min_bulk	   = &min_bulk;
+	ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+	ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+	ksocknal_tunables.ksnd_nagle	      = &nagle;
+	ksocknal_tunables.ksnd_round_robin	= &round_robin;
+	ksocknal_tunables.ksnd_keepalive	  = &keepalive;
+	ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+	ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+	ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+	ksocknal_tunables.ksnd_credits	    = &credits;
+	ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+	ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+	ksocknal_tunables.ksnd_peertimeout	= &peer_timeout;
+	ksocknal_tunables.ksnd_enable_csum	= &enable_csum;
+	ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+	ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+	ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+	ksocknal_tunables.ksnd_zc_recv	    = &zc_recv;
+	ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
+
+
+#if SOCKNAL_VERSION_DEBUG
+	ksocknal_tunables.ksnd_protocol	   = &protocol;
+#endif
+
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
+	ksocknal_tunables.ksnd_sysctl	     =  NULL;
+#endif
+
+	if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+		*ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+	/* initialize platform-sepcific tunables */
+	return ksocknal_lib_tunables_init();
+};
+
+void ksocknal_tunables_fini(void)
+{
+	ksocknal_lib_tunables_fini();
+}
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
new file mode 100644
index 000000000000..ec57179f8d2b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+/*
+ * Protocol entries :
+ *   pro_send_hello       : send hello message
+ *   pro_recv_hello       : receive hello message
+ *   pro_pack	     : pack message header
+ *   pro_unpack	   : unpack message header
+ *   pro_queue_tx_zcack() : Called holding BH lock: kss_lock
+ *			  return 1 if ACK is piggybacked, otherwise return 0
+ *   pro_queue_tx_msg()   : Called holding BH lock: kss_lock
+ *			  return the ACK that piggybacked by my message, or NULL
+ *   pro_handle_zcreq()   : handler of incoming ZC-REQ
+ *   pro_handle_zcack()   : handler of incoming ZC-ACK
+ *   pro_match_tx()       : Called holding glock
+ */
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	/* V1.x, just enqueue it */
+	list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+	return NULL;
+}
+
+void
+ksocknal_next_tx_carrier(ksock_conn_t *conn)
+{
+	ksock_tx_t     *tx = conn->ksnc_tx_carrier;
+
+	/* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+	LASSERT (!list_empty(&conn->ksnc_tx_queue));
+	LASSERT (tx != NULL);
+
+	/* Next TX that can carry ZC-ACK or LNet message */
+	if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+		/* no more packets queued */
+		conn->ksnc_tx_carrier = NULL;
+	} else {
+		conn->ksnc_tx_carrier = list_entry(tx->tx_list.next,
+						       ksock_tx_t, tx_list);
+		LASSERT (conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+	}
+}
+
+static int
+ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx = conn->ksnc_tx_carrier;
+
+	LASSERT (tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/*
+	 * Enqueue or piggyback tx_ack / cookie
+	 * . no tx can piggyback cookie of tx_ack (or cookie), just
+	 *   enqueue the tx_ack (if tx_ack != NUL) and return NULL.
+	 * . There is tx can piggyback cookie of tx_ack (or cookie),
+	 *   piggyback the cookie and return the tx.
+	 */
+	if (tx == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+		/* tx is noop zc-ack, can't piggyback zc-ack cookie */
+		if (tx_ack != NULL)
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+		return 0;
+	}
+
+	LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+	LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0);
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	/* piggyback the zc-ack cookie */
+	tx->tx_msg.ksm_zc_cookies[1] = cookie;
+	/* move on to the next TX which can carry cookie */
+	ksocknal_next_tx_carrier(conn);
+
+	return 1;
+}
+
+static ksock_tx_t *
+ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg)
+{
+	ksock_tx_t  *tx  = conn->ksnc_tx_carrier;
+
+	/*
+	 * Enqueue tx_msg:
+	 * . If there is no NOOP on the connection, just enqueue
+	 *   tx_msg and return NULL
+	 * . If there is NOOP on the connection, piggyback the cookie
+	 *   and replace the NOOP tx, and return the NOOP tx.
+	 */
+	if (tx == NULL) { /* nothing on queue */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		conn->ksnc_tx_carrier = tx_msg;
+		return NULL;
+	}
+
+	if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */
+		list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue);
+		return NULL;
+	}
+
+	LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	/* There is a noop zc-ack can be piggybacked */
+	tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1];
+	ksocknal_next_tx_carrier(conn);
+
+	/* use new_tx to replace the noop zc-ack packet */
+	list_add(&tx_msg->tx_list, &tx->tx_list);
+	list_del(&tx->tx_list);
+
+	return tx;
+}
+
+static int
+ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn,
+			   ksock_tx_t *tx_ack, __u64 cookie)
+{
+	ksock_tx_t *tx;
+
+	if (conn->ksnc_type != SOCKLND_CONN_ACK)
+		return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie);
+
+	/* non-blocking ZC-ACK (to router) */
+	LASSERT (tx_ack == NULL ||
+		 tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+	if ((tx = conn->ksnc_tx_carrier) == NULL) {
+		if (tx_ack != NULL) {
+			list_add_tail(&tx_ack->tx_list,
+					  &conn->ksnc_tx_queue);
+			conn->ksnc_tx_carrier = tx_ack;
+		}
+		return 0;
+	}
+
+	/* conn->ksnc_tx_carrier != NULL */
+
+	if (tx_ack != NULL)
+		cookie = tx_ack->tx_msg.ksm_zc_cookies[1];
+
+	if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */
+		return 1;
+
+	if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) {
+		/* replace the keepalive PING with a real ACK */
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0);
+		tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		return 1;
+	}
+
+	if (cookie == tx->tx_msg.ksm_zc_cookies[0] ||
+	    cookie == tx->tx_msg.ksm_zc_cookies[1]) {
+		CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+		      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+		return 1; /* XXX return error in the future */
+	}
+
+	if (tx->tx_msg.ksm_zc_cookies[0] == 0) {
+		/* NOOP tx has only one ZC-ACK cookie, can carry at least one more */
+		if (tx->tx_msg.ksm_zc_cookies[1] > cookie) {
+			tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1];
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+		} else {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+		}
+
+		if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) {
+			/* not likely to carry more ACKs, skip it to simplify logic */
+			ksocknal_next_tx_carrier(conn);
+		}
+
+		return 1;
+	}
+
+	/* takes two or more cookies already */
+
+	if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) {
+		__u64   tmp = 0;
+
+		/* two seperated cookies: (a+2, a) or (a+1, a) */
+		LASSERT (tx->tx_msg.ksm_zc_cookies[0] -
+			 tx->tx_msg.ksm_zc_cookies[1] <= 2);
+
+		if (tx->tx_msg.ksm_zc_cookies[0] -
+		    tx->tx_msg.ksm_zc_cookies[1] == 2) {
+			if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1)
+				tmp = cookie;
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[1];
+		} else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) {
+			tmp = tx->tx_msg.ksm_zc_cookies[0];
+		}
+
+		if (tmp != 0) {
+			/* range of cookies */
+			tx->tx_msg.ksm_zc_cookies[0] = tmp - 1;
+			tx->tx_msg.ksm_zc_cookies[1] = tmp + 1;
+			return 1;
+		}
+
+	} else {
+		/* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */
+		if (cookie >= tx->tx_msg.ksm_zc_cookies[0] &&
+		    cookie <= tx->tx_msg.ksm_zc_cookies[1]) {
+			CWARN("%s: duplicated ZC cookie: "LPU64"\n",
+			      libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie);
+			return 1; /* XXX: return error in the future */
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) {
+			tx->tx_msg.ksm_zc_cookies[1] = cookie;
+			return 1;
+		}
+
+		if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) {
+			tx->tx_msg.ksm_zc_cookies[0] = cookie;
+			return 1;
+		}
+	}
+
+	/* failed to piggyback ZC-ACK */
+	if (tx_ack != NULL) {
+		list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue);
+		/* the next tx can piggyback at least 1 ACK */
+		ksocknal_next_tx_carrier(conn);
+	}
+
+	return 0;
+}
+
+static int
+ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+#if SOCKNAL_VERSION_DEBUG
+	if (!*ksocknal_tunables.ksnd_typed_conns)
+		return SOCKNAL_MATCH_YES;
+#endif
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL) {
+		/* noop packet */
+		nob = offsetof(ksock_msg_t, ksm_u);
+	} else {
+		nob = tx->tx_lnetmsg->msg_len +
+		      ((conn->ksnc_proto == &ksocknal_protocol_v1x) ?
+		       sizeof(lnet_hdr_t) : sizeof(ksock_msg_t));
+	}
+
+	/* default checking for typed connection */
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_BULK_IN:
+		return SOCKNAL_MATCH_MAY;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+static int
+ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk)
+{
+	int nob;
+
+	if (tx == NULL || tx->tx_lnetmsg == NULL)
+		nob = offsetof(ksock_msg_t, ksm_u);
+	else
+		nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t);
+
+	switch (conn->ksnc_type) {
+	default:
+		CERROR("ksnc_type bad: %u\n", conn->ksnc_type);
+		LBUG();
+	case SOCKLND_CONN_ANY:
+		return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_ACK:
+		if (nonblk)
+			return SOCKNAL_MATCH_YES;
+		else if (tx == NULL || tx->tx_lnetmsg == NULL)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_NO;
+
+	case SOCKLND_CONN_BULK_OUT:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob < *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+
+	case SOCKLND_CONN_CONTROL:
+		if (nonblk)
+			return SOCKNAL_MATCH_NO;
+		else if (nob >= *ksocknal_tunables.ksnd_min_bulk)
+			return SOCKNAL_MATCH_MAY;
+		else
+			return SOCKNAL_MATCH_YES;
+	}
+}
+
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote)
+{
+	ksock_peer_t   *peer = c->ksnc_peer;
+	ksock_conn_t   *conn;
+	ksock_tx_t     *tx;
+	int	     rc;
+
+	read_lock(&ksocknal_data.ksnd_global_lock);
+
+	conn = ksocknal_find_conn_locked(peer, NULL, !!remote);
+	if (conn != NULL) {
+		ksock_sched_t *sched = conn->ksnc_scheduler;
+
+		LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL);
+
+		spin_lock_bh(&sched->kss_lock);
+
+		rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie);
+
+		spin_unlock_bh(&sched->kss_lock);
+
+		if (rc) { /* piggybacked */
+			read_unlock(&ksocknal_data.ksnd_global_lock);
+			return 0;
+		}
+	}
+
+	read_unlock(&ksocknal_data.ksnd_global_lock);
+
+	/* ACK connection is not ready, or can't piggyback the ACK */
+	tx = ksocknal_alloc_tx_noop(cookie, !!remote);
+	if (tx == NULL)
+		return -ENOMEM;
+
+	if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0)
+		return 0;
+
+	ksocknal_free_tx(tx);
+	return rc;
+}
+
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2)
+{
+	ksock_peer_t      *peer = conn->ksnc_peer;
+	ksock_tx_t	*tx;
+	ksock_tx_t	*tmp;
+	LIST_HEAD     (zlist);
+	int		count;
+
+	if (cookie1 == 0)
+		cookie1 = cookie2;
+
+	count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1);
+
+	if (cookie2 == SOCKNAL_KEEPALIVE_PING &&
+	    conn->ksnc_proto == &ksocknal_protocol_v3x) {
+		/* keepalive PING for V3.x, just ignore it */
+		return count == 1 ? 0 : -EPROTO;
+	}
+
+	spin_lock(&peer->ksnp_lock);
+
+	list_for_each_entry_safe(tx, tmp,
+				     &peer->ksnp_zc_req_list, tx_zc_list) {
+		__u64 c = tx->tx_msg.ksm_zc_cookies[0];
+
+		if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) {
+			tx->tx_msg.ksm_zc_cookies[0] = 0;
+			list_del(&tx->tx_zc_list);
+			list_add(&tx->tx_zc_list, &zlist);
+
+			if (--count == 0)
+				break;
+		}
+	}
+
+	spin_unlock(&peer->ksnp_lock);
+
+	while (!list_empty(&zlist)) {
+		tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list);
+		list_del(&tx->tx_zc_list);
+		ksocknal_tx_decref(tx);
+	}
+
+	return count == 0 ? 0 : -EPROTO;
+}
+
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	socket_t	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	lnet_magicversion_t *hmv;
+	int		  rc;
+	int		  i;
+
+	CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+	/* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+	 * header and send out */
+	hmv->magic	 = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+	hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+	hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hmv->version_major++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		if ((the_lnet.ln_testprotocompat & 2) != 0) {
+			hmv->magic = LNET_PROTO_MAGIC;
+			the_lnet.ln_testprotocompat &= ~2;
+		}
+		LNET_UNLOCK();
+	}
+
+	hdr->src_nid	= cpu_to_le64 (hello->kshm_src_nid);
+	hdr->src_pid	= cpu_to_le32 (hello->kshm_src_pid);
+	hdr->type	   = cpu_to_le32 (LNET_MSG_HELLO);
+	hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+	hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+	hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
+
+	rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
+	}
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d)"
+			" to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+			HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+	socket_t   *sock = conn->ksnc_sock;
+	int	     rc;
+
+	hello->kshm_magic   = LNET_PROTO_MAGIC;
+	hello->kshm_version = conn->ksnc_proto->pro_version;
+
+	if (the_lnet.ln_testprotocompat != 0) {
+		/* single-shot proto check */
+		LNET_LOCK();
+		if ((the_lnet.ln_testprotocompat & 1) != 0) {
+			hello->kshm_version++;   /* just different! */
+			the_lnet.ln_testprotocompat &= ~1;
+		}
+		LNET_UNLOCK();
+	}
+
+	rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+			       lnet_acceptor_timeout());
+
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+		return rc;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_write(sock, hello->kshm_ips,
+			       hello->kshm_nips * sizeof(__u32),
+			       lnet_acceptor_timeout());
+	if (rc != 0) {
+		CNETERR("Error %d sending HELLO payload (%d)"
+			" to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+			HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+	}
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout)
+{
+	socket_t	*sock = conn->ksnc_sock;
+	lnet_hdr_t	  *hdr;
+	int		  rc;
+	int		  i;
+
+	LIBCFS_ALLOC(hdr, sizeof(*hdr));
+	if (hdr == NULL) {
+		CERROR("Can't allocate lnet_hdr_t\n");
+		return -ENOMEM;
+	}
+
+	rc = libcfs_sock_read(sock, &hdr->src_nid,
+			      sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	/* ...and check we got what we expected */
+	if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
+		CERROR ("Expecting a HELLO hdr,"
+			" but got type %d from %u.%u.%u.%u\n",
+			le32_to_cpu (hdr->type),
+			HIPQUAD(conn->ksnc_ipaddr));
+		rc = -EPROTO;
+		goto out;
+	}
+
+	hello->kshm_src_nid	 = le64_to_cpu (hdr->src_nid);
+	hello->kshm_src_pid	 = le32_to_cpu (hdr->src_pid);
+	hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+	hello->kshm_ctype	   = le32_to_cpu (hdr->msg.hello.type);
+	hello->kshm_nips	    = le32_to_cpu (hdr->payload_length) /
+					 sizeof (__u32);
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+		       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+		rc = -EPROTO;
+		goto out;
+	}
+
+	if (hello->kshm_nips == 0)
+		goto out;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		goto out;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+			       i, HIPQUAD(conn->ksnc_ipaddr));
+			rc = -EPROTO;
+			break;
+		}
+	}
+out:
+	LIBCFS_FREE(hdr, sizeof(*hdr));
+
+	return rc;
+}
+
+static int
+ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout)
+{
+	socket_t      *sock = conn->ksnc_sock;
+	int		rc;
+	int		i;
+
+	if (hello->kshm_magic == LNET_PROTO_MAGIC)
+		conn->ksnc_flip = 0;
+	else
+		conn->ksnc_flip = 1;
+
+	rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+			      offsetof(ksock_hello_msg_t, kshm_ips) -
+				       offsetof(ksock_hello_msg_t, kshm_src_nid),
+			      timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	if (conn->ksnc_flip) {
+		__swab32s(&hello->kshm_src_pid);
+		__swab64s(&hello->kshm_src_nid);
+		__swab32s(&hello->kshm_dst_pid);
+		__swab64s(&hello->kshm_dst_nid);
+		__swab64s(&hello->kshm_src_incarnation);
+		__swab64s(&hello->kshm_dst_incarnation);
+		__swab32s(&hello->kshm_ctype);
+		__swab32s(&hello->kshm_nips);
+	}
+
+	if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+		CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+		       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+		return -EPROTO;
+	}
+
+	if (hello->kshm_nips == 0)
+		return 0;
+
+	rc = libcfs_sock_read(sock, hello->kshm_ips,
+			      hello->kshm_nips * sizeof(__u32), timeout);
+	if (rc != 0) {
+		CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+			rc, HIPQUAD(conn->ksnc_ipaddr));
+		LASSERT (rc < 0 && rc != -EALREADY);
+		return rc;
+	}
+
+	for (i = 0; i < (int) hello->kshm_nips; i++) {
+		if (conn->ksnc_flip)
+			__swab32s(&hello->kshm_ips[i]);
+
+		if (hello->kshm_ips[i] == 0) {
+			CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+			       i, HIPQUAD(conn->ksnc_ipaddr));
+			return -EPROTO;
+		}
+	}
+
+	return 0;
+}
+
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+	/* V1.x has no KSOCK_MSG_NOOP */
+	LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+	LASSERT(tx->tx_lnetmsg != NULL);
+
+	tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+	tx->tx_iov[0].iov_len  = sizeof(lnet_hdr_t);
+
+	tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+	tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+	if (tx->tx_lnetmsg != NULL) {
+		LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+		tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+		tx->tx_iov[0].iov_len = sizeof(ksock_msg_t);
+		tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len;
+	} else {
+		LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+		tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+		tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_hdr);
+	}
+	/* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+	msg->ksm_csum	   = 0;
+	msg->ksm_type	   = KSOCK_MSG_LNET;
+	msg->ksm_zc_cookies[0]  = msg->ksm_zc_cookies[1]  = 0;
+}
+
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+	return;  /* Do nothing */
+}
+
+ksock_proto_t  ksocknal_protocol_v1x =
+{
+	.pro_version	    = KSOCK_PROTO_V1,
+	.pro_send_hello	 = ksocknal_send_hello_v1,
+	.pro_recv_hello	 = ksocknal_recv_hello_v1,
+	.pro_pack	       = ksocknal_pack_msg_v1,
+	.pro_unpack	     = ksocknal_unpack_msg_v1,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v1,
+	.pro_handle_zcreq       = NULL,
+	.pro_handle_zcack       = NULL,
+	.pro_queue_tx_zcack     = NULL,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v2x =
+{
+	.pro_version	    = KSOCK_PROTO_V2,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v2,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx
+};
+
+ksock_proto_t  ksocknal_protocol_v3x =
+{
+	.pro_version	    = KSOCK_PROTO_V3,
+	.pro_send_hello	 = ksocknal_send_hello_v2,
+	.pro_recv_hello	 = ksocknal_recv_hello_v2,
+	.pro_pack	       = ksocknal_pack_msg_v2,
+	.pro_unpack	     = ksocknal_unpack_msg_v2,
+	.pro_queue_tx_msg       = ksocknal_queue_tx_msg_v2,
+	.pro_queue_tx_zcack     = ksocknal_queue_tx_zcack_v3,
+	.pro_handle_zcreq       = ksocknal_handle_zcreq,
+	.pro_handle_zcack       = ksocknal_handle_zcack,
+	.pro_match_tx	   = ksocknal_match_tx_v3
+};
diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644
index 000000000000..1bd9ef774208
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o	\
+	  lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o		\
+	  router_proc.o acceptor.o peer.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644
index 000000000000..81ef28bbcba0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/acceptor.c
@@ -0,0 +1,527 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+
+static int   accept_port    = 988;
+static int   accept_backlog = 127;
+static int   accept_timeout = 5;
+
+struct {
+	int			pta_shutdown;
+	socket_t		*pta_sock;
+	struct completion	pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+	return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+	return (magic == constant ||
+		magic == __swab32(constant));
+}
+
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+		"Accept connections (secure|all|none)");
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+		"Acceptor's port (same on all nodes)");
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+		"Acceptor's listen backlog");
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+		"Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+int
+lnet_acceptor_get_tunables(void)
+{
+	/* Userland acceptor uses 'accept_type' instead of 'accept', due to
+	 * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+	 * for compatibility. Hence the trick. */
+	accept_type = accept;
+	return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+	return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+			   __u32 peer_ip, int peer_port)
+{
+	switch (rc) {
+	/* "normal" errors */
+	case -ECONNREFUSED:
+		CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was "
+			"refused: check that Lustre is running on that node.\n",
+			libcfs_nid2str(peer_nid),
+			HIPQUAD(peer_ip), peer_port);
+		break;
+	case -EHOSTUNREACH:
+	case -ENETUNREACH:
+		CNETERR("Connection to %s at host %u.%u.%u.%u "
+			"was unreachable: the network or that node may "
+			"be down, or Lustre may be misconfigured.\n",
+			libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+		break;
+	case -ETIMEDOUT:
+		CNETERR("Connection to %s at host %u.%u.%u.%u on "
+			"port %d took too long: that node may be hung "
+			"or experiencing high load.\n",
+			libcfs_nid2str(peer_nid),
+			HIPQUAD(peer_ip), peer_port);
+		break;
+	case -ECONNRESET:
+		LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u"
+				   " on port %d was reset: "
+				   "is it running a compatible version of "
+				   "Lustre and is %s one of its NIDs?\n",
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port,
+				   libcfs_nid2str(peer_nid));
+		break;
+	case -EPROTO:
+		LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+				   "host %u.%u.%u.%u on port %d: is it running "
+				   "a compatible version of Lustre?\n",
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	case -EADDRINUSE:
+		LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+				   "connect to %s at host %u.%u.%u.%u on port "
+				   "%d\n", libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	default:
+		LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+				   " at host %u.%u.%u.%u on port %d\n", rc,
+				   libcfs_nid2str(peer_nid),
+				   HIPQUAD(peer_ip), peer_port);
+		break;
+	}
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+	    __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+	lnet_acceptor_connreq_t cr;
+	socket_t	   *sock;
+	int		     rc;
+	int		     port;
+	int		     fatal;
+
+	CLASSERT (sizeof(cr) <= 16);	    /* not too big to be on the stack */
+
+	for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+	     port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+	     --port) {
+		/* Iterate through reserved ports. */
+
+		rc = libcfs_sock_connect(&sock, &fatal,
+					 local_ip, port,
+					 peer_ip, peer_port);
+		if (rc != 0) {
+			if (fatal)
+				goto failed;
+			continue;
+		}
+
+		CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+		cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+		cr.acr_nid     = peer_nid;
+
+		if (the_lnet.ln_testprotocompat != 0) {
+			/* single-shot proto check */
+			lnet_net_lock(LNET_LOCK_EX);
+			if ((the_lnet.ln_testprotocompat & 4) != 0) {
+				cr.acr_version++;
+				the_lnet.ln_testprotocompat &= ~4;
+			}
+			if ((the_lnet.ln_testprotocompat & 8) != 0) {
+				cr.acr_magic = LNET_PROTO_MAGIC;
+				the_lnet.ln_testprotocompat &= ~8;
+			}
+			lnet_net_unlock(LNET_LOCK_EX);
+		}
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+		if (rc != 0)
+			goto failed_sock;
+
+		*sockp = sock;
+		return 0;
+	}
+
+	rc = -EADDRINUSE;
+	goto failed;
+
+ failed_sock:
+	libcfs_sock_release(sock);
+ failed:
+	lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+	return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+int
+lnet_accept(socket_t *sock, __u32 magic)
+{
+	lnet_acceptor_connreq_t cr;
+	__u32		   peer_ip;
+	int		     peer_port;
+	int		     rc;
+	int		     flip;
+	lnet_ni_t	      *ni;
+	char		   *str;
+
+	LASSERT (sizeof(cr) <= 16);	     /* not too big for the stack */
+
+	rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+	LASSERT (rc == 0);		      /* we succeeded before */
+
+	if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+		if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+			/* future version compatibility!
+			 * When LNET unifies protocols over all LNDs, the first
+			 * thing sent will be a version query.  I send back
+			 * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+			memset (&cr, 0, sizeof(cr));
+			cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+			cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+			rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+					       accept_timeout);
+
+			if (rc != 0)
+				CERROR("Error sending magic+version in response"
+				       "to LNET magic from %u.%u.%u.%u: %d\n",
+				       HIPQUAD(peer_ip), rc);
+			return -EPROTO;
+		}
+
+		if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+			str = "'old' socknal/tcpnal";
+		else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+			str = "'old' ranal";
+		else
+			str = "unrecognised";
+
+		LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u"
+				   " magic %08x: %s acceptor protocol\n",
+				   HIPQUAD(peer_ip), magic, str);
+		return -EPROTO;
+	}
+
+	flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+	rc = libcfs_sock_read(sock, &cr.acr_version,
+			      sizeof(cr.acr_version),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request version from "
+		       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+		return -EIO;
+	}
+
+	if (flip)
+		__swab32s(&cr.acr_version);
+
+	if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+		/* future version compatibility!
+		 * An acceptor-specific protocol rev will first send a version
+		 * query.  I send back my current version to tell her I'm
+		 * "old". */
+		int peer_version = cr.acr_version;
+
+		memset (&cr, 0, sizeof(cr));
+		cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+		cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+		rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+				       accept_timeout);
+
+		if (rc != 0)
+			CERROR("Error sending magic+version in response"
+			       "to version %d from %u.%u.%u.%u: %d\n",
+			       peer_version, HIPQUAD(peer_ip), rc);
+		return -EPROTO;
+	}
+
+	rc = libcfs_sock_read(sock, &cr.acr_nid,
+			      sizeof(cr) -
+			      offsetof(lnet_acceptor_connreq_t, acr_nid),
+			      accept_timeout);
+	if (rc != 0) {
+		CERROR("Error %d reading connection request from "
+		       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+		return -EIO;
+	}
+
+	if (flip)
+		__swab64s(&cr.acr_nid);
+
+	ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+	if (ni == NULL ||	       /* no matching net */
+	    ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+		if (ni != NULL)
+			lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u"
+				   " for %s: No matching NI\n",
+				   HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	if (ni->ni_lnd->lnd_accept == NULL) {
+		/* This catches a request for the loopback LND */
+		lnet_ni_decref(ni);
+		LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u"
+				  " for %s: NI doesn not accept IP connections\n",
+				  HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+		return -EPERM;
+	}
+
+	CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n",
+	       libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+
+	rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+	lnet_ni_decref(ni);
+	return rc;
+}
+
+int
+lnet_acceptor(void *arg)
+{
+	socket_t  *newsock;
+	int	    rc;
+	__u32	  magic;
+	__u32	  peer_ip;
+	int	    peer_port;
+	int	    secure = (int)((long_ptr_t)arg);
+
+	LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+	cfs_block_allsigs();
+
+	rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+				0, accept_port, accept_backlog);
+	if (rc != 0) {
+		if (rc == -EADDRINUSE)
+			LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+					   " %d: port already in use\n",
+					   accept_port);
+		else
+			LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+					   "%d: unexpected error %d\n",
+					   accept_port, rc);
+
+		lnet_acceptor_state.pta_sock = NULL;
+	} else {
+		LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+	}
+
+	/* set init status and unblock parent */
+	lnet_acceptor_state.pta_shutdown = rc;
+	complete(&lnet_acceptor_state.pta_signal);
+
+	if (rc != 0)
+		return rc;
+
+	while (!lnet_acceptor_state.pta_shutdown) {
+
+		rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+		if (rc != 0) {
+			if (rc != -EAGAIN) {
+				CWARN("Accept error %d: pausing...\n", rc);
+				cfs_pause(cfs_time_seconds(1));
+			}
+			continue;
+		}
+
+		/* maybe we're waken up with libcfs_sock_abort_accept() */
+		if (lnet_acceptor_state.pta_shutdown) {
+			libcfs_sock_release(newsock);
+			break;
+		}
+
+		rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+		if (rc != 0) {
+			CERROR("Can't determine new connection's address\n");
+			goto failed;
+		}
+
+		if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+			CERROR("Refusing connection from %u.%u.%u.%u: "
+			       "insecure port %d\n",
+			       HIPQUAD(peer_ip), peer_port);
+			goto failed;
+		}
+
+		rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+				      accept_timeout);
+		if (rc != 0) {
+			CERROR("Error %d reading connection request from "
+			       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+			goto failed;
+		}
+
+		rc = lnet_accept(newsock, magic);
+		if (rc != 0)
+			goto failed;
+
+		continue;
+
+	failed:
+		libcfs_sock_release(newsock);
+	}
+
+	libcfs_sock_release(lnet_acceptor_state.pta_sock);
+	lnet_acceptor_state.pta_sock = NULL;
+
+	CDEBUG(D_NET, "Acceptor stopping\n");
+
+	/* unblock lnet_acceptor_stop() */
+	complete(&lnet_acceptor_state.pta_signal);
+	return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+	if (!strcmp(acc, "secure")) {
+		*sec = 1;
+		return 1;
+	} else if (!strcmp(acc, "all")) {
+		*sec = 0;
+		return 1;
+	} else if (!strcmp(acc, "none")) {
+		return 0;
+	} else {
+		LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+				   acc);
+		return -EINVAL;
+	}
+}
+
+int
+lnet_acceptor_start(void)
+{
+	int  rc;
+	long rc2;
+	long secure;
+
+	LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+	rc = lnet_acceptor_get_tunables();
+	if (rc != 0)
+		return rc;
+
+
+	init_completion(&lnet_acceptor_state.pta_signal);
+	rc = accept2secure(accept_type, &secure);
+	if (rc <= 0) {
+		fini_completion(&lnet_acceptor_state.pta_signal);
+		return rc;
+	}
+
+	if (lnet_count_acceptor_nis() == 0)  /* not required */
+		return 0;
+
+	rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+				  (void *)(ulong_ptr_t)secure,
+				  "acceptor_%03ld", secure));
+	if (IS_ERR_VALUE(rc2)) {
+		CERROR("Can't start acceptor thread: %ld\n", rc2);
+		fini_completion(&lnet_acceptor_state.pta_signal);
+
+		return -ESRCH;
+	}
+
+	/* wait for acceptor to startup */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	if (!lnet_acceptor_state.pta_shutdown) {
+		/* started OK */
+		LASSERT(lnet_acceptor_state.pta_sock != NULL);
+		return 0;
+	}
+
+	LASSERT(lnet_acceptor_state.pta_sock == NULL);
+	fini_completion(&lnet_acceptor_state.pta_signal);
+
+	return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+	if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+		return;
+
+	lnet_acceptor_state.pta_shutdown = 1;
+	libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+	/* block until acceptor signals exit */
+	wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+	fini_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c
new file mode 100644
index 000000000000..695b27265e23
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-errno.c
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/api-errno.c
+ *
+ * Instantiate the string table of errors
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644
index 000000000000..e88bee362497
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -0,0 +1,1941 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+#include <linux/log2.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t      the_lnet;			   /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+		"LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+		"local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+		"routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444,
+		"size of remote network hash table");
+
+char *
+lnet_get_routes(void)
+{
+	return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+	char   *nets;
+	int     rc;
+
+	if (*networks != 0 && *ip2nets != 0) {
+		LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+				   "'ip2nets' but not both at once\n");
+		return NULL;
+	}
+
+	if (*ip2nets != 0) {
+		rc = lnet_parse_ip2nets(&nets, ip2nets);
+		return (rc == 0) ? nets : NULL;
+	}
+
+	if (*networks != 0)
+		return networks;
+
+	return "tcp";
+}
+
+void
+lnet_init_locks(void)
+{
+	spin_lock_init(&the_lnet.ln_eq_wait_lock);
+	init_waitqueue_head(&the_lnet.ln_eq_waitq);
+	mutex_init(&the_lnet.ln_lnd_mutex);
+	mutex_init(&the_lnet.ln_api_mutex);
+}
+
+void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+	int		i;
+	struct list_head	*hash;
+
+	LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+	LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+	LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	if (hash == NULL) {
+		CERROR("Failed to create remote nets hash table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&hash[i]);
+	the_lnet.ln_remote_nets_hash = hash;
+	return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+	int		i;
+	struct list_head	*hash;
+
+	if (the_lnet.ln_remote_nets_hash == NULL)
+		return;
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+		LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+	LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+		    LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+	the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+	if (the_lnet.ln_res_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_res_lock);
+		the_lnet.ln_res_lock = NULL;
+	}
+
+	if (the_lnet.ln_net_lock != NULL) {
+		cfs_percpt_lock_free(the_lnet.ln_net_lock);
+		the_lnet.ln_net_lock = NULL;
+	}
+
+	lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+	lnet_init_locks();
+
+	the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_res_lock == NULL)
+		goto failed;
+
+	the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+	if (the_lnet.ln_net_lock == NULL)
+		goto failed;
+
+	return 0;
+
+ failed:
+	lnet_destroy_locks();
+	return -ENOMEM;
+}
+
+void lnet_assert_wire_constants (void)
+{
+	/* Wire protocol assertions generated by 'wirecheck'
+	 * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+	 * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+	 * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+	/* Constants... */
+	CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+	CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+	CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+	CLASSERT (LNET_MSG_ACK == 0);
+	CLASSERT (LNET_MSG_PUT == 1);
+	CLASSERT (LNET_MSG_GET == 2);
+	CLASSERT (LNET_MSG_REPLY == 3);
+	CLASSERT (LNET_MSG_HELLO == 4);
+
+	/* Checks for struct ptl_handle_wire_t */
+	CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+	CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+	CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+	CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+	CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+	/* Checks for struct lnet_magicversion_t */
+	CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+	CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+	CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+	/* Checks for struct lnet_hdr_t */
+	CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+	CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+	/* Ack */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+	/* Put */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+	/* Get */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+	/* Reply */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+	/* Hello */
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+	CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+	CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type)
+{
+	lnd_t	      *lnd;
+	struct list_head	 *tmp;
+
+	/* holding lnd mutex */
+	list_for_each (tmp, &the_lnet.ln_lnds) {
+		lnd = list_entry(tmp, lnd_t, lnd_list);
+
+		if ((int)lnd->lnd_type == type)
+			return lnd;
+	}
+
+	return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+	LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+	list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+	lnd->lnd_refcount = 0;
+
+	CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd (lnd_t *lnd)
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+	LASSERT (lnd->lnd_refcount == 0);
+
+	list_del (&lnd->lnd_list);
+	CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+	lnet_counters_t *ctr;
+	int		i;
+
+	memset(counters, 0, sizeof(*counters));
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+		counters->msgs_max     += ctr->msgs_max;
+		counters->msgs_alloc   += ctr->msgs_alloc;
+		counters->errors       += ctr->errors;
+		counters->send_count   += ctr->send_count;
+		counters->recv_count   += ctr->recv_count;
+		counters->route_count  += ctr->route_count;
+		counters->drop_length  += ctr->drop_length;
+		counters->send_length  += ctr->send_length;
+		counters->recv_length  += ctr->recv_length;
+		counters->route_length += ctr->route_length;
+		counters->drop_length  += ctr->drop_length;
+
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+	lnet_counters_t *counters;
+	int		i;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+		memset(counters, 0, sizeof(lnet_counters_t));
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+	char *space;
+
+	LASSERT (n > 0);
+
+	size += offsetof (lnet_freeobj_t, fo_contents);
+
+	LIBCFS_ALLOC(space, n * size);
+	if (space == NULL)
+		return (-ENOMEM);
+
+	INIT_LIST_HEAD (&fl->fl_list);
+	fl->fl_objs = space;
+	fl->fl_nobjs = n;
+	fl->fl_objsize = size;
+
+	do
+	{
+		memset (space, 0, size);
+		list_add ((struct list_head *)space, &fl->fl_list);
+		space += size;
+	} while (--n != 0);
+
+	return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+	struct list_head       *el;
+	int	       count;
+
+	if (fl->fl_nobjs == 0)
+		return;
+
+	count = 0;
+	for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+		count++;
+
+	LASSERT (count == fl->fl_nobjs);
+
+	LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+	memset (fl, 0, sizeof (*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+__u64
+lnet_create_interface_cookie (void)
+{
+	/* NB the interface cookie in wire handles guards against delayed
+	 * replies and ACKs appearing valid after reboot. Initialisation time,
+	 * even if it's only implemented to millisecond resolution is probably
+	 * easily good enough. */
+	struct timeval tv;
+	__u64	  cookie;
+	do_gettimeofday(&tv);
+	cookie = tv.tv_sec;
+	cookie *= 1000000;
+	cookie += tv.tv_usec;
+	return cookie;
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+	switch (type) {
+	default:
+		LBUG();
+	case LNET_COOKIE_TYPE_MD:
+		return "MD";
+	case LNET_COOKIE_TYPE_ME:
+		return "ME";
+	case LNET_COOKIE_TYPE_EQ:
+		return "EQ";
+	}
+}
+
+void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+	int	count = 0;
+
+	if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+		return;
+
+	while (!list_empty(&rec->rec_active)) {
+		struct list_head *e = rec->rec_active.next;
+
+		list_del_init(e);
+		if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+			lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+		} else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+			lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+		} else { /* NB: Active MEs should be attached on portals */
+			LBUG();
+		}
+		count++;
+	}
+
+	if (count > 0) {
+		/* Found alive MD/ME/EQ, user really should unlink/free
+		 * all of them before finalize LNet, but if someone didn't,
+		 * we have to recycle garbage for him */
+		CERROR("%d active elements on exit of %s container\n",
+		       count, lnet_res_type2str(rec->rec_type));
+	}
+
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&rec->rec_freelist);
+#endif
+	if (rec->rec_lh_hash != NULL) {
+		LIBCFS_FREE(rec->rec_lh_hash,
+			    LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+		rec->rec_lh_hash = NULL;
+	}
+
+	rec->rec_type = 0; /* mark it as finalized */
+}
+
+int
+lnet_res_container_setup(struct lnet_res_container *rec,
+			 int cpt, int type, int objnum, int objsz)
+{
+	int	rc = 0;
+	int	i;
+
+	LASSERT(rec->rec_type == 0);
+
+	rec->rec_type = type;
+	INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+	rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+	if (rc != 0)
+		goto out;
+#endif
+	rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+	/* Arbitrary choice of hash table size */
+	LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+			 LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+	if (rec->rec_lh_hash == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+	return 0;
+
+out:
+	CERROR("Failed to setup %s resource container\n",
+	       lnet_res_type2str(type));
+	lnet_res_container_cleanup(rec);
+	return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+	struct lnet_res_container	*rec;
+	int				i;
+
+	cfs_percpt_for_each(rec, i, recs)
+		lnet_res_container_cleanup(rec);
+
+	cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+	struct lnet_res_container	**recs;
+	struct lnet_res_container	*rec;
+	int				rc;
+	int				i;
+
+	recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+	if (recs == NULL) {
+		CERROR("Failed to allocate %s resource containers\n",
+		       lnet_res_type2str(type));
+		return NULL;
+	}
+
+	cfs_percpt_for_each(rec, i, recs) {
+		rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+		if (rc != 0) {
+			lnet_res_containers_destroy(recs);
+			return NULL;
+		}
+	}
+
+	return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	struct list_head		*head;
+	lnet_libhandle_t	*lh;
+	unsigned int		hash;
+
+	if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+		return NULL;
+
+	hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+	head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+	list_for_each_entry(lh, head, lh_hash_chain) {
+		if (lh->lh_cookie == cookie)
+			return lh;
+	}
+
+	return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+	/* ALWAYS called with lnet_res_lock held */
+	unsigned int	ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+	unsigned int	hash;
+
+	lh->lh_cookie = rec->rec_lh_cookie;
+	rec->rec_lh_cookie += 1 << ibits;
+
+	hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+	list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+	/* Prepare to bring up the network */
+	struct lnet_res_container **recs;
+	int			  rc = 0;
+
+	LASSERT (the_lnet.ln_refcount == 0);
+
+	the_lnet.ln_routing = 0;
+
+	LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+	the_lnet.ln_pid = requested_pid;
+
+	INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+	INIT_LIST_HEAD(&the_lnet.ln_nis);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+	INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+	rc = lnet_create_remote_nets_table();
+	if (rc != 0)
+		goto failed;
+
+	the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+	the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+						sizeof(lnet_counters_t));
+	if (the_lnet.ln_counters == NULL) {
+		CERROR("Failed to allocate counters for LNet\n");
+		rc = -ENOMEM;
+		goto failed;
+	}
+
+	rc = lnet_peer_tables_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_msg_containers_create();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+				      LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+				      sizeof(lnet_eq_t));
+	if (rc != 0)
+		goto failed;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+					  sizeof(lnet_me_t));
+	if (recs == NULL)
+		goto failed;
+
+	the_lnet.ln_me_containers = recs;
+
+	recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+					  sizeof(lnet_libmd_t));
+	if (recs == NULL)
+		goto failed;
+
+	the_lnet.ln_md_containers = recs;
+
+	rc = lnet_portals_create();
+	if (rc != 0) {
+		CERROR("Failed to create portals for LNet: %d\n", rc);
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_unprepare();
+	return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+	/* NB no LNET_LOCK since this is the last reference.  All LND instances
+	 * have shut down already, so it is safe to unlink and free all
+	 * descriptors, even those that appear committed to a network op (eg MD
+	 * with non-zero pending count) */
+
+	lnet_fail_nid(LNET_NID_ANY, 0);
+
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_test_peers));
+	LASSERT(list_empty(&the_lnet.ln_nis));
+	LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_portals_destroy();
+
+	if (the_lnet.ln_md_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_md_containers);
+		the_lnet.ln_md_containers = NULL;
+	}
+
+	if (the_lnet.ln_me_containers != NULL) {
+		lnet_res_containers_destroy(the_lnet.ln_me_containers);
+		the_lnet.ln_me_containers = NULL;
+	}
+
+	lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+	lnet_msg_containers_destroy();
+	lnet_peer_tables_destroy();
+	lnet_rtrpools_free();
+
+	if (the_lnet.ln_counters != NULL) {
+		cfs_percpt_free(the_lnet.ln_counters);
+		the_lnet.ln_counters = NULL;
+	}
+	lnet_destroy_remote_nets_table();
+
+	return 0;
+}
+
+lnet_ni_t  *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+	struct list_head	*tmp;
+	lnet_ni_t	*ni;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+	lnet_ni_t *ni;
+
+	lnet_net_lock(0);
+	ni = lnet_net2ni_locked(net, 0);
+	lnet_net_unlock(0);
+
+	return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+	__u64		key = nid;
+	unsigned int	val;
+
+	LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+	if (number == 1)
+		return 0;
+
+	val = cfs_hash_long(key, LNET_CPT_BITS);
+	/* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+	if (val < number)
+		return val;
+
+	return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+	struct lnet_ni *ni;
+
+	/* must called with hold of lnet_net_lock */
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	/* take lnet_net_lock(any) would be OK */
+	if (!list_empty(&the_lnet.ln_nis_cpt)) {
+		list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+			if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+				continue;
+
+			LASSERT(ni->ni_cpts != NULL);
+			return ni->ni_cpts[lnet_nid_cpt_hash
+					   (nid, ni->ni_ncpts)];
+		}
+	}
+
+	return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+	int	cpt;
+	int	cpt2;
+
+	if (LNET_CPT_NUMBER == 1)
+		return 0; /* the only one */
+
+	if (list_empty(&the_lnet.ln_nis_cpt))
+		return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+	cpt = lnet_net_lock_current();
+	cpt2 = lnet_cpt_of_nid_locked(nid);
+	lnet_net_unlock(cpt);
+
+	return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+
+	ni = lnet_net2ni_locked(net, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+lnet_ni_t  *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+
+	LASSERT(cpt != LNET_LOCK_EX);
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == nid) {
+			lnet_ni_addref_locked(ni, cpt);
+			return ni;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	ni = lnet_nid2ni_locked(nid, cpt);
+	if (ni != NULL)
+		lnet_ni_decref_locked(ni, cpt);
+	lnet_net_unlock(cpt);
+
+	return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis (void)
+{
+	/* Return the # of NIs that need the acceptor. */
+	int		count = 0;
+	struct list_head	*tmp;
+	struct lnet_ni	*ni;
+	int		cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (ni->ni_lnd->lnd_accept != NULL)
+			count++;
+	}
+
+	lnet_net_unlock(cpt);
+
+	return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+	int	credits;
+
+	LASSERT(ni->ni_ncpts >= 1);
+
+	if (ni->ni_ncpts == 1)
+		return ni->ni_maxtxcredits;
+
+	credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+	credits = max(credits, 8 * ni->ni_peertxcredits);
+	credits = min(credits, ni->ni_maxtxcredits);
+
+	return credits;
+}
+
+void
+lnet_shutdown_lndnis (void)
+{
+	int		i;
+	int		islo;
+	lnet_ni_t	 *ni;
+
+	/* NB called holding the global mutex */
+
+	/* All quiet on the API front */
+	LASSERT(!the_lnet.ln_shutdown);
+	LASSERT(the_lnet.ln_refcount == 0);
+	LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_shutdown = 1;	/* flag shutdown */
+
+	/* Unlink NIs from the global table */
+	while (!list_empty(&the_lnet.ln_nis)) {
+		ni = list_entry(the_lnet.ln_nis.next,
+				    lnet_ni_t, ni_list);
+		/* move it to zombie list and nobody can find it anymore */
+		list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+		lnet_ni_decref_locked(ni, 0);	/* drop ln_nis' ref */
+
+		if (!list_empty(&ni->ni_cptlist)) {
+			list_del_init(&ni->ni_cptlist);
+			lnet_ni_decref_locked(ni, 0);
+		}
+	}
+
+	/* Drop the cached eqwait NI. */
+	if (the_lnet.ln_eq_waitni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+		the_lnet.ln_eq_waitni = NULL;
+	}
+
+	/* Drop the cached loopback NI. */
+	if (the_lnet.ln_loni != NULL) {
+		lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+		the_lnet.ln_loni = NULL;
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	/* Clear lazy portals and drop delayed messages which hold refs
+	 * on their lnet_msg_t::msg_rxpeer */
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		LNetClearLazyPortal(i);
+
+	/* Clear the peer table and wait for all peers to go (they hold refs on
+	 * their NIs) */
+	lnet_peer_tables_cleanup();
+
+	lnet_net_lock(LNET_LOCK_EX);
+	/* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+	 * and shut them down in guaranteed thread context */
+	i = 2;
+	while (!list_empty(&the_lnet.ln_nis_zombie)) {
+		int	*ref;
+		int	j;
+
+		ni = list_entry(the_lnet.ln_nis_zombie.next,
+				    lnet_ni_t, ni_list);
+		list_del_init(&ni->ni_list);
+		cfs_percpt_for_each(ref, j, ni->ni_refs) {
+			if (*ref == 0)
+				continue;
+			/* still busy, add it back to zombie list */
+			list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+			break;
+		}
+
+		while (!list_empty(&ni->ni_list)) {
+			lnet_net_unlock(LNET_LOCK_EX);
+			++i;
+			if ((i & (-i)) == i) {
+				CDEBUG(D_WARNING,
+				       "Waiting for zombie LNI %s\n",
+				       libcfs_nid2str(ni->ni_nid));
+			}
+			cfs_pause(cfs_time_seconds(1));
+			lnet_net_lock(LNET_LOCK_EX);
+			continue;
+		}
+
+		ni->ni_lnd->lnd_refcount--;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		islo = ni->ni_lnd->lnd_type == LOLND;
+
+		LASSERT (!in_interrupt ());
+		(ni->ni_lnd->lnd_shutdown)(ni);
+
+		/* can't deref lnd anymore now; it might have unregistered
+		 * itself...  */
+
+		if (!islo)
+			CDEBUG(D_LNI, "Removed LNI %s\n",
+			       libcfs_nid2str(ni->ni_nid));
+
+		lnet_ni_free(ni);
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	the_lnet.ln_shutdown = 0;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_network_tokens != NULL) {
+		LIBCFS_FREE(the_lnet.ln_network_tokens,
+			    the_lnet.ln_network_tokens_nob);
+		the_lnet.ln_network_tokens = NULL;
+	}
+}
+
+int
+lnet_startup_lndnis (void)
+{
+	lnd_t			*lnd;
+	struct lnet_ni		*ni;
+	struct lnet_tx_queue	*tq;
+	struct list_head		nilist;
+	int			i;
+	int		rc = 0;
+	int		lnd_type;
+	int		nicount = 0;
+	char	      *nets = lnet_get_networks();
+
+	INIT_LIST_HEAD(&nilist);
+
+	if (nets == NULL)
+		goto failed;
+
+	rc = lnet_parse_networks(&nilist, nets);
+	if (rc != 0)
+		goto failed;
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		LASSERT (libcfs_isknown_lnd(lnd_type));
+
+		if (lnd_type == CIBLND    ||
+		    lnd_type == OPENIBLND ||
+		    lnd_type == IIBLND    ||
+		    lnd_type == VIBLND) {
+			CERROR("LND %s obsoleted\n",
+			       libcfs_lnd2str(lnd_type));
+			goto failed;
+		}
+
+		LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+		lnd = lnet_find_lnd_by_type(lnd_type);
+
+		if (lnd == NULL) {
+			LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+			rc = request_module("%s",
+						libcfs_lnd2modname(lnd_type));
+			LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+			lnd = lnet_find_lnd_by_type(lnd_type);
+			if (lnd == NULL) {
+				LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+				CERROR("Can't load LND %s, module %s, rc=%d\n",
+				       libcfs_lnd2str(lnd_type),
+				       libcfs_lnd2modname(lnd_type), rc);
+				goto failed;
+			}
+		}
+
+		lnet_net_lock(LNET_LOCK_EX);
+		lnd->lnd_refcount++;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		ni->ni_lnd = lnd;
+
+		rc = (lnd->lnd_startup)(ni);
+
+		LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+		if (rc != 0) {
+			LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s"
+					   "\n",
+					   rc, libcfs_lnd2str(lnd->lnd_type));
+			lnet_net_lock(LNET_LOCK_EX);
+			lnd->lnd_refcount--;
+			lnet_net_unlock(LNET_LOCK_EX);
+			goto failed;
+		}
+
+		LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+		list_del(&ni->ni_list);
+
+		lnet_net_lock(LNET_LOCK_EX);
+		/* refcount for ln_nis */
+		lnet_ni_addref_locked(ni, 0);
+		list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+		if (ni->ni_cpts != NULL) {
+			list_add_tail(&ni->ni_cptlist,
+					  &the_lnet.ln_nis_cpt);
+			lnet_ni_addref_locked(ni, 0);
+		}
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		if (lnd->lnd_type == LOLND) {
+			lnet_ni_addref(ni);
+			LASSERT (the_lnet.ln_loni == NULL);
+			the_lnet.ln_loni = ni;
+			continue;
+		}
+
+		if (ni->ni_peertxcredits == 0 ||
+		    ni->ni_maxtxcredits == 0) {
+			LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+					   libcfs_lnd2str(lnd->lnd_type),
+					   ni->ni_peertxcredits == 0 ?
+					   "" : "per-peer ");
+			goto failed;
+		}
+
+		cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+			tq->tq_credits_min =
+			tq->tq_credits_max =
+			tq->tq_credits = lnet_ni_tq_credits(ni);
+		}
+
+		CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+		       libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+		       lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+		       ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+		nicount++;
+	}
+
+	if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+		lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+		LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network"
+				   "\n",
+				   libcfs_lnd2str(lnd_type));
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	lnet_shutdown_lndnis();
+
+	while (!list_empty(&nilist)) {
+		ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+	int	rc;
+
+	lnet_assert_wire_constants();
+	LASSERT(!the_lnet.ln_init);
+
+	memset(&the_lnet, 0, sizeof(the_lnet));
+
+	/* refer to global cfs_cpt_table for now */
+	the_lnet.ln_cpt_table	= cfs_cpt_table;
+	the_lnet.ln_cpt_number	= cfs_cpt_number(cfs_cpt_table);
+
+	LASSERT(the_lnet.ln_cpt_number > 0);
+	if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+		/* we are under risk of consuming all lh_cookie */
+		CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+		       "please change setting of CPT-table and retry\n",
+		       the_lnet.ln_cpt_number, LNET_CPT_MAX);
+		return -1;
+	}
+
+	while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+		the_lnet.ln_cpt_bits++;
+
+	rc = lnet_create_locks();
+	if (rc != 0) {
+		CERROR("Can't create LNet global locks: %d\n", rc);
+		return -1;
+	}
+
+	the_lnet.ln_refcount = 0;
+	the_lnet.ln_init = 1;
+	LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+	INIT_LIST_HEAD(&the_lnet.ln_lnds);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+	INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+	/* The hash table size is the number of bits it takes to express the set
+	 * ln_num_routes, minus 1 (better to under estimate than over so we
+	 * don't waste memory). */
+	if (rnet_htable_size <= 0)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+	else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+		rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+	the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+					   order_base_2(rnet_htable_size) - 1);
+
+	/* All LNDs apart from the LOLND are in separate modules.  They
+	 * register themselves when their module loads, and unregister
+	 * themselves when their module is unloaded. */
+	lnet_register_lnd(&the_lolnd);
+	return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount == 0);
+
+	while (!list_empty(&the_lnet.ln_lnds))
+		lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+						   lnd_t, lnd_list));
+	lnet_destroy_locks();
+
+	the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+	int	 im_a_router = 0;
+	int	 rc;
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+	if (the_lnet.ln_refcount > 0) {
+		rc = the_lnet.ln_refcount++;
+		goto out;
+	}
+
+	lnet_get_tunables();
+
+	if (requested_pid == LNET_PID_ANY) {
+		/* Don't instantiate LNET just for me */
+		rc = -ENETDOWN;
+		goto failed0;
+	}
+
+	rc = lnet_prepare(requested_pid);
+	if (rc != 0)
+		goto failed0;
+
+	rc = lnet_startup_lndnis();
+	if (rc != 0)
+		goto failed1;
+
+	rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_check_routes();
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_rtrpools_alloc(im_a_router);
+	if (rc != 0)
+		goto failed2;
+
+	rc = lnet_acceptor_start();
+	if (rc != 0)
+		goto failed2;
+
+	the_lnet.ln_refcount = 1;
+	/* Now I may use my own API functions... */
+
+	/* NB router checker needs the_lnet.ln_ping_info in
+	 * lnet_router_checker -> lnet_update_ni_status_locked */
+	rc = lnet_ping_target_init();
+	if (rc != 0)
+		goto failed3;
+
+	rc = lnet_router_checker_start();
+	if (rc != 0)
+		goto failed4;
+
+	lnet_proc_init();
+	goto out;
+
+ failed4:
+	lnet_ping_target_fini();
+ failed3:
+	the_lnet.ln_refcount = 0;
+	lnet_acceptor_stop();
+ failed2:
+	lnet_destroy_routes();
+	lnet_shutdown_lndnis();
+ failed1:
+	lnet_unprepare();
+ failed0:
+	LASSERT (rc < 0);
+ out:
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (the_lnet.ln_refcount != 1) {
+		the_lnet.ln_refcount--;
+	} else {
+		LASSERT (!the_lnet.ln_niinit_self);
+
+		lnet_proc_fini();
+		lnet_router_checker_stop();
+		lnet_ping_target_fini();
+
+		/* Teardown fns that use my own API functions BEFORE here */
+		the_lnet.ln_refcount = 0;
+
+		lnet_acceptor_stop();
+		lnet_destroy_routes();
+		lnet_shutdown_lndnis();
+		lnet_unprepare();
+	}
+
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+	struct libcfs_ioctl_data *data = arg;
+	lnet_process_id_t	 id = {0};
+	lnet_ni_t		*ni;
+	int		       rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	switch (cmd) {
+	case IOC_LIBCFS_GET_NI:
+		rc = LNetGetId(data->ioc_count, &id);
+		data->ioc_nid = id.nid;
+		return rc;
+
+	case IOC_LIBCFS_FAIL_NID:
+		return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+	case IOC_LIBCFS_ADD_ROUTE:
+		rc = lnet_add_route(data->ioc_net, data->ioc_count,
+				    data->ioc_nid);
+		return (rc != 0) ? rc : lnet_check_routes();
+
+	case IOC_LIBCFS_DEL_ROUTE:
+		return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+	case IOC_LIBCFS_GET_ROUTE:
+		return lnet_get_route(data->ioc_count,
+				      &data->ioc_net, &data->ioc_count,
+				      &data->ioc_nid, &data->ioc_flags);
+	case IOC_LIBCFS_NOTIFY_ROUTER:
+		return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+				   cfs_time_current() -
+				   cfs_time_seconds(cfs_time_current_sec() -
+						    (time_t)data->ioc_u64[0]));
+
+	case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+		/* This can be removed once lustre stops calling it */
+		return 0;
+
+	case IOC_LIBCFS_LNET_DIST:
+		rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+		if (rc < 0 && rc != -EHOSTUNREACH)
+			return rc;
+
+		data->ioc_u32[0] = rc;
+		return 0;
+
+	case IOC_LIBCFS_TESTPROTOCOMPAT:
+		lnet_net_lock(LNET_LOCK_EX);
+		the_lnet.ln_testprotocompat = data->ioc_flags;
+		lnet_net_unlock(LNET_LOCK_EX);
+		return 0;
+
+	case IOC_LIBCFS_PING:
+		id.nid = data->ioc_nid;
+		id.pid = data->ioc_u32[0];
+		rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+			       (lnet_process_id_t *)data->ioc_pbuf1,
+			       data->ioc_plen1/sizeof(lnet_process_id_t));
+		if (rc < 0)
+			return rc;
+		data->ioc_count = rc;
+		return 0;
+
+	case IOC_LIBCFS_DEBUG_PEER: {
+		/* CAVEAT EMPTOR: this one designed for calling directly; not
+		 * via an ioctl */
+		id = *((lnet_process_id_t *) arg);
+
+		lnet_debug_peer(id.nid);
+
+		ni = lnet_net2ni(LNET_NIDNET(id.nid));
+		if (ni == NULL) {
+			CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+		} else {
+			if (ni->ni_lnd->lnd_ctl == NULL) {
+				CDEBUG(D_WARNING, "No ctl for %s\n",
+				       libcfs_id2str(id));
+			} else {
+				(void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+			}
+
+			lnet_ni_decref(ni);
+		}
+		return 0;
+	}
+
+	default:
+		ni = lnet_net2ni(data->ioc_net);
+		if (ni == NULL)
+			return -EINVAL;
+
+		if (ni->ni_lnd->lnd_ctl == NULL)
+			rc = -EINVAL;
+		else
+			rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+		lnet_ni_decref(ni);
+		return rc;
+	}
+	/* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+	struct lnet_ni	*ni;
+	struct list_head	*tmp;
+	int		cpt;
+	int		rc = -ENOENT;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		if (index-- != 0)
+			continue;
+
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		id->nid = ni->ni_nid;
+		id->pid = the_lnet.ln_pid;
+		rc = 0;
+		break;
+	}
+
+	lnet_net_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+	snprintf(str, len, LPX64, h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+	int	       i;
+	int	       n;
+	int	       rc;
+	unsigned int      infosz;
+	lnet_ni_t	*ni;
+	lnet_process_id_t id;
+	lnet_ping_info_t *pinfo;
+
+	for (n = 0; ; n++) {
+		rc = LNetGetId(n, &id);
+		if (rc == -ENOENT)
+			break;
+
+		LASSERT (rc == 0);
+	}
+
+	infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+	LIBCFS_ALLOC(pinfo, infosz);
+	if (pinfo == NULL) {
+		CERROR("Can't allocate ping info[%d]\n", n);
+		return -ENOMEM;
+	}
+
+	pinfo->pi_nnis    = n;
+	pinfo->pi_pid     = the_lnet.ln_pid;
+	pinfo->pi_magic   = LNET_PROTO_PING_MAGIC;
+	pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+	for (i = 0; i < n; i++) {
+		lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+		rc = LNetGetId(i, &id);
+		LASSERT (rc == 0);
+
+		ns->ns_nid    = id.nid;
+		ns->ns_status = LNET_NI_STATUS_UP;
+
+		lnet_net_lock(0);
+
+		ni = lnet_nid2ni_locked(id.nid, 0);
+		LASSERT(ni != NULL);
+
+		lnet_ni_lock(ni);
+		LASSERT(ni->ni_status == NULL);
+		ni->ni_status = ns;
+		lnet_ni_unlock(ni);
+
+		lnet_ni_decref_locked(ni, 0);
+		lnet_net_unlock(0);
+	}
+
+	the_lnet.ln_ping_info = pinfo;
+	return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+	struct lnet_ni	*ni;
+
+	lnet_net_lock(0);
+
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		lnet_ni_lock(ni);
+		ni->ni_status = NULL;
+		lnet_ni_unlock(ni);
+	}
+
+	lnet_net_unlock(0);
+
+	LIBCFS_FREE(the_lnet.ln_ping_info,
+		    offsetof(lnet_ping_info_t,
+			     pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+	the_lnet.ln_ping_info = NULL;
+	return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+	lnet_md_t	 md = {0};
+	lnet_handle_me_t  meh;
+	lnet_process_id_t id;
+	int	       rc;
+	int	       rc2;
+	int	       infosz;
+
+	rc = lnet_create_ping_info();
+	if (rc != 0)
+		return rc;
+
+	/* We can have a tiny EQ since we only need to see the unlink event on
+	 * teardown, which by definition is the last one! */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+	if (rc != 0) {
+		CERROR("Can't allocate ping EQ: %d\n", rc);
+		goto failed_0;
+	}
+
+	memset(&id, 0, sizeof(lnet_process_id_t));
+	id.nid = LNET_NID_ANY;
+	id.pid = LNET_PID_ANY;
+
+	rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+			  LNET_PROTO_PING_MATCHBITS, 0,
+			  LNET_UNLINK, LNET_INS_AFTER,
+			  &meh);
+	if (rc != 0) {
+		CERROR("Can't create ping ME: %d\n", rc);
+		goto failed_1;
+	}
+
+	/* initialize md content */
+	infosz = offsetof(lnet_ping_info_t,
+			  pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+	md.start     = the_lnet.ln_ping_info;
+	md.length    = infosz;
+	md.threshold = LNET_MD_THRESH_INF;
+	md.max_size  = 0;
+	md.options   = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+		       LNET_MD_MANAGE_REMOTE;
+	md.user_ptr  = NULL;
+	md.eq_handle = the_lnet.ln_ping_target_eq;
+
+	rc = LNetMDAttach(meh, md,
+			  LNET_RETAIN,
+			  &the_lnet.ln_ping_target_md);
+	if (rc != 0) {
+		CERROR("Can't attach ping MD: %d\n", rc);
+		goto failed_2;
+	}
+
+	return 0;
+
+ failed_2:
+	rc2 = LNetMEUnlink(meh);
+	LASSERT (rc2 == 0);
+ failed_1:
+	rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT (rc2 == 0);
+ failed_0:
+	lnet_destroy_ping_info();
+	return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+	lnet_event_t    event;
+	int	     rc;
+	int	     which;
+	int	     timeout_ms = 1000;
+	sigset_t    blocked = cfs_block_allsigs();
+
+	LNetMDUnlink(the_lnet.ln_ping_target_md);
+	/* NB md could be busy; this just starts the unlink */
+
+	for (;;) {
+		rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+				timeout_ms, &event, &which);
+
+		/* I expect overflow... */
+		LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+		if (rc == 0) {
+			/* timed out: provide a diagnostic */
+			CWARN("Still waiting for ping MD to unlink\n");
+			timeout_ms *= 2;
+			continue;
+		}
+
+		/* Got a valid event */
+		if (event.unlinked)
+			break;
+	}
+
+	rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+	LASSERT (rc == 0);
+	lnet_destroy_ping_info();
+	cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+	lnet_handle_eq_t     eqh;
+	lnet_handle_md_t     mdh;
+	lnet_event_t	 event;
+	lnet_md_t	    md = {0};
+	int		  which;
+	int		  unlinked = 0;
+	int		  replied = 0;
+	const int	    a_long_time = 60000; /* mS */
+	int		  infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+	lnet_ping_info_t    *info;
+	lnet_process_id_t    tmpid;
+	int		  i;
+	int		  nob;
+	int		  rc;
+	int		  rc2;
+	sigset_t	 blocked;
+
+	if (n_ids <= 0 ||
+	    id.nid == LNET_NID_ANY ||
+	    timeout_ms > 500000 ||	      /* arbitrary limit! */
+	    n_ids > 20)			 /* arbitrary limit! */
+		return -EINVAL;
+
+	if (id.pid == LNET_PID_ANY)
+		id.pid = LUSTRE_SRV_LNET_PID;
+
+	LIBCFS_ALLOC(info, infosz);
+	if (info == NULL)
+		return -ENOMEM;
+
+	/* NB 2 events max (including any unlink event) */
+	rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ: %d\n", rc);
+		goto out_0;
+	}
+
+	/* initialize md content */
+	md.start     = info;
+	md.length    = infosz;
+	md.threshold = 2; /*GET/REPLY*/
+	md.max_size  = 0;
+	md.options   = LNET_MD_TRUNCATE;
+	md.user_ptr  = NULL;
+	md.eq_handle = eqh;
+
+	rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+	if (rc != 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out_1;
+	}
+
+	rc = LNetGet(LNET_NID_ANY, mdh, id,
+		     LNET_RESERVED_PORTAL,
+		     LNET_PROTO_PING_MATCHBITS, 0);
+
+	if (rc != 0) {
+		/* Don't CERROR; this could be deliberate! */
+
+		rc2 = LNetMDUnlink(mdh);
+		LASSERT (rc2 == 0);
+
+		/* NB must wait for the UNLINK event below... */
+		unlinked = 1;
+		timeout_ms = a_long_time;
+	}
+
+	do {
+		/* MUST block for unlink to complete */
+		if (unlinked)
+			blocked = cfs_block_allsigs();
+
+		rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+		if (unlinked)
+			cfs_restore_sigs(blocked);
+
+		CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+		       (rc2 <= 0) ? -1 : event.type,
+		       (rc2 <= 0) ? -1 : event.status,
+		       (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+		LASSERT (rc2 != -EOVERFLOW);     /* can't miss anything */
+
+		if (rc2 <= 0 || event.status != 0) {
+			/* timeout or error */
+			if (!replied && rc == 0)
+				rc = (rc2 < 0) ? rc2 :
+				     (rc2 == 0) ? -ETIMEDOUT :
+				     event.status;
+
+			if (!unlinked) {
+				/* Ensure completion in finite time... */
+				LNetMDUnlink(mdh);
+				/* No assertion (racing with network) */
+				unlinked = 1;
+				timeout_ms = a_long_time;
+			} else if (rc2 == 0) {
+				/* timed out waiting for unlink */
+				CWARN("ping %s: late network completion\n",
+				      libcfs_id2str(id));
+			}
+		} else if (event.type == LNET_EVENT_REPLY) {
+			replied = 1;
+			rc = event.mlength;
+		}
+
+	} while (rc2 <= 0 || !event.unlinked);
+
+	if (!replied) {
+		if (rc >= 0)
+			CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+			      libcfs_id2str(id));
+		rc = -EIO;
+		goto out_1;
+	}
+
+	nob = rc;
+	LASSERT (nob >= 0 && nob <= infosz);
+
+	rc = -EPROTO;			   /* if I can't parse... */
+
+	if (nob < 8) {
+		/* can't check magic/version */
+		CERROR("%s: ping info too short %d\n",
+		       libcfs_id2str(id), nob);
+		goto out_1;
+	}
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+		lnet_swap_pinginfo(info);
+	} else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CERROR("%s: Unexpected magic %08x\n",
+		       libcfs_id2str(id), info->pi_magic);
+		goto out_1;
+	}
+
+	if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+		CERROR("%s: ping w/o NI status: 0x%x\n",
+		       libcfs_id2str(id), info->pi_features);
+		goto out_1;
+	}
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+		CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+		goto out_1;
+	}
+
+	if (info->pi_nnis < n_ids)
+		n_ids = info->pi_nnis;
+
+	if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+		CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+		       nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+		goto out_1;
+	}
+
+	rc = -EFAULT;			   /* If I SEGV... */
+
+	for (i = 0; i < n_ids; i++) {
+		tmpid.pid = info->pi_pid;
+		tmpid.nid = info->pi_ni[i].ns_nid;
+		if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+			goto out_1;
+	}
+	rc = info->pi_nnis;
+
+ out_1:
+	rc2 = LNetEQFree(eqh);
+	if (rc2 != 0)
+		CERROR("rc2 %d\n", rc2);
+	LASSERT (rc2 == 0);
+
+ out_0:
+	LIBCFS_FREE(info, infosz);
+	return rc;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644
index 000000000000..28711e6e8b03
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/config.c
@@ -0,0 +1,1264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+typedef struct {			    /* tmp struct for parsing routes */
+	struct list_head	 ltb_list;	/* stash on lists */
+	int		ltb_size;	/* allocated size */
+	char	       ltb_text[0];     /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0;			/* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB     (64<<10)	/* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+	static char dots[LNET_SINGLE_TEXTBUF_NOB];
+	static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+	memset(dots, '.', sizeof(dots));
+	dots[sizeof(dots)-1] = 0;
+	memset(dashes, '-', sizeof(dashes));
+	dashes[sizeof(dashes)-1] = 0;
+
+	LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+	LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+			   (int)strlen(name), dots, offset, dots,
+			    (width < 1) ? 0 : width - 1, dashes);
+}
+
+int
+lnet_issep (char c)
+{
+	switch (c) {
+	case '\n':
+	case '\r':
+	case ';':
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+	struct list_head       *tmp;
+	lnet_ni_t	*ni;
+
+	list_for_each (tmp, nilist) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+		if (LNET_NIDNET(ni->ni_nid) == net)
+			return 0;
+	}
+
+	return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+	if (ni->ni_refs != NULL)
+		cfs_percpt_free(ni->ni_refs);
+
+	if (ni->ni_tx_queues != NULL)
+		cfs_percpt_free(ni->ni_tx_queues);
+
+	if (ni->ni_cpts != NULL)
+		cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+	LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+	struct lnet_tx_queue	*tq;
+	struct lnet_ni		*ni;
+	int			rc;
+	int			i;
+
+	if (!lnet_net_unique(net, nilist)) {
+		LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+				   libcfs_net2str(net));
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ni, sizeof(*ni));
+	if (ni == NULL) {
+		CERROR("Out of memory creating network %s\n",
+		       libcfs_net2str(net));
+		return NULL;
+	}
+
+	spin_lock_init(&ni->ni_lock);
+	INIT_LIST_HEAD(&ni->ni_cptlist);
+	ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*ni->ni_refs[0]));
+	if (ni->ni_refs == NULL)
+		goto failed;
+
+	ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(*ni->ni_tx_queues[0]));
+	if (ni->ni_tx_queues == NULL)
+		goto failed;
+
+	cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+		INIT_LIST_HEAD(&tq->tq_delayed);
+
+	if (el == NULL) {
+		ni->ni_cpts  = NULL;
+		ni->ni_ncpts = LNET_CPT_NUMBER;
+	} else {
+		rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+		if (rc <= 0) {
+			CERROR("Failed to set CPTs for NI %s: %d\n",
+			       libcfs_net2str(net), rc);
+			goto failed;
+		}
+
+		LASSERT(rc <= LNET_CPT_NUMBER);
+		if (rc == LNET_CPT_NUMBER) {
+			LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+			ni->ni_cpts = NULL;
+		}
+
+		ni->ni_ncpts = rc;
+	}
+
+	/* LND will fill in the address part of the NID */
+	ni->ni_nid = LNET_MKNID(net, 0);
+	ni->ni_last_alive = cfs_time_current_sec();
+	list_add_tail(&ni->ni_list, nilist);
+	return ni;
+ failed:
+	lnet_ni_free(ni);
+	return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+	struct cfs_expr_list *el = NULL;
+	int		tokensize = strlen(networks) + 1;
+	char		*tokens;
+	char		*str;
+	char		*tmp;
+	struct lnet_ni	*ni;
+	__u32		net;
+	int		nnets = 0;
+
+	if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _WAY_ conservative */
+		LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+				   "long\n");
+		return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(tokens, tokensize);
+	if (tokens == NULL) {
+		CERROR("Can't allocate net tokens\n");
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_network_tokens = tokens;
+	the_lnet.ln_network_tokens_nob = tokensize;
+	memcpy (tokens, networks, tokensize);
+	str = tmp = tokens;
+
+	/* Add in the loopback network */
+	ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+	if (ni == NULL)
+		goto failed;
+
+	while (str != NULL && *str != 0) {
+		char	*comma = strchr(str, ',');
+		char	*bracket = strchr(str, '(');
+		char	*square = strchr(str, '[');
+		char	*iface;
+		int	niface;
+		int	rc;
+
+		/* NB we don't check interface conflicts here; it's the LNDs
+		 * responsibility (if it cares at all) */
+
+		if (square != NULL && (comma == NULL || square < comma)) {
+			/* i.e: o2ib0(ib0)[1,2], number between square
+			 * brackets are CPTs this NI needs to be bond */
+			if (bracket != NULL && bracket > square) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			tmp = strchr(square, ']');
+			if (tmp == NULL) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			rc = cfs_expr_list_parse(square, tmp - square + 1,
+						 0, LNET_CPT_NUMBER - 1, &el);
+			if (rc != 0) {
+				tmp = square;
+				goto failed_syntax;
+			}
+
+			while (square <= tmp)
+				*square++ = ' ';
+		}
+
+		if (bracket == NULL ||
+		    (comma != NULL && comma < bracket)) {
+
+			/* no interface list specified */
+
+			if (comma != NULL)
+				*comma++ = 0;
+			net = libcfs_str2net(cfs_trimwhite(str));
+
+			if (net == LNET_NIDNET(LNET_NID_ANY)) {
+				LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
+						   " type\n");
+				tmp = str;
+				goto failed_syntax;
+			}
+
+			if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+			    lnet_ni_alloc(net, el, nilist) == NULL)
+				goto failed;
+
+			if (el != NULL) {
+				cfs_expr_list_free(el);
+				el = NULL;
+			}
+
+			str = comma;
+			continue;
+		}
+
+		*bracket = 0;
+		net = libcfs_str2net(cfs_trimwhite(str));
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			tmp = str;
+			goto failed_syntax;
+		}
+
+		nnets++;
+		ni = lnet_ni_alloc(net, el, nilist);
+		if (ni == NULL)
+			goto failed;
+
+		if (el != NULL) {
+			cfs_expr_list_free(el);
+			el = NULL;
+		}
+
+		niface = 0;
+		iface = bracket + 1;
+
+		bracket = strchr(iface, ')');
+		if (bracket == NULL) {
+			tmp = iface;
+			goto failed_syntax;
+		}
+
+		*bracket = 0;
+		do {
+			comma = strchr(iface, ',');
+			if (comma != NULL)
+				*comma++ = 0;
+
+			iface = cfs_trimwhite(iface);
+			if (*iface == 0) {
+				tmp = iface;
+				goto failed_syntax;
+			}
+
+			if (niface == LNET_MAX_INTERFACES) {
+				LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+						   "for net %s\n",
+						   libcfs_net2str(net));
+				goto failed;
+			}
+
+			ni->ni_interfaces[niface++] = iface;
+			iface = comma;
+		} while (iface != NULL);
+
+		str = bracket + 1;
+		comma = strchr(bracket + 1, ',');
+		if (comma != NULL) {
+			*comma = 0;
+			str = cfs_trimwhite(str);
+			if (*str != 0) {
+				tmp = str;
+				goto failed_syntax;
+			}
+			str = comma + 1;
+			continue;
+		}
+
+		str = cfs_trimwhite(str);
+		if (*str != 0) {
+			tmp = str;
+			goto failed_syntax;
+		}
+	}
+
+	LASSERT(!list_empty(nilist));
+	return 0;
+
+ failed_syntax:
+	lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+	while (!list_empty(nilist)) {
+		ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+		list_del(&ni->ni_list);
+		lnet_ni_free(ni);
+	}
+
+	if (el != NULL)
+		cfs_expr_list_free(el);
+
+	LIBCFS_FREE(tokens, tokensize);
+	the_lnet.ln_network_tokens = NULL;
+
+	return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len)
+{
+	lnet_text_buf_t *ltb;
+	int	      nob;
+
+	/* NB allocate space for the terminating 0 */
+	nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+	if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+		/* _way_ conservative for "route net gateway..." */
+		CERROR("text buffer too big\n");
+		return NULL;
+	}
+
+	if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+		CERROR("Too many text buffers\n");
+		return NULL;
+	}
+
+	LIBCFS_ALLOC(ltb, nob);
+	if (ltb == NULL)
+		return NULL;
+
+	ltb->ltb_size = nob;
+	ltb->ltb_text[0] = 0;
+	lnet_tbnob += nob;
+	return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+	lnet_tbnob -= ltb->ltb_size;
+	LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+	lnet_text_buf_t  *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+	struct list_head	*tmp;
+	lnet_text_buf_t   *ltb;
+
+	list_for_each (tmp, tbs) {
+		ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+		CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+	}
+
+	CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str)
+{
+	struct list_head	pending;
+	char	     *sep;
+	int	       nob;
+	int	       i;
+	lnet_text_buf_t  *ltb;
+
+	INIT_LIST_HEAD(&pending);
+
+	/* Split 'str' into separate commands */
+	for (;;) {
+		/* skip leading whitespace */
+		while (cfs_iswhite(*str))
+			str++;
+
+		/* scan for separator or comment */
+		for (sep = str; *sep != 0; sep++)
+			if (lnet_issep(*sep) || *sep == '#')
+				break;
+
+		nob = (int)(sep - str);
+		if (nob > 0) {
+			ltb = lnet_new_text_buf(nob);
+			if (ltb == NULL) {
+				lnet_free_text_bufs(&pending);
+				return -1;
+			}
+
+			for (i = 0; i < nob; i++)
+				if (cfs_iswhite(str[i]))
+					ltb->ltb_text[i] = ' ';
+				else
+					ltb->ltb_text[i] = str[i];
+
+			ltb->ltb_text[nob] = 0;
+
+			list_add_tail(&ltb->ltb_list, &pending);
+		}
+
+		if (*sep == '#') {
+			/* scan for separator */
+			do {
+				sep++;
+			} while (*sep != 0 && !lnet_issep(*sep));
+		}
+
+		if (*sep == 0)
+			break;
+
+		str = sep + 1;
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list,
+	       char *str, char *sep1, char *sep2,
+	       char *item, int itemlen)
+{
+	int	      len1 = (int)(sep1 - str);
+	int	      len2 = strlen(sep2 + 1);
+	lnet_text_buf_t *ltb;
+
+	LASSERT (*sep1 == '[');
+	LASSERT (*sep2 == ']');
+
+	ltb = lnet_new_text_buf(len1 + itemlen + len2);
+	if (ltb == NULL)
+		return -ENOMEM;
+
+	memcpy(ltb->ltb_text, str, len1);
+	memcpy(&ltb->ltb_text[len1], item, itemlen);
+	memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+	ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+	list_add_tail(&ltb->ltb_list, list);
+	return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+	char	      num[16];
+	struct list_head	pending;
+	char	     *sep;
+	char	     *sep2;
+	char	     *parsed;
+	char	     *enditem;
+	int	       lo;
+	int	       hi;
+	int	       stride;
+	int	       i;
+	int	       nob;
+	int	       scanned;
+
+	INIT_LIST_HEAD(&pending);
+
+	sep = strchr(str, '[');
+	if (sep == NULL)			/* nothing to expand */
+		return 0;
+
+	sep2 = strchr(sep, ']');
+	if (sep2 == NULL)
+		goto failed;
+
+	for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+		enditem = ++parsed;
+		while (enditem < sep2 && *enditem != ',')
+			enditem++;
+
+		if (enditem == parsed)		/* no empty items */
+			goto failed;
+
+		if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+			if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+				/* simple string enumeration */
+				if (lnet_expand1tb(&pending, str, sep, sep2,
+						   parsed, (int)(enditem - parsed)) != 0)
+					goto failed;
+
+				continue;
+			}
+
+			stride = 1;
+		}
+
+		/* range expansion */
+
+		if (enditem != parsed + scanned) /* no trailing junk */
+			goto failed;
+
+		if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+		    (hi - lo) % stride != 0)
+			goto failed;
+
+		for (i = lo; i <= hi; i += stride) {
+
+			snprintf(num, sizeof(num), "%d", i);
+			nob = strlen(num);
+			if (nob + 1 == sizeof(num))
+				goto failed;
+
+			if (lnet_expand1tb(&pending, str, sep, sep2,
+					   num, nob) != 0)
+				goto failed;
+		}
+	}
+
+	list_splice(&pending, tbs->prev);
+	return 1;
+
+ failed:
+	lnet_free_text_bufs(&pending);
+	return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+	int     len = strlen(str);
+	int     nob = len;
+
+	return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+		nob == len &&
+		*hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+	/* static scratch buffer OK (single threaded) */
+	static char       cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	nets;
+	struct list_head	gateways;
+	struct list_head       *tmp1;
+	struct list_head       *tmp2;
+	__u32	     net;
+	lnet_nid_t	nid;
+	lnet_text_buf_t  *ltb;
+	int	       rc;
+	char	     *sep;
+	char	     *token = str;
+	int	       ntokens = 0;
+	int	       myrc = -1;
+	unsigned int      hops;
+	int	       got_hops = 0;
+
+	INIT_LIST_HEAD(&gateways);
+	INIT_LIST_HEAD(&nets);
+
+	/* save a copy of the string for error messages */
+	strncpy(cmd, str, sizeof(cmd) - 1);
+	cmd[sizeof(cmd) - 1] = 0;
+
+	sep = str;
+	for (;;) {
+		/* scan for token start */
+		while (cfs_iswhite(*sep))
+			sep++;
+		if (*sep == 0) {
+			if (ntokens < (got_hops ? 3 : 2))
+				goto token_error;
+			break;
+		}
+
+		ntokens++;
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !cfs_iswhite(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens == 1) {
+			tmp2 = &nets;		/* expanding nets */
+		} else if (ntokens == 2 &&
+			   lnet_parse_hops(token, &hops)) {
+			got_hops = 1;	   /* got a hop count */
+			continue;
+		} else {
+			tmp2 = &gateways;	/* expanding gateways */
+		}
+
+		ltb = lnet_new_text_buf(strlen(token));
+		if (ltb == NULL)
+			goto out;
+
+		strcpy(ltb->ltb_text, token);
+		tmp1 = &ltb->ltb_list;
+		list_add_tail(tmp1, tmp2);
+
+		while (tmp1 != tmp2) {
+			ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+			rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+			if (rc < 0)
+				goto token_error;
+
+			tmp1 = tmp1->next;
+
+			if (rc > 0) {		/* expanded! */
+				list_del(&ltb->ltb_list);
+				lnet_free_text_buf(ltb);
+				continue;
+			}
+
+			if (ntokens == 1) {
+				net = libcfs_str2net(ltb->ltb_text);
+				if (net == LNET_NIDNET(LNET_NID_ANY) ||
+				    LNET_NETTYP(net) == LOLND)
+					goto token_error;
+			} else {
+				nid = libcfs_str2nid(ltb->ltb_text);
+				if (nid == LNET_NID_ANY ||
+				    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+					goto token_error;
+			}
+		}
+	}
+
+	if (!got_hops)
+		hops = 1;
+
+	LASSERT (!list_empty(&nets));
+	LASSERT (!list_empty(&gateways));
+
+	list_for_each (tmp1, &nets) {
+		ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+		net = libcfs_str2net(ltb->ltb_text);
+		LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+		list_for_each (tmp2, &gateways) {
+			ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+			nid = libcfs_str2nid(ltb->ltb_text);
+			LASSERT (nid != LNET_NID_ANY);
+
+			if (lnet_islocalnid(nid)) {
+				*im_a_router = 1;
+				continue;
+			}
+
+			rc = lnet_add_route (net, hops, nid);
+			if (rc != 0) {
+				CERROR("Can't create route "
+				       "to %s via %s\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid));
+				goto out;
+			}
+		}
+	}
+
+	myrc = 0;
+	goto out;
+
+ token_error:
+	lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+	lnet_free_text_bufs(&nets);
+	lnet_free_text_bufs(&gateways);
+	return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+	lnet_text_buf_t   *ltb;
+
+	while (!list_empty(tbs)) {
+		ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+		if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+			lnet_free_text_bufs(tbs);
+			return -EINVAL;
+		}
+
+		list_del(&ltb->ltb_list);
+		lnet_free_text_buf(ltb);
+	}
+
+	return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+	struct list_head	tbs;
+	int	       rc = 0;
+
+	*im_a_router = 0;
+
+	INIT_LIST_HEAD(&tbs);
+
+	if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+		CERROR("Error parsing routes\n");
+		rc = -EINVAL;
+	} else {
+		rc = lnet_parse_route_tbs(&tbs, im_a_router);
+	}
+
+	LASSERT (lnet_tbnob == 0);
+	return rc;
+}
+
+int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+	LIST_HEAD	(list);
+	int		rc;
+	int		i;
+
+	rc = cfs_ip_addr_parse(token, len, &list);
+	if (rc != 0)
+		return rc;
+
+	for (rc = i = 0; !rc && i < nip; i++)
+		rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+	cfs_ip_addr_free(&list);
+
+	return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+	static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+	int   matched = 0;
+	int   ntokens = 0;
+	int   len;
+	char *net = NULL;
+	char *sep;
+	char *token;
+	int   rc;
+
+	LASSERT (strlen(net_entry) < sizeof(tokens));
+
+	/* work on a copy of the string */
+	strcpy(tokens, net_entry);
+	sep = tokens;
+	for (;;) {
+		/* scan for token start */
+		while (cfs_iswhite(*sep))
+			sep++;
+		if (*sep == 0)
+			break;
+
+		token = sep++;
+
+		/* scan for token end */
+		while (*sep != 0 && !cfs_iswhite(*sep))
+			sep++;
+		if (*sep != 0)
+			*sep++ = 0;
+
+		if (ntokens++ == 0) {
+			net = token;
+			continue;
+		}
+
+		len = strlen(token);
+
+		rc = lnet_match_network_token(token, len, ipaddrs, nip);
+		if (rc < 0) {
+			lnet_syntax("ip2nets", net_entry,
+				    (int)(token - tokens), len);
+			return rc;
+		}
+
+		matched |= (rc != 0);
+	}
+
+	if (!matched)
+		return 0;
+
+	strcpy(net_entry, net);		 /* replace with matched net */
+	return 1;
+}
+
+__u32
+lnet_netspec2net(char *netspec)
+{
+	char   *bracket = strchr(netspec, '(');
+	__u32   net;
+
+	if (bracket != NULL)
+		*bracket = 0;
+
+	net = libcfs_str2net(netspec);
+
+	if (bracket != NULL)
+		*bracket = '(';
+
+	return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+	int	       offset = 0;
+	int	       offset2;
+	int	       len;
+	lnet_text_buf_t  *tb;
+	lnet_text_buf_t  *tb2;
+	struct list_head       *t;
+	char	     *sep;
+	char	     *bracket;
+	__u32	     net;
+
+	LASSERT (!list_empty(nets));
+	LASSERT (nets->next == nets->prev);     /* single entry */
+
+	tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+	for (;;) {
+		sep = strchr(tb->ltb_text, ',');
+		bracket = strchr(tb->ltb_text, '(');
+
+		if (sep != NULL &&
+		    bracket != NULL &&
+		    bracket < sep) {
+			/* netspec lists interfaces... */
+
+			offset2 = offset + (int)(bracket - tb->ltb_text);
+			len = strlen(bracket);
+
+			bracket = strchr(bracket + 1, ')');
+
+			if (bracket == NULL ||
+			    !(bracket[1] == ',' || bracket[1] == 0)) {
+				lnet_syntax("ip2nets", source, offset2, len);
+				return -EINVAL;
+			}
+
+			sep = (bracket[1] == 0) ? NULL : bracket + 1;
+		}
+
+		if (sep != NULL)
+			*sep++ = 0;
+
+		net = lnet_netspec2net(tb->ltb_text);
+		if (net == LNET_NIDNET(LNET_NID_ANY)) {
+			lnet_syntax("ip2nets", source, offset,
+				    strlen(tb->ltb_text));
+			return -EINVAL;
+		}
+
+		list_for_each(t, nets) {
+			tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+			if (tb2 == tb)
+				continue;
+
+			if (net == lnet_netspec2net(tb2->ltb_text)) {
+				/* duplicate network */
+				lnet_syntax("ip2nets", source, offset,
+					    strlen(tb->ltb_text));
+				return -EINVAL;
+			}
+		}
+
+		if (sep == NULL)
+			return 0;
+
+		offset += (int)(sep - tb->ltb_text);
+		tb2 = lnet_new_text_buf(strlen(sep));
+		if (tb2 == NULL)
+			return -ENOMEM;
+
+		strcpy(tb2->ltb_text, sep);
+		list_add_tail(&tb2->ltb_list, nets);
+
+		tb = tb2;
+	}
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+	static char	networks[LNET_SINGLE_TEXTBUF_NOB];
+	static char	source[LNET_SINGLE_TEXTBUF_NOB];
+
+	struct list_head	  raw_entries;
+	struct list_head	  matched_nets;
+	struct list_head	  current_nets;
+	struct list_head	 *t;
+	struct list_head	 *t2;
+	lnet_text_buf_t    *tb;
+	lnet_text_buf_t    *tb2;
+	__u32	       net1;
+	__u32	       net2;
+	int		 len;
+	int		 count;
+	int		 dup;
+	int		 rc;
+
+	INIT_LIST_HEAD(&raw_entries);
+	if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+		CERROR("Error parsing ip2nets\n");
+		LASSERT (lnet_tbnob == 0);
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&matched_nets);
+	INIT_LIST_HEAD(&current_nets);
+	networks[0] = 0;
+	count = 0;
+	len = 0;
+	rc = 0;
+
+	while (!list_empty(&raw_entries)) {
+		tb = list_entry(raw_entries.next, lnet_text_buf_t,
+				    ltb_list);
+
+		strncpy(source, tb->ltb_text, sizeof(source)-1);
+		source[sizeof(source)-1] = 0;
+
+		/* replace ltb_text with the network(s) add on match */
+		rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+		if (rc < 0)
+			break;
+
+		list_del(&tb->ltb_list);
+
+		if (rc == 0) {		  /* no match */
+			lnet_free_text_buf(tb);
+			continue;
+		}
+
+		/* split into separate networks */
+		INIT_LIST_HEAD(&current_nets);
+		list_add(&tb->ltb_list, &current_nets);
+		rc = lnet_splitnets(source, &current_nets);
+		if (rc < 0)
+			break;
+
+		dup = 0;
+		list_for_each (t, &current_nets) {
+			tb = list_entry(t, lnet_text_buf_t, ltb_list);
+			net1 = lnet_netspec2net(tb->ltb_text);
+			LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+			list_for_each(t2, &matched_nets) {
+				tb2 = list_entry(t2, lnet_text_buf_t,
+						     ltb_list);
+				net2 = lnet_netspec2net(tb2->ltb_text);
+				LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+				if (net1 == net2) {
+					dup = 1;
+					break;
+				}
+			}
+
+			if (dup)
+				break;
+		}
+
+		if (dup) {
+			lnet_free_text_bufs(&current_nets);
+			continue;
+		}
+
+		list_for_each_safe(t, t2, &current_nets) {
+			tb = list_entry(t, lnet_text_buf_t, ltb_list);
+
+			list_del(&tb->ltb_list);
+			list_add_tail(&tb->ltb_list, &matched_nets);
+
+			len += snprintf(networks + len, sizeof(networks) - len,
+					"%s%s", (len == 0) ? "" : ",",
+					tb->ltb_text);
+
+			if (len >= sizeof(networks)) {
+				CERROR("Too many matched networks\n");
+				rc = -E2BIG;
+				goto out;
+			}
+		}
+
+		count++;
+	}
+
+ out:
+	lnet_free_text_bufs(&raw_entries);
+	lnet_free_text_bufs(&matched_nets);
+	lnet_free_text_bufs(&current_nets);
+	LASSERT (lnet_tbnob == 0);
+
+	if (rc < 0)
+		return rc;
+
+	*networksp = networks;
+	return count;
+}
+
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+	LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+	int	up;
+	__u32      netmask;
+	__u32     *ipaddrs;
+	__u32     *ipaddrs2;
+	int	nip;
+	char     **ifnames;
+	int	nif = libcfs_ipif_enumerate(&ifnames);
+	int	i;
+	int	rc;
+
+	if (nif <= 0)
+		return nif;
+
+	LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+	if (ipaddrs == NULL) {
+		CERROR("Can't allocate ipaddrs[%d]\n", nif);
+		libcfs_ipif_free_enumeration(ifnames, nif);
+		return -ENOMEM;
+	}
+
+	for (i = nip = 0; i < nif; i++) {
+		if (!strcmp(ifnames[i], "lo"))
+			continue;
+
+		rc = libcfs_ipif_query(ifnames[i], &up,
+				       &ipaddrs[nip], &netmask);
+		if (rc != 0) {
+			CWARN("Can't query interface %s: %d\n",
+			      ifnames[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Ignoring interface %s: it's down\n",
+			      ifnames[i]);
+			continue;
+		}
+
+		nip++;
+	}
+
+	libcfs_ipif_free_enumeration(ifnames, nif);
+
+	if (nip == nif) {
+		*ipaddrsp = ipaddrs;
+	} else {
+		if (nip > 0) {
+			LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+			if (ipaddrs2 == NULL) {
+				CERROR("Can't allocate ipaddrs[%d]\n", nip);
+				nip = -ENOMEM;
+			} else {
+				memcpy(ipaddrs2, ipaddrs,
+				       nip * sizeof(*ipaddrs));
+				*ipaddrsp = ipaddrs2;
+				rc = nip;
+			}
+		}
+		lnet_ipaddr_free_enumeration(ipaddrs, nif);
+	}
+	return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+	__u32     *ipaddrs;
+	int	nip = lnet_ipaddr_enumerate(&ipaddrs);
+	int	rc;
+
+	if (nip < 0) {
+		LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+				   "interfaces for ip2nets to match\n", nip);
+		return nip;
+	}
+
+	if (nip == 0) {
+		LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+				   "for ip2nets to match\n");
+		return -ENOENT;
+	}
+
+	rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+	lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+	if (rc < 0) {
+		LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+		return rc;
+	}
+
+	if (rc == 0) {
+		LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+				   "any local IP interfaces\n");
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni)
+{
+	__u32  net = LNET_NIDNET(ni->ni_nid);
+	char **names;
+	int    n;
+	__u32  ip;
+	__u32  netmask;
+	int    up;
+	int    i;
+	int    rc;
+
+	/* Convenience for LNDs that use the IP address of a local interface as
+	 * the local address part of their NID */
+
+	if (ni->ni_interfaces[0] != NULL) {
+
+		CLASSERT (LNET_MAX_INTERFACES > 1);
+
+		if (ni->ni_interfaces[1] != NULL) {
+			CERROR("Net %s doesn't support multiple interfaces\n",
+			       libcfs_net2str(net));
+			return -EPERM;
+		}
+
+		rc = libcfs_ipif_query(ni->ni_interfaces[0],
+				       &up, &ip, &netmask);
+		if (rc != 0) {
+			CERROR("Net %s can't query interface %s: %d\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0], rc);
+			return -EPERM;
+		}
+
+		if (!up) {
+			CERROR("Net %s can't use interface %s: it's down\n",
+			       libcfs_net2str(net), ni->ni_interfaces[0]);
+			return -ENETDOWN;
+		}
+
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	n = libcfs_ipif_enumerate(&names);
+	if (n <= 0) {
+		CERROR("Net %s can't enumerate interfaces: %d\n",
+		       libcfs_net2str(net), n);
+		return 0;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+			continue;
+
+		rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+		if (rc != 0) {
+			CWARN("Net %s can't query interface %s: %d\n",
+			      libcfs_net2str(net), names[i], rc);
+			continue;
+		}
+
+		if (!up) {
+			CWARN("Net %s ignoring interface %s (down)\n",
+			      libcfs_net2str(net), names[i]);
+			continue;
+		}
+
+		libcfs_ipif_free_enumeration(names, n);
+		ni->ni_nid = LNET_MKNID(net, ip);
+		return 0;
+	}
+
+	CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+	libcfs_ipif_free_enumeration(names, n);
+	return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644
index 000000000000..78297a7d94e8
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+	    lnet_handle_eq_t *handle)
+{
+	lnet_eq_t     *eq;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	/* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+	 * overflow, they don't skip entries, so the queue has the same
+	 * apparent capacity at all times */
+
+	count = cfs_power2_roundup(count);
+
+	if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+		CWARN("EQ callback is guaranteed to get every event, "
+		      "do you still want to set eqcount %d for polling "
+		      "event which will have locking overhead? "
+		      "Please contact with developer to confirm\n", count);
+	}
+
+	/* count can be 0 if only need callback, we can eliminate
+	 * overhead of enqueue event */
+	if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+		return -EINVAL;
+
+	eq = lnet_eq_alloc();
+	if (eq == NULL)
+		return -ENOMEM;
+
+	if (count != 0) {
+		LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+		if (eq->eq_events == NULL)
+			goto failed;
+		/* NB allocator has set all event sequence numbers to 0,
+		 * so all them should be earlier than eq_deq_seq */
+	}
+
+	eq->eq_deq_seq = 1;
+	eq->eq_enq_seq = 1;
+	eq->eq_size = count;
+	eq->eq_callback = callback;
+
+	eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+				       sizeof(*eq->eq_refs[0]));
+	if (eq->eq_refs == NULL)
+		goto failed;
+
+	/* MUST hold both exclusive lnet_res_lock */
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+	list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_eq2handle(handle, eq);
+	return 0;
+
+failed:
+	if (eq->eq_events != NULL)
+		LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+	if (eq->eq_refs != NULL)
+		cfs_percpt_free(eq->eq_refs);
+
+	lnet_eq_free(eq);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY  If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+	struct lnet_eq	*eq;
+	lnet_event_t	*events = NULL;
+	int		**refs = NULL;
+	int		*ref;
+	int		rc = 0;
+	int		size = 0;
+	int		i;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	lnet_res_lock(LNET_LOCK_EX);
+	/* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+	 * both EQ lookup and poll event with only lnet_eq_wait_lock */
+	lnet_eq_wait_lock();
+
+	eq = lnet_handle2eq(&eqh);
+	if (eq == NULL) {
+		rc = -ENOENT;
+		goto out;
+	}
+
+	cfs_percpt_for_each(ref, i, eq->eq_refs) {
+		LASSERT(*ref >= 0);
+		if (*ref == 0)
+			continue;
+
+		CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+		       i, *ref);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/* stash for free after lock dropped */
+	events	= eq->eq_events;
+	size	= eq->eq_size;
+	refs	= eq->eq_refs;
+
+	lnet_res_lh_invalidate(&eq->eq_lh);
+	list_del(&eq->eq_list);
+	lnet_eq_free_locked(eq);
+ out:
+	lnet_eq_wait_unlock();
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	if (events != NULL)
+		LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+	if (refs != NULL)
+		cfs_percpt_free(refs);
+
+	return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	/* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+	int index;
+
+	if (eq->eq_size == 0) {
+		LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+		eq->eq_callback(ev);
+		return;
+	}
+
+	lnet_eq_wait_lock();
+	ev->sequence = eq->eq_enq_seq++;
+
+	LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+	index = ev->sequence & (eq->eq_size - 1);
+
+	eq->eq_events[index] = *ev;
+
+	if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+		eq->eq_callback(ev);
+
+	/* Wake anyone waiting in LNetEQPoll() */
+	if (waitqueue_active(&the_lnet.ln_eq_waitq))
+		wake_up_all(&the_lnet.ln_eq_waitq);
+	lnet_eq_wait_unlock();
+}
+
+int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+	int		new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+	lnet_event_t	*new_event = &eq->eq_events[new_index];
+	int		rc;
+	ENTRY;
+
+	/* must called with lnet_eq_wait_lock hold */
+	if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+		RETURN(0);
+
+	/* We've got a new event... */
+	*ev = *new_event;
+
+	CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+	       new_event, eq->eq_deq_seq, eq->eq_size);
+
+	/* ...but did it overwrite an event we've not seen yet? */
+	if (eq->eq_deq_seq == new_event->sequence) {
+		rc = 1;
+	} else {
+		/* don't complain with CERROR: some EQs are sized small
+		 * anyway; if it's important, the caller should complain */
+		CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+		       eq->eq_deq_seq, new_event->sequence);
+		rc = -EOVERFLOW;
+	}
+
+	eq->eq_deq_seq = new_event->sequence + 1;
+	RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0	  No pending event in the EQ.
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, 0,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1	  Indicates success.
+ * \retval -ENOENT    If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+	int which;
+
+	return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+			 event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+{
+	int		tms = *timeout_ms;
+	int		wait;
+	wait_queue_t  wl;
+	cfs_time_t      now;
+
+	if (tms == 0)
+		return -1; /* don't want to wait and no new event */
+
+	init_waitqueue_entry_current(&wl);
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	lnet_eq_wait_unlock();
+
+	if (tms < 0) {
+		waitq_wait(&wl, TASK_INTERRUPTIBLE);
+
+	} else {
+		struct timeval tv;
+
+		now = cfs_time_current();
+		waitq_timedwait(&wl, TASK_INTERRUPTIBLE,
+				    cfs_time_seconds(tms) / 1000);
+		cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+		tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+		if (tms < 0) /* no more wait but may have new event */
+			tms = 0;
+	}
+
+	wait = tms != 0; /* might need to call here again */
+	*timeout_ms = tms;
+
+	lnet_eq_wait_lock();
+	remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+	return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0	  No pending event in the EQs after timeout.
+ * \retval 1	  Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT    If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+	   lnet_event_t *event, int *which)
+{
+	int	wait = 1;
+	int	rc;
+	int	i;
+	ENTRY;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (neq < 1)
+		RETURN(-ENOENT);
+
+	lnet_eq_wait_lock();
+
+	for (;;) {
+		for (i = 0; i < neq; i++) {
+			lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+			if (eq == NULL) {
+				lnet_eq_wait_unlock();
+				RETURN(-ENOENT);
+			}
+
+			rc = lnet_eq_dequeue_event(eq, event);
+			if (rc != 0) {
+				lnet_eq_wait_unlock();
+				*which = i;
+				RETURN(rc);
+			}
+		}
+
+		if (wait == 0)
+			break;
+
+		/*
+		 * return value of lnet_eq_wait_locked:
+		 * -1 : did nothing and it's sure no new event
+		 *  1 : sleep inside and wait until new event
+		 *  0 : don't want to wait anymore, but might have new event
+		 *      so need to call dequeue again
+		 */
+		wait = lnet_eq_wait_locked(&timeout_ms);
+		if (wait < 0) /* no new event */
+			break;
+	}
+
+	lnet_eq_wait_unlock();
+	RETURN(0);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644
index 000000000000..ae643f26933b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-md.c
@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+	if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+		/* first unlink attempt... */
+		lnet_me_t *me = md->md_me;
+
+		md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+		/* Disassociate from ME (if any), and unlink it if it was created
+		 * with LNET_UNLINK */
+		if (me != NULL) {
+			/* detach MD from portal */
+			lnet_ptl_detach_md(me, md);
+			if (me->me_unlink == LNET_UNLINK)
+				lnet_me_unlink(me);
+		}
+
+		/* ensure all future handle lookups fail */
+		lnet_res_lh_invalidate(&md->md_lh);
+	}
+
+	if (md->md_refcount != 0) {
+		CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+		return;
+	}
+
+	CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+	if (md->md_eq != NULL) {
+		int	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+		LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+		(*md->md_eq->eq_refs[cpt])--;
+	}
+
+	LASSERT(!list_empty(&md->md_list));
+	list_del_init(&md->md_list);
+	lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+	int	  i;
+	unsigned int niov;
+	int	  total_length = 0;
+
+	lmd->md_me = NULL;
+	lmd->md_start = umd->start;
+	lmd->md_offset = 0;
+	lmd->md_max_size = umd->max_size;
+	lmd->md_options = umd->options;
+	lmd->md_user_ptr = umd->user_ptr;
+	lmd->md_eq = NULL;
+	lmd->md_threshold = umd->threshold;
+	lmd->md_refcount = 0;
+	lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+	if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+		if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+			return -EINVAL;
+
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.iov, umd->start,
+		       niov * sizeof (lmd->md_iov.iov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the base address on trust */
+			if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+				return -EINVAL;
+
+			total_length += lmd->md_iov.iov[i].iov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+
+	} else if ((umd->options & LNET_MD_KIOV) != 0) {
+		lmd->md_niov = niov = umd->length;
+		memcpy(lmd->md_iov.kiov, umd->start,
+		       niov * sizeof (lmd->md_iov.kiov[0]));
+
+		for (i = 0; i < (int)niov; i++) {
+			/* We take the page pointer on trust */
+			if (lmd->md_iov.kiov[i].kiov_offset +
+			    lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE )
+				return -EINVAL; /* invalid length */
+
+			total_length += lmd->md_iov.kiov[i].kiov_len;
+		}
+
+		lmd->md_length = total_length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > total_length)) // illegal max_size
+			return -EINVAL;
+	} else {   /* contiguous */
+		lmd->md_length = umd->length;
+		lmd->md_niov = niov = 1;
+		lmd->md_iov.iov[0].iov_base = umd->start;
+		lmd->md_iov.iov[0].iov_len = umd->length;
+
+		if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+		    (umd->max_size < 0 ||
+		     umd->max_size > (int)umd->length)) // illegal max_size
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+	struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+	/* NB we are passed an allocated, but inactive md.
+	 * if we return success, caller may lnet_md_unlink() it.
+	 * otherwise caller may only lnet_md_free() it.
+	 */
+	/* This implementation doesn't know how to create START events or
+	 * disable END events.  Best to LASSERT our caller is compliant so
+	 * we find out quickly...  */
+	/*  TODO - reevaluate what should be here in light of
+	 * the removal of the start and end events
+	 * maybe there we shouldn't even allow LNET_EQ_NONE!)
+	 * LASSERT (eq == NULL);
+	 */
+	if (!LNetHandleIsInvalid(eq_handle)) {
+		md->md_eq = lnet_handle2eq(&eq_handle);
+
+		if (md->md_eq == NULL)
+			return -ENOENT;
+
+		(*md->md_eq->eq_refs[cpt])++;
+	}
+
+	lnet_res_lh_initialize(container, &md->md_lh);
+
+	LASSERT(list_empty(&md->md_list));
+	list_add(&md->md_list, &container->rec_active);
+
+	return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+	/* NB this doesn't copy out all the iov entries so when a
+	 * discontiguous MD is copied out, the target gets to know the
+	 * original iov pointer (in start) and the number of entries it had
+	 * and that's all.
+	 */
+	umd->start = lmd->md_start;
+	umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+		      lmd->md_length : lmd->md_niov;
+	umd->threshold = lmd->md_threshold;
+	umd->max_size = lmd->md_max_size;
+	umd->options = lmd->md_options;
+	umd->user_ptr = lmd->md_user_ptr;
+	lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+int
+lnet_md_validate(lnet_md_t *umd)
+{
+	if (umd->start == NULL && umd->length != 0) {
+		CERROR("MD start pointer can not be NULL with length %u\n",
+		       umd->length);
+		return -EINVAL;
+	}
+
+	if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+	    umd->length > LNET_MAX_IOV) {
+		CERROR("Invalid option: too many fragments %u, %d max\n",
+		       umd->length, LNET_MAX_IOV);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY  If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+	     lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	LIST_HEAD		(matches);
+	LIST_HEAD		(drops);
+	struct lnet_me		*me;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+		CERROR("Invalid option: no MD_OP set\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+
+	lnet_res_lock(cpt);
+	if (rc != 0)
+		goto failed;
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL)
+		rc = -ENOENT;
+	else if (me->me_md != NULL)
+		rc = -EBUSY;
+	else
+		rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+	if (rc != 0)
+		goto failed;
+
+	/* attach this MD to portal of ME and check if it matches any
+	 * blocked msgs on this portal */
+	lnet_ptl_attach_md(me, md, &matches, &drops);
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+
+	lnet_drop_delayed_msg_list(&drops, "Bad match");
+	lnet_recv_delayed_msg_list(&matches);
+
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+	lnet_libmd_t	*md;
+	int		cpt;
+	int		rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (lnet_md_validate(&umd) != 0)
+		return -EINVAL;
+
+	if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+		CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+		return -EINVAL;
+	}
+
+	md = lnet_md_alloc(&umd);
+	if (md == NULL)
+		return -ENOMEM;
+
+	rc = lnet_md_build(md, &umd, unlink);
+
+	cpt = lnet_res_lock_current();
+	if (rc != 0)
+		goto failed;
+
+	rc = lnet_md_link(md, umd.eq_handle, cpt);
+	if (rc != 0)
+		goto failed;
+
+	lnet_md2handle(handle, md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+
+ failed:
+	lnet_md_free_locked(md);
+
+	lnet_res_unlock(cpt);
+	return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ *   and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ *   returns, and the unlinked event will be piggybacked on the event of
+ *   the completion of the last operation by setting the unlinked field of
+ *   the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink (lnet_handle_md_t mdh)
+{
+	lnet_event_t	ev;
+	lnet_libmd_t	*md;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	/* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+	 * when the NAL is done, the completion event flags that the MD was
+	 * unlinked.  Otherwise, we enqueue an event now... */
+
+	if (md->md_eq != NULL &&
+	    md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_md_unlink(md);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644
index 000000000000..0081075cabee
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-me.c
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_match_table *mtable;
+	struct lnet_me		*me;
+	struct list_head		*head;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if ((int)portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	mtable = lnet_mt_of_attach(portal, match_id,
+				   match_bits, ignore_bits, pos);
+	if (mtable == NULL) /* can't match portal type */
+		return -EPERM;
+
+	me = lnet_me_alloc();
+	if (me == NULL)
+		return -ENOMEM;
+
+	lnet_res_lock(mtable->mt_cpt);
+
+	me->me_portal = portal;
+	me->me_match_id = match_id;
+	me->me_match_bits = match_bits;
+	me->me_ignore_bits = ignore_bits;
+	me->me_unlink = unlink;
+	me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+			       &me->me_lh);
+	if (ignore_bits != 0)
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+	me->me_pos = head - &mtable->mt_mhash[0];
+	if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+		list_add_tail(&me->me_list, head);
+	else
+		list_add(&me->me_list, head);
+
+	lnet_me2handle(handle, me);
+
+	lnet_res_unlock(mtable->mt_cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0       On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+	     lnet_process_id_t match_id,
+	     __u64 match_bits, __u64 ignore_bits,
+	     lnet_unlink_t unlink, lnet_ins_pos_t pos,
+	     lnet_handle_me_t *handle)
+{
+	struct lnet_me		*current_me;
+	struct lnet_me		*new_me;
+	struct lnet_portal	*ptl;
+	int			cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	if (pos == LNET_INS_LOCAL)
+		return -EPERM;
+
+	new_me = lnet_me_alloc();
+	if (new_me == NULL)
+		return -ENOMEM;
+
+	cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+	lnet_res_lock(cpt);
+
+	current_me = lnet_handle2me(&current_meh);
+	if (current_me == NULL) {
+		lnet_me_free_locked(new_me);
+
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+	ptl = the_lnet.ln_portals[current_me->me_portal];
+	if (lnet_ptl_is_unique(ptl)) {
+		/* nosense to insertion on unique portal */
+		lnet_me_free_locked(new_me);
+		lnet_res_unlock(cpt);
+		return -EPERM;
+	}
+
+	new_me->me_pos = current_me->me_pos;
+	new_me->me_portal = current_me->me_portal;
+	new_me->me_match_id = match_id;
+	new_me->me_match_bits = match_bits;
+	new_me->me_ignore_bits = ignore_bits;
+	new_me->me_unlink = unlink;
+	new_me->me_md = NULL;
+
+	lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+	if (pos == LNET_INS_AFTER)
+		list_add(&new_me->me_list, &current_me->me_list);
+	else
+		list_add_tail(&new_me->me_list, &current_me->me_list);
+
+	lnet_me2handle(handle, new_me);
+
+	lnet_res_unlock(cpt);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0       On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+	lnet_me_t	*me;
+	lnet_libmd_t	*md;
+	lnet_event_t	ev;
+	int		cpt;
+
+	LASSERT(the_lnet.ln_init);
+	LASSERT(the_lnet.ln_refcount > 0);
+
+	cpt = lnet_cpt_of_cookie(meh.cookie);
+	lnet_res_lock(cpt);
+
+	me = lnet_handle2me(&meh);
+	if (me == NULL) {
+		lnet_res_unlock(cpt);
+		return -ENOENT;
+	}
+
+	md = me->me_md;
+	if (md != NULL &&
+	    md->md_eq != NULL &&
+	    md->md_refcount == 0) {
+		lnet_build_unlink_event(md, &ev);
+		lnet_eq_enqueue_event(md->md_eq, &ev);
+	}
+
+	lnet_me_unlink(me);
+
+	lnet_res_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+	list_del(&me->me_list);
+
+	if (me->me_md != NULL) {
+		lnet_libmd_t *md = me->me_md;
+
+		/* detach MD from portal of this ME */
+		lnet_ptl_detach_md(me, md);
+		lnet_md_unlink(md);
+	}
+
+	lnet_res_lh_invalidate(&me->me_lh);
+	lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+	CWARN("Match Entry %p ("LPX64")\n", me,
+	      me->me_lh.lh_cookie);
+
+	CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+	      me->me_match_bits, me->me_ignore_bits);
+
+	CWARN("\tMD\t= %p\n", me->md);
+	CWARN("\tprev\t= %p\n",
+	      list_entry(me->me_list.prev, lnet_me_t, me_list));
+	CWARN("\tnext\t= %p\n",
+	      list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644
index 000000000000..49b0f1287a69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -0,0 +1,2441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+		"Reserved");
+
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
+{
+	lnet_test_peer_t  *tp;
+	struct list_head	*el;
+	struct list_head	*next;
+	struct list_head	 cull;
+
+	LASSERT (the_lnet.ln_init);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	if (threshold != 0) {
+		/* Adding a new entry */
+		LIBCFS_ALLOC(tp, sizeof(*tp));
+		if (tp == NULL)
+			return -ENOMEM;
+
+		tp->tp_nid = nid;
+		tp->tp_threshold = threshold;
+
+		lnet_net_lock(0);
+		list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+		lnet_net_unlock(0);
+		return 0;
+	}
+
+	/* removing entries */
+	INIT_LIST_HEAD(&cull);
+
+	lnet_net_lock(0);
+
+	list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+		    nid == LNET_NID_ANY ||       /* removing all entries */
+		    tp->tp_nid == nid)	  /* matched this one */
+		{
+			list_del (&tp->tp_list);
+			list_add (&tp->tp_list, &cull);
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty (&cull)) {
+		tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+
+		list_del (&tp->tp_list);
+		LIBCFS_FREE(tp, sizeof (*tp));
+	}
+	return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+	lnet_test_peer_t *tp;
+	struct list_head       *el;
+	struct list_head       *next;
+	struct list_head	cull;
+	int	       fail = 0;
+
+	INIT_LIST_HEAD (&cull);
+
+	/* NB: use lnet_net_lock(0) to serialize operations on test peers */
+	lnet_net_lock(0);
+
+	list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+		tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+		if (tp->tp_threshold == 0) {
+			/* zombie entry */
+			if (outgoing) {
+				/* only cull zombies on outgoing tests,
+				 * since we may be at interrupt priority on
+				 * incoming messages. */
+				list_del (&tp->tp_list);
+				list_add (&tp->tp_list, &cull);
+			}
+			continue;
+		}
+
+		if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+		    nid == tp->tp_nid) {	/* fail this peer */
+			fail = 1;
+
+			if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+				tp->tp_threshold--;
+				if (outgoing &&
+				    tp->tp_threshold == 0) {
+					/* see above */
+					list_del (&tp->tp_list);
+					list_add (&tp->tp_list, &cull);
+				}
+			}
+			break;
+		}
+	}
+
+	lnet_net_unlock(0);
+
+	while (!list_empty (&cull)) {
+		tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+		list_del (&tp->tp_list);
+
+		LIBCFS_FREE(tp, sizeof (*tp));
+	}
+
+	return (fail);
+}
+
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
+{
+	unsigned int nob = 0;
+
+	while (niov-- > 0)
+		nob += (iov++)->iov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+		   unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+		   unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int  this_nob;
+
+	if (nob == 0)
+		return;
+
+	/* skip complete frags before 'doffset' */
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->iov_len) {
+		doffset -= diov->iov_len;
+		diov++;
+		ndiov--;
+		LASSERT (ndiov > 0);
+	}
+
+	/* skip complete frags before 'soffset' */
+	LASSERT (nsiov > 0);
+	while (soffset >= siov->iov_len) {
+		soffset -= siov->iov_len;
+		siov++;
+		nsiov--;
+		LASSERT (nsiov > 0);
+	}
+
+	do {
+		LASSERT (ndiov > 0);
+		LASSERT (nsiov > 0);
+		this_nob = MIN(diov->iov_len - doffset,
+			       siov->iov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		memcpy ((char *)diov->iov_base + doffset,
+			(char *)siov->iov_base + soffset, this_nob);
+		nob -= this_nob;
+
+		if (diov->iov_len > doffset + this_nob) {
+			doffset += this_nob;
+		} else {
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->iov_len > soffset + this_nob) {
+			soffset += this_nob;
+		} else {
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+		  int src_niov, struct iovec *src,
+		  unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return (0);		     /* no frags */
+
+	LASSERT (src_niov > 0);
+	while (offset >= src->iov_len) {      /* skip initial frags */
+		offset -= src->iov_len;
+		src_niov--;
+		src++;
+		LASSERT (src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT (src_niov > 0);
+		LASSERT ((int)niov <= dst_niov);
+
+		frag_len = src->iov_len - offset;
+		dst->iov_base = ((char *)src->iov_base) + offset;
+
+		if (len <= frag_len) {
+			dst->iov_len = len;
+			return (niov);
+		}
+
+		dst->iov_len = frag_len;
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
+{
+	unsigned int  nob = 0;
+
+	while (niov-- > 0)
+		nob += (kiov++)->kiov_len;
+
+	return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+		     unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+		     unsigned int nob)
+{
+	/* NB diov, siov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *daddr = NULL;
+	char	   *saddr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (ndiov > 0);
+	while (doffset >= diov->kiov_len) {
+		doffset -= diov->kiov_len;
+		diov++;
+		ndiov--;
+		LASSERT (ndiov > 0);
+	}
+
+	LASSERT (nsiov > 0);
+	while (soffset >= siov->kiov_len) {
+		soffset -= siov->kiov_len;
+		siov++;
+		nsiov--;
+		LASSERT (nsiov > 0);
+	}
+
+	do {
+		LASSERT (ndiov > 0);
+		LASSERT (nsiov > 0);
+		this_nob = MIN(diov->kiov_len - doffset,
+			       siov->kiov_len - soffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (daddr == NULL)
+			daddr = ((char *)kmap(diov->kiov_page)) +
+				diov->kiov_offset + doffset;
+		if (saddr == NULL)
+			saddr = ((char *)kmap(siov->kiov_page)) +
+				siov->kiov_offset + soffset;
+
+		/* Vanishing risk of kmap deadlock when mapping 2 pages.
+		 * However in practice at least one of the kiovs will be mapped
+		 * kernel pages and the map/unmap will be NOOPs */
+
+		memcpy (daddr, saddr, this_nob);
+		nob -= this_nob;
+
+		if (diov->kiov_len > doffset + this_nob) {
+			daddr += this_nob;
+			doffset += this_nob;
+		} else {
+			kunmap(diov->kiov_page);
+			daddr = NULL;
+			diov++;
+			ndiov--;
+			doffset = 0;
+		}
+
+		if (siov->kiov_len > soffset + this_nob) {
+			saddr += this_nob;
+			soffset += this_nob;
+		} else {
+			kunmap(siov->kiov_page);
+			saddr = NULL;
+			siov++;
+			nsiov--;
+			soffset = 0;
+		}
+	} while (nob > 0);
+
+	if (daddr != NULL)
+		kunmap(diov->kiov_page);
+	if (saddr != NULL)
+		kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+		    unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		    unsigned int nob)
+{
+	/* NB iov, kiov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT (niov > 0);
+	}
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT (nkiov > 0);
+	}
+
+	do {
+		LASSERT (niov > 0);
+		LASSERT (nkiov > 0);
+		this_nob = MIN(iov->iov_len - iovoffset,
+			       kiov->kiov_len - kiovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+		nob -= this_nob;
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+		    unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+		    unsigned int nob)
+{
+	/* NB kiov, iov are READ-ONLY */
+	unsigned int    this_nob;
+	char	   *addr = NULL;
+
+	if (nob == 0)
+		return;
+
+	LASSERT (!in_interrupt ());
+
+	LASSERT (nkiov > 0);
+	while (kiovoffset >= kiov->kiov_len) {
+		kiovoffset -= kiov->kiov_len;
+		kiov++;
+		nkiov--;
+		LASSERT (nkiov > 0);
+	}
+
+	LASSERT (niov > 0);
+	while (iovoffset >= iov->iov_len) {
+		iovoffset -= iov->iov_len;
+		iov++;
+		niov--;
+		LASSERT (niov > 0);
+	}
+
+	do {
+		LASSERT (nkiov > 0);
+		LASSERT (niov > 0);
+		this_nob = MIN(kiov->kiov_len - kiovoffset,
+			       iov->iov_len - iovoffset);
+		this_nob = MIN(this_nob, nob);
+
+		if (addr == NULL)
+			addr = ((char *)kmap(kiov->kiov_page)) +
+				kiov->kiov_offset + kiovoffset;
+
+		memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+		nob -= this_nob;
+
+		if (kiov->kiov_len > kiovoffset + this_nob) {
+			addr += this_nob;
+			kiovoffset += this_nob;
+		} else {
+			kunmap(kiov->kiov_page);
+			addr = NULL;
+			kiov++;
+			nkiov--;
+			kiovoffset = 0;
+		}
+
+		if (iov->iov_len > iovoffset + this_nob) {
+			iovoffset += this_nob;
+		} else {
+			iov++;
+			niov--;
+			iovoffset = 0;
+		}
+	} while (nob > 0);
+
+	if (addr != NULL)
+		kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+		   int src_niov, lnet_kiov_t *src,
+		   unsigned int offset, unsigned int len)
+{
+	/* Initialise 'dst' to the subset of 'src' starting at 'offset',
+	 * for exactly 'len' bytes, and return the number of entries.
+	 * NB not destructive to 'src' */
+	unsigned int    frag_len;
+	unsigned int    niov;
+
+	if (len == 0)			   /* no data => */
+		return (0);		     /* no frags */
+
+	LASSERT (src_niov > 0);
+	while (offset >= src->kiov_len) {      /* skip initial frags */
+		offset -= src->kiov_len;
+		src_niov--;
+		src++;
+		LASSERT (src_niov > 0);
+	}
+
+	niov = 1;
+	for (;;) {
+		LASSERT (src_niov > 0);
+		LASSERT ((int)niov <= dst_niov);
+
+		frag_len = src->kiov_len - offset;
+		dst->kiov_page = src->kiov_page;
+		dst->kiov_offset = src->kiov_offset + offset;
+
+		if (len <= frag_len) {
+			dst->kiov_len = len;
+			LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+			return (niov);
+		}
+
+		dst->kiov_len = frag_len;
+		LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+		len -= frag_len;
+		dst++;
+		src++;
+		niov++;
+		src_niov--;
+		offset = 0;
+	}
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+	     unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	unsigned int  niov = 0;
+	struct iovec *iov = NULL;
+	lnet_kiov_t  *kiov = NULL;
+	int	   rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (mlen == 0 || msg != NULL);
+
+	if (msg != NULL) {
+		LASSERT(msg->msg_receiving);
+		LASSERT(!msg->msg_sending);
+		LASSERT(rlen == msg->msg_len);
+		LASSERT(mlen <= msg->msg_len);
+		LASSERT(msg->msg_offset == offset);
+		LASSERT(msg->msg_wanted == mlen);
+
+		msg->msg_receiving = 0;
+
+		if (mlen != 0) {
+			niov = msg->msg_niov;
+			iov  = msg->msg_iov;
+			kiov = msg->msg_kiov;
+
+			LASSERT (niov > 0);
+			LASSERT ((iov == NULL) != (kiov == NULL));
+		}
+	}
+
+	rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+				    niov, iov, kiov, offset, mlen, rlen);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+	lnet_libmd_t *md = msg->msg_md;
+
+	LASSERT (msg->msg_len > 0);
+	LASSERT (!msg->msg_routing);
+	LASSERT (md != NULL);
+	LASSERT (msg->msg_niov == 0);
+	LASSERT (msg->msg_iov == NULL);
+	LASSERT (msg->msg_kiov == NULL);
+
+	msg->msg_niov = md->md_niov;
+	if ((md->md_options & LNET_MD_KIOV) != 0)
+		msg->msg_kiov = md->md_iov.kiov;
+	else
+		msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+	       unsigned int offset, unsigned int len)
+{
+	msg->msg_type = type;
+	msg->msg_target = target;
+	msg->msg_len = len;
+	msg->msg_offset = offset;
+
+	if (len != 0)
+		lnet_setpayloadbuffer(msg);
+
+	memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+	msg->msg_hdr.type	   = cpu_to_le32(type);
+	msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
+	msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+	/* src_nid will be set later */
+	msg->msg_hdr.src_pid	= cpu_to_le32(the_lnet.ln_pid);
+	msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void   *priv = msg->msg_private;
+	int     rc;
+
+	LASSERT (!in_interrupt ());
+	LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+		 (msg->msg_txcredit && msg->msg_peertxcredit));
+
+	rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+	if (rc < 0)
+		lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc;
+
+	LASSERT(!msg->msg_sending);
+	LASSERT(msg->msg_receiving);
+	LASSERT(!msg->msg_rx_ready_delay);
+	LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+	msg->msg_rx_ready_delay = 1;
+	rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+					  &msg->msg_private);
+	if (rc != 0) {
+		CERROR("recv from %s / send to %s aborted: "
+		       "eager_recv failed %d\n",
+		       libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+		       libcfs_id2str(msg->msg_target), rc);
+		LASSERT(rc < 0); /* required by my callers */
+	}
+
+	return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	cfs_time_t last_alive = 0;
+
+	LASSERT(lnet_peer_aliveness_enabled(lp));
+	LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+	lnet_net_unlock(lp->lp_cpt);
+	(ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+	lnet_net_lock(lp->lp_cpt);
+
+	lp->lp_last_query = cfs_time_current();
+
+	if (last_alive != 0) /* NI has updated timestamp */
+		lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+	int	alive;
+	cfs_time_t deadline;
+
+	LASSERT (lnet_peer_aliveness_enabled(lp));
+
+	/* Trust lnet_notify() if it has more recent aliveness news, but
+	 * ignore the initial assumed death (see lnet_peers_start_down()).
+	 */
+	if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+	    cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+		return 0;
+
+	deadline = cfs_time_add(lp->lp_last_alive,
+				cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+	alive = cfs_time_after(deadline, now);
+
+	/* Update obsolete lp_alive except for routers assumed to be dead
+	 * initially, because router checker would update aliveness in this
+	 * case, and moreover lp_last_alive at peer creation is assumed.
+	 */
+	if (alive && !lp->lp_alive &&
+	    !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+		lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+	return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ *     may drop the lnet_net_lock */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+	cfs_time_t now = cfs_time_current();
+
+	if (!lnet_peer_aliveness_enabled(lp))
+		return -ENODEV;
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	/* Peer appears dead, but we should avoid frequent NI queries (at
+	 * most once per lnet_queryinterval seconds). */
+	if (lp->lp_last_query != 0) {
+		static const int lnet_queryinterval = 1;
+
+		cfs_time_t next_query =
+			   cfs_time_add(lp->lp_last_query,
+					cfs_time_seconds(lnet_queryinterval));
+
+		if (cfs_time_before(now, next_query)) {
+			if (lp->lp_alive)
+				CWARN("Unexpected aliveness of peer %s: "
+				      "%d < %d (%d/%d)\n",
+				      libcfs_nid2str(lp->lp_nid),
+				      (int)now, (int)next_query,
+				      lnet_queryinterval,
+				      lp->lp_ni->ni_peertimeout);
+			return 0;
+		}
+	}
+
+	/* query NI for latest aliveness news */
+	lnet_ni_query_locked(lp->lp_ni, lp);
+
+	if (lnet_peer_is_alive(lp, now))
+		return 1;
+
+	lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+	return 0;
+}
+
+int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+	/* lnet_send is going to lnet_net_unlock immediately after this,
+	 * so it sets do_send FALSE and I don't do the unlock/send/lock bit.
+	 * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer
+	 * appears dead, and 0 if sent or OK to send */
+	struct lnet_peer	*lp = msg->msg_txpeer;
+	struct lnet_ni		*ni = lp->lp_ni;
+	struct lnet_tx_queue	*tq;
+	int			cpt;
+
+	/* non-lnet_send() callers have checked before */
+	LASSERT(!do_send || msg->msg_tx_delayed);
+	LASSERT(!msg->msg_receiving);
+	LASSERT(msg->msg_tx_committed);
+
+	cpt = msg->msg_tx_cpt;
+	tq = ni->ni_tx_queues[cpt];
+
+	/* NB 'lp' is always the next hop */
+	if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+	    lnet_peer_alive_locked(lp) == 0) {
+		the_lnet.ln_counters[cpt]->drop_count++;
+		the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+		lnet_net_unlock(cpt);
+
+		CNETERR("Dropping message for %s: peer not alive\n",
+			libcfs_id2str(msg->msg_target));
+		if (do_send)
+			lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+		lnet_net_lock(cpt);
+		return EHOSTUNREACH;
+	}
+
+	if (!msg->msg_peertxcredit) {
+		LASSERT ((lp->lp_txcredits < 0) ==
+			 !list_empty(&lp->lp_txq));
+
+		msg->msg_peertxcredit = 1;
+		lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+		lp->lp_txcredits--;
+
+		if (lp->lp_txcredits < lp->lp_mintxcredits)
+			lp->lp_mintxcredits = lp->lp_txcredits;
+
+		if (lp->lp_txcredits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_txq);
+			return EAGAIN;
+		}
+	}
+
+	if (!msg->msg_txcredit) {
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		msg->msg_txcredit = 1;
+		tq->tq_credits--;
+
+		if (tq->tq_credits < tq->tq_credits_min)
+			tq->tq_credits_min = tq->tq_credits;
+
+		if (tq->tq_credits < 0) {
+			msg->msg_tx_delayed = 1;
+			list_add_tail(&msg->msg_list, &tq->tq_delayed);
+			return EAGAIN;
+		}
+	}
+
+	if (do_send) {
+		lnet_net_unlock(cpt);
+		lnet_ni_send(ni, msg);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+	lnet_rtrbufpool_t	*rbp;
+	int			cpt;
+
+	LASSERT(msg->msg_rx_committed);
+
+	cpt = msg->msg_rx_cpt;
+	rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+	LASSERT(msg->msg_len <= LNET_MTU);
+	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+		rbp++;
+		LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+	}
+
+	return rbp;
+}
+
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
+{
+	/* lnet_parse is going to lnet_net_unlock immediately after this, so it
+	 * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
+	 * return EAGAIN if msg blocked and 0 if received or OK to receive */
+	lnet_peer_t	 *lp = msg->msg_rxpeer;
+	lnet_rtrbufpool_t   *rbp;
+	lnet_rtrbuf_t       *rb;
+
+	LASSERT (msg->msg_iov == NULL);
+	LASSERT (msg->msg_kiov == NULL);
+	LASSERT (msg->msg_niov == 0);
+	LASSERT (msg->msg_routing);
+	LASSERT (msg->msg_receiving);
+	LASSERT (!msg->msg_sending);
+
+	/* non-lnet_parse callers only receive delayed messages */
+	LASSERT(!do_recv || msg->msg_rx_delayed);
+
+	if (!msg->msg_peerrtrcredit) {
+		LASSERT ((lp->lp_rtrcredits < 0) ==
+			 !list_empty(&lp->lp_rtrq));
+
+		msg->msg_peerrtrcredit = 1;
+		lp->lp_rtrcredits--;
+		if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+			lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+		if (lp->lp_rtrcredits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+			return EAGAIN;
+		}
+	}
+
+	rbp = lnet_msg2bufpool(msg);
+
+	if (!msg->msg_rtrcredit) {
+		LASSERT ((rbp->rbp_credits < 0) ==
+			 !list_empty(&rbp->rbp_msgs));
+
+		msg->msg_rtrcredit = 1;
+		rbp->rbp_credits--;
+		if (rbp->rbp_credits < rbp->rbp_mincredits)
+			rbp->rbp_mincredits = rbp->rbp_credits;
+
+		if (rbp->rbp_credits < 0) {
+			/* must have checked eager_recv before here */
+			LASSERT(msg->msg_rx_ready_delay);
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+			return EAGAIN;
+		}
+	}
+
+	LASSERT (!list_empty(&rbp->rbp_bufs));
+	rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+	list_del(&rb->rb_list);
+
+	msg->msg_niov = rbp->rbp_npages;
+	msg->msg_kiov = &rb->rb_kiov[0];
+
+	if (do_recv) {
+		int cpt = msg->msg_rx_cpt;
+
+		lnet_net_unlock(cpt);
+		lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+			     0, msg->msg_len, msg->msg_len);
+		lnet_net_lock(cpt);
+	}
+	return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*txpeer = msg->msg_txpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_txcredit) {
+		struct lnet_ni	     *ni = txpeer->lp_ni;
+		struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+		/* give back NI txcredits */
+		msg->msg_txcredit = 0;
+
+		LASSERT((tq->tq_credits < 0) ==
+			!list_empty(&tq->tq_delayed));
+
+		tq->tq_credits++;
+		if (tq->tq_credits <= 0) {
+			msg2 = list_entry(tq->tq_delayed.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer->lp_ni == ni);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peertxcredit) {
+		/* give back peer txcredits */
+		msg->msg_peertxcredit = 0;
+
+		LASSERT((txpeer->lp_txcredits < 0) ==
+			!list_empty(&txpeer->lp_txq));
+
+		txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+		LASSERT (txpeer->lp_txqnob >= 0);
+
+		txpeer->lp_txcredits++;
+		if (txpeer->lp_txcredits <= 0) {
+			msg2 = list_entry(txpeer->lp_txq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			LASSERT(msg2->msg_txpeer == txpeer);
+			LASSERT(msg2->msg_tx_delayed);
+
+			(void) lnet_post_send_locked(msg2, 1);
+		}
+	}
+
+	if (txpeer != NULL) {
+		msg->msg_txpeer = NULL;
+		lnet_peer_decref_locked(txpeer);
+	}
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+	lnet_peer_t	*rxpeer = msg->msg_rxpeer;
+	lnet_msg_t	*msg2;
+
+	if (msg->msg_rtrcredit) {
+		/* give back global router credits */
+		lnet_rtrbuf_t     *rb;
+		lnet_rtrbufpool_t *rbp;
+
+		/* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+		 * there until it gets one allocated, or aborts the wait
+		 * itself */
+		LASSERT (msg->msg_kiov != NULL);
+
+		rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+		rbp = rb->rb_pool;
+		LASSERT (rbp == lnet_msg2bufpool(msg));
+
+		msg->msg_kiov = NULL;
+		msg->msg_rtrcredit = 0;
+
+		LASSERT((rbp->rbp_credits < 0) ==
+			!list_empty(&rbp->rbp_msgs));
+		LASSERT((rbp->rbp_credits > 0) ==
+			!list_empty(&rbp->rbp_bufs));
+
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+		rbp->rbp_credits++;
+		if (rbp->rbp_credits <= 0) {
+			msg2 = list_entry(rbp->rbp_msgs.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+
+	if (msg->msg_peerrtrcredit) {
+		/* give back peer router credits */
+		msg->msg_peerrtrcredit = 0;
+
+		LASSERT((rxpeer->lp_rtrcredits < 0) ==
+			!list_empty(&rxpeer->lp_rtrq));
+
+		rxpeer->lp_rtrcredits++;
+		if (rxpeer->lp_rtrcredits <= 0) {
+			msg2 = list_entry(rxpeer->lp_rtrq.next,
+					      lnet_msg_t, msg_list);
+			list_del(&msg2->msg_list);
+
+			(void) lnet_post_routed_recv_locked(msg2, 1);
+		}
+	}
+	if (rxpeer != NULL) {
+		msg->msg_rxpeer = NULL;
+		lnet_peer_decref_locked(rxpeer);
+	}
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+	lnet_peer_t *p1 = r1->lr_gateway;
+	lnet_peer_t *p2 = r2->lr_gateway;
+
+	if (r1->lr_hops < r2->lr_hops)
+		return 1;
+
+	if (r1->lr_hops > r2->lr_hops)
+		return -1;
+
+	if (p1->lp_txqnob < p2->lp_txqnob)
+		return 1;
+
+	if (p1->lp_txqnob > p2->lp_txqnob)
+		return -1;
+
+	if (p1->lp_txcredits > p2->lp_txcredits)
+		return 1;
+
+	if (p1->lp_txcredits < p2->lp_txcredits)
+		return -1;
+
+	if (r1->lr_seq - r2->lr_seq <= 0)
+		return 1;
+
+	return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*rtr;
+	lnet_route_t		*rtr_best;
+	lnet_route_t		*rtr_last;
+	struct lnet_peer	*lp_best;
+	struct lnet_peer	*lp;
+	int			rc;
+
+	/* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+	 * rtr_nid nid, otherwise find the best gateway I can use */
+
+	rnet = lnet_find_net_locked(LNET_NIDNET(target));
+	if (rnet == NULL)
+		return NULL;
+
+	lp_best = NULL;
+	rtr_best = rtr_last = NULL;
+	list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+		lp = rtr->lr_gateway;
+
+		if (!lp->lp_alive || /* gateway is down */
+		    ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+		     rtr->lr_downis != 0)) /* NI to target is down */
+			continue;
+
+		if (ni != NULL && lp->lp_ni != ni)
+			continue;
+
+		if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+			return lp;
+
+		if (lp_best == NULL) {
+			rtr_best = rtr_last = rtr;
+			lp_best = lp;
+			continue;
+		}
+
+		/* no protection on below fields, but it's harmless */
+		if (rtr_last->lr_seq - rtr->lr_seq < 0)
+			rtr_last = rtr;
+
+		rc = lnet_compare_routes(rtr, rtr_best);
+		if (rc < 0)
+			continue;
+
+		rtr_best = rtr;
+		lp_best = lp;
+	}
+
+	/* set sequence number on the best router to the latest sequence + 1
+	 * so we can round-robin all routers, it's race and inaccurate but
+	 * harmless and functional  */
+	if (rtr_best != NULL)
+		rtr_best->lr_seq = rtr_last->lr_seq + 1;
+	return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+	lnet_nid_t		dst_nid = msg->msg_target.nid;
+	struct lnet_ni		*src_ni;
+	struct lnet_ni		*local_ni;
+	struct lnet_peer	*lp;
+	int			cpt;
+	int			cpt2;
+	int			rc;
+
+	/* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+	 * but we might want to use pre-determined router for ACK/REPLY
+	 * in the future */
+	/* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+	LASSERT (msg->msg_txpeer == NULL);
+	LASSERT (!msg->msg_sending);
+	LASSERT (!msg->msg_target_is_router);
+	LASSERT (!msg->msg_receiving);
+
+	msg->msg_sending = 1;
+
+	LASSERT(!msg->msg_tx_committed);
+	cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	if (src_nid == LNET_NID_ANY) {
+		src_ni = NULL;
+	} else {
+		src_ni = lnet_nid2ni_locked(src_nid, cpt);
+		if (src_ni == NULL) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Can't send to %s: src %s is not a "
+				      "local nid\n", libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+		LASSERT (!msg->msg_routing);
+	}
+
+	/* Is this for someone on a local network? */
+	local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+	if (local_ni != NULL) {
+		if (src_ni == NULL) {
+			src_ni = local_ni;
+			src_nid = src_ni->ni_nid;
+		} else if (src_ni == local_ni) {
+			lnet_ni_decref_locked(local_ni, cpt);
+		} else {
+			lnet_ni_decref_locked(local_ni, cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("No route to %s via from %s\n",
+				      libcfs_nid2str(dst_nid),
+				      libcfs_nid2str(src_nid));
+			return -EINVAL;
+		}
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing)
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+		if (src_ni == the_lnet.ln_loni) {
+			/* No send credit hassles with LOLND */
+			lnet_net_unlock(cpt);
+			lnet_ni_send(src_ni, msg);
+
+			lnet_net_lock(cpt);
+			lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+			return 0;
+		}
+
+		rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+		/* lp has ref on src_ni; lose mine */
+		lnet_ni_decref_locked(src_ni, cpt);
+		if (rc != 0) {
+			lnet_net_unlock(cpt);
+			LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+				      libcfs_nid2str(dst_nid));
+			/* ENOMEM or shutting down */
+			return rc;
+		}
+		LASSERT (lp->lp_ni == src_ni);
+	} else {
+		/* sending to a remote network */
+		lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+		if (lp == NULL) {
+			if (src_ni != NULL)
+				lnet_ni_decref_locked(src_ni, cpt);
+			lnet_net_unlock(cpt);
+
+			LCONSOLE_WARN("No route to %s via %s "
+				      "(all routers down)\n",
+				      libcfs_id2str(msg->msg_target),
+				      libcfs_nid2str(src_nid));
+			return -EHOSTUNREACH;
+		}
+
+		/* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+		 * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+		 * pre-determined router, this can happen if router table
+		 * was changed when we release the lock */
+		if (rtr_nid != lp->lp_nid) {
+			cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+			if (cpt2 != cpt) {
+				if (src_ni != NULL)
+					lnet_ni_decref_locked(src_ni, cpt);
+				lnet_net_unlock(cpt);
+
+				rtr_nid = lp->lp_nid;
+				cpt = cpt2;
+				goto again;
+			}
+		}
+
+		CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+		       libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+		       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+		if (src_ni == NULL) {
+			src_ni = lp->lp_ni;
+			src_nid = src_ni->ni_nid;
+		} else {
+			LASSERT (src_ni == lp->lp_ni);
+			lnet_ni_decref_locked(src_ni, cpt);
+		}
+
+		lnet_peer_addref_locked(lp);
+
+		LASSERT(src_nid != LNET_NID_ANY);
+		lnet_msg_commit(msg, cpt);
+
+		if (!msg->msg_routing) {
+			/* I'm the source and now I know which NI to send on */
+			msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+		}
+
+		msg->msg_target_is_router = 1;
+		msg->msg_target.nid = lp->lp_nid;
+		msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+	}
+
+	/* 'lp' is our best choice of peer */
+
+	LASSERT (!msg->msg_peertxcredit);
+	LASSERT (!msg->msg_txcredit);
+	LASSERT (msg->msg_txpeer == NULL);
+
+	msg->msg_txpeer = lp;		   /* msg takes my ref on lp */
+
+	rc = lnet_post_send_locked(msg, 0);
+	lnet_net_unlock(cpt);
+
+	if (rc == EHOSTUNREACH)
+		return -EHOSTUNREACH;
+
+	if (rc == 0)
+		lnet_ni_send(src_ni, msg);
+
+	return 0;
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += nob;
+	lnet_net_unlock(cpt);
+
+	lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+
+	if (msg->msg_wanted != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+	/* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+	 * it back into the ACK during lnet_finalize() */
+	msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+			(msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+		     msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	struct lnet_match_info	info;
+	int			rc;
+
+	/* Convert put fields to host byte order */
+	hdr->msg.put.match_bits	= le64_to_cpu(hdr->msg.put.match_bits);
+	hdr->msg.put.ptl_index	= le32_to_cpu(hdr->msg.put.ptl_index);
+	hdr->msg.put.offset	= le32_to_cpu(hdr->msg.put.offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_PUT;
+	info.mi_portal	= hdr->msg.put.ptl_index;
+	info.mi_rlength	= hdr->payload_length;
+	info.mi_roffset	= hdr->msg.put.offset;
+	info.mi_mbits	= hdr->msg.put.match_bits;
+
+	msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+	rc = lnet_ptl_match_md(&info, msg);
+	switch (rc) {
+	default:
+		LBUG();
+
+	case LNET_MATCHMD_OK:
+		lnet_recv_put(ni, msg);
+		return 0;
+
+	case LNET_MATCHMD_NONE:
+		if (msg->msg_rx_delayed) /* attached on delayed list */
+			return 0;
+
+		rc = lnet_ni_eager_recv(ni, msg);
+		if (rc == 0)
+			goto again;
+		/* fall through */
+
+	case LNET_MATCHMD_DROP:
+		CNETERR("Dropping PUT from %s portal %d match "LPU64
+			" offset %d length %d: %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+		return ENOENT;	/* +ve: OK but no match */
+	}
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+	struct lnet_match_info	info;
+	lnet_hdr_t		*hdr = &msg->msg_hdr;
+	lnet_handle_wire_t	reply_wmd;
+	int			rc;
+
+	/* Convert get fields to host byte order */
+	hdr->msg.get.match_bits	  = le64_to_cpu(hdr->msg.get.match_bits);
+	hdr->msg.get.ptl_index	  = le32_to_cpu(hdr->msg.get.ptl_index);
+	hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
+	hdr->msg.get.src_offset	  = le32_to_cpu(hdr->msg.get.src_offset);
+
+	info.mi_id.nid	= hdr->src_nid;
+	info.mi_id.pid	= hdr->src_pid;
+	info.mi_opc	= LNET_MD_OP_GET;
+	info.mi_portal	= hdr->msg.get.ptl_index;
+	info.mi_rlength	= hdr->msg.get.sink_length;
+	info.mi_roffset	= hdr->msg.get.src_offset;
+	info.mi_mbits	= hdr->msg.get.match_bits;
+
+	rc = lnet_ptl_match_md(&info, msg);
+	if (rc == LNET_MATCHMD_DROP) {
+		CNETERR("Dropping GET from %s portal %d match "LPU64
+			" offset %d length %d\n",
+			libcfs_id2str(info.mi_id), info.mi_portal,
+			info.mi_mbits, info.mi_roffset, info.mi_rlength);
+		return ENOENT;	/* +ve: OK but no match */
+	}
+
+	LASSERT(rc == LNET_MATCHMD_OK);
+
+	lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+	reply_wmd = hdr->msg.get.return_wmd;
+
+	lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+		       msg->msg_offset, msg->msg_wanted);
+
+	msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+	if (rdma_get) {
+		/* The LND completes the REPLY from her recv procedure */
+		lnet_ni_recv(ni, msg->msg_private, msg, 0,
+			     msg->msg_offset, msg->msg_len, msg->msg_len);
+		return 0;
+	}
+
+	lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+	msg->msg_receiving = 0;
+
+	rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		/* didn't get as far as lnet_ni_send() */
+		CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+		       libcfs_nid2str(ni->ni_nid),
+		       libcfs_id2str(info.mi_id), rc);
+
+		lnet_finalize(ni, msg, rc);
+	}
+
+	return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	void	     *private = msg->msg_private;
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int	       rlength;
+	int	       mlength;
+	int			cpt;
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CNETERR("%s: Dropping REPLY from %s for %s "
+			"MD "LPX64"."LPX64"\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			(md == NULL) ? "invalid" : "inactive",
+			hdr->msg.reply.dst_wmd.wh_interface_cookie,
+			hdr->msg.reply.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve: OK but no match */
+	}
+
+	LASSERT (md->md_offset == 0);
+
+	rlength = hdr->payload_length;
+	mlength = MIN(rlength, (int)md->md_length);
+
+	if (mlength < rlength &&
+	    (md->md_options & LNET_MD_TRUNCATE) == 0) {
+		CNETERR("%s: Dropping REPLY from %s length %d "
+			"for MD "LPX64" would overflow (%d)\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+			rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+			mlength);
+		lnet_res_unlock(cpt);
+		return ENOENT;	  /* +ve: OK but no match */
+	}
+
+	CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, mlength);
+
+	if (mlength != 0)
+		lnet_setpayloadbuffer(msg);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+	return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	lnet_hdr_t       *hdr = &msg->msg_hdr;
+	lnet_process_id_t src = {0};
+	lnet_libmd_t     *md;
+	int			cpt;
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	/* Convert ack fields to host byte order */
+	hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+	hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+	cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+	lnet_res_lock(cpt);
+
+	/* NB handles only looked up by creator (no flips) */
+	md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		/* Don't moan; this is expected */
+		CDEBUG(D_NET,
+		       "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+		       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+		       (md == NULL) ? "invalid" : "inactive",
+		       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		       hdr->msg.ack.dst_wmd.wh_object_cookie);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+		return ENOENT;		  /* +ve! */
+	}
+
+	CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+	       hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+	lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+	return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+	int	rc = 0;
+
+	if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+	    lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+		if (ni->ni_lnd->lnd_eager_recv == NULL) {
+			msg->msg_rx_ready_delay = 1;
+		} else {
+			lnet_net_unlock(msg->msg_rx_cpt);
+			rc = lnet_ni_eager_recv(ni, msg);
+			lnet_net_lock(msg->msg_rx_cpt);
+		}
+	}
+
+	if (rc == 0)
+		rc = lnet_post_routed_recv_locked(msg, 0);
+	return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+	switch (type) {
+	case LNET_MSG_ACK:
+		return ("ACK");
+	case LNET_MSG_PUT:
+		return ("PUT");
+	case LNET_MSG_GET:
+		return ("GET");
+	case LNET_MSG_REPLY:
+		return ("REPLY");
+	case LNET_MSG_HELLO:
+		return ("HELLO");
+	default:
+		return ("<UNKNOWN>");
+	}
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
+{
+	lnet_process_id_t src = {0};
+	lnet_process_id_t dst = {0};
+	char *type_str = lnet_msgtyp2str (hdr->type);
+
+	src.nid = hdr->src_nid;
+	src.pid = hdr->src_pid;
+
+	dst.nid = hdr->dest_nid;
+	dst.pid = hdr->dest_pid;
+
+	CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+	CWARN("    From %s\n", libcfs_id2str(src));
+	CWARN("    To   %s\n", libcfs_id2str(dst));
+
+	switch (hdr->type) {
+	default:
+		break;
+
+	case LNET_MSG_PUT:
+		CWARN("    Ptl index %d, ack md "LPX64"."LPX64", "
+		      "match bits "LPU64"\n",
+		      hdr->msg.put.ptl_index,
+		      hdr->msg.put.ack_wmd.wh_interface_cookie,
+		      hdr->msg.put.ack_wmd.wh_object_cookie,
+		      hdr->msg.put.match_bits);
+		CWARN("    Length %d, offset %d, hdr data "LPX64"\n",
+		      hdr->payload_length, hdr->msg.put.offset,
+		      hdr->msg.put.hdr_data);
+		break;
+
+	case LNET_MSG_GET:
+		CWARN("    Ptl index %d, return md "LPX64"."LPX64", "
+		      "match bits "LPU64"\n", hdr->msg.get.ptl_index,
+		      hdr->msg.get.return_wmd.wh_interface_cookie,
+		      hdr->msg.get.return_wmd.wh_object_cookie,
+		      hdr->msg.get.match_bits);
+		CWARN("    Length %d, src offset %d\n",
+		      hdr->msg.get.sink_length,
+		      hdr->msg.get.src_offset);
+		break;
+
+	case LNET_MSG_ACK:
+		CWARN("    dst md "LPX64"."LPX64", "
+		      "manipulated length %d\n",
+		      hdr->msg.ack.dst_wmd.wh_interface_cookie,
+		      hdr->msg.ack.dst_wmd.wh_object_cookie,
+		      hdr->msg.ack.mlength);
+		break;
+
+	case LNET_MSG_REPLY:
+		CWARN("    dst md "LPX64"."LPX64", "
+		      "length %d\n",
+		      hdr->msg.reply.dst_wmd.wh_interface_cookie,
+		      hdr->msg.reply.dst_wmd.wh_object_cookie,
+		      hdr->payload_length);
+	}
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+	   void *private, int rdma_req)
+{
+	int		rc = 0;
+	int		cpt;
+	int		for_me;
+	struct lnet_msg	*msg;
+	lnet_pid_t     dest_pid;
+	lnet_nid_t     dest_nid;
+	lnet_nid_t     src_nid;
+	__u32	  payload_length;
+	__u32	  type;
+
+	LASSERT (!in_interrupt ());
+
+	type = le32_to_cpu(hdr->type);
+	src_nid = le64_to_cpu(hdr->src_nid);
+	dest_nid = le64_to_cpu(hdr->dest_nid);
+	dest_pid = le32_to_cpu(hdr->dest_pid);
+	payload_length = le32_to_cpu(hdr->payload_length);
+
+	for_me = (ni->ni_nid == dest_nid);
+	cpt = lnet_cpt_of_nid(from_nid);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+	case LNET_MSG_GET:
+		if (payload_length > 0) {
+			CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type), payload_length);
+			return -EPROTO;
+		}
+		break;
+
+	case LNET_MSG_PUT:
+	case LNET_MSG_REPLY:
+		if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+			CERROR("%s, src %s: bad %s payload %d "
+			       "(%d max expected)\n",
+			       libcfs_nid2str(from_nid),
+			       libcfs_nid2str(src_nid),
+			       lnet_msgtyp2str(type),
+			       payload_length,
+			       for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+			return -EPROTO;
+		}
+		break;
+
+	default:
+		CERROR("%s, src %s: Bad message type 0x%x\n",
+		       libcfs_nid2str(from_nid),
+		       libcfs_nid2str(src_nid), type);
+		return -EPROTO;
+	}
+
+	if (the_lnet.ln_routing &&
+	    ni->ni_last_alive != cfs_time_current_sec()) {
+		lnet_ni_lock(ni);
+
+		/* NB: so far here is the only place to set NI status to "up */
+		ni->ni_last_alive = cfs_time_current_sec();
+		if (ni->ni_status != NULL &&
+		    ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+			ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+		lnet_ni_unlock(ni);
+	}
+
+	/* Regard a bad destination NID as a protocol error.  Senders should
+	 * know what they're doing; if they don't they're misconfigured, buggy
+	 * or malicious so we chop them off at the knees :) */
+
+	if (!for_me) {
+		if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+			/* should have gone direct */
+			CERROR ("%s, src %s: Bad dest nid %s "
+				"(should have been sent direct)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (lnet_islocalnid(dest_nid)) {
+			/* dest is another local NI; sender should have used
+			 * this node's NID on its own network */
+			CERROR ("%s, src %s: Bad dest nid %s "
+				"(it's my nid but on a different network)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (rdma_req && type == LNET_MSG_GET) {
+			CERROR ("%s, src %s: Bad optimized GET for %s "
+				"(final destination must be me)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			return -EPROTO;
+		}
+
+		if (!the_lnet.ln_routing) {
+			CERROR ("%s, src %s: Dropping message for %s "
+				"(routing not enabled)\n",
+				libcfs_nid2str(from_nid),
+				libcfs_nid2str(src_nid),
+				libcfs_nid2str(dest_nid));
+			goto drop;
+		}
+	}
+
+	/* Message looks OK; we're not going to return an error, so we MUST
+	 * call back lnd_recv() come what may... */
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (src_nid, 0))	     /* shall we now? */
+	{
+		CERROR("%s, src %s: Dropping %s to simulate failure\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("%s, src %s: Dropping %s (out of memory)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type));
+		goto drop;
+	}
+
+	/* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+
+	msg->msg_type = type;
+	msg->msg_private = private;
+	msg->msg_receiving = 1;
+	msg->msg_len = msg->msg_wanted = payload_length;
+	msg->msg_offset = 0;
+	msg->msg_hdr = *hdr;
+	/* for building message event */
+	msg->msg_from = from_nid;
+	if (!for_me) {
+		msg->msg_target.pid	= dest_pid;
+		msg->msg_target.nid	= dest_nid;
+		msg->msg_routing	= 1;
+
+	} else {
+		/* convert common msg->hdr fields to host byteorder */
+		msg->msg_hdr.type	= type;
+		msg->msg_hdr.src_nid	= src_nid;
+		msg->msg_hdr.src_pid	= le32_to_cpu(msg->msg_hdr.src_pid);
+		msg->msg_hdr.dest_nid	= dest_nid;
+		msg->msg_hdr.dest_pid	= dest_pid;
+		msg->msg_hdr.payload_length = payload_length;
+	}
+
+	lnet_net_lock(cpt);
+	rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CERROR("%s, src %s: Dropping %s "
+		       "(error %d looking up sender)\n",
+		       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+		       lnet_msgtyp2str(type), rc);
+		lnet_msg_free(msg);
+		goto drop;
+	}
+
+	lnet_msg_commit(msg, cpt);
+
+	if (!for_me) {
+		rc = lnet_parse_forward_locked(ni, msg);
+		lnet_net_unlock(cpt);
+
+		if (rc < 0)
+			goto free_drop;
+		if (rc == 0) {
+			lnet_ni_recv(ni, msg->msg_private, msg, 0,
+				     0, payload_length, payload_length);
+		}
+		return 0;
+	}
+
+	lnet_net_unlock(cpt);
+
+	switch (type) {
+	case LNET_MSG_ACK:
+		rc = lnet_parse_ack(ni, msg);
+		break;
+	case LNET_MSG_PUT:
+		rc = lnet_parse_put(ni, msg);
+		break;
+	case LNET_MSG_GET:
+		rc = lnet_parse_get(ni, msg, rdma_req);
+		break;
+	case LNET_MSG_REPLY:
+		rc = lnet_parse_reply(ni, msg);
+		break;
+	default:
+		LASSERT(0);
+		rc = -EPROTO;
+		goto free_drop;  /* prevent an unused label if !kernel */
+	}
+
+	if (rc == 0)
+		return 0;
+
+	LASSERT (rc == ENOENT);
+
+ free_drop:
+	LASSERT(msg->msg_md == NULL);
+	lnet_finalize(ni, msg, rc);
+
+ drop:
+	lnet_drop_message(ni, cpt, private, payload_length);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+	while (!list_empty(head)) {
+		lnet_process_id_t	id = {0};
+		lnet_msg_t		*msg;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_md == NULL);
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+		      " offset %d length %d: %s\n",
+		      libcfs_id2str(id),
+		      msg->msg_hdr.msg.put.ptl_index,
+		      msg->msg_hdr.msg.put.match_bits,
+		      msg->msg_hdr.msg.put.offset,
+		      msg->msg_hdr.payload_length, reason);
+
+		/* NB I can't drop msg's ref on msg_rxpeer until after I've
+		 * called lnet_drop_message(), so I just hang onto msg as well
+		 * until that's done */
+
+		lnet_drop_message(msg->msg_rxpeer->lp_ni,
+				  msg->msg_rxpeer->lp_cpt,
+				  msg->msg_private, msg->msg_len);
+		/*
+		 * NB: message will not generate event because w/o attached MD,
+		 * but we still should give error code so lnet_msg_decommit()
+		 * can skip counters operations and other checks.
+		 */
+		lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+	}
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		lnet_msg_t	  *msg;
+		lnet_process_id_t  id;
+
+		msg = list_entry(head->next, lnet_msg_t, msg_list);
+		list_del(&msg->msg_list);
+
+		/* md won't disappear under me, since each msg
+		 * holds a ref on it */
+
+		id.nid = msg->msg_hdr.src_nid;
+		id.pid = msg->msg_hdr.src_pid;
+
+		LASSERT(msg->msg_rx_delayed);
+		LASSERT(msg->msg_md != NULL);
+		LASSERT(msg->msg_rxpeer != NULL);
+		LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+		CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+		       "match "LPU64" offset %d length %d.\n",
+			libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+			msg->msg_hdr.msg.put.match_bits,
+			msg->msg_hdr.msg.put.offset,
+			msg->msg_hdr.payload_length);
+
+		lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+	}
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset,
+	__u64 hdr_data)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (target.nid, 1))	  /* shall we now? */
+	{
+		CERROR("Dropping PUT to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+	msg->msg_vmflush = !!memory_pressure_get();
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("Source MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+	msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+	/* NB handles only looked up by creator (no flips) */
+	if (ack == LNET_ACK_REQ) {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			the_lnet.ln_interface_cookie;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			md->md_lh.lh_cookie;
+	} else {
+		msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+		msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+			LNET_WIRE_HANDLE_COOKIE_NONE;
+	}
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc != 0) {
+		CNETERR( "Error sending PUT to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize (NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+	/* The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+	 * returns a msg for the LND to pass to lnet_finalize() when the sink
+	 * data has been received.
+	 *
+	 * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+	 * lnet_finalize() is called on it, so the LND must call this first */
+
+	struct lnet_msg		*msg = lnet_msg_alloc();
+	struct lnet_libmd	*getmd = getmsg->msg_md;
+	lnet_process_id_t	peer_id = getmsg->msg_target;
+	int			cpt;
+
+	LASSERT(!getmsg->msg_target_is_router);
+	LASSERT(!getmsg->msg_routing);
+
+	cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+	lnet_res_lock(cpt);
+
+	LASSERT (getmd->md_refcount > 0);
+
+	if (msg == NULL) {
+		CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+		goto drop;
+	}
+
+	if (getmd->md_threshold == 0) {
+		CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+			libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+			getmd);
+		lnet_res_unlock(cpt);
+		goto drop;
+	}
+
+	LASSERT(getmd->md_offset == 0);
+
+	CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+	       libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+	/* setup information for lnet_build_msg_event */
+	msg->msg_from = peer_id.nid;
+	msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+	msg->msg_hdr.src_nid = peer_id.nid;
+	msg->msg_hdr.payload_length = getmd->md_length;
+	msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+	lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+	lnet_res_unlock(cpt);
+
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	lnet_msg_commit(msg, cpt);
+	lnet_net_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+	return msg;
+
+ drop:
+	cpt = lnet_cpt_of_nid(peer_id.nid);
+
+	lnet_net_lock(cpt);
+	the_lnet.ln_counters[cpt]->drop_count++;
+	the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+	lnet_net_unlock(cpt);
+
+	if (msg != NULL)
+		lnet_msg_free(msg);
+
+	return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+	/* Set the REPLY length, now the RDMA that elides the REPLY message has
+	 * completed and I know it. */
+	LASSERT (reply != NULL);
+	LASSERT (reply->msg_type == LNET_MSG_GET);
+	LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+	/* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+	 * the end of my buffer, I might as well be dead. */
+	LASSERT (len <= reply->msg_ev.mlength);
+
+	reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval  0      Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO    Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+	lnet_process_id_t target, unsigned int portal,
+	__u64 match_bits, unsigned int offset)
+{
+	struct lnet_msg		*msg;
+	struct lnet_libmd	*md;
+	int			cpt;
+	int			rc;
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+	    fail_peer (target.nid, 1))	  /* shall we now? */
+	{
+		CERROR("Dropping GET to %s: simulated failure\n",
+		       libcfs_id2str(target));
+		return -EIO;
+	}
+
+	msg = lnet_msg_alloc();
+	if (msg == NULL) {
+		CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+		       libcfs_id2str(target));
+		return -ENOMEM;
+	}
+
+	cpt = lnet_cpt_of_cookie(mdh.cookie);
+	lnet_res_lock(cpt);
+
+	md = lnet_handle2md(&mdh);
+	if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+		CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n",
+		       match_bits, portal, libcfs_id2str(target),
+		       md == NULL ? -1 : md->md_threshold);
+		if (md != NULL && md->md_me != NULL)
+			CERROR("REPLY MD also attached to portal %d\n",
+			       md->md_me->me_portal);
+
+		lnet_res_unlock(cpt);
+
+		lnet_msg_free(msg);
+
+		return -ENOENT;
+	}
+
+	CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+	lnet_msg_attach_md(msg, md, 0, 0);
+
+	lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+	msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+	msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+	msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+	msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+	/* NB handles only looked up by creator (no flips) */
+	msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+		the_lnet.ln_interface_cookie;
+	msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+		md->md_lh.lh_cookie;
+
+	lnet_res_unlock(cpt);
+
+	lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+	rc = lnet_send(self, msg, LNET_NID_ANY);
+	if (rc < 0) {
+		CNETERR( "Error sending GET to %s: %d\n",
+		       libcfs_id2str(target), rc);
+		lnet_finalize (NULL, msg, rc);
+	}
+
+	/* completion will be signalled by an event */
+	return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+	struct list_head		*e;
+	struct lnet_ni		*ni;
+	lnet_remotenet_t	*rnet;
+	__u32			dstnet = LNET_NIDNET(dstnid);
+	int			hops;
+	int			cpt;
+	__u32			order = 2;
+	struct list_head		*rn_list;
+
+	/* if !local_nid_dist_zero, I don't return a distance of 0 ever
+	 * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+	 * keep order 0 free for 0@lo and order 1 free for a local NID
+	 * match */
+
+	LASSERT (the_lnet.ln_init);
+	LASSERT (the_lnet.ln_refcount > 0);
+
+	cpt = lnet_net_lock_current();
+
+	list_for_each (e, &the_lnet.ln_nis) {
+		ni = list_entry(e, lnet_ni_t, ni_list);
+
+		if (ni->ni_nid == dstnid) {
+			if (srcnidp != NULL)
+				*srcnidp = dstnid;
+			if (orderp != NULL) {
+				if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+					*orderp = 0;
+				else
+					*orderp = 1;
+			}
+			lnet_net_unlock(cpt);
+
+			return local_nid_dist_zero ? 0 : 1;
+		}
+
+		if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+			if (srcnidp != NULL)
+				*srcnidp = ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return 1;
+		}
+
+		order++;
+	}
+
+	rn_list = lnet_net2rnethash(dstnet);
+	list_for_each(e, rn_list) {
+		rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == dstnet) {
+			lnet_route_t *route;
+			lnet_route_t *shortest = NULL;
+
+			LASSERT (!list_empty(&rnet->lrn_routes));
+
+			list_for_each_entry(route, &rnet->lrn_routes,
+						lr_list) {
+				if (shortest == NULL ||
+				    route->lr_hops < shortest->lr_hops)
+					shortest = route;
+			}
+
+			LASSERT (shortest != NULL);
+			hops = shortest->lr_hops;
+			if (srcnidp != NULL)
+				*srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+			if (orderp != NULL)
+				*orderp = order;
+			lnet_net_unlock(cpt);
+			return hops + 1;
+		}
+		order++;
+	}
+
+	lnet_net_unlock(cpt);
+	return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644
index 000000000000..8f3a50bd5f69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c
@@ -0,0 +1,650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev)
+{
+	ENTRY;
+
+	memset(ev, 0, sizeof(*ev));
+
+	ev->status   = 0;
+	ev->unlinked = 1;
+	ev->type     = LNET_EVENT_UNLINK;
+	lnet_md_deconstruct(md, &ev->md);
+	lnet_md2handle(&ev->md_handle, md);
+	EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+	lnet_hdr_t	*hdr = &msg->msg_hdr;
+	lnet_event_t	*ev  = &msg->msg_ev;
+
+	LASSERT(!msg->msg_routing);
+
+	ev->type = ev_type;
+
+	if (ev_type == LNET_EVENT_SEND) {
+		/* event for active message */
+		ev->target.nid    = le64_to_cpu(hdr->dest_nid);
+		ev->target.pid    = le32_to_cpu(hdr->dest_pid);
+		ev->initiator.nid = LNET_NID_ANY;
+		ev->initiator.pid = the_lnet.ln_pid;
+		ev->sender	  = LNET_NID_ANY;
+
+	} else {
+		/* event for passive message */
+		ev->target.pid    = hdr->dest_pid;
+		ev->target.nid    = hdr->dest_nid;
+		ev->initiator.pid = hdr->src_pid;
+		ev->initiator.nid = hdr->src_nid;
+		ev->rlength       = hdr->payload_length;
+		ev->sender	  = msg->msg_from;
+		ev->mlength	  = msg->msg_wanted;
+		ev->offset	  = msg->msg_offset;
+	}
+
+	switch (ev_type) {
+	default:
+		LBUG();
+
+	case LNET_EVENT_PUT: /* passive PUT */
+		ev->pt_index   = hdr->msg.put.ptl_index;
+		ev->match_bits = hdr->msg.put.match_bits;
+		ev->hdr_data   = hdr->msg.put.hdr_data;
+		return;
+
+	case LNET_EVENT_GET: /* passive GET */
+		ev->pt_index   = hdr->msg.get.ptl_index;
+		ev->match_bits = hdr->msg.get.match_bits;
+		ev->hdr_data   = 0;
+		return;
+
+	case LNET_EVENT_ACK: /* ACK */
+		ev->match_bits = hdr->msg.ack.match_bits;
+		ev->mlength    = hdr->msg.ack.mlength;
+		return;
+
+	case LNET_EVENT_REPLY: /* REPLY */
+		return;
+
+	case LNET_EVENT_SEND: /* active message */
+		if (msg->msg_type == LNET_MSG_PUT) {
+			ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+			ev->offset     = le32_to_cpu(hdr->msg.put.offset);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->payload_length);
+			ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
+
+		} else {
+			LASSERT(msg->msg_type == LNET_MSG_GET);
+			ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
+			ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+			ev->mlength    =
+			ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
+			ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
+			ev->hdr_data   = 0;
+		}
+		return;
+	}
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+	struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+	lnet_counters_t		  *counters  = the_lnet.ln_counters[cpt];
+
+	/* routed message can be committed for both receiving and sending */
+	LASSERT(!msg->msg_tx_committed);
+
+	if (msg->msg_sending) {
+		LASSERT(!msg->msg_receiving);
+
+		msg->msg_tx_cpt = cpt;
+		msg->msg_tx_committed = 1;
+		if (msg->msg_rx_committed) { /* routed message REPLY */
+			LASSERT(msg->msg_onactivelist);
+			return;
+		}
+	} else {
+		LASSERT(!msg->msg_sending);
+		msg->msg_rx_cpt = cpt;
+		msg->msg_rx_committed = 1;
+	}
+
+	LASSERT(!msg->msg_onactivelist);
+	msg->msg_onactivelist = 1;
+	list_add(&msg->msg_activelist, &container->msc_active);
+
+	counters->msgs_alloc++;
+	if (counters->msgs_alloc > counters->msgs_max)
+		counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(msg->msg_tx_committed);
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+	switch (ev->type) {
+	default: /* routed message */
+		LASSERT(msg->msg_routing);
+		LASSERT(msg->msg_rx_committed);
+		LASSERT(ev->type == 0);
+
+		counters->route_length += msg->msg_len;
+		counters->route_count++;
+		goto out;
+
+	case LNET_EVENT_PUT:
+		/* should have been decommitted */
+		LASSERT(!msg->msg_rx_committed);
+		/* overwritten while sending ACK */
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		msg->msg_type = LNET_MSG_PUT; /* fix type */
+		break;
+
+	case LNET_EVENT_SEND:
+		LASSERT(!msg->msg_rx_committed);
+		if (msg->msg_type == LNET_MSG_PUT)
+			counters->send_length += msg->msg_len;
+		break;
+
+	case LNET_EVENT_GET:
+		LASSERT(msg->msg_rx_committed);
+		/* overwritten while sending reply, we should never be
+		 * here for optimized GET */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY);
+		msg->msg_type = LNET_MSG_GET; /* fix type */
+		break;
+	}
+
+	counters->send_count++;
+ out:
+	lnet_return_tx_credits_locked(msg);
+	msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+	lnet_counters_t	*counters;
+	lnet_event_t	*ev = &msg->msg_ev;
+
+	LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+	LASSERT(msg->msg_rx_committed);
+
+	if (status != 0)
+		goto out;
+
+	counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+	switch (ev->type) {
+	default:
+		LASSERT(ev->type == 0);
+		LASSERT(msg->msg_routing);
+		goto out;
+
+	case LNET_EVENT_ACK:
+		LASSERT(msg->msg_type == LNET_MSG_ACK);
+		break;
+
+	case LNET_EVENT_GET:
+		/* type is "REPLY" if it's an optimized GET on passive side,
+		 * because optimized GET will never be committed for sending,
+		 * so message type wouldn't be changed back to "GET" by
+		 * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+		LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+			msg->msg_type == LNET_MSG_GET);
+		counters->send_length += msg->msg_wanted;
+		break;
+
+	case LNET_EVENT_PUT:
+		LASSERT(msg->msg_type == LNET_MSG_PUT);
+		break;
+
+	case LNET_EVENT_REPLY:
+		/* type is "GET" if it's an optimized GET on active side,
+		 * see details in lnet_create_reply_msg() */
+		LASSERT(msg->msg_type == LNET_MSG_GET ||
+			msg->msg_type == LNET_MSG_REPLY);
+		break;
+	}
+
+	counters->recv_count++;
+	if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+		counters->recv_length += msg->msg_wanted;
+
+ out:
+	lnet_return_rx_credits_locked(msg);
+	msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+	int	cpt2 = cpt;
+
+	LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+	LASSERT(msg->msg_onactivelist);
+
+	if (msg->msg_tx_committed) { /* always decommit for sending first */
+		LASSERT(cpt == msg->msg_tx_cpt);
+		lnet_msg_decommit_tx(msg, status);
+	}
+
+	if (msg->msg_rx_committed) {
+		/* forwarding msg committed for both receiving and sending */
+		if (cpt != msg->msg_rx_cpt) {
+			lnet_net_unlock(cpt);
+			cpt2 = msg->msg_rx_cpt;
+			lnet_net_lock(cpt2);
+		}
+		lnet_msg_decommit_rx(msg, status);
+	}
+
+	list_del(&msg->msg_activelist);
+	msg->msg_onactivelist = 0;
+
+	the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+	if (cpt2 != cpt) {
+		lnet_net_unlock(cpt2);
+		lnet_net_lock(cpt);
+	}
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+		   unsigned int offset, unsigned int mlen)
+{
+	/* NB: @offset and @len are only useful for receiving */
+	/* Here, we attach the MD on lnet_msg and mark it busy and
+	 * decrementing its threshold. Come what may, the lnet_msg "owns"
+	 * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+	 * signals completion. */
+	LASSERT(!msg->msg_routing);
+
+	msg->msg_md = md;
+	if (msg->msg_receiving) { /* commited for receiving */
+		msg->msg_offset = offset;
+		msg->msg_wanted = mlen;
+	}
+
+	md->md_refcount++;
+	if (md->md_threshold != LNET_MD_THRESH_INF) {
+		LASSERT(md->md_threshold > 0);
+		md->md_threshold--;
+	}
+
+	/* build umd in event */
+	lnet_md2handle(&msg->msg_ev.md_handle, md);
+	lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+	lnet_libmd_t	*md = msg->msg_md;
+	int		unlink;
+
+	/* Now it's safe to drop my caller's ref */
+	md->md_refcount--;
+	LASSERT(md->md_refcount >= 0);
+
+	unlink = lnet_md_unlinkable(md);
+	if (md->md_eq != NULL) {
+		msg->msg_ev.status   = status;
+		msg->msg_ev.unlinked = unlink;
+		lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+	}
+
+	if (unlink)
+		lnet_md_unlink(md);
+
+	msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+	lnet_handle_wire_t ack_wmd;
+	int		rc;
+	int		status = msg->msg_ev.status;
+
+	LASSERT (msg->msg_onactivelist);
+
+	if (status == 0 && msg->msg_ack) {
+		/* Only send an ACK if the PUT completed successfully */
+
+		lnet_msg_decommit(msg, cpt, 0);
+
+		msg->msg_ack = 0;
+		lnet_net_unlock(cpt);
+
+		LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+		LASSERT(!msg->msg_routing);
+
+		ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+		lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+		msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+		msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+		msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+		/* NB: we probably want to use NID of msg::msg_from as 3rd
+		 * parameter (router NID) if it's routed message */
+		rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is commited for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either
+		 * because CPT for sending can be different with CPT for
+		 * receiving, so we should return back to lnet_finalize()
+		 * to make sure we are locking the correct partition.
+		 */
+		return rc;
+
+	} else if (status == 0 &&	/* OK so far */
+		   (msg->msg_routing && !msg->msg_sending)) {
+		/* not forwarded */
+		LASSERT(!msg->msg_receiving);	/* called back recv already */
+		lnet_net_unlock(cpt);
+
+		rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+		lnet_net_lock(cpt);
+		/*
+		 * NB: message is committed for sending, we should return
+		 * on success because LND will finalize this message later.
+		 *
+		 * Also, there is possibility that message is commited for
+		 * sending and also failed before delivering to LND,
+		 * i.e: ENOMEM, in that case we can't fall through either:
+		 * - The rule is message must decommit for sending first if
+		 *   the it's committed for both sending and receiving
+		 * - CPT for sending can be different with CPT for receiving,
+		 *   so we should return back to lnet_finalize() to make
+		 *   sure we are locking the correct partition.
+		 */
+		return rc;
+	}
+
+	lnet_msg_decommit(msg, cpt, status);
+	lnet_msg_free_locked(msg);
+	return 0;
+}
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+	struct lnet_msg_container	*container;
+	int				my_slot;
+	int				cpt;
+	int				rc;
+	int				i;
+
+	LASSERT (!in_interrupt ());
+
+	if (msg == NULL)
+		return;
+#if 0
+	CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+	       lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+	       msg->msg_target_is_router ? "t" : "",
+	       msg->msg_routing ? "X" : "",
+	       msg->msg_ack ? "A" : "",
+	       msg->msg_sending ? "S" : "",
+	       msg->msg_receiving ? "R" : "",
+	       msg->msg_delayed ? "d" : "",
+	       msg->msg_txcredit ? "C" : "",
+	       msg->msg_peertxcredit ? "c" : "",
+	       msg->msg_rtrcredit ? "F" : "",
+	       msg->msg_peerrtrcredit ? "f" : "",
+	       msg->msg_onactivelist ? "!" : "",
+	       msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+	       msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+	msg->msg_ev.status = status;
+
+	if (msg->msg_md != NULL) {
+		cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+		lnet_res_lock(cpt);
+		lnet_msg_detach_md(msg, status);
+		lnet_res_unlock(cpt);
+	}
+
+ again:
+	rc = 0;
+	if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+		/* not commited to network yet */
+		LASSERT(!msg->msg_onactivelist);
+		lnet_msg_free(msg);
+		return;
+	}
+
+	/*
+	 * NB: routed message can be commited for both receiving and sending,
+	 * we should finalize in LIFO order and keep counters correct.
+	 * (finalize sending first then finalize receiving)
+	 */
+	cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+	lnet_net_lock(cpt);
+
+	container = the_lnet.ln_msg_containers[cpt];
+	list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+	/* Recursion breaker.  Don't complete the message here if I am (or
+	 * enough other threads are) already completing messages */
+
+	my_slot = -1;
+	for (i = 0; i < container->msc_nfinalizers; i++) {
+		if (container->msc_finalizers[i] == current)
+			break;
+
+		if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+			my_slot = i;
+	}
+
+	if (i < container->msc_nfinalizers || my_slot < 0) {
+		lnet_net_unlock(cpt);
+		return;
+	}
+
+	container->msc_finalizers[my_slot] = current;
+
+	while (!list_empty(&container->msc_finalizing)) {
+		msg = list_entry(container->msc_finalizing.next,
+				     lnet_msg_t, msg_list);
+
+		list_del(&msg->msg_list);
+
+		/* NB drops and regains the lnet lock if it actually does
+		 * anything, so my finalizing friends can chomp along too */
+		rc = lnet_complete_msg_locked(msg, cpt);
+		if (rc != 0)
+			break;
+	}
+
+	container->msc_finalizers[my_slot] = NULL;
+	lnet_net_unlock(cpt);
+
+	if (rc != 0)
+		goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+	int     count = 0;
+
+	if (container->msc_init == 0)
+		return;
+
+	while (!list_empty(&container->msc_active)) {
+		lnet_msg_t *msg = list_entry(container->msc_active.next,
+						 lnet_msg_t, msg_activelist);
+
+		LASSERT(msg->msg_onactivelist);
+		msg->msg_onactivelist = 0;
+		list_del(&msg->msg_activelist);
+		lnet_msg_free(msg);
+		count++;
+	}
+
+	if (count > 0)
+		CERROR("%d active msg on exit\n", count);
+
+	if (container->msc_finalizers != NULL) {
+		LIBCFS_FREE(container->msc_finalizers,
+			    container->msc_nfinalizers *
+			    sizeof(*container->msc_finalizers));
+		container->msc_finalizers = NULL;
+	}
+#ifdef LNET_USE_LIB_FREELIST
+	lnet_freelist_fini(&container->msc_freelist);
+#endif
+	container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+	int	rc;
+
+	container->msc_init = 1;
+
+	INIT_LIST_HEAD(&container->msc_active);
+	INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+	memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+	rc = lnet_freelist_init(&container->msc_freelist,
+				LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+	if (rc != 0) {
+		CERROR("Failed to init freelist for message container\n");
+		lnet_msg_container_cleanup(container);
+		return rc;
+	}
+#else
+	rc = 0;
+#endif
+	/* number of CPUs */
+	container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+	LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+			 container->msc_nfinalizers *
+			 sizeof(*container->msc_finalizers));
+
+	if (container->msc_finalizers == NULL) {
+		CERROR("Failed to allocate message finalizers\n");
+		lnet_msg_container_cleanup(container);
+		return -ENOMEM;
+	}
+
+	return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+	struct lnet_msg_container *container;
+	int     i;
+
+	if (the_lnet.ln_msg_containers == NULL)
+		return;
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+		lnet_msg_container_cleanup(container);
+
+	cfs_percpt_free(the_lnet.ln_msg_containers);
+	the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+	struct lnet_msg_container *container;
+	int	rc;
+	int	i;
+
+	the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+						      sizeof(*container));
+
+	if (the_lnet.ln_msg_containers == NULL) {
+		CERROR("Failed to allocate cpu-partition data for network\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+		rc = lnet_msg_container_setup(container, i);
+		if (rc != 0) {
+			lnet_msg_containers_destroy();
+			return rc;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644
index 000000000000..9b9e7d3139b0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -0,0 +1,938 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int	portal_rotor	= LNET_PTL_ROTOR_HASH_RT;
+CFS_MODULE_PARM(portal_rotor, "i", int, 0644,
+		"redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+		    __u64 mbits, __u64 ignore_bits)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[index];
+	int			unique;
+
+	unique = ignore_bits == 0 &&
+		 match_id.nid != LNET_NID_ANY &&
+		 match_id.pid != LNET_PID_ANY;
+
+	LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+	/* prefer to check w/o any lock */
+	if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+		goto match;
+
+	/* unset, new portal */
+	lnet_ptl_lock(ptl);
+	/* check again with lock */
+	if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+		lnet_ptl_unlock(ptl);
+		goto match;
+	}
+
+	/* still not set */
+	if (unique)
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+	else
+		lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+	lnet_ptl_unlock(ptl);
+
+	return 1;
+
+ match:
+	if ((lnet_ptl_is_unique(ptl) && !unique) ||
+	    (lnet_ptl_is_wildcard(ptl) && unique))
+		return 0;
+	return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	mtable->mt_enabled = 1;
+
+	ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+	for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+		LASSERT(ptl->ptl_mt_maps[i] != cpt);
+		if (ptl->ptl_mt_maps[i] < cpt)
+			break;
+
+		/* swap to order */
+		ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+		ptl->ptl_mt_maps[i] = cpt;
+	}
+
+	ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+	struct lnet_match_table	*mtable = ptl->ptl_mtables[cpt];
+	int			i;
+
+	/* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	if (LNET_CPT_NUMBER == 1)
+		return; /* never disable the only match-table */
+
+	mtable->mt_enabled = 0;
+
+	LASSERT(ptl->ptl_mt_nmaps > 0 &&
+		ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+	/* remove it from mt_maps */
+	ptl->ptl_mt_nmaps--;
+	for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+		if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+			ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+	}
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+		  struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	/* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+	 * lnet_match_blocked_msg() relies on this to avoid races */
+	unsigned int	offset;
+	unsigned int	mlength;
+	lnet_me_t	*me = md->md_me;
+
+	/* MD exhausted */
+	if (lnet_md_exhausted(md))
+		return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+	/* mismatched MD op */
+	if ((md->md_options & info->mi_opc) == 0)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME nid/pid? */
+	if (me->me_match_id.nid != LNET_NID_ANY &&
+	    me->me_match_id.nid != info->mi_id.nid)
+		return LNET_MATCHMD_NONE;
+
+	if (me->me_match_id.pid != LNET_PID_ANY &&
+	    me->me_match_id.pid != info->mi_id.pid)
+		return LNET_MATCHMD_NONE;
+
+	/* mismatched ME matchbits? */
+	if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+		return LNET_MATCHMD_NONE;
+
+	/* Hurrah! This _is_ a match; check it out... */
+
+	if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+		offset = md->md_offset;
+	else
+		offset = info->mi_roffset;
+
+	if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+		mlength = md->md_max_size;
+		LASSERT(md->md_offset + mlength <= md->md_length);
+	} else {
+		mlength = md->md_length - offset;
+	}
+
+	if (info->mi_rlength <= mlength) {	/* fits in allowed space */
+		mlength = info->mi_rlength;
+	} else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+		/* this packet _really_ is too big */
+		CERROR("Matching packet from %s, match "LPU64
+		       " length %d too big: %d left, %d allowed\n",
+		       libcfs_id2str(info->mi_id), info->mi_mbits,
+		       info->mi_rlength, md->md_length - offset, mlength);
+
+		return LNET_MATCHMD_DROP;
+	}
+
+	/* Commit to this ME/MD */
+	CDEBUG(D_NET, "Incoming %s index %x from %s of "
+	       "length %d/%d into md "LPX64" [%d] + %d\n",
+	       (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+	       info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+	       info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+	lnet_msg_attach_md(msg, md, offset, mlength);
+	md->md_offset = offset + mlength;
+
+	if (!lnet_md_exhausted(md))
+		return LNET_MATCHMD_OK;
+
+	/* Auto-unlink NOW, so the ME gets unlinked if required.
+	 * We bumped md->md_refcount above so the MD just gets flagged
+	 * for unlink when it is finalized. */
+	if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+		lnet_md_unlink(md);
+
+	return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+	if (LNET_CPT_NUMBER == 1)
+		return ptl->ptl_mtables[0]; /* the only one */
+
+	/* if it's a unique portal, return match-table hashed by NID */
+	return lnet_ptl_is_unique(ptl) ?
+	       ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+		  __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+	struct lnet_portal	*ptl;
+	struct lnet_match_table	*mtable;
+
+	/* NB: called w/o lock */
+	LASSERT(index < the_lnet.ln_nportals);
+
+	if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+		return NULL;
+
+	ptl = the_lnet.ln_portals[index];
+
+	mtable = lnet_match2mt(ptl, id, mbits);
+	if (mtable != NULL) /* unique portal or only one match-table */
+		return mtable;
+
+	/* it's a wildcard portal */
+	switch (pos) {
+	default:
+		return NULL;
+	case LNET_INS_BEFORE:
+	case LNET_INS_AFTER:
+		/* posted by no affinity thread, always hash to specific
+		 * match-table to avoid buffer stealing which is heavy */
+		return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+	case LNET_INS_LOCAL:
+		/* posted by cpu-affinity thread */
+		return ptl->ptl_mtables[lnet_cpt_current()];
+	}
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			nmaps;
+	int			rotor;
+	int			routed;
+	int			cpt;
+
+	/* NB: called w/o lock */
+	LASSERT(info->mi_portal < the_lnet.ln_nportals);
+	ptl = the_lnet.ln_portals[info->mi_portal];
+
+	LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+	mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+	if (mtable != NULL)
+		return mtable;
+
+	/* it's a wildcard portal */
+	routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+		 LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+	if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+	    (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+		cpt = lnet_cpt_current();
+		if (ptl->ptl_mtables[cpt]->mt_enabled)
+			return ptl->ptl_mtables[cpt];
+	}
+
+	rotor = ptl->ptl_rotor++; /* get round-robin factor */
+	if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+		cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+	else
+		cpt = rotor % LNET_CPT_NUMBER;
+
+	if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+		/* is there any active entry for this portal? */
+		nmaps = ptl->ptl_mt_nmaps;
+		/* map to an active mtable to avoid heavy "stealing" */
+		if (nmaps != 0) {
+			/* NB: there is possibility that ptl_mt_maps is being
+			 * changed because we are not under protection of
+			 * lnet_ptl_lock, but it shouldn't hurt anything */
+			cpt = ptl->ptl_mt_maps[rotor % nmaps];
+		}
+	}
+
+	return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+	__u64	*bmap;
+	int	i;
+
+	if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		return 0;
+
+	if (pos < 0) { /* check all bits */
+		for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+			if (mtable->mt_exhausted[i] != (__u64)(-1))
+				return 0;
+		}
+		return 1;
+	}
+
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+	/* mtable::mt_mhash[pos] is marked as exhausted or not */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+	__u64	*bmap;
+
+	LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+	LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+	/* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+	bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+	pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+	if (!exhausted)
+		*bmap &= ~(1ULL << pos);
+	else
+		*bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+		   lnet_process_id_t id, __u64 mbits)
+{
+	struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+	if (lnet_ptl_is_wildcard(ptl)) {
+		return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+	} else {
+		unsigned long hash = mbits + id.nid + id.pid;
+
+		LASSERT(lnet_ptl_is_unique(ptl));
+		hash = cfs_hash_long(hash, LNET_MT_HASH_BITS);
+		return &mtable->mt_mhash[hash];
+	}
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+		 struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct list_head		*head;
+	lnet_me_t		*me;
+	lnet_me_t		*tmp;
+	int			exhausted = 0;
+	int			rc;
+
+	/* any ME with ignore bits? */
+	if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+		head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+	else
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+	/* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+	if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+		exhausted = LNET_MATCHMD_EXHAUSTED;
+
+	list_for_each_entry_safe(me, tmp, head, me_list) {
+		/* ME attached but MD not attached yet */
+		if (me->me_md == NULL)
+			continue;
+
+		LASSERT(me == me->me_md->md_me);
+
+		rc = lnet_try_match_md(me->me_md, info, msg);
+		if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+			exhausted = 0; /* mlist is not empty */
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0) {
+			/* don't return EXHAUSTED bit because we don't know
+			 * whether the mlist is empty or not */
+			return rc & ~LNET_MATCHMD_EXHAUSTED;
+		}
+	}
+
+	if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+		lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+		if (!lnet_mt_test_exhausted(mtable, -1))
+			exhausted = 0;
+	}
+
+	if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+		head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+		goto again; /* re-check MEs w/o ignore-bits */
+	}
+
+	if (info->mi_opc == LNET_MD_OP_GET ||
+	    !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+		return LNET_MATCHMD_DROP | exhausted;
+
+	return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+	int	rc;
+
+	/* message arrived before any buffer posting on this portal,
+	 * simply delay or drop this message */
+	if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+		return 0;
+
+	lnet_ptl_lock(ptl);
+	/* check it again with hold of lock */
+	if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+		lnet_ptl_unlock(ptl);
+		return 0;
+	}
+
+	if (lnet_ptl_is_lazy(ptl)) {
+		if (msg->msg_rx_ready_delay) {
+			msg->msg_rx_delayed = 1;
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_delayed);
+		}
+		rc = LNET_MATCHMD_NONE;
+	} else {
+		rc = LNET_MATCHMD_DROP;
+	}
+
+	lnet_ptl_unlock(ptl);
+	return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+		     struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	int	first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+	int	rc = 0;
+	int	i;
+
+	/* steal buffer from other CPTs, and delay it if nothing to steal,
+	 * this function is more expensive than a regular match, but we
+	 * don't expect it can happen a lot */
+	LASSERT(lnet_ptl_is_wildcard(ptl));
+
+	for (i = 0; i < LNET_CPT_NUMBER; i++) {
+		struct lnet_match_table *mtable;
+		int			cpt;
+
+		cpt = (first + i) % LNET_CPT_NUMBER;
+		mtable = ptl->ptl_mtables[cpt];
+		if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+			continue;
+
+		lnet_res_lock(cpt);
+		lnet_ptl_lock(ptl);
+
+		if (i == 0) { /* the first try, attach on stealing list */
+			list_add_tail(&msg->msg_list,
+					  &ptl->ptl_msg_stealing);
+		}
+
+		if (!list_empty(&msg->msg_list)) { /* on stealing list */
+			rc = lnet_mt_match_md(mtable, info, msg);
+
+			if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+			    mtable->mt_enabled)
+				lnet_ptl_disable_mt(ptl, cpt);
+
+			if ((rc & LNET_MATCHMD_FINISH) != 0)
+				list_del_init(&msg->msg_list);
+
+		} else {
+			/* could be matched by lnet_ptl_attach_md()
+			 * which is called by another thread */
+			rc = msg->msg_md == NULL ?
+			     LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+		}
+
+		if (!list_empty(&msg->msg_list) && /* not matched yet */
+		    (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+		     ptl->ptl_mt_nmaps == 0 ||   /* no active CPT */
+		     (ptl->ptl_mt_nmaps == 1 &&  /* the only active CPT */
+		      ptl->ptl_mt_maps[0] == cpt))) {
+			/* nothing to steal, delay or drop */
+			list_del_init(&msg->msg_list);
+
+			if (lnet_ptl_is_lazy(ptl)) {
+				msg->msg_rx_delayed = 1;
+				list_add_tail(&msg->msg_list,
+						  &ptl->ptl_msg_delayed);
+				rc = LNET_MATCHMD_NONE;
+			} else {
+				rc = LNET_MATCHMD_DROP;
+			}
+		}
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(cpt);
+
+		if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+			break;
+	}
+
+	return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+	struct lnet_match_table	*mtable;
+	struct lnet_portal	*ptl;
+	int			rc;
+
+	CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+	       "MB="LPX64"\n", libcfs_id2str(info->mi_id),
+	       info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+	if (info->mi_portal >= the_lnet.ln_nportals) {
+		CERROR("Invalid portal %d not in [0-%d]\n",
+		       info->mi_portal, the_lnet.ln_nportals);
+		return LNET_MATCHMD_DROP;
+	}
+
+	ptl = the_lnet.ln_portals[info->mi_portal];
+	rc = lnet_ptl_match_early(ptl, msg);
+	if (rc != 0) /* matched or delayed early message */
+		return rc;
+
+	mtable = lnet_mt_of_match(info, msg);
+	lnet_res_lock(mtable->mt_cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = LNET_MATCHMD_DROP;
+		goto out1;
+	}
+
+	rc = lnet_mt_match_md(mtable, info, msg);
+	if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+		lnet_ptl_lock(ptl);
+		lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+		lnet_ptl_unlock(ptl);
+	}
+
+	if ((rc & LNET_MATCHMD_FINISH) != 0)	/* matched or dropping */
+		goto out1;
+
+	if (!msg->msg_rx_ready_delay)
+		goto out1;
+
+	LASSERT(lnet_ptl_is_lazy(ptl));
+	LASSERT(!msg->msg_rx_delayed);
+
+	/* NB: we don't expect "delay" can happen a lot */
+	if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+		lnet_ptl_lock(ptl);
+
+		msg->msg_rx_delayed = 1;
+		list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(mtable->mt_cpt);
+
+	} else  {
+		lnet_res_unlock(mtable->mt_cpt);
+		rc = lnet_ptl_match_delay(ptl, info, msg);
+	}
+
+	if (msg->msg_rx_delayed) {
+		CDEBUG(D_NET,
+		       "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n",
+		       info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+		       libcfs_id2str(info->mi_id), info->mi_portal,
+		       info->mi_mbits, info->mi_roffset, info->mi_rlength);
+	}
+	goto out0;
+ out1:
+	lnet_res_unlock(mtable->mt_cpt);
+ out0:
+	/* EXHAUSTED bit is only meaningful for internal functions */
+	return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+	LASSERT(me->me_md == md && md->md_me == me);
+
+	me->me_md = NULL;
+	md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+		   struct list_head *matches, struct list_head *drops)
+{
+	struct lnet_portal	*ptl = the_lnet.ln_portals[me->me_portal];
+	struct lnet_match_table	*mtable;
+	struct list_head		*head;
+	lnet_msg_t		*tmp;
+	lnet_msg_t		*msg;
+	int			exhausted = 0;
+	int			cpt;
+
+	LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+	me->me_md = md;
+	md->md_me = me;
+
+	cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+	mtable = ptl->ptl_mtables[cpt];
+
+	if (list_empty(&ptl->ptl_msg_stealing) &&
+	    list_empty(&ptl->ptl_msg_delayed) &&
+	    !lnet_mt_test_exhausted(mtable, me->me_pos))
+		return;
+
+	lnet_ptl_lock(ptl);
+	head = &ptl->ptl_msg_stealing;
+ again:
+	list_for_each_entry_safe(msg, tmp, head, msg_list) {
+		struct lnet_match_info	info;
+		lnet_hdr_t		*hdr;
+		int			rc;
+
+		LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+		hdr   = &msg->msg_hdr;
+		info.mi_id.nid	= hdr->src_nid;
+		info.mi_id.pid	= hdr->src_pid;
+		info.mi_opc	= LNET_MD_OP_PUT;
+		info.mi_portal	= hdr->msg.put.ptl_index;
+		info.mi_rlength	= hdr->payload_length;
+		info.mi_roffset	= hdr->msg.put.offset;
+		info.mi_mbits	= hdr->msg.put.match_bits;
+
+		rc = lnet_try_match_md(md, &info, msg);
+
+		exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+		if ((rc & LNET_MATCHMD_NONE) != 0) {
+			if (exhausted)
+				break;
+			continue;
+		}
+
+		/* Hurrah! This _is_ a match */
+		LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+		list_del_init(&msg->msg_list);
+
+		if (head == &ptl->ptl_msg_stealing) {
+			if (exhausted)
+				break;
+			/* stealing thread will handle the message */
+			continue;
+		}
+
+		if ((rc & LNET_MATCHMD_OK) != 0) {
+			list_add_tail(&msg->msg_list, matches);
+
+			CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+			       "match "LPU64" offset %d length %d.\n",
+			       libcfs_id2str(info.mi_id),
+			       info.mi_portal, info.mi_mbits,
+			       info.mi_roffset, info.mi_rlength);
+		} else {
+			list_add_tail(&msg->msg_list, drops);
+		}
+
+		if (exhausted)
+			break;
+	}
+
+	if (!exhausted && head == &ptl->ptl_msg_stealing) {
+		head = &ptl->ptl_msg_delayed;
+		goto again;
+	}
+
+	if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+		lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+		if (!mtable->mt_enabled)
+			lnet_ptl_enable_mt(ptl, cpt);
+	}
+
+	lnet_ptl_unlock(ptl);
+}
+
+void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+	struct lnet_match_table	*mtable;
+	int			i;
+
+	if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+		return;
+
+	LASSERT(list_empty(&ptl->ptl_msg_delayed));
+	LASSERT(list_empty(&ptl->ptl_msg_stealing));
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		struct list_head	*mhash;
+		lnet_me_t	*me;
+		int		j;
+
+		if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+			continue;
+
+		mhash = mtable->mt_mhash;
+		/* cleanup ME */
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+			while (!list_empty(&mhash[j])) {
+				me = list_entry(mhash[j].next,
+						    lnet_me_t, me_list);
+				CERROR("Active ME %p on exit\n", me);
+				list_del(&me->me_list);
+				lnet_me_free(me);
+			}
+		}
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+	}
+
+	cfs_percpt_free(ptl->ptl_mtables);
+	ptl->ptl_mtables = NULL;
+}
+
+int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+	struct lnet_match_table	*mtable;
+	struct list_head		*mhash;
+	int			i;
+	int			j;
+
+	ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct lnet_match_table));
+	if (ptl->ptl_mtables == NULL) {
+		CERROR("Failed to create match table for portal %d\n", index);
+		return -ENOMEM;
+	}
+
+	ptl->ptl_index = index;
+	INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+	INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+	spin_lock_init(&ptl->ptl_lock);
+	cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+		/* the extra entry is for MEs with ignore bits */
+		LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+				 sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+		if (mhash == NULL) {
+			CERROR("Failed to create match hash for portal %d\n",
+			       index);
+			goto failed;
+		}
+
+		memset(&mtable->mt_exhausted[0], -1,
+		       sizeof(mtable->mt_exhausted[0]) *
+		       LNET_MT_EXHAUSTED_BMAP);
+		mtable->mt_mhash = mhash;
+		for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+			INIT_LIST_HEAD(&mhash[j]);
+
+		mtable->mt_portal = index;
+		mtable->mt_cpt = i;
+	}
+
+	return 0;
+ failed:
+	lnet_ptl_cleanup(ptl);
+	return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+	int	i;
+
+	if (the_lnet.ln_portals == NULL)
+		return;
+
+	for (i = 0; i < the_lnet.ln_nportals; i++)
+		lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+	cfs_array_free(the_lnet.ln_portals);
+	the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+	int	size;
+	int	i;
+
+	size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+	the_lnet.ln_nportals = MAX_PORTALS;
+	the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+	if (the_lnet.ln_portals == NULL) {
+		CERROR("Failed to allocate portals table\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < the_lnet.ln_nportals; i++) {
+		if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+			lnet_portals_destroy();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+	struct lnet_portal *ptl;
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0       On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+	struct lnet_portal	*ptl;
+	LIST_HEAD		(zombies);
+
+	if (portal < 0 || portal >= the_lnet.ln_nportals)
+		return -EINVAL;
+
+	ptl = the_lnet.ln_portals[portal];
+
+	lnet_res_lock(LNET_LOCK_EX);
+	lnet_ptl_lock(ptl);
+
+	if (!lnet_ptl_is_lazy(ptl)) {
+		lnet_ptl_unlock(ptl);
+		lnet_res_unlock(LNET_LOCK_EX);
+		return 0;
+	}
+
+	if (the_lnet.ln_shutdown)
+		CWARN("Active lazy portal %d on exit\n", portal);
+	else
+		CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+	/* grab all the blocked messages atomically */
+	list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+	lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+	lnet_ptl_unlock(ptl);
+	lnet_res_unlock(LNET_LOCK_EX);
+
+	lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+	return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644
index 000000000000..670dae34107c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lo.c
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+	LASSERT (!lntmsg->msg_routing);
+	LASSERT (!lntmsg->msg_target_is_router);
+
+	return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+	    int delayed, unsigned int niov,
+	    struct iovec *iov, lnet_kiov_t *kiov,
+	    unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+	lnet_msg_t *sendmsg = private;
+
+	if (lntmsg != NULL) {		   /* not discarding */
+		if (sendmsg->msg_iov != NULL) {
+			if (iov != NULL)
+				lnet_copy_iov2iov(niov, iov, offset,
+						  sendmsg->msg_niov,
+						  sendmsg->msg_iov,
+						  sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_iov2kiov(niov, kiov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_iov,
+						   sendmsg->msg_offset, mlen);
+		} else {
+			if (iov != NULL)
+				lnet_copy_kiov2iov(niov, iov, offset,
+						   sendmsg->msg_niov,
+						   sendmsg->msg_kiov,
+						   sendmsg->msg_offset, mlen);
+			else
+				lnet_copy_kiov2kiov(niov, kiov, offset,
+						    sendmsg->msg_niov,
+						    sendmsg->msg_kiov,
+						    sendmsg->msg_offset, mlen);
+		}
+
+		lnet_finalize(ni, lntmsg, 0);
+	}
+
+	lnet_finalize(ni, sendmsg, 0);
+	return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+	CDEBUG (D_NET, "shutdown\n");
+	LASSERT (lolnd_instanced);
+
+	lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+	LASSERT (ni->ni_lnd == &the_lolnd);
+	LASSERT (!lolnd_instanced);
+	lolnd_instanced = 1;
+
+	return (0);
+}
+
+lnd_t the_lolnd = {
+	/* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+	/* .lnd_refcount   = */ 0,
+	/* .lnd_type       = */ LOLND,
+	/* .lnd_startup    = */ lolnd_startup,
+	/* .lnd_shutdown   = */ lolnd_shutdown,
+	/* .lnt_ctl	= */ NULL,
+	/* .lnd_send       = */ lolnd_send,
+	/* .lnd_recv       = */ lolnd_recv,
+	/* .lnd_eager_recv = */ NULL,
+	/* .lnd_notify     = */ NULL,
+	/* .lnd_accept     = */ NULL
+};
diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644
index 000000000000..c8323854580a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/module.c
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+		"configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+int
+lnet_configure (void *arg)
+{
+	/* 'arg' only there so I can be passed to cfs_create_thread() */
+	int    rc = 0;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (!the_lnet.ln_niinit_self) {
+		rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+		if (rc >= 0) {
+			the_lnet.ln_niinit_self = 1;
+			rc = 0;
+		}
+	}
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return rc;
+}
+
+int
+lnet_unconfigure (void)
+{
+	int   refcount;
+
+	LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+	if (the_lnet.ln_niinit_self) {
+		the_lnet.ln_niinit_self = 0;
+		LNetNIFini();
+	}
+
+	LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+	refcount = the_lnet.ln_refcount;
+	LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+	LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+	return (refcount == 0) ? 0 : -EBUSY;
+}
+
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	int   rc;
+
+	switch (cmd) {
+	case IOC_LIBCFS_CONFIGURE:
+		return lnet_configure(NULL);
+
+	case IOC_LIBCFS_UNCONFIGURE:
+		return lnet_unconfigure();
+
+	default:
+		/* Passing LNET_PID_ANY only gives me a ref if the net is up
+		 * already; I'll need it to ensure the net can't go down while
+		 * I'm called into it */
+		rc = LNetNIInit(LNET_PID_ANY);
+		if (rc >= 0) {
+			rc = LNetCtl(cmd, data);
+			LNetNIFini();
+		}
+		return rc;
+	}
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+int
+init_lnet(void)
+{
+	int		  rc;
+	ENTRY;
+
+	mutex_init(&lnet_config_mutex);
+
+	rc = LNetInit();
+	if (rc != 0) {
+		CERROR("LNetInit: error %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+	LASSERT (rc == 0);
+
+	if (config_on_load) {
+		/* Have to schedule a separate thread to avoid deadlocking
+		 * in modload */
+		(void) kthread_run(lnet_configure, NULL, "lnet_initd");
+	}
+
+	RETURN(0);
+}
+
+void
+fini_lnet(void)
+{
+	int rc;
+
+	rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+	LASSERT (rc == 0);
+
+	LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644
index 000000000000..286977691393
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -0,0 +1,337 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+int
+lnet_peer_tables_create(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+						   sizeof(*ptable));
+	if (the_lnet.ln_peer_tables == NULL) {
+		CERROR("Failed to allocate cpu-partition peer tables\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+		LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+				 LNET_PEER_HASH_SIZE * sizeof(*hash));
+		if (hash == NULL) {
+			CERROR("Failed to create peer hash table\n");
+			lnet_peer_tables_destroy();
+			return -ENOMEM;
+		}
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			INIT_LIST_HEAD(&hash[j]);
+		ptable->pt_hash = hash; /* sign of initialization */
+	}
+
+	return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+	struct lnet_peer_table	*ptable;
+	struct list_head		*hash;
+	int			i;
+	int			j;
+
+	if (the_lnet.ln_peer_tables == NULL)
+		return;
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		hash = ptable->pt_hash;
+		if (hash == NULL) /* not intialized */
+			break;
+
+		LASSERT(list_empty(&ptable->pt_deathrow));
+
+		ptable->pt_hash = NULL;
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+			LASSERT(list_empty(&hash[j]));
+
+		LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+	}
+
+	cfs_percpt_free(the_lnet.ln_peer_tables);
+	the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+	struct lnet_peer_table	*ptable;
+	int			i;
+	int			j;
+
+	LASSERT(the_lnet.ln_shutdown);	/* i.e. no new peers */
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		lnet_net_lock(i);
+
+		for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+			struct list_head *peers = &ptable->pt_hash[j];
+
+			while (!list_empty(peers)) {
+				lnet_peer_t *lp = list_entry(peers->next,
+								 lnet_peer_t,
+								 lp_hashlist);
+				list_del_init(&lp->lp_hashlist);
+				/* lose hash table's ref */
+				lnet_peer_decref_locked(lp);
+			}
+		}
+
+		lnet_net_unlock(i);
+	}
+
+	cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+		LIST_HEAD	(deathrow);
+		lnet_peer_t	*lp;
+
+		lnet_net_lock(i);
+
+		for (j = 3; ptable->pt_number != 0; j++) {
+			lnet_net_unlock(i);
+
+			if ((j & (j - 1)) == 0) {
+				CDEBUG(D_WARNING,
+				       "Waiting for %d peers on peer table\n",
+				       ptable->pt_number);
+			}
+			cfs_pause(cfs_time_seconds(1) / 2);
+			lnet_net_lock(i);
+		}
+		list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+		lnet_net_unlock(i);
+
+		while (!list_empty(&deathrow)) {
+			lp = list_entry(deathrow.next,
+					    lnet_peer_t, lp_hashlist);
+			list_del(&lp->lp_hashlist);
+			LIBCFS_FREE(lp, sizeof(*lp));
+		}
+	}
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+	struct lnet_peer_table *ptable;
+
+	LASSERT(lp->lp_refcount == 0);
+	LASSERT(lp->lp_rtr_refcount == 0);
+	LASSERT(list_empty(&lp->lp_txq));
+	LASSERT(list_empty(&lp->lp_hashlist));
+	LASSERT(lp->lp_txqnob == 0);
+
+	ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+	LASSERT(ptable->pt_number > 0);
+	ptable->pt_number--;
+
+	lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+	lp->lp_ni = NULL;
+
+	list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+	struct list_head	*peers;
+	lnet_peer_t	*lp;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+	list_for_each_entry(lp, peers, lp_hashlist) {
+		if (lp->lp_nid == nid) {
+			lnet_peer_addref_locked(lp);
+			return lp;
+		}
+	}
+
+	return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+	struct lnet_peer_table	*ptable;
+	lnet_peer_t		*lp = NULL;
+	lnet_peer_t		*lp2;
+	int			cpt2;
+	int			rc = 0;
+
+	*lpp = NULL;
+	if (the_lnet.ln_shutdown) /* it's shutting down */
+		return -ESHUTDOWN;
+
+	/* cpt can be LNET_LOCK_EX if it's called from router functions */
+	cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+	ptable = the_lnet.ln_peer_tables[cpt2];
+	lp = lnet_find_peer_locked(ptable, nid);
+	if (lp != NULL) {
+		*lpp = lp;
+		return 0;
+	}
+
+	if (!list_empty(&ptable->pt_deathrow)) {
+		lp = list_entry(ptable->pt_deathrow.next,
+				    lnet_peer_t, lp_hashlist);
+		list_del(&lp->lp_hashlist);
+	}
+
+	/*
+	 * take extra refcount in case another thread has shutdown LNet
+	 * and destroyed locks and peer-table before I finish the allocation
+	 */
+	ptable->pt_number++;
+	lnet_net_unlock(cpt);
+
+	if (lp != NULL)
+		memset(lp, 0, sizeof(*lp));
+	else
+		LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+	if (lp == NULL) {
+		rc = -ENOMEM;
+		lnet_net_lock(cpt);
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&lp->lp_txq);
+	INIT_LIST_HEAD(&lp->lp_rtrq);
+	INIT_LIST_HEAD(&lp->lp_routes);
+
+	lp->lp_notify = 0;
+	lp->lp_notifylnd = 0;
+	lp->lp_notifying = 0;
+	lp->lp_alive_count = 0;
+	lp->lp_timestamp = 0;
+	lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+	lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+	lp->lp_last_query = 0; /* haven't asked NI yet */
+	lp->lp_ping_timestamp = 0;
+	lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+	lp->lp_nid = nid;
+	lp->lp_cpt = cpt2;
+	lp->lp_refcount = 2;	/* 1 for caller; 1 for hash */
+	lp->lp_rtr_refcount = 0;
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	lp2 = lnet_find_peer_locked(ptable, nid);
+	if (lp2 != NULL) {
+		*lpp = lp2;
+		goto out;
+	}
+
+	lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+	if (lp->lp_ni == NULL) {
+		rc = -EHOSTUNREACH;
+		goto out;
+	}
+
+	lp->lp_txcredits    =
+	lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+	lp->lp_rtrcredits    =
+	lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+	list_add_tail(&lp->lp_hashlist,
+			  &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+	ptable->pt_version++;
+	*lpp = lp;
+
+	return 0;
+out:
+	if (lp != NULL)
+		list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+	ptable->pt_number--;
+	return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+	char		*aliveness = "NA";
+	lnet_peer_t	*lp;
+	int		rc;
+	int		cpt;
+
+	cpt = lnet_cpt_of_nid(nid);
+	lnet_net_lock(cpt);
+
+	rc = lnet_nid2peer_locked(&lp, nid, cpt);
+	if (rc != 0) {
+		lnet_net_unlock(cpt);
+		CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+		return;
+	}
+
+	if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+		aliveness = lp->lp_alive ? "up" : "down";
+
+	CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+	       libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+	       aliveness, lp->lp_ni->ni_peertxcredits,
+	       lp->lp_rtrcredits, lp->lp_minrtrcredits,
+	       lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644
index 000000000000..c5ff97aaacc3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router.c
@@ -0,0 +1,1693 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN	512	/* min value for each CPT */
+#define LNET_NRB_TINY		(LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN	4096	/* min value for each CPT */
+#define LNET_NRB_SMALL		(LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
+#define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+		"Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+		"# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+		"# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+		"# of large messages to buffer in the router");
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+		"# router buffer credits per peer");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+		"Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	/* NI option overrides LNet default */
+	if (ni->ni_peerrtrcredits > 0)
+		return ni->ni_peerrtrcredits;
+	if (peer_buffer_credits > 0)
+		return peer_buffer_credits;
+
+	/* As an approximation, allow this peer the same number of router
+	 * buffers as it is allowed outstanding sends */
+	return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+	return 0;
+}
+
+#endif
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+		"Assume routers are down and ping them before use");
+
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+		"Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+		"Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+		"Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+		"Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+	return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+{
+	if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+		CDEBUG(D_NET, "Out of date\n");
+		return;
+	}
+
+	lp->lp_timestamp = when;		/* update timestamp */
+	lp->lp_ping_deadline = 0;	       /* disable ping timeout */
+
+	if (lp->lp_alive_count != 0 &&	  /* got old news */
+	    (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+		CDEBUG(D_NET, "Old news\n");
+		return;
+	}
+
+	/* Flag that notification is outstanding */
+
+	lp->lp_alive_count++;
+	lp->lp_alive = !(!alive);	       /* 1 bit! */
+	lp->lp_notify = 1;
+	lp->lp_notifylnd |= notifylnd;
+	if (lp->lp_alive)
+		lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+	CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+	int	alive;
+	int	notifylnd;
+
+	/* Notify only in 1 thread at any time to ensure ordered notification.
+	 * NB individual events can be missed; the only guarantee is that you
+	 * always get the most recent news */
+
+	if (lp->lp_notifying)
+		return;
+
+	lp->lp_notifying = 1;
+
+	while (lp->lp_notify) {
+		alive     = lp->lp_alive;
+		notifylnd = lp->lp_notifylnd;
+
+		lp->lp_notifylnd = 0;
+		lp->lp_notify    = 0;
+
+		if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+			lnet_net_unlock(lp->lp_cpt);
+
+			/* A new notification could happen now; I'll handle it
+			 * when control returns to me */
+
+			(ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+			lnet_net_lock(lp->lp_cpt);
+		}
+	}
+
+	lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount >= 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount++;
+	if (lp->lp_rtr_refcount == 1) {
+		struct list_head *pos;
+
+		/* a simple insertion sort */
+		list_for_each_prev(pos, &the_lnet.ln_routers) {
+			lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+							  lp_rtr_list);
+
+			if (rtr->lp_nid < lp->lp_nid)
+				break;
+		}
+
+		list_add(&lp->lp_rtr_list, pos);
+		/* addref for the_lnet.ln_routers */
+		lnet_peer_addref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+	LASSERT(lp->lp_refcount > 0);
+	LASSERT(lp->lp_rtr_refcount > 0);
+
+	/* lnet_net_lock must be exclusively locked */
+	lp->lp_rtr_refcount--;
+	if (lp->lp_rtr_refcount == 0) {
+		LASSERT(list_empty(&lp->lp_routes));
+
+		if (lp->lp_rcd != NULL) {
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+
+		list_del(&lp->lp_rtr_list);
+		/* decref for the_lnet.ln_routers */
+		lnet_peer_decref_locked(lp);
+		the_lnet.ln_routers_version++;
+	}
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+	lnet_remotenet_t	*rnet;
+	struct list_head		*tmp;
+	struct list_head		*rn_list;
+
+	LASSERT(!the_lnet.ln_shutdown);
+
+	rn_list = lnet_net2rnethash(net);
+	list_for_each(tmp, rn_list) {
+		rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+		if (rnet->lrn_net == net)
+			return rnet;
+	}
+	return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+	static int seeded = 0;
+	int lnd_type, seed[2];
+	struct timeval tv;
+	lnet_ni_t *ni;
+	struct list_head *tmp;
+
+	if (seeded)
+		return;
+
+	cfs_get_random_bytes(seed, sizeof(seed));
+
+	/* Nodes with small feet have little entropy
+	 * the NID for this node gives the most entropy in the low bits */
+	list_for_each(tmp, &the_lnet.ln_nis) {
+		ni = list_entry(tmp, lnet_ni_t, ni_list);
+		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+		if (lnd_type != LOLND)
+			seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+	}
+
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+	seeded = 1;
+	return;
+}
+
+/* NB expects LNET_LOCK held */
+void
+lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+	unsigned int      len = 0;
+	unsigned int      offset = 0;
+	struct list_head       *e;
+
+	lnet_shuffle_seed();
+
+	list_for_each (e, &rnet->lrn_routes) {
+		len++;
+	}
+
+	/* len+1 positions to add a new entry, also prevents division by 0 */
+	offset = cfs_rand() % (len + 1);
+	list_for_each (e, &rnet->lrn_routes) {
+		if (offset == 0)
+			break;
+		offset--;
+	}
+	list_add(&route->lr_list, e);
+	list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+	the_lnet.ln_remote_nets_version++;
+	lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+	struct list_head	  *e;
+	lnet_remotenet_t    *rnet;
+	lnet_remotenet_t    *rnet2;
+	lnet_route_t	*route;
+	lnet_ni_t	   *ni;
+	int		  add_route;
+	int		  rc;
+
+	CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+	       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+	if (gateway == LNET_NID_ANY ||
+	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+	    net == LNET_NIDNET(LNET_NID_ANY) ||
+	    LNET_NETTYP(net) == LOLND ||
+	    LNET_NIDNET(gateway) == net ||
+	    hops < 1 || hops > 255)
+		return (-EINVAL);
+
+	if (lnet_islocalnet(net))	       /* it's a local network */
+		return 0;		       /* ignore the route entry */
+
+	/* Assume net, route, all new */
+	LIBCFS_ALLOC(route, sizeof(*route));
+	LIBCFS_ALLOC(rnet, sizeof(*rnet));
+	if (route == NULL || rnet == NULL) {
+		CERROR("Out of memory creating route %s %d %s\n",
+		       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+		if (route != NULL)
+			LIBCFS_FREE(route, sizeof(*route));
+		if (rnet != NULL)
+			LIBCFS_FREE(rnet, sizeof(*rnet));
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&rnet->lrn_routes);
+	rnet->lrn_net = net;
+	route->lr_hops = hops;
+	route->lr_net = net;
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+	if (rc != 0) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		LIBCFS_FREE(route, sizeof(*route));
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+		if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
+			return 0;	/* ignore the route entry */
+		} else {
+			CERROR("Error %d creating route %s %d %s\n", rc,
+			       libcfs_net2str(net), hops,
+			       libcfs_nid2str(gateway));
+		}
+		return rc;
+	}
+
+	LASSERT (!the_lnet.ln_shutdown);
+
+	rnet2 = lnet_find_net_locked(net);
+	if (rnet2 == NULL) {
+		/* new network */
+		list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+		rnet2 = rnet;
+	}
+
+	/* Search for a duplicate route (it's a NOOP if it is) */
+	add_route = 1;
+	list_for_each (e, &rnet2->lrn_routes) {
+		lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+		if (route2->lr_gateway == route->lr_gateway) {
+			add_route = 0;
+			break;
+		}
+
+		/* our lookups must be true */
+		LASSERT (route2->lr_gateway->lp_nid != gateway);
+	}
+
+	if (add_route) {
+		lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+		lnet_add_route_to_rnet(rnet2, route);
+
+		ni = route->lr_gateway->lp_ni;
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		/* XXX Assume alive */
+		if (ni->ni_lnd->lnd_notify != NULL)
+			(ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	/* -1 for notify or !add_route */
+	lnet_peer_decref_locked(route->lr_gateway);
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	if (!add_route)
+		LIBCFS_FREE(route, sizeof(*route));
+
+	if (rnet != rnet2)
+		LIBCFS_FREE(rnet, sizeof(*rnet));
+
+	return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	lnet_route_t		*route2;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			cpt;
+	struct list_head		*rn_list;
+	int			i;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			route2 = NULL;
+			list_for_each(e2, &rnet->lrn_routes) {
+				lnet_nid_t	nid1;
+				lnet_nid_t	nid2;
+				int		net;
+
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (route2 == NULL) {
+					route2 = route;
+					continue;
+				}
+
+				if (route->lr_gateway->lp_ni ==
+				    route2->lr_gateway->lp_ni)
+					continue;
+
+				nid1 = route->lr_gateway->lp_nid;
+				nid2 = route2->lr_gateway->lp_nid;
+				net = rnet->lrn_net;
+
+				lnet_net_unlock(cpt);
+
+				CERROR("Routes to %s via %s and %s not "
+				       "supported\n",
+				       libcfs_net2str(net),
+				       libcfs_nid2str(nid1),
+				       libcfs_nid2str(nid2));
+				return -EINVAL;
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+	struct lnet_peer	*gateway;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	struct list_head		*e1;
+	struct list_head		*e2;
+	int			rc = -ENOENT;
+	struct list_head		*rn_list;
+	int			idx = 0;
+
+	CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+	       libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+	/* NB Caller may specify either all routes via the given gateway
+	 * or a specific route entry actual NIDs) */
+
+	lnet_net_lock(LNET_LOCK_EX);
+	if (net == LNET_NIDNET(LNET_NID_ANY))
+		rn_list = &the_lnet.ln_remote_nets_hash[0];
+	else
+		rn_list = lnet_net2rnethash(net);
+
+ again:
+	list_for_each(e1, rn_list) {
+		rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+		if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+			net == rnet->lrn_net))
+			continue;
+
+		list_for_each(e2, &rnet->lrn_routes) {
+			route = list_entry(e2, lnet_route_t, lr_list);
+
+			gateway = route->lr_gateway;
+			if (!(gw_nid == LNET_NID_ANY ||
+			      gw_nid == gateway->lp_nid))
+				continue;
+
+			list_del(&route->lr_list);
+			list_del(&route->lr_gwlist);
+			the_lnet.ln_remote_nets_version++;
+
+			if (list_empty(&rnet->lrn_routes))
+				list_del(&rnet->lrn_list);
+			else
+				rnet = NULL;
+
+			lnet_rtr_decref_locked(gateway);
+			lnet_peer_decref_locked(gateway);
+
+			lnet_net_unlock(LNET_LOCK_EX);
+
+			LIBCFS_FREE(route, sizeof(*route));
+
+			if (rnet != NULL)
+				LIBCFS_FREE(rnet, sizeof(*rnet));
+
+			rc = 0;
+			lnet_net_lock(LNET_LOCK_EX);
+			goto again;
+		}
+	}
+
+	if (net == LNET_NIDNET(LNET_NID_ANY) &&
+	    ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+		rn_list = &the_lnet.ln_remote_nets_hash[idx];
+		goto again;
+	}
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+	       lnet_nid_t *gateway, __u32 *alive)
+{
+	struct list_head		*e1;
+	struct list_head		*e2;
+	lnet_remotenet_t	*rnet;
+	lnet_route_t		*route;
+	int			cpt;
+	int			i;
+	struct list_head		*rn_list;
+
+	cpt = lnet_net_lock_current();
+
+	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+		rn_list = &the_lnet.ln_remote_nets_hash[i];
+		list_for_each(e1, rn_list) {
+			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+			list_for_each(e2, &rnet->lrn_routes) {
+				route = list_entry(e2, lnet_route_t,
+						       lr_list);
+
+				if (idx-- == 0) {
+					*net     = rnet->lrn_net;
+					*hops    = route->lr_hops;
+					*gateway = route->lr_gateway->lp_nid;
+					*alive   = route->lr_gateway->lp_alive;
+					lnet_net_unlock(cpt);
+					return 0;
+				}
+			}
+		}
+	}
+
+	lnet_net_unlock(cpt);
+	return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+	int	       i;
+	lnet_ni_status_t *stat;
+
+	__swab32s(&info->pi_magic);
+	__swab32s(&info->pi_features);
+	__swab32s(&info->pi_pid);
+	__swab32s(&info->pi_nnis);
+	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+		stat = &info->pi_ni[i];
+		__swab64s(&stat->ns_nid);
+		__swab32s(&stat->ns_status);
+	}
+	return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+	lnet_ping_info_t	*info = rcd->rcd_pinginfo;
+	struct lnet_peer	*gw   = rcd->rcd_gateway;
+	lnet_route_t		*rtr;
+
+	if (!gw->lp_alive)
+		return;
+
+	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+		lnet_swap_pinginfo(info);
+
+	/* NB always racing with network! */
+	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+		       libcfs_nid2str(gw->lp_nid), info->pi_magic);
+		gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+		return;
+	}
+
+	gw->lp_ping_feats = info->pi_features;
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+		       libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+		return; /* nothing I can understand */
+	}
+
+	if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+		return; /* can't carry NI status info */
+
+	list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+		int	ptl_status = LNET_NI_STATUS_INVALID;
+		int	down = 0;
+		int	up = 0;
+		int	i;
+
+		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+			lnet_ni_status_t *stat = &info->pi_ni[i];
+			lnet_nid_t	 nid = stat->ns_nid;
+
+			if (nid == LNET_NID_ANY) {
+				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+				       libcfs_nid2str(gw->lp_nid));
+				gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+				return;
+			}
+
+			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+				continue;
+
+			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+				if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+					down++;
+				else if (ptl_status != LNET_NI_STATUS_UP)
+					ptl_status = LNET_NI_STATUS_DOWN;
+				continue;
+			}
+
+			if (stat->ns_status == LNET_NI_STATUS_UP) {
+				if (LNET_NIDNET(nid) == rtr->lr_net) {
+					up = 1;
+					break;
+				}
+				/* ptl NIs are considered down only when
+				 * they're all down */
+				if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+					ptl_status = LNET_NI_STATUS_UP;
+				continue;
+			}
+
+			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+			       libcfs_nid2str(gw->lp_nid), stat->ns_status);
+			gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+			return;
+		}
+
+		if (up) { /* ignore downed NIs if NI for dest network is up */
+			rtr->lr_downis = 0;
+			continue;
+		}
+		rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+	}
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+	lnet_rc_data_t		*rcd = event->md.user_ptr;
+	struct lnet_peer	*lp;
+
+	LASSERT(rcd != NULL);
+
+	if (event->unlinked) {
+		LNetInvalidateHandle(&rcd->rcd_mdh);
+		return;
+	}
+
+	LASSERT(event->type == LNET_EVENT_SEND ||
+		event->type == LNET_EVENT_REPLY);
+
+	lp = rcd->rcd_gateway;
+	LASSERT(lp != NULL);
+
+	 /* NB: it's called with holding lnet_res_lock, we have a few
+	  * places need to hold both locks at the same time, please take
+	  * care of lock ordering */
+	lnet_net_lock(lp->lp_cpt);
+	if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+		/* ignore if no longer a router or rcd is replaced */
+		goto out;
+	}
+
+	if (event->type == LNET_EVENT_SEND) {
+		lp->lp_ping_notsent = 0;
+		if (event->status == 0)
+			goto out;
+	}
+
+	/* LNET_EVENT_REPLY */
+	/* A successful REPLY means the router is up.  If _any_ comms
+	 * to the router fail I assume it's down (this will happen if
+	 * we ping alive routers to try to detect router death before
+	 * apps get burned). */
+
+	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+	/* The router checker will wake up very shortly and do the
+	 * actual notification.
+	 * XXX If 'lp' stops being a router before then, it will still
+	 * have the notification pending!!! */
+
+	if (avoid_asym_router_failure && event->status == 0)
+		lnet_parse_rc_info(rcd);
+
+ out:
+	lnet_net_unlock(lp->lp_cpt);
+}
+
+void
+lnet_wait_known_routerstate(void)
+{
+	lnet_peer_t	 *rtr;
+	struct list_head	  *entry;
+	int		  all_known;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	for (;;) {
+		int	cpt = lnet_net_lock_current();
+
+		all_known = 1;
+		list_for_each (entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			if (rtr->lp_alive_count == 0) {
+				all_known = 0;
+				break;
+			}
+		}
+
+		lnet_net_unlock(cpt);
+
+		if (all_known)
+			return;
+
+		cfs_pause(cfs_time_seconds(1));
+	}
+}
+
+void
+lnet_update_ni_status_locked(void)
+{
+	lnet_ni_t	*ni;
+	long		now;
+	int		timeout;
+
+	LASSERT(the_lnet.ln_routing);
+
+	timeout = router_ping_timeout +
+		  MAX(live_router_check_interval, dead_router_check_interval);
+
+	now = cfs_time_current_sec();
+	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+		if (ni->ni_lnd->lnd_type == LOLND)
+			continue;
+
+		if (now < ni->ni_last_alive + timeout)
+			continue;
+
+		lnet_ni_lock(ni);
+		/* re-check with lock */
+		if (now < ni->ni_last_alive + timeout) {
+			lnet_ni_unlock(ni);
+			continue;
+		}
+
+		LASSERT(ni->ni_status != NULL);
+
+		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+			       libcfs_nid2str(ni->ni_nid), timeout);
+			/* NB: so far, this is the only place to set
+			 * NI status to "down" */
+			ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+		}
+		lnet_ni_unlock(ni);
+	}
+}
+
+void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+	LASSERT(list_empty(&rcd->rcd_list));
+	/* detached from network */
+	LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+	if (rcd->rcd_gateway != NULL) {
+		int cpt = rcd->rcd_gateway->lp_cpt;
+
+		lnet_net_lock(cpt);
+		lnet_peer_decref_locked(rcd->rcd_gateway);
+		lnet_net_unlock(cpt);
+	}
+
+	if (rcd->rcd_pinginfo != NULL)
+		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+	LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+	lnet_rc_data_t		*rcd = NULL;
+	lnet_ping_info_t	*pi;
+	int			rc;
+	int			i;
+
+	lnet_net_unlock(gateway->lp_cpt);
+
+	LIBCFS_ALLOC(rcd, sizeof(*rcd));
+	if (rcd == NULL)
+		goto out;
+
+	LNetInvalidateHandle(&rcd->rcd_mdh);
+	INIT_LIST_HEAD(&rcd->rcd_list);
+
+	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+	if (pi == NULL)
+		goto out;
+
+	memset(pi, 0, LNET_PINGINFO_SIZE);
+	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+	}
+	rcd->rcd_pinginfo = pi;
+
+	LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+	rc = LNetMDBind((lnet_md_t){.start     = pi,
+				    .user_ptr  = rcd,
+				    .length    = LNET_PINGINFO_SIZE,
+				    .threshold = LNET_MD_THRESH_INF,
+				    .options   = LNET_MD_TRUNCATE,
+				    .eq_handle = the_lnet.ln_rc_eqh},
+			LNET_UNLINK,
+			&rcd->rcd_mdh);
+	if (rc < 0) {
+		CERROR("Can't bind MD: %d\n", rc);
+		goto out;
+	}
+	LASSERT(rc == 0);
+
+	lnet_net_lock(gateway->lp_cpt);
+	/* router table changed or someone has created rcd for this gateway */
+	if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+		lnet_net_unlock(gateway->lp_cpt);
+		goto out;
+	}
+
+	lnet_peer_addref_locked(gateway);
+	rcd->rcd_gateway = gateway;
+	gateway->lp_rcd = rcd;
+	gateway->lp_ping_notsent = 0;
+
+	return rcd;
+
+ out:
+	if (rcd != NULL) {
+		if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+			rc = LNetMDUnlink(rcd->rcd_mdh);
+			LASSERT(rc == 0);
+		}
+		lnet_destroy_rc_data(rcd);
+	}
+
+	lnet_net_lock(gateway->lp_cpt);
+	return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval (lnet_peer_t *rtr)
+{
+	int secs;
+
+	secs = rtr->lp_alive ? live_router_check_interval :
+			       dead_router_check_interval;
+	if (secs < 0)
+		secs = 0;
+
+	return secs;
+}
+
+static void
+lnet_ping_router_locked (lnet_peer_t *rtr)
+{
+	lnet_rc_data_t *rcd = NULL;
+	cfs_time_t      now = cfs_time_current();
+	int	     secs;
+
+	lnet_peer_addref_locked(rtr);
+
+	if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+	    cfs_time_after(now, rtr->lp_ping_deadline))
+		lnet_notify_locked(rtr, 1, 0, now);
+
+	/* Run any outstanding notifications */
+	lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+	if (!lnet_isrouter(rtr) ||
+	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router table changed or router checker is shutting down */
+		lnet_peer_decref_locked(rtr);
+		return;
+	}
+
+	rcd = rtr->lp_rcd != NULL ?
+	      rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+	if (rcd == NULL)
+		return;
+
+	secs = lnet_router_check_interval(rtr);
+
+	CDEBUG(D_NET,
+	       "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+	       "alive_count %d lp_ping_timestamp %lu\n",
+	       libcfs_nid2str(rtr->lp_nid), secs,
+	       rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+	       rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+	if (secs != 0 && !rtr->lp_ping_notsent &&
+	    cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+					     cfs_time_seconds(secs)))) {
+		int	       rc;
+		lnet_process_id_t id;
+		lnet_handle_md_t  mdh;
+
+		id.nid = rtr->lp_nid;
+		id.pid = LUSTRE_SRV_LNET_PID;
+		CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+		rtr->lp_ping_notsent   = 1;
+		rtr->lp_ping_timestamp = now;
+
+		mdh = rcd->rcd_mdh;
+
+		if (rtr->lp_ping_deadline == 0) {
+			rtr->lp_ping_deadline =
+				cfs_time_shift(router_ping_timeout);
+		}
+
+		lnet_net_unlock(rtr->lp_cpt);
+
+		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+			     LNET_PROTO_PING_MATCHBITS, 0);
+
+		lnet_net_lock(rtr->lp_cpt);
+		if (rc != 0)
+			rtr->lp_ping_notsent = 0; /* no event pending */
+	}
+
+	lnet_peer_decref_locked(rtr);
+	return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+	int	  rc;
+	int	  eqsz;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	if (check_routers_before_use &&
+	    dead_router_check_interval <= 0) {
+		LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+				   " set if 'check_routers_before_use' is set"
+				   "\n");
+		return -EINVAL;
+	}
+
+	if (!the_lnet.ln_routing &&
+	    live_router_check_interval <= 0 &&
+	    dead_router_check_interval <= 0)
+		return 0;
+
+	sema_init(&the_lnet.ln_rc_signal, 0);
+	/* EQ size doesn't matter; the callback is guaranteed to get every
+	 * event */
+	eqsz = 0;
+	rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+			 &the_lnet.ln_rc_eqh);
+	if (rc != 0) {
+		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+		return -ENOMEM;
+	}
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+	rc = PTR_ERR(kthread_run(lnet_router_checker,
+				 NULL, "router_checker"));
+	if (IS_ERR_VALUE(rc)) {
+		CERROR("Can't start router checker thread: %d\n", rc);
+		/* block until event callback signals exit */
+		down(&the_lnet.ln_rc_signal);
+		rc = LNetEQFree(the_lnet.ln_rc_eqh);
+		LASSERT(rc == 0);
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		return -ENOMEM;
+	}
+
+	if (check_routers_before_use) {
+		/* Note that a helpful side-effect of pinging all known routers
+		 * at startup is that it makes them drop stale connections they
+		 * may have to a previous instance of me. */
+		lnet_wait_known_routerstate();
+	}
+
+	return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+	int rc;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+		return;
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+	/* block until event callback signals exit */
+	down(&the_lnet.ln_rc_signal);
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+	rc = LNetEQFree(the_lnet.ln_rc_eqh);
+	LASSERT (rc == 0);
+	return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+	lnet_rc_data_t		*rcd;
+	lnet_rc_data_t		*tmp;
+	lnet_peer_t		*lp;
+	struct list_head		head;
+	int			i = 2;
+
+	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+		   list_empty(&the_lnet.ln_rcd_deathrow) &&
+		   list_empty(&the_lnet.ln_rcd_zombie)))
+		return;
+
+	INIT_LIST_HEAD(&head);
+
+	lnet_net_lock(LNET_LOCK_EX);
+
+	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+		/* router checker is stopping, prune all */
+		list_for_each_entry(lp, &the_lnet.ln_routers,
+					lp_rtr_list) {
+			if (lp->lp_rcd == NULL)
+				continue;
+
+			LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+			list_add(&lp->lp_rcd->rcd_list,
+				     &the_lnet.ln_rcd_deathrow);
+			lp->lp_rcd = NULL;
+		}
+	}
+
+	/* unlink all RCDs on deathrow list */
+	list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+	if (!list_empty(&head)) {
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		list_for_each_entry(rcd, &head, rcd_list)
+			LNetMDUnlink(rcd->rcd_mdh);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+	/* release all zombie RCDs */
+	while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+		list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+					     rcd_list) {
+			if (LNetHandleIsInvalid(rcd->rcd_mdh))
+				list_move(&rcd->rcd_list, &head);
+		}
+
+		wait_unlink = wait_unlink &&
+			      !list_empty(&the_lnet.ln_rcd_zombie);
+
+		lnet_net_unlock(LNET_LOCK_EX);
+
+		while (!list_empty(&head)) {
+			rcd = list_entry(head.next,
+					     lnet_rc_data_t, rcd_list);
+			list_del_init(&rcd->rcd_list);
+			lnet_destroy_rc_data(rcd);
+		}
+
+		if (!wait_unlink)
+			return;
+
+		i++;
+		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+		       "Waiting for rc buffers to unlink\n");
+		cfs_pause(cfs_time_seconds(1) / 4);
+
+		lnet_net_lock(LNET_LOCK_EX);
+	}
+
+	lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if  defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+	lnet_peer_t       *rtr;
+	struct list_head	*entry;
+
+	cfs_block_allsigs();
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+		__u64	version;
+		int	cpt;
+		int	cpt2;
+
+		cpt = lnet_net_lock_current();
+rescan:
+		version = the_lnet.ln_routers_version;
+
+		list_for_each(entry, &the_lnet.ln_routers) {
+			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+			cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+			if (cpt != cpt2) {
+				lnet_net_unlock(cpt);
+				cpt = cpt2;
+				lnet_net_lock(cpt);
+				/* the routers list has changed */
+				if (version != the_lnet.ln_routers_version)
+					goto rescan;
+			}
+
+			lnet_ping_router_locked(rtr);
+
+			/* NB dropped lock */
+			if (version != the_lnet.ln_routers_version) {
+				/* the routers list has changed */
+				goto rescan;
+			}
+		}
+
+		if (the_lnet.ln_routing)
+			lnet_update_ni_status_locked();
+
+		lnet_net_unlock(cpt);
+
+		lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+		/* Call cfs_pause() here always adds 1 to load average
+		 * because kernel counts # active tasks as nr_running
+		 * + nr_uninterruptible. */
+		schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+						   cfs_time_seconds(1));
+	}
+
+	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+	lnet_prune_rc_data(1); /* wait for UNLINK */
+
+	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+	up(&the_lnet.ln_rc_signal);
+	/* The unlink event callback will signal final completion */
+	return 0;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+	int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+	while (--npages >= 0)
+		__free_page(rb->rb_kiov[npages].kiov_page);
+
+	LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+	int	    npages = rbp->rbp_npages;
+	int	    sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+	struct page   *page;
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+	if (rb == NULL)
+		return NULL;
+
+	rb->rb_pool = rbp;
+
+	for (i = 0; i < npages; i++) {
+		page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt,
+					  __GFP_ZERO | GFP_IOFS);
+		if (page == NULL) {
+			while (--i >= 0)
+				__free_page(rb->rb_kiov[i].kiov_page);
+
+			LIBCFS_FREE(rb, sz);
+			return NULL;
+		}
+
+		rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+		rb->rb_kiov[i].kiov_offset = 0;
+		rb->rb_kiov[i].kiov_page = page;
+	}
+
+	return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+	int		npages = rbp->rbp_npages;
+	int		nbuffers = 0;
+	lnet_rtrbuf_t	*rb;
+
+	if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+		return;
+
+	LASSERT (list_empty(&rbp->rbp_msgs));
+	LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+	while (!list_empty(&rbp->rbp_bufs)) {
+		LASSERT (rbp->rbp_credits > 0);
+
+		rb = list_entry(rbp->rbp_bufs.next,
+				    lnet_rtrbuf_t, rb_list);
+		list_del(&rb->rb_list);
+		lnet_destroy_rtrbuf(rb, npages);
+		nbuffers++;
+	}
+
+	LASSERT (rbp->rbp_nbuffers == nbuffers);
+	LASSERT (rbp->rbp_credits == nbuffers);
+
+	rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+	lnet_rtrbuf_t *rb;
+	int	    i;
+
+	if (rbp->rbp_nbuffers != 0) {
+		LASSERT (rbp->rbp_nbuffers == nbufs);
+		return 0;
+	}
+
+	for (i = 0; i < nbufs; i++) {
+		rb = lnet_new_rtrbuf(rbp, cpt);
+
+		if (rb == NULL) {
+			CERROR("Failed to allocate %d router bufs of %d pages\n",
+			       nbufs, rbp->rbp_npages);
+			return -ENOMEM;
+		}
+
+		rbp->rbp_nbuffers++;
+		rbp->rbp_credits++;
+		rbp->rbp_mincredits++;
+		list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+		/* No allocation "under fire" */
+		/* Otherwise we'd need code to schedule blocked msgs etc */
+		LASSERT (!the_lnet.ln_routing);
+	}
+
+	LASSERT (rbp->rbp_credits == nbufs);
+	return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+	INIT_LIST_HEAD(&rbp->rbp_msgs);
+	INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+	rbp->rbp_npages = npages;
+	rbp->rbp_credits = 0;
+	rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int		  i;
+
+	if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+		return;
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_free_bufs(&rtrp[0]);
+		lnet_rtrpool_free_bufs(&rtrp[1]);
+		lnet_rtrpool_free_bufs(&rtrp[2]);
+	}
+
+	cfs_percpt_free(the_lnet.ln_rtrpools);
+	the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_TINY;
+
+	if (tiny_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "tiny_router_buffers=%d invalid when "
+				   "routing enabled\n", tiny_router_buffers);
+		return -1;
+	}
+
+	if (tiny_router_buffers > 0)
+		nrbs = tiny_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_SMALL;
+
+	if (small_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "small_router_buffers=%d invalid when "
+				   "routing enabled\n", small_router_buffers);
+		return -1;
+	}
+
+	if (small_router_buffers > 0)
+		nrbs = small_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+	int	nrbs = LNET_NRB_LARGE;
+
+	if (large_router_buffers < 0) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "large_router_buffers=%d invalid when "
+				   "routing enabled\n", large_router_buffers);
+		return -1;
+	}
+
+	if (large_router_buffers > 0)
+		nrbs = large_router_buffers;
+
+	nrbs /= LNET_CPT_NUMBER;
+	return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+	lnet_rtrbufpool_t *rtrp;
+	int	large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int	small_pages = 1;
+	int	nrb_tiny;
+	int	nrb_small;
+	int	nrb_large;
+	int	rc;
+	int	i;
+
+	if (!strcmp(forwarding, "")) {
+		/* not set either way */
+		if (!im_a_router)
+			return 0;
+	} else if (!strcmp(forwarding, "disabled")) {
+		/* explicitly disabled */
+		return 0;
+	} else if (!strcmp(forwarding, "enabled")) {
+		/* explicitly enabled */
+	} else {
+		LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+				   "'enabled' or 'disabled'\n");
+		return -EINVAL;
+	}
+
+	nrb_tiny = lnet_nrb_tiny_calculate(0);
+	if (nrb_tiny < 0)
+		return -EINVAL;
+
+	nrb_small = lnet_nrb_small_calculate(small_pages);
+	if (nrb_small < 0)
+		return -EINVAL;
+
+	nrb_large = lnet_nrb_large_calculate(large_pages);
+	if (nrb_large < 0)
+		return -EINVAL;
+
+	the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+						LNET_NRBPOOLS *
+						sizeof(lnet_rtrbufpool_t));
+	if (the_lnet.ln_rtrpools == NULL) {
+		LCONSOLE_ERROR_MSG(0x10c,
+				   "Failed to initialize router buffe pool\n");
+		return -ENOMEM;
+	}
+
+	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+		lnet_rtrpool_init(&rtrp[0], 0);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[1], small_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+		if (rc != 0)
+			goto failed;
+
+		lnet_rtrpool_init(&rtrp[2], large_pages);
+		rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+		if (rc != 0)
+			goto failed;
+	}
+
+	lnet_net_lock(LNET_LOCK_EX);
+	the_lnet.ln_routing = 1;
+	lnet_net_unlock(LNET_LOCK_EX);
+
+	return 0;
+
+ failed:
+	lnet_rtrpools_free();
+	return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+	struct lnet_peer	*lp = NULL;
+	cfs_time_t		now = cfs_time_current();
+	int			cpt = lnet_cpt_of_nid(nid);
+
+	LASSERT (!in_interrupt ());
+
+	CDEBUG (D_NET, "%s notifying %s: %s\n",
+		(ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		libcfs_nid2str(nid),
+		alive ? "up" : "down");
+
+	if (ni != NULL &&
+	    LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+		CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+			libcfs_nid2str(nid), alive ? "birth" : "death",
+			libcfs_nid2str(ni->ni_nid));
+		return -EINVAL;
+	}
+
+	/* can't do predictions... */
+	if (cfs_time_after(when, now)) {
+		CWARN ("Ignoring prediction from %s of %s %s "
+		       "%ld seconds in the future\n",
+		       (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+		       libcfs_nid2str(nid), alive ? "up" : "down",
+		       cfs_duration_sec(cfs_time_sub(when, now)));
+		return -EINVAL;
+	}
+
+	if (ni != NULL && !alive &&	     /* LND telling me she's down */
+	    !auto_down) {		       /* auto-down disabled */
+		CDEBUG(D_NET, "Auto-down disabled\n");
+		return 0;
+	}
+
+	lnet_net_lock(cpt);
+
+	if (the_lnet.ln_shutdown) {
+		lnet_net_unlock(cpt);
+		return -ESHUTDOWN;
+	}
+
+	lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+	if (lp == NULL) {
+		/* nid not found */
+		lnet_net_unlock(cpt);
+		CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+		return 0;
+	}
+
+	/* We can't fully trust LND on reporting exact peer last_alive
+	 * if he notifies us about dead peer. For example ksocklnd can
+	 * call us with when == _time_when_the_node_was_booted_ if
+	 * no connections were successfully established */
+	if (ni != NULL && !alive && when < lp->lp_last_alive)
+		when = lp->lp_last_alive;
+
+	lnet_notify_locked(lp, ni == NULL, alive, when);
+
+	lnet_ni_notify_locked(ni, lp);
+
+	lnet_peer_decref_locked(lp);
+
+	lnet_net_unlock(cpt);
+	return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables (void)
+{
+	return;
+}
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+	return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker (void)
+{
+	static time_t last = 0;
+	static int    running = 0;
+
+	time_t	    now = cfs_time_current_sec();
+	int	       interval = now - last;
+	int	       rc;
+	__u64	     version;
+	lnet_peer_t      *rtr;
+
+	/* It's no use to call me again within a sec - all intervals and
+	 * timeouts are measured in seconds */
+	if (last != 0 && interval < 2)
+		return;
+
+	if (last != 0 &&
+	    interval > MAX(live_router_check_interval,
+			   dead_router_check_interval))
+		CNETERR("Checker(%d/%d) not called for %d seconds\n",
+			live_router_check_interval, dead_router_check_interval,
+			interval);
+
+	LASSERT(LNET_CPT_NUMBER == 1);
+
+	lnet_net_lock(0);
+	LASSERT(!running); /* recursion check */
+	running = 1;
+	lnet_net_unlock(0);
+
+	last = now;
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+		lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+	/* consume all pending events */
+	while (1) {
+		int	  i;
+		lnet_event_t ev;
+
+		/* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+		 * recursion breaker in LNetEQPoll would fail */
+		rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+		if (rc == 0)   /* no event pending */
+			break;
+
+		/* NB a lost SENT prevents me from pinging a router again */
+		if (rc == -EOVERFLOW) {
+			CERROR("Dropped an event!!!\n");
+			abort();
+		}
+
+		LASSERT (rc == 1);
+
+		lnet_router_checker_event(&ev);
+	}
+
+	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+		lnet_prune_rc_data(1); /* release rcd */
+		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+		running = 0;
+		return;
+	}
+
+	LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+	lnet_net_lock(0);
+
+	version = the_lnet.ln_routers_version;
+	list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) {
+		lnet_ping_router_locked(rtr);
+		LASSERT (version == the_lnet.ln_routers_version);
+	}
+
+	lnet_net_unlock(0);
+
+	running = 0; /* lock only needed for the recursion check */
+	return;
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables (void)
+{
+	char *s;
+
+	s = getenv("LNET_ROUTER_PING_TIMEOUT");
+	if (s != NULL) router_ping_timeout = atoi(s);
+
+	s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+	if (s != NULL) live_router_check_interval = atoi(s);
+
+	s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+	if (s != NULL) dead_router_check_interval = atoi(s);
+
+	/* This replaces old lnd_notify mechanism */
+	check_routers_before_use = 1;
+	if (dead_router_check_interval <= 0)
+		dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+	return 0;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644
index 000000000000..3084b0c75983
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router_proc.c
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+
+#if  defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static ctl_table_header_t *lnet_table_header = NULL;
+
+#define CTL_LNET	 (0x100)
+enum {
+	PSDEV_LNET_STATS = 100,
+	PSDEV_LNET_ROUTES,
+	PSDEV_LNET_ROUTERS,
+	PSDEV_LNET_PEERS,
+	PSDEV_LNET_BUFFERS,
+	PSDEV_LNET_NIS,
+	PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS		(sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS	(LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS	MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS	LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS	(LNET_LOFFT_BITS -       \
+				 LNET_PROC_CPT_BITS -    \
+				 LNET_PROC_VER_BITS -    \
+				 LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS	(LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS	(LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK	((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK	((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK	((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK	((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos)				\
+	(int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos)				\
+	(int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off)		\
+	(((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) |   \
+	((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) |   \
+	((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+	((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v)	((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+			     loff_t pos, void *buffer, int nob)
+{
+	int	      rc;
+	lnet_counters_t *ctrs;
+	int	      len;
+	char	    *tmpstr;
+	const int	tmpsiz = 256; /* 7 %u and 4 LPU64 */
+
+	if (write) {
+		lnet_counters_reset();
+		return 0;
+	}
+
+	/* read */
+
+	LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+	if (ctrs == NULL)
+		return -ENOMEM;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL) {
+		LIBCFS_FREE(ctrs, sizeof(*ctrs));
+		return -ENOMEM;
+	}
+
+	lnet_counters_get(ctrs);
+
+	len = snprintf(tmpstr, tmpsiz,
+		       "%u %u %u %u %u %u %u "LPU64" "LPU64" "
+		       LPU64" "LPU64,
+		       ctrs->msgs_alloc, ctrs->msgs_max,
+		       ctrs->errors,
+		       ctrs->send_count, ctrs->recv_count,
+		       ctrs->route_count, ctrs->drop_count,
+		       ctrs->send_length, ctrs->recv_length,
+		       ctrs->route_length, ctrs->drop_length);
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, "\n");
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	LIBCFS_FREE(ctrs, sizeof(*ctrs));
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_stats);
+
+int LL_PROC_PROTO(proc_lnet_routes)
+{
+	const int	tmpsiz = 256;
+	char		*tmpstr;
+	char		*s;
+	int		rc = 0;
+	int		len;
+	int		ver;
+	int		off;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	CLASSERT(sizeof(loff_t) >= 4);
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+			      the_lnet.ln_routing ? "enabled" : "disabled");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n",
+			      "net", "hops", "state", "router");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_remote_nets_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*n;
+		struct list_head		*r;
+		lnet_route_t		*route = NULL;
+		lnet_remotenet_t	*rnet  = NULL;
+		int			skip  = off - 1;
+		struct list_head		*rn_list;
+		int			i;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+			lnet_net_unlock(0);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+		     i++) {
+			rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+			n = rn_list->next;
+
+			while (n != rn_list && route == NULL) {
+				rnet = list_entry(n, lnet_remotenet_t,
+						      lrn_list);
+
+				r = rnet->lrn_routes.next;
+
+				while (r != &rnet->lrn_routes) {
+					lnet_route_t *re =
+						list_entry(r, lnet_route_t,
+							       lr_list);
+					if (skip == 0) {
+						route = re;
+						break;
+					}
+
+					skip--;
+					r = r->next;
+				}
+
+				n = n->next;
+			}
+		}
+
+		if (route != NULL) {
+			__u32	net   = rnet->lrn_net;
+			unsigned int hops  = route->lr_hops;
+			lnet_nid_t   nid   = route->lr_gateway->lp_nid;
+			int	  alive = route->lr_gateway->lp_alive;
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-8s %4u %7s %s\n",
+				      libcfs_net2str(net), hops,
+				      alive ? "up" : "down",
+				      libcfs_nid2str(nid));
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_routers)
+{
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	const int  tmpsiz = 256;
+	int	len;
+	int	ver;
+	int	off;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	off = LNET_PROC_HOFF_GET(*ppos);
+	ver = LNET_PROC_VER_GET(*ppos);
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+			      "ref", "rtr_ref", "alive_cnt", "state",
+			      "last_ping", "ping_sent", "deadline",
+			      "down_ni", "router");
+		LASSERT(tmpstr + tmpsiz - s > 0);
+
+		lnet_net_lock(0);
+		ver = (unsigned int)the_lnet.ln_routers_version;
+		lnet_net_unlock(0);
+		*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+	} else {
+		struct list_head		*r;
+		struct lnet_peer	*peer = NULL;
+		int			skip = off - 1;
+
+		lnet_net_lock(0);
+
+		if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+			lnet_net_unlock(0);
+
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		r = the_lnet.ln_routers.next;
+
+		while (r != &the_lnet.ln_routers) {
+			lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+							 lp_rtr_list);
+
+			if (skip == 0) {
+				peer = lp;
+				break;
+			}
+
+			skip--;
+			r = r->next;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid = peer->lp_nid;
+			cfs_time_t now = cfs_time_current();
+			cfs_time_t deadline = peer->lp_ping_deadline;
+			int nrefs     = peer->lp_refcount;
+			int nrtrrefs  = peer->lp_rtr_refcount;
+			int alive_cnt = peer->lp_alive_count;
+			int alive     = peer->lp_alive;
+			int pingsent  = !peer->lp_ping_notsent;
+			int last_ping = cfs_duration_sec(cfs_time_sub(now,
+						     peer->lp_ping_timestamp));
+			int down_ni   = 0;
+			lnet_route_t *rtr;
+
+			if ((peer->lp_ping_feats &
+			     LNET_PING_FEAT_NI_STATUS) != 0) {
+				list_for_each_entry(rtr, &peer->lp_routes,
+							lr_gwlist) {
+					/* downis on any route should be the
+					 * number of downis on the gateway */
+					if (rtr->lr_downis != 0) {
+						down_ni = rtr->lr_downis;
+						break;
+					}
+				}
+			}
+
+			if (deadline == 0)
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent, "NA", down_ni,
+					      libcfs_nid2str(nid));
+			else
+				s += snprintf(s, tmpstr + tmpsiz - s,
+					      "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+					      nrefs, nrtrrefs, alive_cnt,
+					      alive ? "up" : "down", last_ping,
+					      pingsent,
+					      cfs_duration_sec(cfs_time_sub(deadline, now)),
+					      down_ni, libcfs_nid2str(nid));
+			LASSERT (tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else {
+			off += 1;
+			*ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+		}
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_peers)
+{
+	const int		tmpsiz  = 256;
+	struct lnet_peer_table	*ptable;
+	char			*tmpstr;
+	char			*s;
+	int			cpt  = LNET_PROC_CPT_GET(*ppos);
+	int			ver  = LNET_PROC_VER_GET(*ppos);
+	int			hash = LNET_PROC_HASH_GET(*ppos);
+	int			hoff = LNET_PROC_HOFF_GET(*ppos);
+	int			rc = 0;
+	int			len;
+
+	CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+	LASSERT(!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	if (cpt >= LNET_CPT_NUMBER) {
+		*lenp = 0;
+		return 0;
+	}
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+			      "nid", "refs", "state", "last", "max",
+			      "rtr", "min", "tx", "min", "queue");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+
+		hoff++;
+	} else {
+		struct lnet_peer	*peer;
+		struct list_head		*p;
+		int			skip;
+ again:
+		p = NULL;
+		peer = NULL;
+		skip = hoff - 1;
+
+		lnet_net_lock(cpt);
+		ptable = the_lnet.ln_peer_tables[cpt];
+		if (hoff == 1)
+			ver = LNET_PROC_VERSION(ptable->pt_version);
+
+		if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+			lnet_net_unlock(cpt);
+			LIBCFS_FREE(tmpstr, tmpsiz);
+			return -ESTALE;
+		}
+
+		while (hash < LNET_PEER_HASH_SIZE) {
+			if (p == NULL)
+				p = ptable->pt_hash[hash].next;
+
+			while (p != &ptable->pt_hash[hash]) {
+				lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+								 lp_hashlist);
+				if (skip == 0) {
+					peer = lp;
+
+					/* minor optimization: start from idx+1
+					 * on next iteration if we've just
+					 * drained lp_hashlist */
+					if (lp->lp_hashlist.next ==
+					    &ptable->pt_hash[hash]) {
+						hoff = 1;
+						hash++;
+					} else {
+						hoff++;
+					}
+
+					break;
+				}
+
+				skip--;
+				p = lp->lp_hashlist.next;
+			}
+
+			if (peer != NULL)
+				break;
+
+			p = NULL;
+			hoff = 1;
+			hash++;
+		}
+
+		if (peer != NULL) {
+			lnet_nid_t nid       = peer->lp_nid;
+			int	nrefs     = peer->lp_refcount;
+			int	lastalive = -1;
+			char      *aliveness = "NA";
+			int	maxcr     = peer->lp_ni->ni_peertxcredits;
+			int	txcr      = peer->lp_txcredits;
+			int	mintxcr   = peer->lp_mintxcredits;
+			int	rtrcr     = peer->lp_rtrcredits;
+			int	minrtrcr  = peer->lp_minrtrcredits;
+			int	txqnob    = peer->lp_txqnob;
+
+			if (lnet_isrouter(peer) ||
+			    lnet_peer_aliveness_enabled(peer))
+				aliveness = peer->lp_alive ? "up" : "down";
+
+			if (lnet_peer_aliveness_enabled(peer)) {
+				cfs_time_t     now = cfs_time_current();
+				cfs_duration_t delta;
+
+				delta = cfs_time_sub(now, peer->lp_last_alive);
+				lastalive = cfs_duration_sec(delta);
+
+				/* No need to mess up peers contents with
+				 * arbitrarily long integers - it suffices to
+				 * know that lastalive is more than 10000s old
+				 */
+				if (lastalive >= 10000)
+					lastalive = 9999;
+			}
+
+			lnet_net_unlock(cpt);
+
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+				      libcfs_nid2str(nid), nrefs, aliveness,
+				      lastalive, maxcr, rtrcr, minrtrcr, txcr,
+				      mintxcr, txqnob);
+			LASSERT (tmpstr + tmpsiz - s > 0);
+
+		} else { /* peer is NULL */
+			lnet_net_unlock(cpt);
+		}
+
+		if (hash == LNET_PEER_HASH_SIZE) {
+			cpt++;
+			hash = 0;
+			hoff = 1;
+			if (peer == NULL && cpt < LNET_CPT_NUMBER)
+				goto again;
+		}
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+			       loff_t pos, void *buffer, int nob)
+{
+	char	    *s;
+	char	    *tmpstr;
+	int		tmpsiz;
+	int		idx;
+	int		len;
+	int		rc;
+	int		i;
+
+	LASSERT(!write);
+
+	/* (4 %d) * 4 * LNET_CPT_NUMBER */
+	tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	s += snprintf(s, tmpstr + tmpsiz - s,
+		      "%5s %5s %7s %7s\n",
+		      "pages", "count", "credits", "min");
+	LASSERT (tmpstr + tmpsiz - s > 0);
+
+	if (the_lnet.ln_rtrpools == NULL)
+		goto out; /* I'm not a router */
+
+	for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+		lnet_rtrbufpool_t *rbp;
+
+		lnet_net_lock(LNET_LOCK_EX);
+		cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+			s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%5d %5d %7d %7d\n",
+				      rbp[idx].rbp_npages,
+				      rbp[idx].rbp_nbuffers,
+				      rbp[idx].rbp_credits,
+				      rbp[idx].rbp_mincredits);
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+		lnet_net_unlock(LNET_LOCK_EX);
+	}
+
+ out:
+	len = s - tmpstr;
+
+	if (pos >= min_t(int, len, strlen(tmpstr)))
+		rc = 0;
+	else
+		rc = cfs_trace_copyout_string(buffer, nob,
+					      tmpstr + pos, NULL);
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+	return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_buffers);
+
+int LL_PROC_PROTO(proc_lnet_nis)
+{
+	int	tmpsiz = 128 * LNET_CPT_NUMBER;
+	int	rc = 0;
+	char      *tmpstr;
+	char      *s;
+	int	len;
+
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	LASSERT (!write);
+
+	if (*lenp == 0)
+		return 0;
+
+	LIBCFS_ALLOC(tmpstr, tmpsiz);
+	if (tmpstr == NULL)
+		return -ENOMEM;
+
+	s = tmpstr; /* points to current position in tmpstr[] */
+
+	if (*ppos == 0) {
+		s += snprintf(s, tmpstr + tmpsiz - s,
+			      "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+			      "nid", "status", "alive", "refs", "peer",
+			      "rtr", "max", "tx", "min");
+		LASSERT (tmpstr + tmpsiz - s > 0);
+	} else {
+		struct list_head	*n;
+		lnet_ni_t	 *ni   = NULL;
+		int		skip = *ppos - 1;
+
+		lnet_net_lock(0);
+
+		n = the_lnet.ln_nis.next;
+
+		while (n != &the_lnet.ln_nis) {
+			lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+			if (skip == 0) {
+				ni = a_ni;
+				break;
+			}
+
+			skip--;
+			n = n->next;
+		}
+
+		if (ni != NULL) {
+			struct lnet_tx_queue	*tq;
+			char	*stat;
+			long	now = cfs_time_current_sec();
+			int	last_alive = -1;
+			int	i;
+			int	j;
+
+			if (the_lnet.ln_routing)
+				last_alive = now - ni->ni_last_alive;
+
+			/* @lo forever alive */
+			if (ni->ni_lnd->lnd_type == LOLND)
+				last_alive = 0;
+
+			lnet_ni_lock(ni);
+			LASSERT(ni->ni_status != NULL);
+			stat = (ni->ni_status->ns_status ==
+				LNET_NI_STATUS_UP) ? "up" : "down";
+			lnet_ni_unlock(ni);
+
+			/* we actually output credits information for
+			 * TX queue of each partition */
+			cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+				for (j = 0; ni->ni_cpts != NULL &&
+				     j < ni->ni_ncpts; j++) {
+					if (i == ni->ni_cpts[j])
+						break;
+				}
+
+				if (j == ni->ni_ncpts)
+					continue;
+
+				if (i != 0)
+					lnet_net_lock(i);
+
+				s += snprintf(s, tmpstr + tmpsiz - s,
+				      "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+				      libcfs_nid2str(ni->ni_nid), stat,
+				      last_alive, *ni->ni_refs[i],
+				      ni->ni_peertxcredits,
+				      ni->ni_peerrtrcredits,
+				      tq->tq_credits_max,
+				      tq->tq_credits, tq->tq_credits_min);
+				if (i != 0)
+					lnet_net_unlock(i);
+			}
+			LASSERT(tmpstr + tmpsiz - s > 0);
+		}
+
+		lnet_net_unlock(0);
+	}
+
+	len = s - tmpstr;     /* how many bytes was written */
+
+	if (len > *lenp) {    /* linux-supplied buffer is too small */
+		rc = -EINVAL;
+	} else if (len > 0) { /* wrote something */
+		if (copy_to_user(buffer, tmpstr, len))
+			rc = -EFAULT;
+		else
+			*ppos += 1;
+	}
+
+	LIBCFS_FREE(tmpstr, tmpsiz);
+
+	if (rc == 0)
+		*lenp = len;
+
+	return rc;
+}
+
+struct lnet_portal_rotors {
+	int	     pr_value;
+	const char      *pr_name;
+	const char	*pr_desc;
+};
+
+static struct lnet_portal_rotors	portal_rotors[] = {
+	{
+		.pr_value = LNET_PTL_ROTOR_OFF,
+		.pr_name  = "OFF",
+		.pr_desc  = "Turn off message rotor for wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_ON,
+		.pr_name  = "ON",
+		.pr_desc  = "round-robin dispatch all PUT messages for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_RR_RT,
+		.pr_name  = "RR_RT",
+		.pr_desc  = "round-robin dispatch routed PUT message for "
+			    "wildcard portals"
+	},
+	{
+		.pr_value = LNET_PTL_ROTOR_HASH_RT,
+		.pr_name  = "HASH_RT",
+		.pr_desc  = "dispatch routed PUT message by hashing source "
+			    "NID for wildcard portals"
+	},
+	{
+		.pr_value = -1,
+		.pr_name  = NULL,
+		.pr_desc  = NULL
+	},
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+				    loff_t pos, void *buffer, int nob)
+{
+	const int	buf_len	= 128;
+	char		*buf;
+	char		*tmp;
+	int		rc;
+	int		i;
+
+	LIBCFS_ALLOC(buf, buf_len);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	if (!write) {
+		lnet_res_lock(0);
+
+		for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+			if (portal_rotors[i].pr_value == portal_rotor)
+				break;
+		}
+
+		LASSERT(portal_rotors[i].pr_value == portal_rotor);
+		lnet_res_unlock(0);
+
+		rc = snprintf(buf, buf_len,
+			      "{\n\tportals: all\n"
+			      "\trotor: %s\n\tdescription: %s\n}",
+			      portal_rotors[i].pr_name,
+			      portal_rotors[i].pr_desc);
+
+		if (pos >= min_t(int, rc, buf_len)) {
+			rc = 0;
+		} else {
+			rc = cfs_trace_copyout_string(buffer, nob,
+					buf + pos, "\n");
+		}
+		goto out;
+	}
+
+	rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+	if (rc < 0)
+		goto out;
+
+	tmp = cfs_trimwhite(buf);
+
+	rc = -EINVAL;
+	lnet_res_lock(0);
+	for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+		if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp,
+				    strlen(portal_rotors[i].pr_name)) == 0) {
+			portal_rotor = portal_rotors[i].pr_value;
+			rc = 0;
+			break;
+		}
+	}
+	lnet_res_unlock(0);
+out:
+	LIBCFS_FREE(buf, buf_len);
+	return rc;
+}
+DECLARE_PROC_HANDLER(proc_lnet_portal_rotor);
+
+static ctl_table_t lnet_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since sysctl(8) prefers
+	 * to go via /proc for portability.
+	 */
+	{
+		INIT_CTL_NAME(PSDEV_LNET_STATS)
+		.procname = "stats",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_stats,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_ROUTES)
+		.procname = "routes",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routes,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_ROUTERS)
+		.procname = "routers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_routers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PEERS)
+		.procname = "peers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_peers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PEERS)
+		.procname = "buffers",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_buffers,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_NIS)
+		.procname = "nis",
+		.mode     = 0444,
+		.proc_handler = &proc_lnet_nis,
+	},
+	{
+		INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR)
+		.procname = "portal_rotor",
+		.mode     = 0644,
+		.proc_handler = &proc_lnet_portal_rotor,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+static ctl_table_t top_table[] = {
+	{
+		INIT_CTL_NAME(CTL_LNET)
+		.procname = "lnet",
+		.mode     = 0555,
+		.data     = NULL,
+		.maxlen   = 0,
+		.child    = lnet_table,
+	},
+	{
+		INIT_CTL_NAME(0)
+	}
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header == NULL)
+		lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+	if (lnet_table_header != NULL)
+		unregister_sysctl_table(lnet_table_header);
+
+	lnet_table_header = NULL;
+#endif
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile
new file mode 100644
index 000000000000..1e40aeea2962
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o
+
+lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \
+		   module.o ping_test.o brw_test.o
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c
new file mode 100644
index 000000000000..3bb6fbe23f78
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/brw_test.c
@@ -0,0 +1,499 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/brw_test.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+static int brw_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(brw_srv_workitems, "i", int, 0644, "# BRW server workitems");
+
+static int brw_inject_errors;
+CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644,
+		"# data errors to inject randomly, zero by default");
+
+static void
+brw_client_fini (sfw_test_instance_t *tsi)
+{
+	srpc_bulk_t     *bulk;
+	sfw_test_unit_t *tsu;
+
+	LASSERT (tsi->tsi_is_client);
+
+	list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+		bulk = tsu->tsu_private;
+		if (bulk == NULL) continue;
+
+		srpc_free_bulk(bulk);
+		tsu->tsu_private = NULL;
+	}
+}
+
+int
+brw_client_init (sfw_test_instance_t *tsi)
+{
+	sfw_session_t	 *sn = tsi->tsi_batch->bat_session;
+	int		  flags;
+	int		  npg;
+	int		  len;
+	int		  opc;
+	srpc_bulk_t	 *bulk;
+	sfw_test_unit_t	 *tsu;
+
+	LASSERT(sn != NULL);
+	LASSERT(tsi->tsi_is_client);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t  *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		/* NB: this is not going to work for variable page size,
+		 * but we have to keep it for compatibility */
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	if (npg > LNET_MAX_IOV || npg <= 0)
+		return -EINVAL;
+
+	if (opc != LST_BRW_READ && opc != LST_BRW_WRITE)
+		return -EINVAL;
+
+	if (flags != LST_BRW_CHECK_NONE &&
+	    flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE)
+		return -EINVAL;
+
+	list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) {
+		bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid),
+				       npg, len, opc == LST_BRW_READ);
+		if (bulk == NULL) {
+			brw_client_fini(tsi);
+			return -ENOMEM;
+		}
+
+		tsu->tsu_private = bulk;
+	}
+
+	return 0;
+}
+
+#define BRW_POISON      0xbeefbeefbeefbeefULL
+#define BRW_MAGIC       0xeeb0eeb1eeb2eeb3ULL
+#define BRW_MSIZE       sizeof(__u64)
+
+int
+brw_inject_one_error (void)
+{
+	struct timeval tv;
+
+	if (brw_inject_errors <= 0) return 0;
+
+	do_gettimeofday(&tv);
+
+	if ((tv.tv_usec & 1) == 0) return 0;
+
+	return brw_inject_errors--;
+}
+
+void
+brw_fill_page (struct page *pg, int pattern, __u64 magic)
+{
+	char *addr = page_address(pg);
+	int   i;
+
+	LASSERT (addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE) return;
+
+	if (magic == BRW_MAGIC)
+		magic += brw_inject_one_error();
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		memcpy(addr, &magic, BRW_MSIZE);
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		memcpy(addr, &magic, BRW_MSIZE);
+		return;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+			memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+int
+brw_check_page (struct page *pg, int pattern, __u64 magic)
+{
+	char  *addr = page_address(pg);
+	__u64  data = 0; /* make compiler happy */
+	int    i;
+
+	LASSERT (addr != NULL);
+
+	if (pattern == LST_BRW_CHECK_NONE)
+		return 0;
+
+	if (pattern == LST_BRW_CHECK_SIMPLE) {
+		data = *((__u64 *) addr);
+		if (data != magic) goto bad_data;
+
+		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		data = *((__u64 *) addr);
+		if (data != magic) goto bad_data;
+
+		return 0;
+	}
+
+	if (pattern == LST_BRW_CHECK_FULL) {
+		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+			data = *(((__u64 *) addr) + i);
+			if (data != magic) goto bad_data;
+		}
+
+		return 0;
+	}
+
+	LBUG ();
+
+bad_data:
+	CERROR ("Bad data in page %p: "LPX64", "LPX64" expected\n",
+		pg, data, magic);
+	return 1;
+}
+
+void
+brw_fill_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		brw_fill_page(pg, pattern, magic);
+	}
+}
+
+int
+brw_check_bulk (srpc_bulk_t *bk, int pattern, __u64 magic)
+{
+	int	 i;
+	struct page *pg;
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (brw_check_page(pg, pattern, magic) != 0) {
+			CERROR ("Bulk page %p (%d/%d) is corrupted!\n",
+				pg, i, bk->bk_niov);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int
+brw_client_prep_rpc (sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpcpp)
+{
+	srpc_bulk_t	 *bulk = tsu->tsu_private;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t	    *sn = tsi->tsi_batch->bat_session;
+	srpc_client_rpc_t   *rpc;
+	srpc_brw_reqst_t    *req;
+	int		     flags;
+	int		     npg;
+	int		     len;
+	int		     opc;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT(bulk != NULL);
+
+	if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+		test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0;
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		npg   = breq->blk_npg;
+		len   = npg * PAGE_CACHE_SIZE;
+
+	} else {
+		test_bulk_req_v1_t  *breq = &tsi->tsi_u.bulk_v1;
+
+		/* I should never get this step if it's unknown feature
+		 * because make_session will reject unknown feature */
+		LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+		opc   = breq->blk_opc;
+		flags = breq->blk_flags;
+		len   = breq->blk_len;
+		npg   = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
+	if (rc != 0)
+		return rc;
+
+	memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg]));
+	if (opc == LST_BRW_WRITE)
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON);
+
+	req = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+	req->brw_flags = flags;
+	req->brw_rw    = opc;
+	req->brw_len   = len;
+
+	*rpcpp = rpc;
+	return 0;
+}
+
+static void
+brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	__u64		magic = BRW_MAGIC;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_msg_t	  *msg = &rpc->crpc_replymsg;
+	srpc_brw_reply_t    *reply = &msg->msg_body.brw_reply;
+	srpc_brw_reqst_t    *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst;
+
+	LASSERT (sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		CERROR ("BRW RPC to %s failed with %d\n",
+			libcfs_id2str(rpc->crpc_dest), rpc->crpc_status);
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_brw_errors);
+		goto out;
+	}
+
+	if (msg->msg_magic != SRPC_MSG_MAGIC) {
+		__swab64s(&magic);
+		__swab32s(&reply->brw_status);
+	}
+
+	CDEBUG (reply->brw_status ? D_WARNING : D_NET,
+		"BRW RPC to %s finished with brw_status: %d\n",
+		libcfs_id2str(rpc->crpc_dest), reply->brw_status);
+
+	if (reply->brw_status != 0) {
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -(int)reply->brw_status;
+		goto out;
+	}
+
+	if (reqst->brw_rw == LST_BRW_WRITE) goto out;
+
+	if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR ("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->crpc_dest));
+		atomic_inc(&sn->sn_brw_errors);
+		rpc->crpc_status = -EBADMSG;
+	}
+
+out:
+	return;
+}
+
+void
+brw_server_rpc_done (srpc_server_rpc_t *rpc)
+{
+	srpc_bulk_t *blk = rpc->srpc_bulk;
+
+	if (blk == NULL) return;
+
+	if (rpc->srpc_status != 0)
+		CERROR ("Bulk transfer %s %s has failed: %d\n",
+			blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer), rpc->srpc_status);
+	else
+		CDEBUG (D_NET, "Transfered %d pages bulk data %s %s\n",
+			blk->bk_niov, blk->bk_sink ? "from" : "to",
+			libcfs_id2str(rpc->srpc_peer));
+
+	sfw_free_pages(rpc);
+}
+
+int
+brw_bulk_ready (srpc_server_rpc_t *rpc, int status)
+{
+	__u64	     magic = BRW_MAGIC;
+	srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst;
+	srpc_msg_t       *reqstmsg;
+
+	LASSERT (rpc->srpc_bulk != NULL);
+	LASSERT (rpc->srpc_reqstbuf != NULL);
+
+	reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	reqst = &reqstmsg->msg_body.brw_reqst;
+
+	if (status != 0) {
+		CERROR ("BRW bulk %s failed for RPC from %s: %d\n",
+			reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE",
+			libcfs_id2str(rpc->srpc_peer), status);
+		return -EIO;
+	}
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		return 0;
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC)
+		__swab64s(&magic);
+
+	if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) {
+		CERROR ("Bulk data from %s is corrupted!\n",
+			libcfs_id2str(rpc->srpc_peer));
+		reply->brw_status = EBADMSG;
+	}
+
+	return 0;
+}
+
+int
+brw_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t       *replymsg = &rpc->srpc_replymsg;
+	srpc_msg_t       *reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply;
+	srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst;
+	int		  npg;
+	int	       rc;
+
+	LASSERT (sv->sv_id == SRPC_SERVICE_BRW);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&reqst->brw_rw);
+		__swab32s(&reqst->brw_len);
+		__swab32s(&reqst->brw_flags);
+		__swab64s(&reqst->brw_rpyid);
+		__swab64s(&reqst->brw_bulkid);
+	}
+	LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id));
+
+	reply->brw_status = 0;
+	rpc->srpc_done = brw_server_rpc_done;
+
+	if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) ||
+	    (reqst->brw_flags != LST_BRW_CHECK_NONE &&
+	     reqst->brw_flags != LST_BRW_CHECK_FULL &&
+	     reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		reply->brw_status = EPROTO;
+		return 0;
+	}
+
+	if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+		/* compat with old version */
+		if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) {
+			reply->brw_status = EINVAL;
+			return 0;
+		}
+		npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+
+	} else {
+		npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) {
+		reply->brw_status = EINVAL;
+		return 0;
+	}
+
+	rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg,
+			     reqst->brw_len,
+			     reqst->brw_rw == LST_BRW_WRITE);
+	if (rc != 0)
+		return rc;
+
+	if (reqst->brw_rw == LST_BRW_READ)
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC);
+	else
+		brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON);
+
+	return 0;
+}
+
+sfw_test_client_ops_t brw_test_client;
+void brw_init_test_client(void)
+{
+	brw_test_client.tso_init       = brw_client_init;
+	brw_test_client.tso_fini       = brw_client_fini;
+	brw_test_client.tso_prep_rpc   = brw_client_prep_rpc;
+	brw_test_client.tso_done_rpc   = brw_client_done_rpc;
+};
+
+srpc_service_t brw_test_service;
+void brw_init_test_service(void)
+{
+
+	brw_test_service.sv_id	 = SRPC_SERVICE_BRW;
+	brw_test_service.sv_name       = "brw_test";
+	brw_test_service.sv_handler    = brw_server_handle;
+	brw_test_service.sv_bulk_ready = brw_bulk_ready;
+	brw_test_service.sv_wi_total   = brw_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c
new file mode 100644
index 000000000000..bce3d3bde6b2
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conctl.c
@@ -0,0 +1,931 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * IOC handle in kernel
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lnetst.h>
+#include "console.h"
+
+int
+lst_session_new_ioctl(lstio_session_new_args_t *args)
+{
+	char      *name;
+	int	rc;
+
+	if (args->lstio_ses_idp   == NULL || /* address for output sid */
+	    args->lstio_ses_key   == 0 || /* no key is specified */
+	    args->lstio_ses_namep == NULL || /* session name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_ses_namep,
+			       args->lstio_ses_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_ses_nmlen] = 0;
+
+	rc = lstcon_session_new(name,
+				args->lstio_ses_key,
+				args->lstio_ses_feats,
+				args->lstio_ses_force,
+				args->lstio_ses_timeout,
+				args->lstio_ses_idp);
+
+	LIBCFS_FREE(name, args->lstio_ses_nmlen + 1);
+	return rc;
+}
+
+int
+lst_session_end_ioctl(lstio_session_end_args_t *args)
+{
+	if (args->lstio_ses_key != console_session.ses_key)
+		return -EACCES;
+
+	return lstcon_session_end();
+}
+
+int
+lst_session_info_ioctl(lstio_session_info_args_t *args)
+{
+	/* no checking of key */
+
+	if (args->lstio_ses_idp   == NULL || /* address for ouput sid */
+	    args->lstio_ses_keyp  == NULL || /* address for ouput key */
+	    args->lstio_ses_featp  == NULL || /* address for ouput features */
+	    args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */
+	    args->lstio_ses_namep == NULL || /* address for ouput name */
+	    args->lstio_ses_nmlen <= 0 ||
+	    args->lstio_ses_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_session_info(args->lstio_ses_idp,
+				   args->lstio_ses_keyp,
+				   args->lstio_ses_featp,
+				   args->lstio_ses_ndinfo,
+				   args->lstio_ses_namep,
+				   args->lstio_ses_nmlen);
+}
+
+int
+lst_debug_ioctl(lstio_debug_args_t *args)
+{
+	char   *name   = NULL;
+	int     client = 1;
+	int     rc;
+
+	if (args->lstio_dbg_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_dbg_resultp == NULL)
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL && /* name of batch/group */
+	    (args->lstio_dbg_nmlen <= 0 ||
+	     args->lstio_dbg_nmlen > LST_NAME_SIZE))
+		return -EINVAL;
+
+	if (args->lstio_dbg_namep != NULL) {
+		LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1);
+		if (name == NULL)
+			return -ENOMEM;
+
+		if (copy_from_user(name, args->lstio_dbg_namep,
+				       args->lstio_dbg_nmlen)) {
+			LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+			return -EFAULT;
+		}
+
+		name[args->lstio_dbg_nmlen] = 0;
+	}
+
+	rc = -EINVAL;
+
+	switch (args->lstio_dbg_type) {
+	case LST_OPC_SESSION:
+		rc = lstcon_session_debug(args->lstio_dbg_timeout,
+					  args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_BATCHSRV:
+		client = 0;
+	case LST_OPC_BATCHCLI:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_batch_debug(args->lstio_dbg_timeout,
+					name, client, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_GROUP:
+		if (name == NULL)
+			goto out;
+
+		rc = lstcon_group_debug(args->lstio_dbg_timeout,
+					name, args->lstio_dbg_resultp);
+		break;
+
+	case LST_OPC_NODES:
+		if (args->lstio_dbg_count <= 0 ||
+		    args->lstio_dbg_idsp == NULL)
+			goto out;
+
+		rc = lstcon_nodes_debug(args->lstio_dbg_timeout,
+					args->lstio_dbg_count,
+					args->lstio_dbg_idsp,
+					args->lstio_dbg_resultp);
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_add_ioctl(lstio_group_add_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_add(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_del_ioctl(lstio_group_del_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_del(name);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_group_update_ioctl(lstio_group_update_args_t *args)
+{
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			   args->lstio_grp_namep,
+			   args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	switch (args->lstio_grp_opc) {
+	case LST_GROUP_CLEAN:
+		rc = lstcon_group_clean(name, args->lstio_grp_args);
+		break;
+
+	case LST_GROUP_REFRESH:
+		rc = lstcon_group_refresh(name, args->lstio_grp_resultp);
+		break;
+
+	case LST_GROUP_RMND:
+		if (args->lstio_grp_count  <= 0 ||
+		    args->lstio_grp_idsp == NULL) {
+			rc = -EINVAL;
+			break;
+		}
+		rc = lstcon_nodes_remove(name, args->lstio_grp_count,
+					 args->lstio_grp_idsp,
+					 args->lstio_grp_resultp);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_nodes_add_ioctl(lstio_group_nodes_args_t *args)
+{
+	unsigned feats;
+	int     rc;
+	char   *name;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idsp == NULL || /* array of ids */
+	    args->lstio_grp_count <= 0 ||
+	    args->lstio_grp_resultp == NULL ||
+	    args->lstio_grp_featp == NULL ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_nodes_add(name, args->lstio_grp_count,
+			      args->lstio_grp_idsp, &feats,
+			      args->lstio_grp_resultp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+	if (rc == 0 &&
+	    copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) {
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+int
+lst_group_list_ioctl(lstio_group_list_args_t *args)
+{
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_idx   < 0 ||
+	    args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_group_list(args->lstio_grp_idx,
+			      args->lstio_grp_nmlen,
+			      args->lstio_grp_namep);
+}
+
+int
+lst_group_info_ioctl(lstio_group_info_args_t *args)
+{
+	char	   *name;
+	int	     ndent;
+	int	     index;
+	int	     rc;
+
+	if (args->lstio_grp_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_grp_namep == NULL ||
+	    args->lstio_grp_nmlen <= 0 ||
+	    args->lstio_grp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_grp_entp  == NULL && /* output: group entry */
+	    args->lstio_grp_dentsp == NULL)  /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_grp_dentsp != NULL) { /* have node entry */
+		if (args->lstio_grp_idxp == NULL || /* node index */
+		    args->lstio_grp_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&ndent, args->lstio_grp_ndentp,
+				       sizeof(ndent)) ||
+		    copy_from_user(&index, args->lstio_grp_idxp,
+				       sizeof(index)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_grp_namep,
+			       args->lstio_grp_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_grp_nmlen] = 0;
+
+	rc = lstcon_group_info(name, args->lstio_grp_entp,
+			       &index, &ndent, args->lstio_grp_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_grp_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_grp_dentsp != NULL &&
+	    (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return 0;
+}
+
+int
+lst_batch_add_ioctl(lstio_batch_add_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_add(name);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_run_ioctl(lstio_batch_run_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_run(name, args->lstio_bat_timeout,
+			      args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_stop_ioctl(lstio_batch_stop_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_stop(name, args->lstio_bat_force,
+			       args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_query_ioctl(lstio_batch_query_args_t *args)
+{
+	char   *name;
+	int     rc;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_resultp == NULL ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_testidx < 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep,
+			       args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_test_batch_query(name,
+				     args->lstio_bat_testidx,
+				     args->lstio_bat_client,
+				     args->lstio_bat_timeout,
+				     args->lstio_bat_resultp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	return rc;
+}
+
+int
+lst_batch_list_ioctl(lstio_batch_list_args_t *args)
+{
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_idx   < 0 ||
+	    args->lstio_bat_namep == NULL ||
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	return lstcon_batch_list(args->lstio_bat_idx,
+			      args->lstio_bat_nmlen,
+			      args->lstio_bat_namep);
+}
+
+int
+lst_batch_info_ioctl(lstio_batch_info_args_t *args)
+{
+	char	   *name;
+	int	     rc;
+	int	     index;
+	int	     ndent;
+
+	if (args->lstio_bat_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_bat_namep == NULL || /* batch name */
+	    args->lstio_bat_nmlen <= 0 ||
+	    args->lstio_bat_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_bat_entp == NULL && /* output: batch entry */
+	    args->lstio_bat_dentsp == NULL) /* output: node entry */
+		return -EINVAL;
+
+	if (args->lstio_bat_dentsp != NULL) { /* have node entry */
+		if (args->lstio_bat_idxp == NULL || /* node index */
+		    args->lstio_bat_ndentp == NULL) /* # of node entry */
+			return -EINVAL;
+
+		if (copy_from_user(&index, args->lstio_bat_idxp,
+				       sizeof(index)) ||
+		    copy_from_user(&ndent, args->lstio_bat_ndentp,
+				       sizeof(ndent)))
+			return -EFAULT;
+
+		if (ndent <= 0 || index < 0)
+			return -EINVAL;
+	}
+
+	LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name,
+			       args->lstio_bat_namep, args->lstio_bat_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+		return -EFAULT;
+	}
+
+	name[args->lstio_bat_nmlen] = 0;
+
+	rc = lstcon_batch_info(name,
+			    args->lstio_bat_entp, args->lstio_bat_server,
+			    args->lstio_bat_testidx, &index, &ndent,
+			    args->lstio_bat_dentsp);
+
+	LIBCFS_FREE(name, args->lstio_bat_nmlen + 1);
+
+	if (rc != 0)
+		return rc;
+
+	if (args->lstio_bat_dentsp != NULL &&
+	    (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) ||
+	     copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent))))
+		rc = -EFAULT;
+
+	return rc;
+}
+
+int
+lst_stat_query_ioctl(lstio_stat_args_t *args)
+{
+	int	     rc;
+	char	   *name;
+
+	/* TODO: not finished */
+	if (args->lstio_sta_key != console_session.ses_key)
+		return -EACCES;
+
+	if (args->lstio_sta_resultp == NULL ||
+	    (args->lstio_sta_namep  == NULL &&
+	     args->lstio_sta_idsp   == NULL) ||
+	    args->lstio_sta_nmlen <= 0 ||
+	    args->lstio_sta_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_sta_idsp != NULL &&
+	    args->lstio_sta_count <= 0)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1);
+	if (name == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(name, args->lstio_sta_namep,
+			       args->lstio_sta_nmlen)) {
+		LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+		return -EFAULT;
+	}
+
+	if (args->lstio_sta_idsp == NULL) {
+		rc = lstcon_group_stat(name, args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	} else {
+		rc = lstcon_nodes_stat(args->lstio_sta_count,
+				       args->lstio_sta_idsp,
+				       args->lstio_sta_timeout,
+				       args->lstio_sta_resultp);
+	}
+
+	LIBCFS_FREE(name, args->lstio_sta_nmlen + 1);
+
+	return rc;
+}
+
+int lst_test_add_ioctl(lstio_test_args_t *args)
+{
+	char	   *name;
+	char	   *srcgrp = NULL;
+	char	   *dstgrp = NULL;
+	void	   *param = NULL;
+	int	     ret = 0;
+	int	     rc = -ENOMEM;
+
+	if (args->lstio_tes_resultp == NULL ||
+	    args->lstio_tes_retp == NULL ||
+	    args->lstio_tes_bat_name == NULL || /* no specified batch */
+	    args->lstio_tes_bat_nmlen <= 0 ||
+	    args->lstio_tes_bat_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_sgrp_name == NULL || /* no source group */
+	    args->lstio_tes_sgrp_nmlen <= 0 ||
+	    args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE ||
+	    args->lstio_tes_dgrp_name == NULL || /* no target group */
+	    args->lstio_tes_dgrp_nmlen <= 0 ||
+	    args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE)
+		return -EINVAL;
+
+	if (args->lstio_tes_loop == 0 || /* negative is infinite */
+	    args->lstio_tes_concur <= 0 ||
+	    args->lstio_tes_dist <= 0 ||
+	    args->lstio_tes_span <= 0)
+		return -EINVAL;
+
+	/* have parameter, check if parameter length is valid */
+	if (args->lstio_tes_param != NULL &&
+	    (args->lstio_tes_param_len <= 0 ||
+	     args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+		return -EINVAL;
+
+	LIBCFS_ALLOC(name, args->lstio_tes_bat_nmlen + 1);
+	if (name == NULL)
+		return rc;
+
+	LIBCFS_ALLOC(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+	if (srcgrp == NULL)
+		goto out;
+
+	LIBCFS_ALLOC(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+	 if (dstgrp == NULL)
+		goto out;
+
+	if (args->lstio_tes_param != NULL) {
+		LIBCFS_ALLOC(param, args->lstio_tes_param_len);
+		if (param == NULL)
+			goto out;
+	}
+
+	rc = -EFAULT;
+	if (copy_from_user(name,
+			      args->lstio_tes_bat_name,
+			      args->lstio_tes_bat_nmlen) ||
+	    copy_from_user(srcgrp,
+			      args->lstio_tes_sgrp_name,
+			      args->lstio_tes_sgrp_nmlen) ||
+	    copy_from_user(dstgrp,
+			      args->lstio_tes_dgrp_name,
+			      args->lstio_tes_dgrp_nmlen) ||
+	    copy_from_user(param, args->lstio_tes_param,
+			      args->lstio_tes_param_len))
+		goto out;
+
+	rc = lstcon_test_add(name,
+			    args->lstio_tes_type,
+			    args->lstio_tes_loop,
+			    args->lstio_tes_concur,
+			    args->lstio_tes_dist, args->lstio_tes_span,
+			    srcgrp, dstgrp, param, args->lstio_tes_param_len,
+			    &ret, args->lstio_tes_resultp);
+
+	if (ret != 0)
+		rc = (copy_to_user(args->lstio_tes_retp, &ret,
+				       sizeof(ret))) ? -EFAULT : 0;
+out:
+	if (name != NULL)
+		LIBCFS_FREE(name, args->lstio_tes_bat_nmlen + 1);
+
+	if (srcgrp != NULL)
+		LIBCFS_FREE(srcgrp, args->lstio_tes_sgrp_nmlen + 1);
+
+	if (dstgrp != NULL)
+		LIBCFS_FREE(dstgrp, args->lstio_tes_dgrp_nmlen + 1);
+
+	if (param != NULL)
+		LIBCFS_FREE(param, args->lstio_tes_param_len);
+
+	return rc;
+}
+
+int
+lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+	char   *buf;
+	int     opc = data->ioc_u32[0];
+	int     rc;
+
+	if (cmd != IOC_LIBCFS_LNETST)
+		return -EINVAL;
+
+	if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+		return -EINVAL;
+
+	LIBCFS_ALLOC(buf, data->ioc_plen1);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	/* copy in parameter */
+	if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) {
+		LIBCFS_FREE(buf, data->ioc_plen1);
+		return -EFAULT;
+	}
+
+	mutex_lock(&console_session.ses_mutex);
+
+	console_session.ses_laststamp = cfs_time_current_sec();
+
+	if (console_session.ses_shutdown) {
+		rc = -ESHUTDOWN;
+		goto out;
+	}
+
+	if (console_session.ses_expired)
+		lstcon_session_end();
+
+	if (opc != LSTIO_SESSION_NEW &&
+	    console_session.ses_state == LST_SESSION_NONE) {
+		CDEBUG(D_NET, "LST no active session\n");
+		rc = -ESRCH;
+		goto out;
+	}
+
+	memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t));
+
+	switch (opc) {
+		case LSTIO_SESSION_NEW:
+			rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf);
+			break;
+		case LSTIO_SESSION_END:
+			rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf);
+			break;
+		case LSTIO_SESSION_INFO:
+			rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf);
+			break;
+		case LSTIO_DEBUG:
+			rc = lst_debug_ioctl((lstio_debug_args_t *)buf);
+			break;
+		case LSTIO_GROUP_ADD:
+			rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf);
+			break;
+		case LSTIO_GROUP_DEL:
+			rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf);
+			break;
+		case LSTIO_GROUP_UPDATE:
+			rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf);
+			break;
+		case LSTIO_NODES_ADD:
+			rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf);
+			break;
+		case LSTIO_GROUP_LIST:
+			rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf);
+			break;
+		case LSTIO_GROUP_INFO:
+			rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf);
+			break;
+		case LSTIO_BATCH_ADD:
+			rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf);
+			break;
+		case LSTIO_BATCH_START:
+			rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf);
+			break;
+		case LSTIO_BATCH_STOP:
+			rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf);
+			break;
+		case LSTIO_BATCH_QUERY:
+			rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf);
+			break;
+		case LSTIO_BATCH_LIST:
+			rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf);
+			break;
+		case LSTIO_BATCH_INFO:
+			rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf);
+			break;
+		case LSTIO_TEST_ADD:
+			rc = lst_test_add_ioctl((lstio_test_args_t *)buf);
+			break;
+		case LSTIO_STAT_QUERY:
+			rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf);
+			break;
+		default:
+			rc = -EINVAL;
+	}
+
+	if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat,
+			     sizeof(lstcon_trans_stat_t)))
+		rc = -EFAULT;
+out:
+	mutex_unlock(&console_session.ses_mutex);
+
+	LIBCFS_FREE(buf, data->ioc_plen1);
+
+	return rc;
+}
+
+EXPORT_SYMBOL(lstcon_ioctl_entry);
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
new file mode 100644
index 000000000000..446de0e4672f
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.c
@@ -0,0 +1,1397 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Console framework rpcs
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "timer.h"
+#include "conrpc.h"
+#include "console.h"
+
+void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *,
+			   lstcon_node_t *, lstcon_trans_stat_t *);
+
+static void
+lstcon_rpc_done(srpc_client_rpc_t *rpc)
+{
+	lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv;
+
+	LASSERT(crpc != NULL && rpc == crpc->crp_rpc);
+	LASSERT(crpc->crp_posted && !crpc->crp_finished);
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (crpc->crp_trans == NULL) {
+		/* Orphan RPC is not in any transaction,
+		 * I'm just a poor body and nobody loves me */
+		spin_unlock(&rpc->crpc_lock);
+
+		/* release it */
+		lstcon_rpc_put(crpc);
+		return;
+	}
+
+	/* not an orphan RPC */
+	crpc->crp_finished = 1;
+
+	if (crpc->crp_stamp == 0) {
+		/* not aborted */
+		LASSERT (crpc->crp_status == 0);
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = rpc->crpc_status;
+	}
+
+	/* wakeup (transaction)thread if I'm the last RPC in the transaction */
+	if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining))
+		wake_up(&crpc->crp_trans->tas_waitq);
+
+	spin_unlock(&rpc->crpc_lock);
+}
+
+int
+lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc)
+{
+	crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service,
+				       feats, bulk_npg, bulk_len,
+				       lstcon_rpc_done, (void *)crpc);
+	if (crpc->crp_rpc == NULL)
+		return -ENOMEM;
+
+	crpc->crp_trans    = NULL;
+	crpc->crp_node     = nd;
+	crpc->crp_posted   = 0;
+	crpc->crp_finished = 0;
+	crpc->crp_unpacked = 0;
+	crpc->crp_status   = 0;
+	crpc->crp_stamp    = 0;
+	crpc->crp_embedded = embedded;
+	INIT_LIST_HEAD(&crpc->crp_link);
+
+	atomic_inc(&console_session.ses_rpc_counter);
+
+	return 0;
+}
+
+int
+lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats,
+		int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp)
+{
+	lstcon_rpc_t  *crpc = NULL;
+	int	    rc;
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!list_empty(&console_session.ses_rpc_freelist)) {
+		crpc = list_entry(console_session.ses_rpc_freelist.next,
+				      lstcon_rpc_t, crp_link);
+		list_del_init(&crpc->crp_link);
+	}
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (crpc == NULL) {
+		LIBCFS_ALLOC(crpc, sizeof(*crpc));
+		if (crpc == NULL)
+			return -ENOMEM;
+	}
+
+	rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc);
+	if (rc == 0) {
+		*crpcpp = crpc;
+		return 0;
+	}
+
+	LIBCFS_FREE(crpc, sizeof(*crpc));
+
+	return rc;
+}
+
+void
+lstcon_rpc_put(lstcon_rpc_t *crpc)
+{
+	srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk;
+	int	  i;
+
+	LASSERT (list_empty(&crpc->crp_link));
+
+	for (i = 0; i < bulk->bk_niov; i++) {
+		if (bulk->bk_iovs[i].kiov_page == NULL)
+			continue;
+
+		__free_page(bulk->bk_iovs[i].kiov_page);
+	}
+
+	srpc_client_rpc_decref(crpc->crp_rpc);
+
+	if (crpc->crp_embedded) {
+		/* embedded RPC, don't recycle it */
+		memset(crpc, 0, sizeof(*crpc));
+		crpc->crp_embedded = 1;
+
+	} else {
+		spin_lock(&console_session.ses_rpc_lock);
+
+		list_add(&crpc->crp_link,
+			     &console_session.ses_rpc_freelist);
+
+		spin_unlock(&console_session.ses_rpc_lock);
+	}
+
+	/* RPC is not alive now */
+	atomic_dec(&console_session.ses_rpc_counter);
+}
+
+void
+lstcon_rpc_post(lstcon_rpc_t *crpc)
+{
+	lstcon_rpc_trans_t *trans = crpc->crp_trans;
+
+	LASSERT (trans != NULL);
+
+	atomic_inc(&trans->tas_remaining);
+	crpc->crp_posted = 1;
+
+	sfw_post_rpc(crpc->crp_rpc);
+}
+
+static char *
+lstcon_rpc_trans_name(int transop)
+{
+	if (transop == LST_TRANS_SESNEW)
+		return "SESNEW";
+
+	if (transop == LST_TRANS_SESEND)
+		return "SESEND";
+
+	if (transop == LST_TRANS_SESQRY)
+		return "SESQRY";
+
+	if (transop == LST_TRANS_SESPING)
+		return "SESPING";
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return "TSBCLIADD";
+
+	if (transop == LST_TRANS_TSBSRVADD)
+		return "TSBSRVADD";
+
+	if (transop == LST_TRANS_TSBRUN)
+		return "TSBRUN";
+
+	if (transop == LST_TRANS_TSBSTOP)
+		return "TSBSTOP";
+
+	if (transop == LST_TRANS_TSBCLIQRY)
+		return "TSBCLIQRY";
+
+	if (transop == LST_TRANS_TSBSRVQRY)
+		return "TSBSRVQRY";
+
+	if (transop == LST_TRANS_STATQRY)
+		return "STATQRY";
+
+	return "Unknown";
+}
+
+int
+lstcon_rpc_trans_prep(struct list_head *translist,
+		      int transop, lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+
+	if (translist != NULL) {
+		list_for_each_entry(trans, translist, tas_link) {
+			/* Can't enqueue two private transaction on
+			 * the same object */
+			if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE)
+				return -EPERM;
+		}
+	}
+
+	/* create a trans group */
+	LIBCFS_ALLOC(trans, sizeof(*trans));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	trans->tas_opc = transop;
+
+	if (translist == NULL)
+		INIT_LIST_HEAD(&trans->tas_olink);
+	else
+		list_add_tail(&trans->tas_olink, translist);
+
+	list_add_tail(&trans->tas_link, &console_session.ses_trans_list);
+
+	INIT_LIST_HEAD(&trans->tas_rpcs_list);
+	atomic_set(&trans->tas_remaining, 0);
+	init_waitqueue_head(&trans->tas_waitq);
+
+	spin_lock(&console_session.ses_rpc_lock);
+	trans->tas_features = console_session.ses_features;
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	*transpp = trans;
+	return 0;
+}
+
+void
+lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc)
+{
+	list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list);
+	crpc->crp_trans = trans;
+}
+
+void
+lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_node_t     *nd;
+
+	list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		if (!crpc->crp_posted || /* not posted */
+		    crpc->crp_stamp != 0) { /* rpc done or aborted already */
+			if (crpc->crp_stamp == 0) {
+				crpc->crp_stamp = cfs_time_current();
+				crpc->crp_status = -EINTR;
+			}
+			spin_unlock(&rpc->crpc_lock);
+			continue;
+		}
+
+		crpc->crp_stamp  = cfs_time_current();
+		crpc->crp_status = error;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		sfw_abort_rpc(rpc);
+
+		if  (error != ETIMEDOUT)
+			continue;
+
+		nd = crpc->crp_node;
+		if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+			continue;
+
+		nd->nd_stamp = crpc->crp_stamp;
+		nd->nd_state = LST_NODE_DOWN;
+	}
+}
+
+static int
+lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans)
+{
+	if (console_session.ses_shutdown &&
+	    !list_empty(&trans->tas_olink)) /* Not an end session RPC */
+		return 1;
+
+	return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0;
+}
+
+int
+lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout)
+{
+	lstcon_rpc_t  *crpc;
+	int	    rc;
+
+	if (list_empty(&trans->tas_rpcs_list))
+		return 0;
+
+	if (timeout < LST_TRANS_MIN_TIMEOUT)
+		timeout = LST_TRANS_MIN_TIMEOUT;
+
+	CDEBUG(D_NET, "Transaction %s started\n",
+	       lstcon_rpc_trans_name(trans->tas_opc));
+
+	/* post all requests */
+	list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) {
+		LASSERT (!crpc->crp_posted);
+
+		lstcon_rpc_post(crpc);
+	}
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	rc = wait_event_interruptible_timeout(trans->tas_waitq,
+					      lstcon_rpc_trans_check(trans),
+					      cfs_time_seconds(timeout));
+	rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown)
+		rc = -ESHUTDOWN;
+
+	if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) {
+		/* treat short timeout as canceled */
+		if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2)
+			rc = -EINTR;
+
+		lstcon_rpc_trans_abort(trans, rc);
+	}
+
+	CDEBUG(D_NET, "Transaction %s stopped: %d\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), rc);
+
+	lstcon_rpc_trans_stat(trans, lstcon_trans_stat());
+
+	return rc;
+}
+
+int
+lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp)
+{
+	lstcon_node_t	*nd  = crpc->crp_node;
+	srpc_client_rpc_t    *rpc = crpc->crp_rpc;
+	srpc_generic_reply_t *rep;
+
+	LASSERT (nd != NULL && rpc != NULL);
+	LASSERT (crpc->crp_stamp != 0);
+
+	if (crpc->crp_status != 0) {
+		*msgpp = NULL;
+		return crpc->crp_status;
+	}
+
+	*msgpp = &rpc->crpc_replymsg;
+	if (!crpc->crp_unpacked) {
+		sfw_unpack_message(*msgpp);
+		crpc->crp_unpacked = 1;
+	}
+
+	if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp))
+		return 0;
+
+	nd->nd_stamp = crpc->crp_stamp;
+	rep = &(*msgpp)->msg_body.reply;
+
+	if (rep->sid.ses_nid == LNET_NID_ANY)
+		nd->nd_state = LST_NODE_UNKNOWN;
+	else if (lstcon_session_match(rep->sid))
+		nd->nd_state = LST_NODE_ACTIVE;
+	else
+		nd->nd_state = LST_NODE_BUSY;
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat)
+{
+	lstcon_rpc_t      *crpc;
+	srpc_msg_t	*rep;
+	int		error;
+
+	LASSERT (stat != NULL);
+
+	memset(stat, 0, sizeof(*stat));
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		lstcon_rpc_stat_total(stat, 1);
+
+		LASSERT (crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &rep);
+		if (error != 0) {
+			lstcon_rpc_stat_failure(stat, 1);
+			if (stat->trs_rpc_errno == 0)
+				stat->trs_rpc_errno = -error;
+
+			continue;
+		}
+
+		lstcon_rpc_stat_success(stat, 1);
+
+		lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat);
+	}
+
+	if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) {
+		stat->trs_fwk_errno =
+		      lstcon_session_feats_check(trans->tas_features);
+	}
+
+	CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, "
+		      "RPC error(%d), Framework error(%d)\n",
+	       lstcon_rpc_trans_name(trans->tas_opc),
+	       lstcon_rpc_stat_success(stat, 0),
+	       lstcon_rpc_stat_failure(stat, 0),
+	       lstcon_rpc_stat_total(stat, 0),
+	       stat->trs_rpc_errno, stat->trs_fwk_errno);
+
+	return;
+}
+
+int
+lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+			     struct list_head *head_up,
+			     lstcon_rpc_readent_func_t readent)
+{
+	struct list_head	    tmp;
+	struct list_head	   *next;
+	lstcon_rpc_ent_t     *ent;
+	srpc_generic_reply_t *rep;
+	lstcon_rpc_t	 *crpc;
+	srpc_msg_t	   *msg;
+	lstcon_node_t	*nd;
+	cfs_duration_t	dur;
+	struct timeval	tv;
+	int		   error;
+
+	LASSERT (head_up != NULL);
+
+	next = head_up;
+
+	list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) {
+		if (copy_from_user(&tmp, next,
+				       sizeof(struct list_head)))
+			return -EFAULT;
+
+		if (tmp.next == head_up)
+			return 0;
+
+		next = tmp.next;
+
+		ent = list_entry(next, lstcon_rpc_ent_t, rpe_link);
+
+		LASSERT (crpc->crp_stamp != 0);
+
+		error = lstcon_rpc_get_reply(crpc, &msg);
+
+		nd = crpc->crp_node;
+
+		dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp,
+		      (cfs_time_t)console_session.ses_id.ses_stamp);
+		cfs_duration_usec(dur, &tv);
+
+		if (copy_to_user(&ent->rpe_peer,
+				     &nd->nd_id, sizeof(lnet_process_id_t)) ||
+		    copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) ||
+		    copy_to_user(&ent->rpe_state,
+				     &nd->nd_state, sizeof(nd->nd_state)) ||
+		    copy_to_user(&ent->rpe_rpc_errno, &error,
+				     sizeof(error)))
+			return -EFAULT;
+
+		if (error != 0)
+			continue;
+
+		/* RPC is done */
+		rep = (srpc_generic_reply_t *)&msg->msg_body.reply;
+
+		if (copy_to_user(&ent->rpe_sid,
+				     &rep->sid, sizeof(lst_sid_t)) ||
+		    copy_to_user(&ent->rpe_fwk_errno,
+				     &rep->status, sizeof(rep->status)))
+			return -EFAULT;
+
+		if (readent == NULL)
+			continue;
+
+		if ((error = readent(trans->tas_opc, msg, ent)) != 0)
+			return error;
+	}
+
+	return 0;
+}
+
+void
+lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans)
+{
+	srpc_client_rpc_t *rpc;
+	lstcon_rpc_t      *crpc;
+	lstcon_rpc_t      *tmp;
+	int		count = 0;
+
+	list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list,
+				 crp_link) {
+		rpc = crpc->crp_rpc;
+
+		spin_lock(&rpc->crpc_lock);
+
+		/* free it if not posted or finished already */
+		if (!crpc->crp_posted || crpc->crp_finished) {
+			spin_unlock(&rpc->crpc_lock);
+
+			list_del_init(&crpc->crp_link);
+			lstcon_rpc_put(crpc);
+
+			continue;
+		}
+
+		/* rpcs can be still not callbacked (even LNetMDUnlink is called)
+		 * because huge timeout for inaccessible network, don't make
+		 * user wait for them, just abandon them, they will be recycled
+		 * in callback */
+
+		LASSERT (crpc->crp_status != 0);
+
+		crpc->crp_node  = NULL;
+		crpc->crp_trans = NULL;
+		list_del_init(&crpc->crp_link);
+		count ++;
+
+		spin_unlock(&rpc->crpc_lock);
+
+		atomic_dec(&trans->tas_remaining);
+	}
+
+	LASSERT (atomic_read(&trans->tas_remaining) == 0);
+
+	list_del(&trans->tas_link);
+	if (!list_empty(&trans->tas_olink))
+		list_del(&trans->tas_olink);
+
+	CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n",
+	       lstcon_rpc_trans_name(trans->tas_opc), count);
+
+	LIBCFS_FREE(trans, sizeof(*trans));
+
+	return;
+}
+
+int
+lstcon_sesrpc_prep(lstcon_node_t *nd, int transop,
+		   unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_mksn_reqst_t *msrq;
+	srpc_rmsn_reqst_t *rsrq;
+	int		rc;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst;
+		msrq->mksn_sid     = console_session.ses_id;
+		msrq->mksn_force   = console_session.ses_force;
+		strncpy(msrq->mksn_name, console_session.ses_name,
+			strlen(console_session.ses_name));
+		break;
+
+	case LST_TRANS_SESEND:
+		rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION,
+				     feats, 0, 0, crpc);
+		if (rc != 0)
+			return rc;
+
+		rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst;
+		rsrq->rmsn_sid = console_session.ses_id;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+int
+lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_debug_reqst_t *drq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+	drq->dbg_sid   = console_session.ses_id;
+	drq->dbg_flags = 0;
+
+	return rc;
+}
+
+int
+lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		   lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc)
+{
+	lstcon_batch_t	   *batch;
+	srpc_batch_reqst_t *brq;
+	int		    rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst;
+
+	brq->bar_sid     = console_session.ses_id;
+	brq->bar_bid     = tsb->tsb_id;
+	brq->bar_testidx = tsb->tsb_index;
+	brq->bar_opc     = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN :
+			   (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP:
+			    SRPC_BATCH_OPC_QUERY);
+
+	if (transop != LST_TRANS_TSBRUN &&
+	    transop != LST_TRANS_TSBSTOP)
+		return 0;
+
+	LASSERT (tsb->tsb_index == 0);
+
+	batch = (lstcon_batch_t *)tsb;
+	brq->bar_arg = batch->bat_arg;
+
+	return 0;
+}
+
+int
+lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc)
+{
+	srpc_stat_reqst_t *srq;
+	int		   rc;
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc);
+	if (rc != 0)
+		return rc;
+
+	srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst;
+
+	srq->str_sid  = console_session.ses_id;
+	srq->str_type = 0; /* XXX remove it */
+
+	return 0;
+}
+
+lnet_process_id_packed_t *
+lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	int		       i;
+
+	i = idx / SFW_ID_PER_PAGE;
+
+	LASSERT (i < nkiov);
+
+	pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page);
+
+	return &pid[idx % SFW_ID_PER_PAGE];
+}
+
+int
+lstcon_dstnodes_prep(lstcon_group_t *grp, int idx,
+		     int dist, int span, int nkiov, lnet_kiov_t *kiov)
+{
+	lnet_process_id_packed_t *pid;
+	lstcon_ndlink_t	  *ndl;
+	lstcon_node_t	    *nd;
+	int		       start;
+	int		       end;
+	int		       i = 0;
+
+	LASSERT (dist >= 1);
+	LASSERT (span >= 1);
+	LASSERT (grp->grp_nnode >= 1);
+
+	if (span > grp->grp_nnode)
+		return -EINVAL;
+
+	start = ((idx / dist) * span) % grp->grp_nnode;
+	end   = ((idx / dist) * span + span - 1) % grp->grp_nnode;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+		if (i < start) {
+			i ++;
+			continue;
+		}
+
+		if (i > (end >= start ? end: grp->grp_nnode))
+			break;
+
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	if (start <= end) /* done */
+		return 0;
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) {
+		if (i > grp->grp_nnode + end)
+			break;
+
+		nd = ndl->ndl_node;
+		pid = lstcon_next_id((i - start), nkiov, kiov);
+		pid->nid = nd->nd_id.nid;
+		pid->pid = nd->nd_id.pid;
+		i++;
+	}
+
+	return 0;
+}
+
+int
+lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req)
+{
+	test_ping_req_t *prq = &req->tsr_u.ping;
+
+	prq->png_size   = param->png_size;
+	prq->png_flags  = param->png_flags;
+	/* TODO dest */
+	return 0;
+}
+
+int
+lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
+
+	brq->blk_opc    = param->blk_opc;
+	brq->blk_npg    = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE;
+	brq->blk_flags  = param->blk_flags;
+
+	return 0;
+}
+
+int
+lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
+{
+	test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1;
+
+	brq->blk_opc	= param->blk_opc;
+	brq->blk_flags	= param->blk_flags;
+	brq->blk_len	= param->blk_size;
+	brq->blk_offset	= 0; /* reserved */
+
+	return 0;
+}
+
+int
+lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
+		    lstcon_test_t *test, lstcon_rpc_t **crpc)
+{
+	lstcon_group_t    *sgrp = test->tes_src_grp;
+	lstcon_group_t    *dgrp = test->tes_dst_grp;
+	srpc_test_reqst_t *trq;
+	srpc_bulk_t       *bulk;
+	int		i;
+	int		   npg = 0;
+	int		   nob = 0;
+	int		   rc  = 0;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		npg = sfw_id_pages(test->tes_span);
+		nob = (feats & LST_FEAT_BULK_LEN) == 0 ?
+		      npg * PAGE_CACHE_SIZE :
+		      sizeof(lnet_process_id_packed_t) * test->tes_span;
+	}
+
+	rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc);
+	if (rc != 0)
+		return rc;
+
+	trq  = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst;
+
+	if (transop == LST_TRANS_TSBSRVADD) {
+		int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist;
+		int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span;
+		int nmax = (ndist + nspan - 1) / nspan;
+
+		trq->tsr_ndest = 0;
+		trq->tsr_loop  = nmax * test->tes_dist * test->tes_concur;
+
+	} else {
+		bulk = &(*crpc)->crp_rpc->crpc_bulk;
+
+		for (i = 0; i < npg; i++) {
+			int	len;
+
+			LASSERT(nob > 0);
+
+			len = (feats & LST_FEAT_BULK_LEN) == 0 ?
+			      PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE);
+			nob -= len;
+
+			bulk->bk_iovs[i].kiov_offset = 0;
+			bulk->bk_iovs[i].kiov_len    = len;
+			bulk->bk_iovs[i].kiov_page   =
+				alloc_page(GFP_IOFS);
+
+			if (bulk->bk_iovs[i].kiov_page == NULL) {
+				lstcon_rpc_put(*crpc);
+				return -ENOMEM;
+			}
+		}
+
+		bulk->bk_sink = 0;
+
+		LASSERT (transop == LST_TRANS_TSBCLIADD);
+
+		rc = lstcon_dstnodes_prep(test->tes_dst_grp,
+					  test->tes_cliidx++,
+					  test->tes_dist,
+					  test->tes_span,
+					  npg, &bulk->bk_iovs[0]);
+		if (rc != 0) {
+			lstcon_rpc_put(*crpc);
+			return rc;
+		}
+
+		trq->tsr_ndest = test->tes_span;
+		trq->tsr_loop  = test->tes_loop;
+	}
+
+	trq->tsr_sid	= console_session.ses_id;
+	trq->tsr_bid	= test->tes_hdr.tsb_id;
+	trq->tsr_concur     = test->tes_concur;
+	trq->tsr_is_client  = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0;
+	trq->tsr_stop_onerr = !!test->tes_stop_onerr;
+
+	switch (test->tes_type) {
+	case LST_TEST_PING:
+		trq->tsr_service = SRPC_SERVICE_PING;
+		rc = lstcon_pingrpc_prep((lst_test_ping_param_t *)
+					 &test->tes_param[0], trq);
+		break;
+
+	case LST_TEST_BULK:
+		trq->tsr_service = SRPC_SERVICE_BRW;
+		if ((feats & LST_FEAT_BULK_LEN) == 0) {
+			rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		} else {
+			rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *)
+						    &test->tes_param[0], trq);
+		}
+
+		break;
+	default:
+		LBUG();
+		break;
+	}
+
+	return rc;
+}
+
+int
+lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans,
+			 lstcon_node_t *nd, srpc_msg_t *reply)
+{
+	srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply;
+	int		   status   = mksn_rep->mksn_status;
+
+	if (status == 0 &&
+	    (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		mksn_rep->mksn_status = EPROTO;
+		status = EPROTO;
+	}
+
+	if (status == EPROTO) {
+		CNETERR("session protocol error from %s: %u\n",
+			libcfs_nid2str(nd->nd_id.nid),
+			reply->msg_ses_feats);
+	}
+
+	if (status != 0)
+		return status;
+
+	if (!trans->tas_feats_updated) {
+		trans->tas_feats_updated = 1;
+		trans->tas_features = reply->msg_ses_feats;
+	}
+
+	if (reply->msg_ses_feats != trans->tas_features) {
+		CNETERR("Framework features %x from %s is different with "
+			"features on this transaction: %x\n",
+			 reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid),
+			 trans->tas_features);
+		status = mksn_rep->mksn_status = EPROTO;
+	}
+
+	if (status == 0) {
+		/* session timeout on remote node */
+		nd->nd_timeout = mksn_rep->mksn_timeout;
+	}
+
+	return status;
+}
+
+void
+lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg,
+		      lstcon_node_t *nd, lstcon_trans_stat_t *stat)
+{
+	srpc_rmsn_reply_t  *rmsn_rep;
+	srpc_debug_reply_t *dbg_rep;
+	srpc_batch_reply_t *bat_rep;
+	srpc_test_reply_t  *test_rep;
+	srpc_stat_reply_t  *stat_rep;
+	int		 rc = 0;
+
+	switch (trans->tas_opc) {
+	case LST_TRANS_SESNEW:
+		rc = lstcon_sesnew_stat_reply(trans, nd, msg);
+		if (rc == 0) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		break;
+
+	case LST_TRANS_SESEND:
+		rmsn_rep = &msg->msg_body.rmsn_reply;
+		/* ESRCH is not an error for end session */
+		if (rmsn_rep->rmsn_status == 0 ||
+		    rmsn_rep->rmsn_status == ESRCH) {
+			lstcon_sesop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_sesop_stat_failure(stat, 1);
+		rc = rmsn_rep->rmsn_status;
+		break;
+
+	case LST_TRANS_SESQRY:
+	case LST_TRANS_SESPING:
+		dbg_rep = &msg->msg_body.dbg_reply;
+
+		if (dbg_rep->dbg_status == ESRCH) {
+			lstcon_sesqry_stat_unknown(stat, 1);
+			return;
+		}
+
+		if (lstcon_session_match(dbg_rep->dbg_sid))
+			lstcon_sesqry_stat_active(stat, 1);
+		else
+			lstcon_sesqry_stat_busy(stat, 1);
+		return;
+
+	case LST_TRANS_TSBRUN:
+	case LST_TRANS_TSBSTOP:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		if (bat_rep->bar_status == EPERM &&
+		    trans->tas_opc == LST_TRANS_TSBSTOP) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		bat_rep = &msg->msg_body.bat_reply;
+
+		if (bat_rep->bar_active != 0)
+			lstcon_tsbqry_stat_run(stat, 1);
+		else
+			lstcon_tsbqry_stat_idle(stat, 1);
+
+		if (bat_rep->bar_status == 0)
+			return;
+
+		lstcon_tsbqry_stat_failure(stat, 1);
+		rc = bat_rep->bar_status;
+		break;
+
+	case LST_TRANS_TSBCLIADD:
+	case LST_TRANS_TSBSRVADD:
+		test_rep = &msg->msg_body.tes_reply;
+
+		if (test_rep->tsr_status == 0) {
+			lstcon_tsbop_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_tsbop_stat_failure(stat, 1);
+		rc = test_rep->tsr_status;
+		break;
+
+	case LST_TRANS_STATQRY:
+		stat_rep = &msg->msg_body.stat_reply;
+
+		if (stat_rep->str_status == 0) {
+			lstcon_statqry_stat_success(stat, 1);
+			return;
+		}
+
+		lstcon_statqry_stat_failure(stat, 1);
+		rc = stat_rep->str_status;
+		break;
+
+	default:
+		LBUG();
+	}
+
+	if (stat->trs_fwk_errno == 0)
+		stat->trs_fwk_errno = rc;
+
+	return;
+}
+
+int
+lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			struct list_head *translist, int transop,
+			void *arg, lstcon_rpc_cond_func_t condition,
+			lstcon_rpc_trans_t **transpp)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	lstcon_rpc_t       *rpc;
+	unsigned	    feats;
+	int		 rc;
+
+	/* Creating session RPG for list of nodes */
+
+	rc = lstcon_rpc_trans_prep(translist, transop, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction %d: %d\n", transop, rc);
+		return rc;
+	}
+
+	feats = trans->tas_features;
+	list_for_each_entry(ndl, ndlist, ndl_link) {
+		rc = condition == NULL ? 1 :
+		     condition(transop, ndl->ndl_node, arg);
+
+		if (rc == 0)
+			continue;
+
+		if (rc < 0) {
+			CDEBUG(D_NET, "Condition error while creating RPC "
+				      " for transaction %d: %d\n", transop, rc);
+			break;
+		}
+
+		nd = ndl->ndl_node;
+
+		switch (transop) {
+		case LST_TRANS_SESNEW:
+		case LST_TRANS_SESEND:
+			rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc);
+			break;
+		case LST_TRANS_SESQRY:
+		case LST_TRANS_SESPING:
+			rc = lstcon_dbgrpc_prep(nd, feats, &rpc);
+			break;
+		case LST_TRANS_TSBCLIADD:
+		case LST_TRANS_TSBSRVADD:
+			rc = lstcon_testrpc_prep(nd, transop, feats,
+						 (lstcon_test_t *)arg, &rpc);
+			break;
+		case LST_TRANS_TSBRUN:
+		case LST_TRANS_TSBSTOP:
+		case LST_TRANS_TSBCLIQRY:
+		case LST_TRANS_TSBSRVQRY:
+			rc = lstcon_batrpc_prep(nd, transop, feats,
+						(lstcon_tsb_hdr_t *)arg, &rpc);
+			break;
+		case LST_TRANS_STATQRY:
+			rc = lstcon_statrpc_prep(nd, feats, &rpc);
+			break;
+		default:
+			rc = -EINVAL;
+			break;
+		}
+
+		if (rc != 0) {
+			CERROR("Failed to create RPC for transaction %s: %d\n",
+			       lstcon_rpc_trans_name(transop), rc);
+			break;
+		}
+
+		lstcon_rpc_trans_addreq(trans, rpc);
+	}
+
+	if (rc == 0) {
+		*transpp = trans;
+		return 0;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+void
+lstcon_rpc_pinger(void *arg)
+{
+	stt_timer_t	*ptimer = (stt_timer_t *)arg;
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	srpc_msg_t	 *rep;
+	srpc_debug_reqst_t *drq;
+	lstcon_ndlink_t    *ndl;
+	lstcon_node_t      *nd;
+	time_t	      intv;
+	int		 count = 0;
+	int		 rc;
+
+	/* RPC pinger is a special case of transaction,
+	 * it's called by timer at 8 seconds interval.
+	 */
+	mutex_lock(&console_session.ses_mutex);
+
+	if (console_session.ses_shutdown || console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	if (!console_session.ses_expired &&
+	    cfs_time_current_sec() - console_session.ses_laststamp >
+	    (time_t)console_session.ses_timeout)
+		console_session.ses_expired = 1;
+
+	trans = console_session.ses_ping;
+
+	LASSERT (trans != NULL);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) {
+		nd = ndl->ndl_node;
+
+		if (console_session.ses_expired) {
+			/* idle console, end session on all nodes */
+			if (nd->nd_state != LST_NODE_ACTIVE)
+				continue;
+
+			rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND,
+						trans->tas_features, &crpc);
+			if (rc != 0) {
+				CERROR("Out of memory\n");
+				break;
+			}
+
+			lstcon_rpc_trans_addreq(trans, crpc);
+			lstcon_rpc_post(crpc);
+
+			continue;
+		}
+
+		crpc = &nd->nd_ping;
+
+		if (crpc->crp_rpc != NULL) {
+			LASSERT (crpc->crp_trans == trans);
+			LASSERT (!list_empty(&crpc->crp_link));
+
+			spin_lock(&crpc->crp_rpc->crpc_lock);
+
+			LASSERT(crpc->crp_posted);
+
+			if (!crpc->crp_finished) {
+				/* in flight */
+				spin_unlock(&crpc->crp_rpc->crpc_lock);
+				continue;
+			}
+
+			spin_unlock(&crpc->crp_rpc->crpc_lock);
+
+			lstcon_rpc_get_reply(crpc, &rep);
+
+			list_del_init(&crpc->crp_link);
+
+			lstcon_rpc_put(crpc);
+		}
+
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			continue;
+
+		intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(),
+						     nd->nd_stamp));
+		if (intv < (time_t)nd->nd_timeout / 2)
+			continue;
+
+		rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG,
+				     trans->tas_features, 0, 0, 1, crpc);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			break;
+		}
+
+		drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst;
+
+		drq->dbg_sid   = console_session.ses_id;
+		drq->dbg_flags = 0;
+
+		lstcon_rpc_trans_addreq(trans, crpc);
+		lstcon_rpc_post(crpc);
+
+		count ++;
+	}
+
+	if (console_session.ses_expired) {
+		mutex_unlock(&console_session.ses_mutex);
+		return;
+	}
+
+	CDEBUG(D_NET, "Ping %d nodes in session\n", count);
+
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+	stt_add_timer(ptimer);
+
+	mutex_unlock(&console_session.ses_mutex);
+}
+
+int
+lstcon_rpc_pinger_start(void)
+{
+	stt_timer_t    *ptimer;
+	int	     rc;
+
+	LASSERT (list_empty(&console_session.ses_rpc_freelist));
+	LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+
+	rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING,
+				   &console_session.ses_ping);
+	if (rc != 0) {
+		CERROR("Failed to create console pinger\n");
+		return rc;
+	}
+
+	ptimer = &console_session.ses_ping_timer;
+	ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL);
+
+	stt_add_timer(ptimer);
+
+	return 0;
+}
+
+void
+lstcon_rpc_pinger_stop(void)
+{
+	LASSERT (console_session.ses_shutdown);
+
+	stt_del_timer(&console_session.ses_ping_timer);
+
+	lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN);
+	lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat());
+	lstcon_rpc_trans_destroy(console_session.ses_ping);
+
+	memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t));
+
+	console_session.ses_ping = NULL;
+}
+
+void
+lstcon_rpc_cleanup_wait(void)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_rpc_t       *crpc;
+	struct list_head	 *pacer;
+	struct list_head	  zlist;
+
+	/* Called with hold of global mutex */
+
+	LASSERT (console_session.ses_shutdown);
+
+	while (!list_empty(&console_session.ses_trans_list)) {
+		list_for_each(pacer, &console_session.ses_trans_list) {
+			trans = list_entry(pacer, lstcon_rpc_trans_t,
+					       tas_link);
+
+			CDEBUG(D_NET, "Session closed, wakeup transaction %s\n",
+			       lstcon_rpc_trans_name(trans->tas_opc));
+
+			wake_up(&trans->tas_waitq);
+		}
+
+		mutex_unlock(&console_session.ses_mutex);
+
+		CWARN("Session is shutting down, "
+		      "waiting for termination of transactions\n");
+		cfs_pause(cfs_time_seconds(1));
+
+		mutex_lock(&console_session.ses_mutex);
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0),
+		       console_session.ses_rpc_lock,
+		       "Network is not accessable or target is down, "
+		       "waiting for %d console RPCs to being recycled\n",
+		       atomic_read(&console_session.ses_rpc_counter));
+
+	list_add(&zlist, &console_session.ses_rpc_freelist);
+	list_del_init(&console_session.ses_rpc_freelist);
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	while (!list_empty(&zlist)) {
+		crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link);
+
+		list_del(&crpc->crp_link);
+		LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t));
+	}
+}
+
+int
+lstcon_rpc_module_init(void)
+{
+	INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list);
+	console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger;
+	console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer;
+
+	console_session.ses_ping = NULL;
+
+	spin_lock_init(&console_session.ses_rpc_lock);
+	atomic_set(&console_session.ses_rpc_counter, 0);
+	INIT_LIST_HEAD(&console_session.ses_rpc_freelist);
+
+	return 0;
+}
+
+void
+lstcon_rpc_module_fini(void)
+{
+	LASSERT (list_empty(&console_session.ses_rpc_freelist));
+	LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h
new file mode 100644
index 000000000000..9aba24a2eab9
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.h
@@ -0,0 +1,146 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * /lnet/selftest/conrpc.h
+ *
+ * Console rpc
+ *
+ * Author: Liang Zhen <liang@whamcloud.com>
+ */
+
+#ifndef __LST_CONRPC_H__
+#define __LST_CONRPC_H__
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "rpc.h"
+#include "selftest.h"
+
+/* Console rpc and rpc transaction */
+#define LST_TRANS_TIMEOUT       30
+#define LST_TRANS_MIN_TIMEOUT   3
+
+#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT)
+
+#define LST_PING_INTERVAL       8
+
+struct lstcon_rpc_trans;
+struct lstcon_tsb_hdr;
+struct lstcon_test;
+struct lstcon_node;
+
+typedef struct lstcon_rpc {
+	struct list_head	       crp_link;       /* chain on rpc transaction */
+	srpc_client_rpc_t       *crp_rpc;	/* client rpc */
+	struct lstcon_node      *crp_node;       /* destination node */
+	struct lstcon_rpc_trans *crp_trans;     /* conrpc transaction */
+
+	unsigned int		 crp_posted:1;   /* rpc is posted */
+	unsigned int		 crp_finished:1; /* rpc is finished */
+	unsigned int		 crp_unpacked:1; /* reply is unpacked */
+	/** RPC is embedded in other structure and can't free it */
+	unsigned int		 crp_embedded:1;
+	int		      crp_status;     /* console rpc errors */
+	cfs_time_t	       crp_stamp;      /* replied time stamp */
+} lstcon_rpc_t;
+
+typedef struct lstcon_rpc_trans {
+	struct list_head	    tas_olink;     /* link chain on owner list */
+	struct list_head	    tas_link;      /* link chain on global list */
+	int		   tas_opc;       /* operation code of transaction */
+	/* features mask is uptodate */
+	unsigned	      tas_feats_updated;
+	/* test features mask */
+	unsigned	      tas_features;
+	wait_queue_head_t	   tas_waitq;     /* wait queue head */
+	atomic_t	  tas_remaining; /* # of un-scheduled rpcs */
+	struct list_head	    tas_rpcs_list; /* queued requests */
+} lstcon_rpc_trans_t;
+
+#define LST_TRANS_PRIVATE       0x1000
+
+#define LST_TRANS_SESNEW	(LST_TRANS_PRIVATE | 0x01)
+#define LST_TRANS_SESEND	(LST_TRANS_PRIVATE | 0x02)
+#define LST_TRANS_SESQRY	0x03
+#define LST_TRANS_SESPING       0x04
+
+#define LST_TRANS_TSBCLIADD     (LST_TRANS_PRIVATE | 0x11)
+#define LST_TRANS_TSBSRVADD     (LST_TRANS_PRIVATE | 0x12)
+#define LST_TRANS_TSBRUN	(LST_TRANS_PRIVATE | 0x13)
+#define LST_TRANS_TSBSTOP       (LST_TRANS_PRIVATE | 0x14)
+#define LST_TRANS_TSBCLIQRY     0x15
+#define LST_TRANS_TSBSRVQRY     0x16
+
+#define LST_TRANS_STATQRY       0x21
+
+typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *);
+typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *);
+
+int  lstcon_sesrpc_prep(struct lstcon_node *nd, int transop,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_dbgrpc_prep(struct lstcon_node *nd,
+			unsigned version, lstcon_rpc_t **crpc);
+int  lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc);
+int  lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version,
+			 struct lstcon_test *test, lstcon_rpc_t **crpc);
+int  lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version,
+			 lstcon_rpc_t **crpc);
+void lstcon_rpc_put(lstcon_rpc_t *crpc);
+int  lstcon_rpc_trans_prep(struct list_head *translist,
+			   int transop, lstcon_rpc_trans_t **transpp);
+int  lstcon_rpc_trans_ndlist(struct list_head *ndlist,
+			     struct list_head *translist, int transop,
+			     void *arg, lstcon_rpc_cond_func_t condition,
+			     lstcon_rpc_trans_t **transpp);
+void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans,
+			   lstcon_trans_stat_t *stat);
+int  lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans,
+				  struct list_head *head_up,
+				  lstcon_rpc_readent_func_t readent);
+void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error);
+void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans);
+void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req);
+int  lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout);
+int  lstcon_rpc_pinger_start(void);
+void lstcon_rpc_pinger_stop(void);
+void lstcon_rpc_cleanup_wait(void);
+int  lstcon_rpc_module_init(void);
+void lstcon_rpc_module_fini(void);
+
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c
new file mode 100644
index 000000000000..78e8d0467267
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.c
@@ -0,0 +1,2071 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Infrastructure of LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+#include "console.h"
+#include "conrpc.h"
+
+#define LST_NODE_STATE_COUNTER(nd, p)		   \
+do {						    \
+	if ((nd)->nd_state == LST_NODE_ACTIVE)	  \
+		(p)->nle_nactive ++;		    \
+	else if ((nd)->nd_state == LST_NODE_BUSY)       \
+		(p)->nle_nbusy ++;		      \
+	else if ((nd)->nd_state == LST_NODE_DOWN)       \
+		(p)->nle_ndown ++;		      \
+	else					    \
+		(p)->nle_nunknown ++;		   \
+	(p)->nle_nnode ++;			      \
+} while (0)
+
+lstcon_session_t	console_session;
+
+void
+lstcon_node_get(lstcon_node_t *nd)
+{
+	LASSERT (nd->nd_ref >= 1);
+
+	nd->nd_ref++;
+}
+
+static int
+lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create)
+{
+	lstcon_ndlink_t *ndl;
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE;
+
+	LASSERT (id.nid != LNET_NID_ANY);
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		lstcon_node_get(ndl->ndl_node);
+		*ndpp = ndl->ndl_node;
+		return 0;
+	}
+
+	if (!create)
+		return -ENOENT;
+
+	LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+	if (*ndpp == NULL)
+		return -ENOMEM;
+
+	ndl = (lstcon_ndlink_t *)(*ndpp + 1);
+
+	ndl->ndl_node = *ndpp;
+
+	ndl->ndl_node->nd_ref   = 1;
+	ndl->ndl_node->nd_id    = id;
+	ndl->ndl_node->nd_stamp = cfs_time_current();
+	ndl->ndl_node->nd_state = LST_NODE_UNKNOWN;
+	ndl->ndl_node->nd_timeout = 0;
+	memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t));
+
+	/* queued in global hash & list, no refcount is taken by
+	 * global hash & list, if caller release his refcount,
+	 * node will be released */
+	list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list);
+
+	return 0;
+}
+
+void
+lstcon_node_put(lstcon_node_t *nd)
+{
+	lstcon_ndlink_t  *ndl;
+
+	LASSERT (nd->nd_ref > 0);
+
+	if (--nd->nd_ref > 0)
+		return;
+
+	ndl = (lstcon_ndlink_t *)(nd + 1);
+
+	LASSERT (!list_empty(&ndl->ndl_link));
+	LASSERT (!list_empty(&ndl->ndl_hlink));
+
+	/* remove from session */
+	list_del(&ndl->ndl_link);
+	list_del(&ndl->ndl_hlink);
+
+	LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t));
+}
+
+static int
+lstcon_ndlink_find(struct list_head *hash,
+		   lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create)
+{
+	unsigned int     idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+	lstcon_ndlink_t *ndl;
+	lstcon_node_t   *nd;
+	int	      rc;
+
+	if (id.nid == LNET_NID_ANY)
+		return -EINVAL;
+
+	/* search in hash */
+	list_for_each_entry(ndl, &hash[idx], ndl_hlink) {
+		if (ndl->ndl_node->nd_id.nid != id.nid ||
+		    ndl->ndl_node->nd_id.pid != id.pid)
+			continue;
+
+		*ndlpp = ndl;
+		return 0;
+	}
+
+	if (create == 0)
+		return -ENOENT;
+
+	/* find or create in session hash */
+	rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0);
+	if (rc != 0)
+		return rc;
+
+	LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t));
+	if (ndl == NULL) {
+		lstcon_node_put(nd);
+		return -ENOMEM;
+	}
+
+	*ndlpp = ndl;
+
+	ndl->ndl_node = nd;
+	INIT_LIST_HEAD(&ndl->ndl_link);
+	list_add_tail(&ndl->ndl_hlink, &hash[idx]);
+
+	return  0;
+}
+
+static void
+lstcon_ndlink_release(lstcon_ndlink_t *ndl)
+{
+	LASSERT (list_empty(&ndl->ndl_link));
+	LASSERT (!list_empty(&ndl->ndl_hlink));
+
+	list_del(&ndl->ndl_hlink); /* delete from hash */
+	lstcon_node_put(ndl->ndl_node);
+
+	LIBCFS_FREE(ndl, sizeof(*ndl));
+}
+
+static int
+lstcon_group_alloc(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t *grp;
+	int	     i;
+
+	LIBCFS_ALLOC(grp, offsetof(lstcon_group_t,
+				   grp_ndl_hash[LST_NODE_HASHSIZE]));
+	if (grp == NULL)
+		return -ENOMEM;
+
+	memset(grp, 0, offsetof(lstcon_group_t,
+				grp_ndl_hash[LST_NODE_HASHSIZE]));
+
+	grp->grp_ref = 1;
+	if (name != NULL)
+		strcpy(grp->grp_name, name);
+
+	INIT_LIST_HEAD(&grp->grp_link);
+	INIT_LIST_HEAD(&grp->grp_ndl_list);
+	INIT_LIST_HEAD(&grp->grp_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++)
+		INIT_LIST_HEAD(&grp->grp_ndl_hash[i]);
+
+	*grpp = grp;
+
+	return 0;
+}
+
+static void
+lstcon_group_addref(lstcon_group_t *grp)
+{
+	grp->grp_ref ++;
+}
+
+static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *);
+
+static void
+lstcon_group_drain(lstcon_group_t *grp, int keep)
+{
+	lstcon_ndlink_t *ndl;
+	lstcon_ndlink_t *tmp;
+
+	list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) {
+		if ((ndl->ndl_node->nd_state & keep) == 0)
+			lstcon_group_ndlink_release(grp, ndl);
+	}
+}
+
+static void
+lstcon_group_decref(lstcon_group_t *grp)
+{
+	int     i;
+
+	if (--grp->grp_ref > 0)
+		return;
+
+	if (!list_empty(&grp->grp_link))
+		list_del(&grp->grp_link);
+
+	lstcon_group_drain(grp, 0);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&grp->grp_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(grp, offsetof(lstcon_group_t,
+				  grp_ndl_hash[LST_NODE_HASHSIZE]));
+}
+
+static int
+lstcon_group_find(char *name, lstcon_group_t **grpp)
+{
+	lstcon_group_t   *grp;
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0)
+			continue;
+
+		lstcon_group_addref(grp);  /* +1 ref for caller */
+		*grpp = grp;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static void
+lstcon_group_put(lstcon_group_t *grp)
+{
+	lstcon_group_decref(grp);
+}
+
+static int
+lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id,
+			 lstcon_ndlink_t **ndlpp, int create)
+{
+	int     rc;
+
+	rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create);
+	if (rc != 0)
+		return rc;
+
+	if (!list_empty(&(*ndlpp)->ndl_link))
+		return 0;
+
+	list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list);
+	grp->grp_nnode ++;
+
+	return 0;
+}
+
+static void
+lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl)
+{
+	list_del_init(&ndl->ndl_link);
+	lstcon_ndlink_release(ndl);
+	grp->grp_nnode --;
+}
+
+static void
+lstcon_group_ndlink_move(lstcon_group_t *old,
+			 lstcon_group_t *new, lstcon_ndlink_t *ndl)
+{
+	unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) %
+			   LST_NODE_HASHSIZE;
+
+	list_del(&ndl->ndl_hlink);
+	list_del(&ndl->ndl_link);
+	old->grp_nnode --;
+
+	list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]);
+	list_add_tail(&ndl->ndl_link, &new->grp_ndl_list);
+	new->grp_nnode ++;
+
+	return;
+}
+
+static void
+lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new)
+{
+	lstcon_ndlink_t *ndl;
+
+	while (!list_empty(&old->grp_ndl_list)) {
+		ndl = list_entry(old->grp_ndl_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		lstcon_group_ndlink_move(old, new, ndl);
+	}
+}
+
+int
+lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_group_t *grp = (lstcon_group_t *)arg;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+		if (nd->nd_state == LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_SESEND:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+
+		if (grp != NULL && nd->nd_ref > 1)
+			return 0;
+		break;
+
+	case LST_TRANS_SESQRY:
+		break;
+
+	default:
+		LBUG();
+	}
+
+	return 1;
+}
+
+int
+lstcon_sesrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_debug_reply_t *rep;
+
+	switch (transop) {
+	case LST_TRANS_SESNEW:
+	case LST_TRANS_SESEND:
+		return 0;
+
+	case LST_TRANS_SESQRY:
+		rep = &msg->msg_body.dbg_reply;
+
+		if (copy_to_user(&ent_up->rpe_priv[0],
+				     &rep->dbg_timeout, sizeof(int)) ||
+		    copy_to_user(&ent_up->rpe_payload[0],
+				     &rep->dbg_name, LST_NAME_SIZE))
+			return -EFAULT;
+
+		return 0;
+
+	default:
+		LBUG();
+	}
+
+	return 0;
+}
+
+static int
+lstcon_group_nodes_add(lstcon_group_t *grp,
+		       int count, lnet_process_id_t *ids_up,
+		       unsigned *featp, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* skip if it's in this group already */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 0);
+		if (rc == 0)
+			continue;
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create ndlink, out of memory\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESNEW,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	/* post all RPCs */
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	*featp = trans->tas_features;
+
+	/* destroy all RPGs */
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+static int
+lstcon_group_nodes_remove(lstcon_group_t *grp,
+			  int count, lnet_process_id_t *ids_up,
+			  struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_ndlink_t	*ndl;
+	lstcon_group_t	 *tmp;
+	lnet_process_id_t       id;
+	int		     rc;
+	int		     i;
+
+	/* End session and remove node from the group */
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			goto error;
+		}
+
+		/* move node to tmp group */
+		if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0)
+			lstcon_group_ndlink_move(grp, tmp, ndl);
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list,
+				     &tmp->grp_trans_list, LST_TRANS_SESEND,
+				     tmp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		goto error;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* release nodes anyway, because we can't rollback status */
+	lstcon_group_put(tmp);
+
+	return rc;
+error:
+	lstcon_group_move(tmp, grp);
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+int
+lstcon_group_add(char *name)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		/* find a group with same name */
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_group_alloc(name, &grp);
+	if (rc != 0) {
+		CERROR("Can't allocate descriptor for group %s\n", name);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&grp->grp_link, &console_session.ses_grp_list);
+
+	return rc;
+}
+
+int
+lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up,
+		 unsigned *featp, struct list_head *result_up)
+{
+	lstcon_group_t	 *grp;
+	int		     rc;
+
+	LASSERT (count > 0);
+	LASSERT (ids_up != NULL);
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by other threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_del(char *name)
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by others threads or test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESEND,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	lstcon_group_put(grp);
+	/* -ref for session, it's destroyed,
+	 * status can't be rolled back, destroy group anway */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_clean(char *name, int args)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	args = (LST_NODE_ACTIVE | LST_NODE_BUSY |
+		LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args;
+
+	lstcon_group_drain(grp, args);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return 0;
+}
+
+int
+lstcon_nodes_remove(char *name, int count,
+		    lnet_process_id_t *ids_up, struct list_head *result_up)
+{
+	lstcon_group_t *grp = NULL;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up);
+
+	lstcon_group_put(grp);
+	/* release empty group */
+	if (list_empty(&grp->grp_ndl_list))
+		lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_refresh(char *name, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t      *trans;
+	lstcon_group_t	  *grp;
+	int		      rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group: %s\n", name);
+		return rc;
+	}
+
+	if (grp->grp_ref > 2) {
+		/* referred by test */
+		CDEBUG(D_NET, "Group %s is busy\n", name);
+		lstcon_group_put(grp);
+		return -EBUSY;
+	}
+
+	/* re-invite all inactive nodes int the group */
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &grp->grp_trans_list, LST_TRANS_SESNEW,
+				     grp, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		/* local error, return */
+		CDEBUG(D_NET, "Can't create transaction: %d\n", rc);
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* -ref for me */
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_group_list(int index, int len, char *name_up)
+{
+	lstcon_group_t *grp;
+
+	LASSERT (index >= 0);
+	LASSERT (name_up != NULL);
+
+	list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up, grp->grp_name, len) ?
+			       -EFAULT : 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+lstcon_nodes_getent(struct list_head *head, int *index_p,
+		    int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlink_t  *ndl;
+	lstcon_node_t    *nd;
+	int	       count = 0;
+	int	       index = 0;
+
+	LASSERT (index_p != NULL && count_p != NULL);
+	LASSERT (dents_up != NULL);
+	LASSERT (*index_p >= 0);
+	LASSERT (*count_p > 0);
+
+	list_for_each_entry(ndl, head, ndl_link) {
+		if (index++ < *index_p)
+			continue;
+
+		if (count >= *count_p)
+			break;
+
+		nd = ndl->ndl_node;
+		if (copy_to_user(&dents_up[count].nde_id,
+				     &nd->nd_id, sizeof(nd->nd_id)) ||
+		    copy_to_user(&dents_up[count].nde_state,
+				     &nd->nd_state, sizeof(nd->nd_state)))
+			return -EFAULT;
+
+		count ++;
+	}
+
+	if (index <= *index_p)
+		return -ENOENT;
+
+	*count_p = count;
+	*index_p = index;
+
+	return 0;
+}
+
+int
+lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p,
+		  int *index_p, int *count_p, lstcon_node_ent_t *dents_up)
+{
+	lstcon_ndlist_ent_t *gentp;
+	lstcon_group_t      *grp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", name);
+		return rc;
+	}
+
+	if (dents_up != 0) {
+		/* verbose query */
+		rc = lstcon_nodes_getent(&grp->grp_ndl_list,
+					 index_p, count_p, dents_up);
+		lstcon_group_put(grp);
+
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t));
+	if (gentp == NULL) {
+		CERROR("Can't allocate ndlist_ent\n");
+		lstcon_group_put(grp);
+
+		return -ENOMEM;
+	}
+
+	memset(gentp, 0, sizeof(lstcon_ndlist_ent_t));
+
+	list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp);
+
+	rc = copy_to_user(gents_p, gentp,
+			      sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0;
+
+	LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t));
+
+	lstcon_group_put(grp);
+
+	return 0;
+}
+
+int
+lstcon_batch_find(char *name, lstcon_batch_t **batpp)
+{
+	lstcon_batch_t   *bat;
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) {
+			*batpp = bat;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_add(char *name)
+{
+	lstcon_batch_t   *bat;
+	int	       i;
+	int	       rc;
+
+	rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0;
+	if (rc != 0) {
+		CDEBUG(D_NET, "Batch %s already exists\n", name);
+		return rc;
+	}
+
+	LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t));
+	if (bat == NULL) {
+		CERROR("Can't allocate descriptor for batch %s\n", name);
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_cli_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_cli_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	LIBCFS_ALLOC(bat->bat_srv_hash,
+		     sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	if (bat->bat_srv_hash == NULL) {
+		CERROR("Can't allocate hash for batch %s\n", name);
+		LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE);
+		LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+
+		return -ENOMEM;
+	}
+
+	strcpy(bat->bat_name, name);
+	bat->bat_hdr.tsb_index = 0;
+	bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie;
+
+	bat->bat_ntest = 0;
+	bat->bat_state = LST_BATCH_IDLE;
+
+	INIT_LIST_HEAD(&bat->bat_cli_list);
+	INIT_LIST_HEAD(&bat->bat_srv_list);
+	INIT_LIST_HEAD(&bat->bat_test_list);
+	INIT_LIST_HEAD(&bat->bat_trans_list);
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		INIT_LIST_HEAD(&bat->bat_cli_hash[i]);
+		INIT_LIST_HEAD(&bat->bat_srv_hash[i]);
+	}
+
+	list_add_tail(&bat->bat_link, &console_session.ses_bat_list);
+
+	return rc;
+}
+
+int
+lstcon_batch_list(int index, int len, char *name_up)
+{
+	lstcon_batch_t    *bat;
+
+	LASSERT (name_up != NULL);
+	LASSERT (index >= 0);
+
+	list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) {
+		if (index-- == 0) {
+			return copy_to_user(name_up,bat->bat_name, len) ?
+			       -EFAULT: 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server,
+		  int testidx, int *index_p, int *ndent_p,
+		  lstcon_node_ent_t *dents_up)
+{
+	lstcon_test_batch_ent_t *entp;
+	struct list_head	      *clilst;
+	struct list_head	      *srvlst;
+	lstcon_test_t	   *test = NULL;
+	lstcon_batch_t	  *bat;
+	lstcon_ndlink_t	 *ndl;
+	int		      rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	if (testidx > 0) {
+		/* query test, test index start from 1 */
+		list_for_each_entry(test, &bat->bat_test_list, tes_link) {
+			if (testidx-- == 1)
+				break;
+		}
+
+		if (testidx > 0) {
+			CDEBUG(D_NET, "Can't find specified test in batch\n");
+			return -ENOENT;
+		}
+	}
+
+	clilst = (test == NULL) ? &bat->bat_cli_list :
+				  &test->tes_src_grp->grp_ndl_list;
+	srvlst = (test == NULL) ? &bat->bat_srv_list :
+				  &test->tes_dst_grp->grp_ndl_list;
+
+	if (dents_up != NULL) {
+		rc = lstcon_nodes_getent((server ? srvlst: clilst),
+					 index_p, ndent_p, dents_up);
+		return rc;
+	}
+
+	/* non-verbose query */
+	LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	memset(entp, 0, sizeof(lstcon_test_batch_ent_t));
+
+	if (test == NULL) {
+		entp->u.tbe_batch.bae_ntest = bat->bat_ntest;
+		entp->u.tbe_batch.bae_state = bat->bat_state;
+
+	} else {
+
+		entp->u.tbe_test.tse_type   = test->tes_type;
+		entp->u.tbe_test.tse_loop   = test->tes_loop;
+		entp->u.tbe_test.tse_concur = test->tes_concur;
+	}
+
+	list_for_each_entry(ndl, clilst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle);
+
+	list_for_each_entry(ndl, srvlst, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle);
+
+	rc = copy_to_user(ent_up, entp,
+			      sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0;
+
+	LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t));
+
+	return rc;
+}
+
+int
+lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	switch (transop) {
+	case LST_TRANS_TSBRUN:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return -ENETDOWN;
+		break;
+
+	case LST_TRANS_TSBSTOP:
+		if (nd->nd_state != LST_NODE_ACTIVE)
+			return 0;
+		break;
+
+	case LST_TRANS_TSBCLIQRY:
+	case LST_TRANS_TSBSRVQRY:
+		break;
+	}
+
+	return 1;
+}
+
+static int
+lstcon_batch_op(lstcon_batch_t *bat, int transop,
+		struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list,
+				     &bat->bat_trans_list, transop,
+				     bat, lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_batch_run(char *name, int timeout, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = timeout;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up);
+
+	/* mark batch as running if it's started in any node */
+	if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0)
+		bat->bat_state = LST_BATCH_RUNNING;
+
+	return rc;
+}
+
+int
+lstcon_batch_stop(char *name, int force, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	if (lstcon_batch_find(name, &bat) != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return -ENOENT;
+	}
+
+	bat->bat_arg = force;
+
+	rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up);
+
+	/* mark batch as stopped if all RPCs finished */
+	if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0)
+		bat->bat_state = LST_BATCH_IDLE;
+
+	return rc;
+}
+
+static void
+lstcon_batch_destroy(lstcon_batch_t *bat)
+{
+	lstcon_ndlink_t    *ndl;
+	lstcon_test_t      *test;
+	int		 i;
+
+	list_del(&bat->bat_link);
+
+	while (!list_empty(&bat->bat_test_list)) {
+		test = list_entry(bat->bat_test_list.next,
+				      lstcon_test_t, tes_link);
+		LASSERT (list_empty(&test->tes_trans_list));
+
+		list_del(&test->tes_link);
+
+		lstcon_group_put(test->tes_src_grp);
+		lstcon_group_put(test->tes_dst_grp);
+
+		LIBCFS_FREE(test, offsetof(lstcon_test_t,
+					   tes_param[test->tes_paramlen]));
+	}
+
+	LASSERT (list_empty(&bat->bat_trans_list));
+
+	while (!list_empty(&bat->bat_cli_list)) {
+		ndl = list_entry(bat->bat_cli_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	while (!list_empty(&bat->bat_srv_list)) {
+		ndl = list_entry(bat->bat_srv_list.next,
+				     lstcon_ndlink_t, ndl_link);
+		list_del_init(&ndl->ndl_link);
+
+		lstcon_ndlink_release(ndl);
+	}
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&bat->bat_cli_hash[i]));
+		LASSERT (list_empty(&bat->bat_srv_hash[i]));
+	}
+
+	LIBCFS_FREE(bat->bat_cli_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat->bat_srv_hash,
+		    sizeof(struct list_head) * LST_NODE_HASHSIZE);
+	LIBCFS_FREE(bat, sizeof(lstcon_batch_t));
+}
+
+int
+lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg)
+{
+	lstcon_test_t    *test;
+	lstcon_batch_t   *batch;
+	lstcon_ndlink_t  *ndl;
+	struct list_head       *hash;
+	struct list_head       *head;
+
+	test = (lstcon_test_t *)arg;
+	LASSERT (test != NULL);
+
+	batch = test->tes_batch;
+	LASSERT (batch != NULL);
+
+	if (test->tes_oneside &&
+	    transop == LST_TRANS_TSBSRVADD)
+		return 0;
+
+	if (nd->nd_state != LST_NODE_ACTIVE)
+		return -ENETDOWN;
+
+	if (transop == LST_TRANS_TSBCLIADD) {
+		hash = batch->bat_cli_hash;
+		head = &batch->bat_cli_list;
+
+	} else {
+		LASSERT (transop == LST_TRANS_TSBSRVADD);
+
+		hash = batch->bat_srv_hash;
+		head = &batch->bat_srv_list;
+	}
+
+	LASSERT (nd->nd_id.nid != LNET_NID_ANY);
+
+	if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0)
+		return -ENOMEM;
+
+	if (list_empty(&ndl->ndl_link))
+		list_add_tail(&ndl->ndl_link, head);
+
+	return 1;
+}
+
+static int
+lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t     *trans;
+	lstcon_group_t	 *grp;
+	int		     transop;
+	int		     rc;
+
+	LASSERT (test->tes_src_grp != NULL);
+	LASSERT (test->tes_dst_grp != NULL);
+
+	transop = LST_TRANS_TSBSRVADD;
+	grp  = test->tes_dst_grp;
+again:
+	rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list,
+				     &test->tes_trans_list, transop,
+				     test, lstcon_testrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0) {
+		lstcon_rpc_trans_interpreter(trans, result_up, NULL);
+
+		lstcon_rpc_trans_destroy(trans);
+		/* return if any error */
+		CDEBUG(D_NET, "Failed to add test %s, "
+			      "RPC error %d, framework error %d\n",
+		       transop == LST_TRANS_TSBCLIADD ? "client" : "server",
+		       lstcon_trans_stat()->trs_rpc_errno,
+		       lstcon_trans_stat()->trs_fwk_errno);
+
+		return rc;
+	}
+
+	lstcon_rpc_trans_destroy(trans);
+
+	if (transop == LST_TRANS_TSBCLIADD)
+		return rc;
+
+	transop = LST_TRANS_TSBCLIADD;
+	grp = test->tes_src_grp;
+	test->tes_cliidx = 0;
+
+	/* requests to test clients */
+	goto again;
+}
+
+int
+lstcon_test_add(char *name, int type, int loop, int concur,
+		int dist, int span, char *src_name, char * dst_name,
+		void *param, int paramlen, int *retp,
+		struct list_head *result_up)
+{
+	lstcon_group_t  *src_grp = NULL;
+	lstcon_group_t  *dst_grp = NULL;
+	lstcon_test_t   *test    = NULL;
+	lstcon_batch_t  *batch;
+	int	      rc;
+
+	rc = lstcon_batch_find(name, &batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch %s\n", name);
+		return rc;
+	}
+
+	if (batch->bat_state != LST_BATCH_IDLE) {
+		CDEBUG(D_NET, "Can't change running batch %s\n", name);
+		return rc;
+	}
+
+	rc = lstcon_group_find(src_name, &src_grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", src_name);
+		goto out;
+	}
+
+	rc = lstcon_group_find(dst_name, &dst_grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", dst_name);
+		goto out;
+	}
+
+	if (dst_grp->grp_userland)
+		*retp = 1;
+
+	LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+	if (!test) {
+		CERROR("Can't allocate test descriptor\n");
+		rc = -ENOMEM;
+
+		goto out;
+	}
+
+	memset(test, 0, offsetof(lstcon_test_t, tes_param[paramlen]));
+	test->tes_hdr.tsb_id    = batch->bat_hdr.tsb_id;
+	test->tes_batch	 = batch;
+	test->tes_type	  = type;
+	test->tes_oneside       = 0; /* TODO */
+	test->tes_loop	  = loop;
+	test->tes_concur	= concur;
+	test->tes_stop_onerr    = 1; /* TODO */
+	test->tes_span	  = span;
+	test->tes_dist	  = dist;
+	test->tes_cliidx	= 0; /* just used for creating RPC */
+	test->tes_src_grp       = src_grp;
+	test->tes_dst_grp       = dst_grp;
+	INIT_LIST_HEAD(&test->tes_trans_list);
+
+	if (param != NULL) {
+		test->tes_paramlen = paramlen;
+		memcpy(&test->tes_param[0], param, paramlen);
+	}
+
+	rc = lstcon_test_nodes_add(test, result_up);
+
+	if (rc != 0)
+		goto out;
+
+	if (lstcon_trans_stat()->trs_rpc_errno != 0 ||
+	    lstcon_trans_stat()->trs_fwk_errno != 0)
+		CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, name);
+
+	/* add to test list anyway, so user can check what's going on */
+	list_add_tail(&test->tes_link, &batch->bat_test_list);
+
+	batch->bat_ntest ++;
+	test->tes_hdr.tsb_index = batch->bat_ntest;
+
+	/*  hold groups so nobody can change them */
+	return rc;
+out:
+	if (test != NULL)
+		LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen]));
+
+	if (dst_grp != NULL)
+		lstcon_group_put(dst_grp);
+
+	if (src_grp != NULL)
+		lstcon_group_put(src_grp);
+
+	return rc;
+}
+
+int
+lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp)
+{
+	lstcon_test_t *test;
+
+	list_for_each_entry(test, &batch->bat_test_list, tes_link) {
+		if (idx == test->tes_hdr.tsb_index) {
+			*testpp = test;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+int
+lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg,
+		      lstcon_rpc_ent_t *ent_up)
+{
+	srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+	LASSERT (transop == LST_TRANS_TSBCLIQRY ||
+		 transop == LST_TRANS_TSBSRVQRY);
+
+	/* positive errno, framework error code */
+	if (copy_to_user(&ent_up->rpe_priv[0],
+			     &rep->bar_active, sizeof(rep->bar_active)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+lstcon_test_batch_query(char *name, int testidx, int client,
+			int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	struct list_head	 *translist;
+	struct list_head	 *ndlist;
+	lstcon_tsb_hdr_t   *hdr;
+	lstcon_batch_t     *batch;
+	lstcon_test_t      *test = NULL;
+	int		 transop;
+	int		 rc;
+
+	rc = lstcon_batch_find(name, &batch);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find batch: %s\n", name);
+		return rc;
+	}
+
+	if (testidx == 0) {
+		translist = &batch->bat_trans_list;
+		ndlist    = &batch->bat_cli_list;
+		hdr       = &batch->bat_hdr;
+
+	} else {
+		/* query specified test only */
+		rc = lstcon_test_find(batch, testidx, &test);
+		if (rc != 0) {
+			CDEBUG(D_NET, "Can't find test: %d\n", testidx);
+			return rc;
+		}
+
+		translist = &test->tes_trans_list;
+		ndlist    = &test->tes_src_grp->grp_ndl_list;
+		hdr       = &test->tes_hdr;
+	}
+
+	transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr,
+				     lstcon_batrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, timeout);
+
+	if (testidx == 0 && /* query a batch, not a test */
+	    lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 &&
+	    lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) {
+		/* all RPCs finished, and no active test */
+		batch->bat_state = LST_BATCH_IDLE;
+	}
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_tsbrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_statrpc_readent(int transop, srpc_msg_t *msg,
+		       lstcon_rpc_ent_t *ent_up)
+{
+	srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+	sfw_counters_t    *sfwk_stat;
+	srpc_counters_t   *srpc_stat;
+	lnet_counters_t   *lnet_stat;
+
+	if (rep->str_status != 0)
+		return 0;
+
+	sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0];
+	srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat));
+	lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat));
+
+	if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) ||
+	    copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) ||
+	    copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat)))
+		return -EFAULT;
+
+	return 0;
+}
+
+int
+lstcon_ndlist_stat(struct list_head *ndlist,
+		   int timeout, struct list_head *result_up)
+{
+	struct list_head	  head;
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	INIT_LIST_HEAD(&head);
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, &head,
+				     LST_TRANS_STATQRY, NULL, NULL, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_statrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up)
+{
+	lstcon_group_t     *grp;
+	int		 rc;
+
+	rc = lstcon_group_find(grp_name, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Can't find group %s\n", grp_name);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+		  int timeout, struct list_head *result_up)
+{
+	lstcon_ndlink_t	 *ndl;
+	lstcon_group_t	  *tmp;
+	lnet_process_id_t	id;
+	int		      i;
+	int		      rc;
+
+	rc = lstcon_group_alloc(NULL, &tmp);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0 ; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* add to tmp group */
+		rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2);
+		if (rc != 0) {
+			CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET,
+			       "Failed to find or create %s: %d\n",
+			       libcfs_id2str(id), rc);
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(tmp);
+		return rc;
+	}
+
+	rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up);
+
+	lstcon_group_put(tmp);
+
+	return rc;
+}
+
+int
+lstcon_debug_ndlist(struct list_head *ndlist,
+		    struct list_head *translist,
+		    int timeout, struct list_head *result_up)
+{
+	lstcon_rpc_trans_t *trans;
+	int		 rc;
+
+	rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY,
+				     NULL, lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout));
+
+	rc = lstcon_rpc_trans_interpreter(trans, result_up,
+					  lstcon_sesrpc_readent);
+	lstcon_rpc_trans_destroy(trans);
+
+	return rc;
+}
+
+int
+lstcon_session_debug(int timeout, struct list_head *result_up)
+{
+	return lstcon_debug_ndlist(&console_session.ses_ndl_list,
+				   NULL, timeout, result_up);
+}
+
+int
+lstcon_batch_debug(int timeout, char *name,
+		   int client, struct list_head *result_up)
+{
+	lstcon_batch_t *bat;
+	int	     rc;
+
+	rc = lstcon_batch_find(name, &bat);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list :
+					  &bat->bat_srv_list,
+				 NULL, timeout, result_up);
+
+	return rc;
+}
+
+int
+lstcon_group_debug(int timeout, char *name,
+		   struct list_head *result_up)
+{
+	lstcon_group_t *grp;
+	int	     rc;
+
+	rc = lstcon_group_find(name, &grp);
+	if (rc != 0)
+		return -ENOENT;
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_nodes_debug(int timeout,
+		   int count, lnet_process_id_t *ids_up,
+		   struct list_head *result_up)
+{
+	lnet_process_id_t  id;
+	lstcon_ndlink_t   *ndl;
+	lstcon_group_t    *grp;
+	int		i;
+	int		rc;
+
+	rc = lstcon_group_alloc(NULL, &grp);
+	if (rc != 0) {
+		CDEBUG(D_NET, "Out of memory\n");
+		return rc;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (copy_from_user(&id, &ids_up[i], sizeof(id))) {
+			rc = -EFAULT;
+			break;
+		}
+
+		/* node is added to tmp group */
+		rc = lstcon_group_ndlink_find(grp, id, &ndl, 1);
+		if (rc != 0) {
+			CERROR("Can't create node link\n");
+			break;
+		}
+	}
+
+	if (rc != 0) {
+		lstcon_group_put(grp);
+		return rc;
+	}
+
+	rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL,
+				 timeout, result_up);
+
+	lstcon_group_put(grp);
+
+	return rc;
+}
+
+int
+lstcon_session_match(lst_sid_t sid)
+{
+	return (console_session.ses_id.ses_nid   == sid.ses_nid &&
+		console_session.ses_id.ses_stamp == sid.ses_stamp) ?  1: 0;
+}
+
+static void
+lstcon_new_session_id(lst_sid_t *sid)
+{
+	lnet_process_id_t      id;
+
+	LASSERT (console_session.ses_state == LST_SESSION_NONE);
+
+	LNetGetId(1, &id);
+	sid->ses_nid   = id.nid;
+	sid->ses_stamp = cfs_time_current();
+}
+
+extern srpc_service_t lstcon_acceptor_service;
+
+int
+lstcon_session_new(char *name, int key, unsigned feats,
+		   int timeout, int force, lst_sid_t *sid_up)
+{
+	int     rc = 0;
+	int     i;
+
+	if (console_session.ses_state != LST_SESSION_NONE) {
+		/* session exists */
+		if (!force) {
+			CNETERR("Session %s already exists\n",
+				console_session.ses_name);
+			return -EEXIST;
+		}
+
+		rc = lstcon_session_end();
+
+		/* lstcon_session_end() only return local error */
+		if  (rc != 0)
+			return rc;
+	}
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CNETERR("Unknown session features %x\n",
+			(feats & ~LST_FEATS_MASK));
+		return -EINVAL;
+	}
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		LASSERT(list_empty(&console_session.ses_ndl_hash[i]));
+
+	lstcon_new_session_id(&console_session.ses_id);
+
+	console_session.ses_key	    = key;
+	console_session.ses_state   = LST_SESSION_ACTIVE;
+	console_session.ses_force   = !!force;
+	console_session.ses_features = feats;
+	console_session.ses_feats_updated = 0;
+	console_session.ses_timeout = (timeout <= 0) ?
+				      LST_CONSOLE_TIMEOUT : timeout;
+	strcpy(console_session.ses_name, name);
+
+	rc = lstcon_batch_add(LST_DEFAULT_BATCH);
+	if (rc != 0)
+		return rc;
+
+	rc = lstcon_rpc_pinger_start();
+	if (rc != 0) {
+		lstcon_batch_t *bat = NULL;
+
+		lstcon_batch_find(LST_DEFAULT_BATCH, &bat);
+		lstcon_batch_destroy(bat);
+
+		return rc;
+	}
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) == 0)
+		return rc;
+
+	lstcon_session_end();
+
+	return -EFAULT;
+}
+
+int
+lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp,
+		    lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len)
+{
+	lstcon_ndlist_ent_t *entp;
+	lstcon_ndlink_t     *ndl;
+	int		  rc = 0;
+
+	if (console_session.ses_state != LST_SESSION_ACTIVE)
+		return -ESRCH;
+
+	LIBCFS_ALLOC(entp, sizeof(*entp));
+	if (entp == NULL)
+		return -ENOMEM;
+
+	memset(entp, 0, sizeof(*entp));
+
+	list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link)
+		LST_NODE_STATE_COUNTER(ndl->ndl_node, entp);
+
+	if (copy_to_user(sid_up, &console_session.ses_id,
+			     sizeof(lst_sid_t)) ||
+	    copy_to_user(key_up, &console_session.ses_key,
+			     sizeof(*key_up)) ||
+	    copy_to_user(featp, &console_session.ses_features,
+			     sizeof(*featp)) ||
+	    copy_to_user(ndinfo_up, entp, sizeof(*entp)) ||
+	    copy_to_user(name_up, console_session.ses_name, len))
+		rc = -EFAULT;
+
+	LIBCFS_FREE(entp, sizeof(*entp));
+
+	return rc;
+}
+
+int
+lstcon_session_end()
+{
+	lstcon_rpc_trans_t *trans;
+	lstcon_group_t     *grp;
+	lstcon_batch_t     *bat;
+	int		 rc = 0;
+
+	LASSERT (console_session.ses_state == LST_SESSION_ACTIVE);
+
+	rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list,
+				     NULL, LST_TRANS_SESEND, NULL,
+				     lstcon_sesrpc_condition, &trans);
+	if (rc != 0) {
+		CERROR("Can't create transaction: %d\n", rc);
+		return rc;
+	}
+
+	console_session.ses_shutdown = 1;
+
+	lstcon_rpc_pinger_stop();
+
+	lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT);
+
+	lstcon_rpc_trans_destroy(trans);
+	/* User can do nothing even rpc failed, so go on */
+
+	/* waiting for orphan rpcs to die */
+	lstcon_rpc_cleanup_wait();
+
+	console_session.ses_id    = LST_INVALID_SID;
+	console_session.ses_state = LST_SESSION_NONE;
+	console_session.ses_key   = 0;
+	console_session.ses_force = 0;
+	console_session.ses_feats_updated = 0;
+
+	/* destroy all batches */
+	while (!list_empty(&console_session.ses_bat_list)) {
+		bat = list_entry(console_session.ses_bat_list.next,
+				     lstcon_batch_t, bat_link);
+
+		lstcon_batch_destroy(bat);
+	}
+
+	/* destroy all groups */
+	while (!list_empty(&console_session.ses_grp_list)) {
+		grp = list_entry(console_session.ses_grp_list.next,
+				     lstcon_group_t, grp_link);
+		LASSERT (grp->grp_ref == 1);
+
+		lstcon_group_put(grp);
+	}
+
+	/* all nodes should be released */
+	LASSERT (list_empty(&console_session.ses_ndl_list));
+
+	console_session.ses_shutdown = 0;
+	console_session.ses_expired  = 0;
+
+	return rc;
+}
+
+int
+lstcon_session_feats_check(unsigned feats)
+{
+	int rc = 0;
+
+	if ((feats & ~LST_FEATS_MASK) != 0) {
+		CERROR("Can't support these features: %x\n",
+		       (feats & ~LST_FEATS_MASK));
+		return -EPROTO;
+	}
+
+	spin_lock(&console_session.ses_rpc_lock);
+
+	if (!console_session.ses_feats_updated) {
+		console_session.ses_feats_updated = 1;
+		console_session.ses_features = feats;
+	}
+
+	if (console_session.ses_features != feats)
+		rc = -EPROTO;
+
+	spin_unlock(&console_session.ses_rpc_lock);
+
+	if (rc != 0) {
+		CERROR("remote features %x do not match with "
+		       "session features %x of console\n",
+		       feats, console_session.ses_features);
+	}
+
+	return rc;
+}
+
+static int
+lstcon_acceptor_handle (srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	*rep  = &rpc->srpc_replymsg;
+	srpc_msg_t	*req  = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_join_reqst_t *jreq = &req->msg_body.join_reqst;
+	srpc_join_reply_t *jrep = &rep->msg_body.join_reply;
+	lstcon_group_t    *grp  = NULL;
+	lstcon_ndlink_t   *ndl;
+	int		rc   = 0;
+
+	sfw_unpack_message(req);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	jrep->join_sid = console_session.ses_id;
+
+	if (console_session.ses_id.ses_nid == LNET_NID_ANY) {
+		jrep->join_status = ESRCH;
+		goto out;
+	}
+
+	if (lstcon_session_feats_check(req->msg_ses_feats) != 0) {
+		jrep->join_status = EPROTO;
+		goto out;
+	}
+
+	if (jreq->join_sid.ses_nid != LNET_NID_ANY &&
+	     !lstcon_session_match(jreq->join_sid)) {
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	if (lstcon_group_find(jreq->join_group, &grp) != 0) {
+		rc = lstcon_group_alloc(jreq->join_group, &grp);
+		if (rc != 0) {
+			CERROR("Out of memory\n");
+			goto out;
+		}
+
+		list_add_tail(&grp->grp_link,
+				  &console_session.ses_grp_list);
+		lstcon_group_addref(grp);
+	}
+
+	if (grp->grp_ref > 2) {
+		/* Group in using */
+		jrep->join_status = EBUSY;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0);
+	if (rc == 0) {
+		jrep->join_status = EEXIST;
+		goto out;
+	}
+
+	rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1);
+	if (rc != 0) {
+		CERROR("Out of memory\n");
+		goto out;
+	}
+
+	ndl->ndl_node->nd_state   = LST_NODE_ACTIVE;
+	ndl->ndl_node->nd_timeout = console_session.ses_timeout;
+
+	if (grp->grp_userland == 0)
+		grp->grp_userland = 1;
+
+	strcpy(jrep->join_session, console_session.ses_name);
+	jrep->join_timeout = console_session.ses_timeout;
+	jrep->join_status  = 0;
+
+out:
+	rep->msg_ses_feats = console_session.ses_features;
+	if (grp != NULL)
+		lstcon_group_put(grp);
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	return rc;
+}
+
+srpc_service_t lstcon_acceptor_service;
+void lstcon_init_acceptor_service(void)
+{
+	/* initialize selftest console acceptor service table */
+	lstcon_acceptor_service.sv_name    = "join session";
+	lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle;
+	lstcon_acceptor_service.sv_id      = SRPC_SERVICE_JOIN;
+	lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX;
+}
+
+extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data);
+
+DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry);
+
+/* initialize console */
+int
+lstcon_console_init(void)
+{
+	int     i;
+	int     rc;
+
+	memset(&console_session, 0, sizeof(lstcon_session_t));
+
+	console_session.ses_id		    = LST_INVALID_SID;
+	console_session.ses_state	    = LST_SESSION_NONE;
+	console_session.ses_timeout	    = 0;
+	console_session.ses_force	    = 0;
+	console_session.ses_expired	    = 0;
+	console_session.ses_feats_updated   = 0;
+	console_session.ses_features	    = LST_FEATS_MASK;
+	console_session.ses_laststamp	    = cfs_time_current_sec();
+
+	mutex_init(&console_session.ses_mutex);
+
+	INIT_LIST_HEAD(&console_session.ses_ndl_list);
+	INIT_LIST_HEAD(&console_session.ses_grp_list);
+	INIT_LIST_HEAD(&console_session.ses_bat_list);
+	INIT_LIST_HEAD(&console_session.ses_trans_list);
+
+	LIBCFS_ALLOC(console_session.ses_ndl_hash,
+		     sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+	if (console_session.ses_ndl_hash == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < LST_GLOBAL_HASHSIZE; i++)
+		INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]);
+
+
+	/* initialize acceptor service table */
+	lstcon_init_acceptor_service();
+
+	rc = srpc_add_service(&lstcon_acceptor_service);
+	LASSERT (rc != -EBUSY);
+	if (rc != 0) {
+		LIBCFS_FREE(console_session.ses_ndl_hash,
+			    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+		return rc;
+	}
+
+	rc = srpc_service_add_buffers(&lstcon_acceptor_service,
+				      lstcon_acceptor_service.sv_wi_total);
+	if (rc != 0) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	rc = libcfs_register_ioctl(&lstcon_ioctl_handler);
+
+	if (rc == 0) {
+		lstcon_rpc_module_init();
+		return 0;
+	}
+
+out:
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return rc;
+}
+
+int
+lstcon_console_fini(void)
+{
+	int     i;
+
+	libcfs_deregister_ioctl(&lstcon_ioctl_handler);
+
+	mutex_lock(&console_session.ses_mutex);
+
+	srpc_shutdown_service(&lstcon_acceptor_service);
+	srpc_remove_service(&lstcon_acceptor_service);
+
+	if (console_session.ses_state != LST_SESSION_NONE)
+		lstcon_session_end();
+
+	lstcon_rpc_module_fini();
+
+	mutex_unlock(&console_session.ses_mutex);
+
+	LASSERT (list_empty(&console_session.ses_ndl_list));
+	LASSERT (list_empty(&console_session.ses_grp_list));
+	LASSERT (list_empty(&console_session.ses_bat_list));
+	LASSERT (list_empty(&console_session.ses_trans_list));
+
+	for (i = 0; i < LST_NODE_HASHSIZE; i++) {
+		LASSERT (list_empty(&console_session.ses_ndl_hash[i]));
+	}
+
+	LIBCFS_FREE(console_session.ses_ndl_hash,
+		    sizeof(struct list_head) * LST_GLOBAL_HASHSIZE);
+
+	srpc_wait_service_shutdown(&lstcon_acceptor_service);
+
+	return 0;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h
new file mode 100644
index 000000000000..e61b26687dbb
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/console.h
@@ -0,0 +1,232 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/console.h
+ *
+ * kernel structure for LST console
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#ifndef __LST_CONSOLE_H__
+#define __LST_CONSOLE_H__
+
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+#include "selftest.h"
+#include "conrpc.h"
+
+typedef struct lstcon_node {
+	lnet_process_id_t    nd_id;	  /* id of the node */
+	int		  nd_ref;	 /* reference count */
+	int		  nd_state;       /* state of the node */
+	int		  nd_timeout;     /* session timeout */
+	cfs_time_t	   nd_stamp;       /* timestamp of last replied RPC */
+	struct lstcon_rpc    nd_ping;	/* ping rpc */
+} lstcon_node_t;				/*** node descriptor */
+
+typedef struct {
+	struct list_head	   ndl_link;       /* chain on list */
+	struct list_head	   ndl_hlink;      /* chain on hash */
+	lstcon_node_t       *ndl_node;       /* pointer to node */
+} lstcon_ndlink_t;			      /*** node link descriptor */
+
+typedef struct {
+	struct list_head	   grp_link;       /* chain on global group list */
+	int		  grp_ref;	/* reference count */
+	int		  grp_userland;   /* has userland nodes */
+	int		  grp_nnode;      /* # of nodes */
+	char		 grp_name[LST_NAME_SIZE]; /* group name */
+
+	struct list_head	   grp_trans_list; /* transaction list */
+	struct list_head	   grp_ndl_list;   /* nodes list */
+	struct list_head	   grp_ndl_hash[0];/* hash table for nodes */
+} lstcon_group_t;		    /*** (alias of nodes) group descriptor */
+
+#define LST_BATCH_IDLE	  0xB0	    /* idle batch */
+#define LST_BATCH_RUNNING       0xB1	    /* running batch */
+
+typedef struct lstcon_tsb_hdr {
+	lst_bid_t	       tsb_id;	 /* batch ID */
+	int		     tsb_index;      /* test index */
+} lstcon_tsb_hdr_t;
+
+typedef struct {
+	lstcon_tsb_hdr_t	bat_hdr;	/* test_batch header */
+	struct list_head	      bat_link;       /* chain on session's batches list */
+	int		     bat_ntest;      /* # of test */
+	int		     bat_state;      /* state of the batch */
+	int		     bat_arg;	/* parameter for run|stop, timeout for run, force for stop */
+	char		    bat_name[LST_NAME_SIZE]; /* name of batch */
+
+	struct list_head	      bat_test_list;  /* list head of tests (lstcon_test_t) */
+	struct list_head	      bat_trans_list; /* list head of transaction */
+	struct list_head	      bat_cli_list;   /* list head of client nodes (lstcon_node_t) */
+	struct list_head	     *bat_cli_hash;   /* hash table of client nodes */
+	struct list_head	      bat_srv_list;   /* list head of server nodes */
+	struct list_head	     *bat_srv_hash;   /* hash table of server nodes */
+} lstcon_batch_t;			     /*** (tests ) batch descritptor */
+
+typedef struct lstcon_test {
+	lstcon_tsb_hdr_t      tes_hdr;	/* test batch header */
+	struct list_head	    tes_link;       /* chain on batch's tests list */
+	lstcon_batch_t       *tes_batch;      /* pointer to batch */
+
+	int		   tes_type;       /* type of the test, i.e: bulk, ping */
+	int		   tes_stop_onerr; /* stop on error */
+	int		   tes_oneside;    /* one-sided test */
+	int		   tes_concur;     /* concurrency */
+	int		   tes_loop;       /* loop count */
+	int		   tes_dist;       /* nodes distribution of target group */
+	int		   tes_span;       /* nodes span of target group */
+	int		   tes_cliidx;     /* client index, used for RPC creating */
+
+	struct list_head  tes_trans_list; /* transaction list */
+	lstcon_group_t       *tes_src_grp;    /* group run the test */
+	lstcon_group_t       *tes_dst_grp;    /* target group */
+
+	int		   tes_paramlen;   /* test parameter length */
+	char		  tes_param[0];   /* test parameter */
+} lstcon_test_t;				/*** a single test descriptor */
+
+#define LST_GLOBAL_HASHSIZE     503	     /* global nodes hash table size */
+#define LST_NODE_HASHSIZE       239	     /* node hash table (for batch or group) */
+
+#define LST_SESSION_NONE	0x0	     /* no session */
+#define LST_SESSION_ACTIVE      0x1	     /* working session */
+
+#define LST_CONSOLE_TIMEOUT     300	     /* default console timeout */
+
+typedef struct {
+	struct mutex		ses_mutex;      /* only 1 thread in session */
+	lst_sid_t	       ses_id;	 /* global session id */
+	int		     ses_key;	/* local session key */
+	int		     ses_state;      /* state of session */
+	int		     ses_timeout;    /* timeout in seconds */
+	time_t		  ses_laststamp;  /* last operation stamp (seconds) */
+	/** tests features of the session */
+	unsigned		ses_features;
+	/** features are synced with remote test nodes */
+	unsigned		ses_feats_updated:1;
+	/** force creating */
+	unsigned		ses_force:1;
+	/** session is shutting down */
+	unsigned		ses_shutdown:1;
+	/** console is timedout */
+	unsigned		ses_expired:1;
+	__u64		   ses_id_cookie;  /* batch id cookie */
+	char		    ses_name[LST_NAME_SIZE];  /* session name */
+	lstcon_rpc_trans_t     *ses_ping;       /* session pinger */
+	stt_timer_t	     ses_ping_timer; /* timer for pinger */
+	lstcon_trans_stat_t     ses_trans_stat; /* transaction stats */
+
+	struct list_head	      ses_trans_list; /* global list of transaction */
+	struct list_head	      ses_grp_list;   /* global list of groups */
+	struct list_head	      ses_bat_list;   /* global list of batches */
+	struct list_head	      ses_ndl_list;   /* global list of nodes */
+	struct list_head	     *ses_ndl_hash;   /* hash table of nodes */
+
+	spinlock_t	  ses_rpc_lock;   /* serialize */
+	atomic_t	    ses_rpc_counter;/* # of initialized RPCs */
+	struct list_head	      ses_rpc_freelist; /* idle console rpc */
+} lstcon_session_t;			     /*** session descriptor */
+
+extern lstcon_session_t	 console_session;
+
+static inline lstcon_trans_stat_t *
+lstcon_trans_stat(void)
+{
+	return &console_session.ses_trans_stat;
+}
+
+static inline struct list_head *
+lstcon_id2hash (lnet_process_id_t id, struct list_head *hash)
+{
+	unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE;
+
+	return &hash[idx];
+}
+
+extern int lstcon_session_match(lst_sid_t sid);
+extern int lstcon_session_new(char *name, int key, unsigned version,
+			      int timeout, int flags, lst_sid_t *sid_up);
+extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp,
+			       lstcon_ndlist_ent_t *entp, char *name_up, int len);
+extern int lstcon_session_end(void);
+extern int lstcon_session_debug(int timeout, struct list_head *result_up);
+extern int lstcon_session_feats_check(unsigned feats);
+extern int lstcon_batch_debug(int timeout, char *name,
+			      int client, struct list_head *result_up);
+extern int lstcon_group_debug(int timeout, char *name,
+			      struct list_head *result_up);
+extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up,
+			      struct list_head *result_up);
+extern int lstcon_group_add(char *name);
+extern int lstcon_group_del(char *name);
+extern int lstcon_group_clean(char *name, int args);
+extern int lstcon_group_refresh(char *name, struct list_head *result_up);
+extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up,
+			    unsigned *featp, struct list_head *result_up);
+extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up,
+			       struct list_head *result_up);
+extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up,
+			     int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up);
+extern int lstcon_group_list(int idx, int len, char *name_up);
+extern int lstcon_batch_add(char *name);
+extern int lstcon_batch_run(char *name, int timeout,
+			    struct list_head *result_up);
+extern int lstcon_batch_stop(char *name, int force,
+			     struct list_head *result_up);
+extern int lstcon_test_batch_query(char *name, int testidx,
+				   int client, int timeout,
+				   struct list_head *result_up);
+extern int lstcon_batch_del(char *name);
+extern int lstcon_batch_list(int idx, int namelen, char *name_up);
+extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up,
+			     int server, int testidx, int *index_p,
+			     int *ndent_p, lstcon_node_ent_t *dents_up);
+extern int lstcon_group_stat(char *grp_name, int timeout,
+			     struct list_head *result_up);
+extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up,
+			     int timeout, struct list_head *result_up);
+extern int lstcon_test_add(char *name, int type, int loop, int concur,
+			   int dist, int span, char *src_name, char * dst_name,
+			   void *param, int paramlen, int *retp,
+			   struct list_head *result_up);
+
+#endif
diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c
new file mode 100644
index 000000000000..483c78564dae
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/framework.c
@@ -0,0 +1,1814 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/framework.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ * Author: Liang Zhen  <liangzhen@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1};
+
+static int session_timeout = 100;
+CFS_MODULE_PARM(session_timeout, "i", int, 0444,
+		"test session timeout in seconds (100 by default, 0 == never)");
+
+static int rpc_timeout = 64;
+CFS_MODULE_PARM(rpc_timeout, "i", int, 0644,
+		"rpc timeout in seconds (64 by default, 0 == never)");
+
+#define sfw_unpack_id(id)	       \
+do {				    \
+	__swab64s(&(id).nid);	   \
+	__swab32s(&(id).pid);	   \
+} while (0)
+
+#define sfw_unpack_sid(sid)	     \
+do {				    \
+	__swab64s(&(sid).ses_nid);      \
+	__swab64s(&(sid).ses_stamp);    \
+} while (0)
+
+#define sfw_unpack_fw_counters(fc)	\
+do {				      \
+	__swab32s(&(fc).running_ms);      \
+	__swab32s(&(fc).active_batches);  \
+	__swab32s(&(fc).zombie_sessions); \
+	__swab32s(&(fc).brw_errors);      \
+	__swab32s(&(fc).ping_errors);     \
+} while (0)
+
+#define sfw_unpack_rpc_counters(rc)     \
+do {				    \
+	__swab32s(&(rc).errors);	\
+	__swab32s(&(rc).rpcs_sent);     \
+	__swab32s(&(rc).rpcs_rcvd);     \
+	__swab32s(&(rc).rpcs_dropped);  \
+	__swab32s(&(rc).rpcs_expired);  \
+	__swab64s(&(rc).bulk_get);      \
+	__swab64s(&(rc).bulk_put);      \
+} while (0)
+
+#define sfw_unpack_lnet_counters(lc)    \
+do {				    \
+	__swab32s(&(lc).errors);	\
+	__swab32s(&(lc).msgs_max);      \
+	__swab32s(&(lc).msgs_alloc);    \
+	__swab32s(&(lc).send_count);    \
+	__swab32s(&(lc).recv_count);    \
+	__swab32s(&(lc).drop_count);    \
+	__swab32s(&(lc).route_count);   \
+	__swab64s(&(lc).send_length);   \
+	__swab64s(&(lc).recv_length);   \
+	__swab64s(&(lc).drop_length);   \
+	__swab64s(&(lc).route_length);  \
+} while (0)
+
+#define sfw_test_active(t)      (atomic_read(&(t)->tsi_nactive) != 0)
+#define sfw_batch_active(b)     (atomic_read(&(b)->bat_nactive) != 0)
+
+struct smoketest_framework {
+	struct list_head	 fw_zombie_rpcs;     /* RPCs to be recycled */
+	struct list_head	 fw_zombie_sessions; /* stopping sessions */
+	struct list_head	 fw_tests;	   /* registered test cases */
+	atomic_t       fw_nzombies;	/* # zombie sessions */
+	spinlock_t	   fw_lock;		/* serialise */
+	sfw_session_t	  *fw_session;		/* _the_ session */
+	int		   fw_shuttingdown;	/* shutdown in progress */
+	srpc_server_rpc_t *fw_active_srpc;	/* running RPC */
+} sfw_data;
+
+/* forward ref's */
+int sfw_stop_batch (sfw_batch_t *tsb, int force);
+void sfw_destroy_session (sfw_session_t *sn);
+
+static inline sfw_test_case_t *
+sfw_find_test_case(int id)
+{
+	sfw_test_case_t *tsc;
+
+	LASSERT (id <= SRPC_SERVICE_MAX_ID);
+	LASSERT (id > SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		if (tsc->tsc_srv_service->sv_id == id)
+			return tsc;
+	}
+
+	return NULL;
+}
+
+static int
+sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops)
+{
+	sfw_test_case_t *tsc;
+
+	if (sfw_find_test_case(service->sv_id) != NULL) {
+		CERROR ("Failed to register test %s (%d)\n",
+			service->sv_name, service->sv_id);
+		return -EEXIST;
+	}
+
+	LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t));
+	if (tsc == NULL)
+		return -ENOMEM;
+
+	memset(tsc, 0, sizeof(sfw_test_case_t));
+	tsc->tsc_cli_ops     = cliops;
+	tsc->tsc_srv_service = service;
+
+	list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests);
+	return 0;
+}
+
+void
+sfw_add_session_timer (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	stt_timer_t   *timer = &sn->sn_timer;
+
+	LASSERT (!sfw_data.fw_shuttingdown);
+
+	if (sn == NULL || sn->sn_timeout == 0)
+		return;
+
+	LASSERT (!sn->sn_timer_active);
+
+	sn->sn_timer_active = 1;
+	timer->stt_expires = cfs_time_add(sn->sn_timeout,
+					  cfs_time_current_sec());
+	stt_add_timer(timer);
+	return;
+}
+
+int
+sfw_del_session_timer (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL || !sn->sn_timer_active)
+		return 0;
+
+	LASSERT (sn->sn_timeout != 0);
+
+	if (stt_del_timer(&sn->sn_timer)) { /* timer defused */
+		sn->sn_timer_active = 0;
+		return 0;
+	}
+
+	return EBUSY; /* racing with sfw_session_expired() */
+}
+
+/* called with sfw_data.fw_lock held */
+static void
+sfw_deactivate_session (void)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    nactive = 0;
+	sfw_batch_t   *tsb;
+	sfw_test_case_t *tsc;
+
+	if (sn == NULL) return;
+
+	LASSERT (!sn->sn_timer_active);
+
+	sfw_data.fw_session = NULL;
+	atomic_inc(&sfw_data.fw_nzombies);
+	list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions);
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) {
+		srpc_abort_service(tsc->tsc_srv_service);
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+
+	list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			nactive++;
+			sfw_stop_batch(tsb, 1);
+		}
+	}
+
+	if (nactive != 0)
+		return;   /* wait for active batches to stop */
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+
+	spin_lock(&sfw_data.fw_lock);
+}
+
+
+void
+sfw_session_expired (void *data)
+{
+	sfw_session_t *sn = data;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT (sn->sn_timer_active);
+	LASSERT (sn == sfw_data.fw_session);
+
+	CWARN ("Session expired! sid: %s-"LPU64", name: %s\n",
+	       libcfs_nid2str(sn->sn_id.ses_nid),
+	       sn->sn_id.ses_stamp, &sn->sn_name[0]);
+
+	sn->sn_timer_active = 0;
+	sfw_deactivate_session();
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+static inline void
+sfw_init_session(sfw_session_t *sn, lst_sid_t sid,
+		 unsigned features, const char *name)
+{
+	stt_timer_t *timer = &sn->sn_timer;
+
+	memset(sn, 0, sizeof(sfw_session_t));
+	INIT_LIST_HEAD(&sn->sn_list);
+	INIT_LIST_HEAD(&sn->sn_batches);
+	atomic_set(&sn->sn_refcount, 1);	/* +1 for caller */
+	atomic_set(&sn->sn_brw_errors, 0);
+	atomic_set(&sn->sn_ping_errors, 0);
+	strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name));
+
+	sn->sn_timer_active = 0;
+	sn->sn_id	   = sid;
+	sn->sn_features	    = features;
+	sn->sn_timeout      = session_timeout;
+	sn->sn_started      = cfs_time_current();
+
+	timer->stt_data = sn;
+	timer->stt_func = sfw_session_expired;
+	INIT_LIST_HEAD(&timer->stt_list);
+}
+
+/* completion handler for incoming framework RPCs */
+void
+sfw_server_rpc_done(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv	= rpc->srpc_scd->scd_svc;
+	int			status	= rpc->srpc_status;
+
+	CDEBUG (D_NET,
+		"Incoming framework RPC done: "
+		"service %s, peer %s, status %s:%d\n",
+		sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state),
+		status);
+
+	if (rpc->srpc_bulk != NULL)
+		sfw_free_pages(rpc);
+	return;
+}
+
+void
+sfw_client_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc->crpc_bulk.bk_niov == 0);
+	LASSERT (list_empty(&rpc->crpc_list));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	CDEBUG (D_NET,
+		"Outgoing framework RPC done: "
+		"service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state),
+		rpc->crpc_aborted, rpc->crpc_status);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	/* my callers must finish all RPCs before shutting me down */
+	LASSERT(!sfw_data.fw_shuttingdown);
+	list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs);
+
+	spin_unlock(&sfw_data.fw_lock);
+}
+
+sfw_batch_t *
+sfw_find_batch (lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT (sn != NULL);
+
+	list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+		if (bat->bat_id.bat_id == bid.bat_id)
+			return bat;
+	}
+
+	return NULL;
+}
+
+sfw_batch_t *
+sfw_bid2batch (lst_bid_t bid)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	sfw_batch_t   *bat;
+
+	LASSERT (sn != NULL);
+
+	bat = sfw_find_batch(bid);
+	if (bat != NULL)
+		return bat;
+
+	LIBCFS_ALLOC(bat, sizeof(sfw_batch_t));
+	if (bat == NULL)
+		return NULL;
+
+	bat->bat_error    = 0;
+	bat->bat_session  = sn;
+	bat->bat_id       = bid;
+	atomic_set(&bat->bat_nactive, 0);
+	INIT_LIST_HEAD(&bat->bat_tests);
+
+	list_add_tail(&bat->bat_list, &sn->sn_batches);
+	return bat;
+}
+
+int
+sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply)
+{
+	sfw_session_t  *sn = sfw_data.fw_session;
+	sfw_counters_t *cnt = &reply->str_fw;
+	sfw_batch_t    *bat;
+	struct timeval  tv;
+
+	reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->str_sid.ses_nid == LNET_NID_ANY) {
+		reply->str_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) {
+		reply->str_status = ESRCH;
+		return 0;
+	}
+
+	lnet_counters_get(&reply->str_lnet);
+	srpc_get_counters(&reply->str_rpc);
+
+	/* send over the msecs since the session was started
+	 - with 32 bits to send, this is ~49 days */
+	cfs_duration_usec(cfs_time_sub(cfs_time_current(),
+				       sn->sn_started), &tv);
+
+	cnt->running_ms      = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+	cnt->brw_errors      = atomic_read(&sn->sn_brw_errors);
+	cnt->ping_errors     = atomic_read(&sn->sn_ping_errors);
+	cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies);
+
+	cnt->active_batches = 0;
+	list_for_each_entry (bat, &sn->sn_batches, bat_list) {
+		if (atomic_read(&bat->bat_nactive) > 0)
+			cnt->active_batches++;
+	}
+
+	reply->str_status = 0;
+	return 0;
+}
+
+int
+sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	srpc_msg_t    *msg = container_of(request, srpc_msg_t,
+					  msg_body.mksn_reqst);
+	int	       cplen = 0;
+
+	if (request->mksn_sid.ses_nid == LNET_NID_ANY) {
+		reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+		reply->mksn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn != NULL) {
+		reply->mksn_status  = 0;
+		reply->mksn_sid     = sn->sn_id;
+		reply->mksn_timeout = sn->sn_timeout;
+
+		if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) {
+			atomic_inc(&sn->sn_refcount);
+			return 0;
+		}
+
+		if (!request->mksn_force) {
+			reply->mksn_status = EBUSY;
+			cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0],
+					sizeof(reply->mksn_name));
+			if (cplen >= sizeof(reply->mksn_name))
+				return -E2BIG;
+			return 0;
+		}
+	}
+
+	/* reject the request if it requires unknown features
+	 * NB: old version will always accept all features because it's not
+	 * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also
+	 * harmless because it will return zero feature to console, and it's
+	 * console's responsibility to make sure all nodes in a session have
+	 * same feature mask. */
+	if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		reply->mksn_status = EPROTO;
+		return 0;
+	}
+
+	/* brand new or create by force */
+	LIBCFS_ALLOC(sn, sizeof(sfw_session_t));
+	if (sn == NULL) {
+		CERROR ("Dropping RPC (mksn) under memory pressure.\n");
+		return -ENOMEM;
+	}
+
+	sfw_init_session(sn, request->mksn_sid,
+			 msg->msg_ses_feats, &request->mksn_name[0]);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_deactivate_session();
+	LASSERT(sfw_data.fw_session == NULL);
+	sfw_data.fw_session = sn;
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->mksn_status  = 0;
+	reply->mksn_sid     = sn->sn_id;
+	reply->mksn_timeout = sn->sn_timeout;
+	return 0;
+}
+
+int
+sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->rmsn_sid.ses_nid == LNET_NID_ANY) {
+		reply->rmsn_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) {
+		reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY;
+		return 0;
+	}
+
+	if (!atomic_dec_and_test(&sn->sn_refcount)) {
+		reply->rmsn_status = 0;
+		return 0;
+	}
+
+	spin_lock(&sfw_data.fw_lock);
+	sfw_deactivate_session();
+	spin_unlock(&sfw_data.fw_lock);
+
+	reply->rmsn_status = 0;
+	reply->rmsn_sid    = LST_INVALID_SID;
+	LASSERT(sfw_data.fw_session == NULL);
+	return 0;
+}
+
+int
+sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+
+	if (sn == NULL) {
+		reply->dbg_status = ESRCH;
+		reply->dbg_sid    = LST_INVALID_SID;
+		return 0;
+	}
+
+	reply->dbg_status  = 0;
+	reply->dbg_sid     = sn->sn_id;
+	reply->dbg_timeout = sn->sn_timeout;
+	if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name))
+	    >= sizeof(reply->dbg_name))
+		return -E2BIG;
+
+	return 0;
+}
+
+void
+sfw_test_rpc_fini (srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	/* Called with hold of tsi->tsi_lock */
+	LASSERT (list_empty(&rpc->crpc_list));
+	list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+}
+
+static inline int
+sfw_test_buffers(sfw_test_instance_t *tsi)
+{
+	struct sfw_test_case	*tsc = sfw_find_test_case(tsi->tsi_service);
+	struct srpc_service	*svc = tsc->tsc_srv_service;
+	int			nbuf;
+
+	nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts;
+	return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA);
+}
+
+int
+sfw_load_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case	*tsc;
+	struct srpc_service	*svc;
+	int			nbuf;
+	int			rc;
+
+	LASSERT(tsi != NULL);
+	tsc = sfw_find_test_case(tsi->tsi_service);
+	nbuf = sfw_test_buffers(tsi);
+	LASSERT(tsc != NULL);
+	svc = tsc->tsc_srv_service;
+
+	if (tsi->tsi_is_client) {
+		tsi->tsi_ops = tsc->tsc_cli_ops;
+		return 0;
+	}
+
+	rc = srpc_service_add_buffers(svc, nbuf);
+	if (rc != 0) {
+		CWARN("Failed to reserve enough buffers: "
+		      "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc);
+		/* NB: this error handler is not strictly correct, because
+		 * it may release more buffers than already allocated,
+		 * but it doesn't matter because request portal should
+		 * be lazy portal and will grow buffers if necessary. */
+		srpc_service_remove_buffers(svc, nbuf);
+		return -ENOMEM;
+	}
+
+	CDEBUG(D_NET, "Reserved %d buffers for test %s\n",
+	       nbuf * (srpc_serv_is_framework(svc) ?
+		       1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name);
+	return 0;
+}
+
+void
+sfw_unload_test(struct sfw_test_instance *tsi)
+{
+	struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service);
+
+	LASSERT(tsc != NULL);
+
+	if (tsi->tsi_is_client)
+		return;
+
+	/* shrink buffers, because request portal is lazy portal
+	 * which can grow buffers at runtime so we may leave
+	 * some buffers behind, but never mind... */
+	srpc_service_remove_buffers(tsc->tsc_srv_service,
+				    sfw_test_buffers(tsi));
+	return;
+}
+
+void
+sfw_destroy_test_instance (sfw_test_instance_t *tsi)
+{
+	srpc_client_rpc_t *rpc;
+	sfw_test_unit_t   *tsu;
+
+	if (!tsi->tsi_is_client) goto clean;
+
+	tsi->tsi_ops->tso_fini(tsi);
+
+	LASSERT (!tsi->tsi_stopping);
+	LASSERT (list_empty(&tsi->tsi_active_rpcs));
+	LASSERT (!sfw_test_active(tsi));
+
+	while (!list_empty(&tsi->tsi_units)) {
+		tsu = list_entry(tsi->tsi_units.next,
+				     sfw_test_unit_t, tsu_list);
+		list_del(&tsu->tsu_list);
+		LIBCFS_FREE(tsu, sizeof(*tsu));
+	}
+
+	while (!list_empty(&tsi->tsi_free_rpcs)) {
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+clean:
+	sfw_unload_test(tsi);
+	LIBCFS_FREE(tsi, sizeof(*tsi));
+	return;
+}
+
+void
+sfw_destroy_batch (sfw_batch_t *tsb)
+{
+	sfw_test_instance_t *tsi;
+
+	LASSERT (!sfw_batch_active(tsb));
+	LASSERT (list_empty(&tsb->bat_list));
+
+	while (!list_empty(&tsb->bat_tests)) {
+		tsi = list_entry(tsb->bat_tests.next,
+				     sfw_test_instance_t, tsi_list);
+		list_del_init(&tsi->tsi_list);
+		sfw_destroy_test_instance(tsi);
+	}
+
+	LIBCFS_FREE(tsb, sizeof(sfw_batch_t));
+	return;
+}
+
+void
+sfw_destroy_session (sfw_session_t *sn)
+{
+	sfw_batch_t *batch;
+
+	LASSERT (list_empty(&sn->sn_list));
+	LASSERT (sn != sfw_data.fw_session);
+
+	while (!list_empty(&sn->sn_batches)) {
+		batch = list_entry(sn->sn_batches.next,
+				       sfw_batch_t, bat_list);
+		list_del_init(&batch->bat_list);
+		sfw_destroy_batch(batch);
+	}
+
+	LIBCFS_FREE(sn, sizeof(*sn));
+	atomic_dec(&sfw_data.fw_nzombies);
+	return;
+}
+
+void
+sfw_unpack_addtest_req(srpc_msg_t *msg)
+{
+	srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+	LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST);
+	LASSERT (req->tsr_is_client);
+
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (req->tsr_service == SRPC_SERVICE_BRW) {
+		if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) {
+			test_bulk_req_t *bulk = &req->tsr_u.bulk_v0;
+
+			__swab32s(&bulk->blk_opc);
+			__swab32s(&bulk->blk_npg);
+			__swab32s(&bulk->blk_flags);
+
+		} else {
+			test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1;
+
+			__swab16s(&bulk->blk_opc);
+			__swab16s(&bulk->blk_flags);
+			__swab32s(&bulk->blk_offset);
+			__swab32s(&bulk->blk_len);
+		}
+
+		return;
+	}
+
+	if (req->tsr_service == SRPC_SERVICE_PING) {
+		test_ping_req_t *ping = &req->tsr_u.ping;
+
+		__swab32s(&ping->png_size);
+		__swab32s(&ping->png_flags);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+int
+sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc)
+{
+	srpc_msg_t	  *msg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_test_reqst_t   *req = &msg->msg_body.tes_reqst;
+	srpc_bulk_t	 *bk = rpc->srpc_bulk;
+	int		  ndest = req->tsr_ndest;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+	int		  i;
+	int		  rc;
+
+	LIBCFS_ALLOC(tsi, sizeof(*tsi));
+	if (tsi == NULL) {
+		CERROR ("Can't allocate test instance for batch: "LPU64"\n",
+			tsb->bat_id.bat_id);
+		return -ENOMEM;
+	}
+
+	memset(tsi, 0, sizeof(*tsi));
+	spin_lock_init(&tsi->tsi_lock);
+	atomic_set(&tsi->tsi_nactive, 0);
+	INIT_LIST_HEAD(&tsi->tsi_units);
+	INIT_LIST_HEAD(&tsi->tsi_free_rpcs);
+	INIT_LIST_HEAD(&tsi->tsi_active_rpcs);
+
+	tsi->tsi_stopping      = 0;
+	tsi->tsi_batch	 = tsb;
+	tsi->tsi_loop	  = req->tsr_loop;
+	tsi->tsi_concur	= req->tsr_concur;
+	tsi->tsi_service       = req->tsr_service;
+	tsi->tsi_is_client     = !!(req->tsr_is_client);
+	tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr);
+
+	rc = sfw_load_test(tsi);
+	if (rc != 0) {
+		LIBCFS_FREE(tsi, sizeof(*tsi));
+		return rc;
+	}
+
+	LASSERT (!sfw_batch_active(tsb));
+
+	if (!tsi->tsi_is_client) {
+		/* it's test server, just add it to tsb */
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+	LASSERT (bk != NULL);
+	LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest);
+	LASSERT((unsigned int)bk->bk_len >=
+		sizeof(lnet_process_id_packed_t) * ndest);
+
+	sfw_unpack_addtest_req(msg);
+	memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u));
+
+	for (i = 0; i < ndest; i++) {
+		lnet_process_id_packed_t *dests;
+		lnet_process_id_packed_t  id;
+		int		       j;
+
+		dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page);
+		LASSERT (dests != NULL);  /* my pages are within KVM always */
+		id = dests[i % SFW_ID_PER_PAGE];
+		if (msg->msg_magic != SRPC_MSG_MAGIC)
+			sfw_unpack_id(id);
+
+		for (j = 0; j < tsi->tsi_concur; j++) {
+			LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t));
+			if (tsu == NULL) {
+				rc = -ENOMEM;
+				CERROR ("Can't allocate tsu for %d\n",
+					tsi->tsi_service);
+				goto error;
+			}
+
+			tsu->tsu_dest.nid = id.nid;
+			tsu->tsu_dest.pid = id.pid;
+			tsu->tsu_instance = tsi;
+			tsu->tsu_private  = NULL;
+			list_add_tail(&tsu->tsu_list, &tsi->tsi_units);
+		}
+	}
+
+	rc = tsi->tsi_ops->tso_init(tsi);
+	if (rc == 0) {
+		list_add_tail(&tsi->tsi_list, &tsb->bat_tests);
+		return 0;
+	}
+
+error:
+	LASSERT (rc != 0);
+	sfw_destroy_test_instance(tsi);
+	return rc;
+}
+
+static void
+sfw_test_unit_done (sfw_test_unit_t *tsu)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_batch_t	 *tsb = tsi->tsi_batch;
+	sfw_session_t       *sn = tsb->bat_session;
+
+	LASSERT (sfw_test_active(tsi));
+
+	if (!atomic_dec_and_test(&tsi->tsi_nactive))
+		return;
+
+	/* the test instance is done */
+	spin_lock(&tsi->tsi_lock);
+
+	tsi->tsi_stopping = 0;
+
+	spin_unlock(&tsi->tsi_lock);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */
+	    sn == sfw_data.fw_session) {		  /* sn also active */
+		spin_unlock(&sfw_data.fw_lock);
+		return;
+	}
+
+	LASSERT (!list_empty(&sn->sn_list)); /* I'm a zombie! */
+
+	list_for_each_entry (tsb, &sn->sn_batches, bat_list) {
+		if (sfw_batch_active(tsb)) {
+			spin_unlock(&sfw_data.fw_lock);
+			return;
+		}
+	}
+
+	list_del_init(&sn->sn_list);
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_destroy_session(sn);
+	return;
+}
+
+void
+sfw_test_rpc_done (srpc_client_rpc_t *rpc)
+{
+	sfw_test_unit_t     *tsu = rpc->crpc_priv;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	int		  done = 0;
+
+	tsi->tsi_ops->tso_done_rpc(tsu, rpc);
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT (sfw_test_active(tsi));
+	LASSERT (!list_empty(&rpc->crpc_list));
+
+	list_del_init(&rpc->crpc_list);
+
+	/* batch is stopping or loop is done or get error */
+	if (tsi->tsi_stopping ||
+	    tsu->tsu_loop == 0 ||
+	    (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr))
+		done = 1;
+
+	/* dec ref for poster */
+	srpc_client_rpc_decref(rpc);
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (!done) {
+		swi_schedule_workitem(&tsu->tsu_worker);
+		return;
+	}
+
+	sfw_test_unit_done(tsu);
+	return;
+}
+
+int
+sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer,
+		    unsigned features, int nblk, int blklen,
+		    srpc_client_rpc_t **rpcpp)
+{
+	srpc_client_rpc_t   *rpc = NULL;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+
+	spin_lock(&tsi->tsi_lock);
+
+	LASSERT (sfw_test_active(tsi));
+
+	if (!list_empty(&tsi->tsi_free_rpcs)) {
+		/* pick request from buffer */
+		rpc = list_entry(tsi->tsi_free_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		LASSERT (nblk == rpc->crpc_bulk.bk_niov);
+		list_del_init(&rpc->crpc_list);
+	}
+
+	spin_unlock(&tsi->tsi_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk,
+					     blklen, sfw_test_rpc_done,
+					     sfw_test_rpc_fini, tsu);
+	} else {
+		srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk,
+				     blklen, sfw_test_rpc_done,
+				     sfw_test_rpc_fini, tsu);
+	}
+
+	if (rpc == NULL) {
+		CERROR("Can't create rpc for test %d\n", tsi->tsi_service);
+		return -ENOMEM;
+	}
+
+	rpc->crpc_reqstmsg.msg_ses_feats = features;
+	*rpcpp = rpc;
+
+	return 0;
+}
+
+int
+sfw_run_test (swi_workitem_t *wi)
+{
+	sfw_test_unit_t     *tsu = wi->swi_workitem.wi_data;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	srpc_client_rpc_t   *rpc = NULL;
+
+	LASSERT (wi == &tsu->tsu_worker);
+
+	if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) {
+		LASSERT (rpc == NULL);
+		goto test_done;
+	}
+
+	LASSERT (rpc != NULL);
+
+	spin_lock(&tsi->tsi_lock);
+
+	if (tsi->tsi_stopping) {
+		list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs);
+		spin_unlock(&tsi->tsi_lock);
+		goto test_done;
+	}
+
+	if (tsu->tsu_loop > 0)
+		tsu->tsu_loop--;
+
+	list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs);
+	spin_unlock(&tsi->tsi_lock);
+
+	rpc->crpc_timeout = rpc_timeout;
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_post_rpc(rpc);
+	spin_unlock(&rpc->crpc_lock);
+	return 0;
+
+test_done:
+	/*
+	 * No one can schedule me now since:
+	 * - previous RPC, if any, has done and
+	 * - no new RPC is initiated.
+	 * - my batch is still active; no one can run it again now.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	swi_exit_workitem(wi);
+	sfw_test_unit_done(tsu);
+	return 1;
+}
+
+int
+sfw_run_batch (sfw_batch_t *tsb)
+{
+	swi_workitem_t      *wi;
+	sfw_test_unit_t     *tsu;
+	sfw_test_instance_t *tsi;
+
+	if (sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch already active: "LPU64" (%d)\n",
+		       tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive));
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		if (!tsi->tsi_is_client) /* skip server instances */
+			continue;
+
+		LASSERT (!tsi->tsi_stopping);
+		LASSERT (!sfw_test_active(tsi));
+
+		atomic_inc(&tsb->bat_nactive);
+
+		list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) {
+			atomic_inc(&tsi->tsi_nactive);
+			tsu->tsu_loop = tsi->tsi_loop;
+			wi = &tsu->tsu_worker;
+			swi_init_workitem(wi, tsu, sfw_run_test,
+					  lst_sched_test[\
+					  lnet_cpt_of_nid(tsu->tsu_dest.nid)]);
+			swi_schedule_workitem(wi);
+		}
+	}
+
+	return 0;
+}
+
+int
+sfw_stop_batch (sfw_batch_t *tsb, int force)
+{
+	sfw_test_instance_t *tsi;
+	srpc_client_rpc_t   *rpc;
+
+	if (!sfw_batch_active(tsb)) {
+		CDEBUG(D_NET, "Batch "LPU64" inactive\n", tsb->bat_id.bat_id);
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		spin_lock(&tsi->tsi_lock);
+
+		if (!tsi->tsi_is_client ||
+		    !sfw_test_active(tsi) || tsi->tsi_stopping) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		tsi->tsi_stopping = 1;
+
+		if (!force) {
+			spin_unlock(&tsi->tsi_lock);
+			continue;
+		}
+
+		/* abort launched rpcs in the test */
+		list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) {
+			spin_lock(&rpc->crpc_lock);
+
+			srpc_abort_rpc(rpc, -EINTR);
+
+			spin_unlock(&rpc->crpc_lock);
+		}
+
+		spin_unlock(&tsi->tsi_lock);
+	}
+
+	return 0;
+}
+
+int
+sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply)
+{
+	sfw_test_instance_t *tsi;
+
+	if (testidx < 0)
+		return -EINVAL;
+
+	if (testidx == 0) {
+		reply->bar_active = atomic_read(&tsb->bat_nactive);
+		return 0;
+	}
+
+	list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) {
+		if (testidx-- > 1)
+			continue;
+
+		reply->bar_active = atomic_read(&tsi->tsi_nactive);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+void
+sfw_free_pages (srpc_server_rpc_t *rpc)
+{
+	srpc_free_bulk(rpc->srpc_bulk);
+	rpc->srpc_bulk = NULL;
+}
+
+int
+sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len,
+		int sink)
+{
+	LASSERT(rpc->srpc_bulk == NULL);
+	LASSERT(npages > 0 && npages <= LNET_MAX_IOV);
+
+	rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink);
+	if (rpc->srpc_bulk == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int
+sfw_add_test (srpc_server_rpc_t *rpc)
+{
+	sfw_session_t     *sn = sfw_data.fw_session;
+	srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply;
+	srpc_test_reqst_t *request;
+	int		rc;
+	sfw_batch_t       *bat;
+
+	request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst;
+	reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (request->tsr_loop == 0 ||
+	    request->tsr_concur == 0 ||
+	    request->tsr_sid.ses_nid == LNET_NID_ANY ||
+	    request->tsr_ndest > SFW_MAX_NDESTS ||
+	    (request->tsr_is_client && request->tsr_ndest == 0) ||
+	    request->tsr_concur > SFW_MAX_CONCUR ||
+	    request->tsr_service > SRPC_SERVICE_MAX_ID ||
+	    request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) {
+		reply->tsr_status = EINVAL;
+		return 0;
+	}
+
+	if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) ||
+	    sfw_find_test_case(request->tsr_service) == NULL) {
+		reply->tsr_status = ENOENT;
+		return 0;
+	}
+
+	bat = sfw_bid2batch(request->tsr_bid);
+	if (bat == NULL) {
+		CERROR ("Dropping RPC (%s) from %s under memory pressure.\n",
+			rpc->srpc_scd->scd_svc->sv_name,
+			libcfs_id2str(rpc->srpc_peer));
+		return -ENOMEM;
+	}
+
+	if (sfw_batch_active(bat)) {
+		reply->tsr_status = EBUSY;
+		return 0;
+	}
+
+	if (request->tsr_is_client && rpc->srpc_bulk == NULL) {
+		/* rpc will be resumed later in sfw_bulk_ready */
+		int	npg = sfw_id_pages(request->tsr_ndest);
+		int	len;
+
+		if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) {
+			len = npg * PAGE_CACHE_SIZE;
+
+		} else  {
+			len = sizeof(lnet_process_id_packed_t) *
+			      request->tsr_ndest;
+		}
+
+		return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1);
+	}
+
+	rc = sfw_add_test_instance(bat, rpc);
+	CDEBUG (rc == 0 ? D_NET : D_WARNING,
+		"%s test: sv %d %s, loop %d, concur %d, ndest %d\n",
+		rc == 0 ? "Added" : "Failed to add", request->tsr_service,
+		request->tsr_is_client ? "client" : "server",
+		request->tsr_loop, request->tsr_concur, request->tsr_ndest);
+
+	reply->tsr_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+int
+sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply)
+{
+	sfw_session_t *sn = sfw_data.fw_session;
+	int	    rc = 0;
+	sfw_batch_t   *bat;
+
+	reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id;
+
+	if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) {
+		reply->bar_status = ESRCH;
+		return 0;
+	}
+
+	bat = sfw_find_batch(request->bar_bid);
+	if (bat == NULL) {
+		reply->bar_status = ENOENT;
+		return 0;
+	}
+
+	switch (request->bar_opc) {
+	case SRPC_BATCH_OPC_RUN:
+		rc = sfw_run_batch(bat);
+		break;
+
+	case SRPC_BATCH_OPC_STOP:
+		rc = sfw_stop_batch(bat, request->bar_arg);
+		break;
+
+	case SRPC_BATCH_OPC_QUERY:
+		rc = sfw_query_batch(bat, request->bar_testidx, reply);
+		break;
+
+	default:
+		return -EINVAL; /* drop it */
+	}
+
+	reply->bar_status = (rc < 0) ? -rc : rc;
+	return 0;
+}
+
+int
+sfw_handle_server_rpc(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	srpc_msg_t     *reply	= &rpc->srpc_replymsg;
+	srpc_msg_t     *request	= &rpc->srpc_reqstbuf->buf_msg;
+	unsigned	features = LST_FEATS_MASK;
+	int		rc = 0;
+
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	/* Remove timer to avoid racing with it or expiring active session */
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer.",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	sfw_unpack_message(request);
+	LASSERT(request->msg_type == srpc_service2request(sv->sv_id));
+
+	/* rpc module should have checked this */
+	LASSERT(request->msg_version == SRPC_MSG_VERSION);
+
+	if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION &&
+	    sv->sv_id != SRPC_SERVICE_DEBUG) {
+		sfw_session_t *sn = sfw_data.fw_session;
+
+		if (sn != NULL &&
+		    sn->sn_features != request->msg_ses_feats) {
+			CNETERR("Features of framework RPC don't match "
+				"features of current session: %x/%x\n",
+				request->msg_ses_feats, sn->sn_features);
+			reply->msg_body.reply.status = EPROTO;
+			reply->msg_body.reply.sid    = sn->sn_id;
+			goto out;
+		}
+
+	} else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		/* NB: at this point, old version will ignore features and
+		 * create new session anyway, so console should be able
+		 * to handle this */
+		reply->msg_body.reply.status = EPROTO;
+		goto out;
+	}
+
+	switch(sv->sv_id) {
+	default:
+		LBUG ();
+	case SRPC_SERVICE_TEST:
+		rc = sfw_add_test(rpc);
+		break;
+
+	case SRPC_SERVICE_BATCH:
+		rc = sfw_control_batch(&request->msg_body.bat_reqst,
+				       &reply->msg_body.bat_reply);
+		break;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		rc = sfw_get_stats(&request->msg_body.stat_reqst,
+				   &reply->msg_body.stat_reply);
+		break;
+
+	case SRPC_SERVICE_DEBUG:
+		rc = sfw_debug_session(&request->msg_body.dbg_reqst,
+				       &reply->msg_body.dbg_reply);
+		break;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		rc = sfw_make_session(&request->msg_body.mksn_reqst,
+				      &reply->msg_body.mksn_reply);
+		break;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		rc = sfw_remove_session(&request->msg_body.rmsn_reqst,
+					&reply->msg_body.rmsn_reply);
+		break;
+	}
+
+	if (sfw_data.fw_session != NULL)
+		features = sfw_data.fw_session->sn_features;
+ out:
+	reply->msg_ses_feats = features;
+	rpc->srpc_done = sfw_server_rpc_done;
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+int
+sfw_bulk_ready(struct srpc_server_rpc *rpc, int status)
+{
+	struct srpc_service	*sv = rpc->srpc_scd->scd_svc;
+	int			rc;
+
+	LASSERT(rpc->srpc_bulk != NULL);
+	LASSERT(sv->sv_id == SRPC_SERVICE_TEST);
+	LASSERT(sfw_data.fw_active_srpc == NULL);
+	LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (status != 0) {
+		CERROR("Bulk transfer failed for RPC: "
+		       "service %s, peer %s, status %d\n",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer), status);
+		spin_unlock(&sfw_data.fw_lock);
+		return -EIO;
+	}
+
+	if (sfw_data.fw_shuttingdown) {
+		spin_unlock(&sfw_data.fw_lock);
+		return -ESHUTDOWN;
+	}
+
+	if (sfw_del_session_timer() != 0) {
+		CERROR("Dropping RPC (%s) from %s: racing with expiry timer",
+		       sv->sv_name, libcfs_id2str(rpc->srpc_peer));
+		spin_unlock(&sfw_data.fw_lock);
+		return -EAGAIN;
+	}
+
+	sfw_data.fw_active_srpc = rpc;
+	spin_unlock(&sfw_data.fw_lock);
+
+	rc = sfw_add_test(rpc);
+
+	spin_lock(&sfw_data.fw_lock);
+
+	if (!sfw_data.fw_shuttingdown)
+		sfw_add_session_timer();
+
+	sfw_data.fw_active_srpc = NULL;
+	spin_unlock(&sfw_data.fw_lock);
+	return rc;
+}
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc = NULL;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	LASSERT (!sfw_data.fw_shuttingdown);
+	LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) {
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		srpc_init_client_rpc(rpc, peer, service, 0, 0,
+				     done, sfw_client_rpc_fini, priv);
+	}
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	if (rpc == NULL) {
+		rpc = srpc_create_client_rpc(peer, service,
+					     nbulkiov, bulklen, done,
+					     nbulkiov != 0 ?  NULL :
+					     sfw_client_rpc_fini,
+					     priv);
+	}
+
+	if (rpc != NULL) /* "session" is concept in framework */
+		rpc->crpc_reqstmsg.msg_ses_feats = features;
+
+	return rpc;
+}
+
+void
+sfw_unpack_message (srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* srpc module should guarantee I wouldn't get crap */
+	LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+	if (msg->msg_type == SRPC_MSG_STAT_REQST) {
+		srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst;
+
+		__swab32s(&req->str_type);
+		__swab64s(&req->str_rpyid);
+		sfw_unpack_sid(req->str_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_STAT_REPLY) {
+		srpc_stat_reply_t *rep = &msg->msg_body.stat_reply;
+
+		__swab32s(&rep->str_status);
+		sfw_unpack_sid(rep->str_sid);
+		sfw_unpack_fw_counters(rep->str_fw);
+		sfw_unpack_rpc_counters(rep->str_rpc);
+		sfw_unpack_lnet_counters(rep->str_lnet);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REQST) {
+		srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst;
+
+		__swab64s(&req->mksn_rpyid);
+		__swab32s(&req->mksn_force);
+		sfw_unpack_sid(req->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_MKSN_REPLY) {
+		srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply;
+
+		__swab32s(&rep->mksn_status);
+		__swab32s(&rep->mksn_timeout);
+		sfw_unpack_sid(rep->mksn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REQST) {
+		srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst;
+
+		__swab64s(&req->rmsn_rpyid);
+		sfw_unpack_sid(req->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_RMSN_REPLY) {
+		srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply;
+
+		__swab32s(&rep->rmsn_status);
+		sfw_unpack_sid(rep->rmsn_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REQST) {
+		srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst;
+
+		__swab64s(&req->dbg_rpyid);
+		__swab32s(&req->dbg_flags);
+		sfw_unpack_sid(req->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) {
+		srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply;
+
+		__swab32s(&rep->dbg_nbatch);
+		__swab32s(&rep->dbg_timeout);
+		sfw_unpack_sid(rep->dbg_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REQST) {
+		srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst;
+
+		__swab32s(&req->bar_opc);
+		__swab64s(&req->bar_rpyid);
+		__swab32s(&req->bar_testidx);
+		__swab32s(&req->bar_arg);
+		sfw_unpack_sid(req->bar_sid);
+		__swab64s(&req->bar_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_BATCH_REPLY) {
+		srpc_batch_reply_t *rep = &msg->msg_body.bat_reply;
+
+		__swab32s(&rep->bar_status);
+		sfw_unpack_sid(rep->bar_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REQST) {
+		srpc_test_reqst_t *req = &msg->msg_body.tes_reqst;
+
+		__swab64s(&req->tsr_rpyid);
+		__swab64s(&req->tsr_bulkid);
+		__swab32s(&req->tsr_loop);
+		__swab32s(&req->tsr_ndest);
+		__swab32s(&req->tsr_concur);
+		__swab32s(&req->tsr_service);
+		sfw_unpack_sid(req->tsr_sid);
+		__swab64s(&req->tsr_bid.bat_id);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_TEST_REPLY) {
+		srpc_test_reply_t *rep = &msg->msg_body.tes_reply;
+
+		__swab32s(&rep->tsr_status);
+		sfw_unpack_sid(rep->tsr_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REQST) {
+		srpc_join_reqst_t *req = &msg->msg_body.join_reqst;
+
+		__swab64s(&req->join_rpyid);
+		sfw_unpack_sid(req->join_sid);
+		return;
+	}
+
+	if (msg->msg_type == SRPC_MSG_JOIN_REPLY) {
+		srpc_join_reply_t *rep = &msg->msg_body.join_reply;
+
+		__swab32s(&rep->join_status);
+		__swab32s(&rep->join_timeout);
+		sfw_unpack_sid(rep->join_sid);
+		return;
+	}
+
+	LBUG ();
+	return;
+}
+
+void
+sfw_abort_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT(atomic_read(&rpc->crpc_refcount) > 0);
+	LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID);
+
+	spin_lock(&rpc->crpc_lock);
+	srpc_abort_rpc(rpc, -EINTR);
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+void
+sfw_post_rpc (srpc_client_rpc_t *rpc)
+{
+	spin_lock(&rpc->crpc_lock);
+
+	LASSERT (!rpc->crpc_closed);
+	LASSERT (!rpc->crpc_aborted);
+	LASSERT (list_empty(&rpc->crpc_list));
+	LASSERT (!sfw_data.fw_shuttingdown);
+
+	rpc->crpc_timeout = rpc_timeout;
+	srpc_post_rpc(rpc);
+
+	spin_unlock(&rpc->crpc_lock);
+	return;
+}
+
+static srpc_service_t sfw_services[] =
+{
+	{
+		/* sv_id */    SRPC_SERVICE_DEBUG,
+		/* sv_name */  "debug",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_QUERY_STAT,
+		/* sv_name */  "query stats",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_MAKE_SESSION,
+		/* sv_name */  "make session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_REMOVE_SESSION,
+		/* sv_name */  "remove session",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_BATCH,
+		/* sv_name */  "batch service",
+		0
+	},
+	{
+		/* sv_id */    SRPC_SERVICE_TEST,
+		/* sv_name */  "test service",
+		0
+	},
+	{
+		/* sv_id */    0,
+		/* sv_name */  NULL,
+		0
+	}
+};
+
+extern sfw_test_client_ops_t ping_test_client;
+extern srpc_service_t	ping_test_service;
+extern void ping_init_test_client(void);
+extern void ping_init_test_service(void);
+
+extern sfw_test_client_ops_t brw_test_client;
+extern srpc_service_t	brw_test_service;
+extern void brw_init_test_client(void);
+extern void brw_init_test_service(void);
+
+
+int
+sfw_startup (void)
+{
+	int	      i;
+	int	      rc;
+	int	      error;
+	srpc_service_t  *sv;
+	sfw_test_case_t *tsc;
+
+
+	if (session_timeout < 0) {
+		CERROR ("Session timeout must be non-negative: %d\n",
+			session_timeout);
+		return -EINVAL;
+	}
+
+	if (rpc_timeout < 0) {
+		CERROR ("RPC timeout must be non-negative: %d\n",
+			rpc_timeout);
+		return -EINVAL;
+	}
+
+	if (session_timeout == 0)
+		CWARN ("Zero session_timeout specified "
+		       "- test sessions never expire.\n");
+
+	if (rpc_timeout == 0)
+		CWARN ("Zero rpc_timeout specified "
+		       "- test RPC never expire.\n");
+
+	memset(&sfw_data, 0, sizeof(struct smoketest_framework));
+
+	sfw_data.fw_session     = NULL;
+	sfw_data.fw_active_srpc = NULL;
+	spin_lock_init(&sfw_data.fw_lock);
+	atomic_set(&sfw_data.fw_nzombies, 0);
+	INIT_LIST_HEAD(&sfw_data.fw_tests);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs);
+	INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions);
+
+	brw_init_test_client();
+	brw_init_test_service();
+	rc = sfw_register_test(&brw_test_service, &brw_test_client);
+	LASSERT (rc == 0);
+
+	ping_init_test_client();
+	ping_init_test_service();
+	rc = sfw_register_test(&ping_test_service, &ping_test_client);
+	LASSERT (rc == 0);
+
+	error = 0;
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+
+		rc = srpc_add_service(sv);
+		LASSERT (rc != -EBUSY);
+		if (rc != 0) {
+			CWARN ("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL) break;
+
+		sv->sv_bulk_ready = NULL;
+		sv->sv_handler    = sfw_handle_server_rpc;
+		sv->sv_wi_total   = SFW_FRWK_WI_MAX;
+		if (sv->sv_id == SRPC_SERVICE_TEST)
+			sv->sv_bulk_ready = sfw_bulk_ready;
+
+		rc = srpc_add_service(sv);
+		LASSERT (rc != -EBUSY);
+		if (rc != 0) {
+			CWARN ("Failed to add %s service: %d\n",
+			       sv->sv_name, rc);
+			error = rc;
+		}
+
+		/* about to sfw_shutdown, no need to add buffer */
+		if (error) continue;
+
+		rc = srpc_service_add_buffers(sv, sv->sv_wi_total);
+		if (rc != 0) {
+			CWARN("Failed to reserve enough buffers: "
+			      "service %s, %d needed: %d\n",
+			      sv->sv_name, sv->sv_wi_total, rc);
+			error = -ENOMEM;
+		}
+	}
+
+	if (error != 0)
+		sfw_shutdown();
+	return error;
+}
+
+void
+sfw_shutdown (void)
+{
+	srpc_service_t	*sv;
+	sfw_test_case_t	*tsc;
+	int		 i;
+
+	spin_lock(&sfw_data.fw_lock);
+
+	sfw_data.fw_shuttingdown = 1;
+	lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock,
+		       "waiting for active RPC to finish.\n");
+
+	if (sfw_del_session_timer() != 0)
+		lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock,
+			       "waiting for session timer to explode.\n");
+
+	sfw_deactivate_session();
+	lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0,
+		       sfw_data.fw_lock,
+		       "waiting for %d zombie sessions to die.\n",
+		       atomic_read(&sfw_data.fw_nzombies));
+
+	spin_unlock(&sfw_data.fw_lock);
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) {
+		sv = tsc->tsc_srv_service;
+		srpc_shutdown_service(sv);
+		srpc_remove_service(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_zombie_rpcs)) {
+		srpc_client_rpc_t *rpc;
+
+		rpc = list_entry(sfw_data.fw_zombie_rpcs.next,
+				     srpc_client_rpc_t, crpc_list);
+		list_del(&rpc->crpc_list);
+
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	}
+
+	for (i = 0; ; i++) {
+		sv = &sfw_services[i];
+		if (sv->sv_name == NULL)
+			break;
+
+		srpc_wait_service_shutdown(sv);
+	}
+
+	while (!list_empty(&sfw_data.fw_tests)) {
+		tsc = list_entry(sfw_data.fw_tests.next,
+				     sfw_test_case_t, tsc_list);
+
+		srpc_wait_service_shutdown(tsc->tsc_srv_service);
+
+		list_del(&tsc->tsc_list);
+		LIBCFS_FREE(tsc, sizeof(*tsc));
+	}
+
+	return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c
new file mode 100644
index 000000000000..5257e5630a0e
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/module.c
@@ -0,0 +1,169 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+enum {
+	LST_INIT_NONE		= 0,
+	LST_INIT_WI_SERIAL,
+	LST_INIT_WI_TEST,
+	LST_INIT_RPC,
+	LST_INIT_FW,
+	LST_INIT_CONSOLE
+};
+
+extern int lstcon_console_init(void);
+extern int lstcon_console_fini(void);
+
+static int lst_init_step = LST_INIT_NONE;
+
+struct cfs_wi_sched *lst_sched_serial;
+struct cfs_wi_sched **lst_sched_test;
+
+void
+lnet_selftest_fini(void)
+{
+	int	i;
+
+	switch (lst_init_step) {
+		case LST_INIT_CONSOLE:
+			lstcon_console_fini();
+		case LST_INIT_FW:
+			sfw_shutdown();
+		case LST_INIT_RPC:
+			srpc_shutdown();
+		case LST_INIT_WI_TEST:
+			for (i = 0;
+			     i < cfs_cpt_number(lnet_cpt_table()); i++) {
+				if (lst_sched_test[i] == NULL)
+					continue;
+				cfs_wi_sched_destroy(lst_sched_test[i]);
+			}
+			LIBCFS_FREE(lst_sched_test,
+				    sizeof(lst_sched_test[0]) *
+				    cfs_cpt_number(lnet_cpt_table()));
+			lst_sched_test = NULL;
+
+		case LST_INIT_WI_SERIAL:
+			cfs_wi_sched_destroy(lst_sched_serial);
+			lst_sched_serial = NULL;
+		case LST_INIT_NONE:
+			break;
+		default:
+			LBUG();
+	}
+	return;
+}
+
+void
+lnet_selftest_structure_assertion(void)
+{
+	CLASSERT(sizeof(srpc_msg_t) == 160);
+	CLASSERT(sizeof(srpc_test_reqst_t) == 70);
+	CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72);
+	CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78);
+	CLASSERT(sizeof(srpc_stat_reply_t) == 136);
+	CLASSERT(sizeof(srpc_stat_reqst_t) == 28);
+}
+
+int
+lnet_selftest_init(void)
+{
+	int	nscheds;
+	int	rc;
+	int	i;
+
+	rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY,
+				 1, &lst_sched_serial);
+	if (rc != 0) {
+		CERROR("Failed to create serial WI scheduler for LST\n");
+		return rc;
+	}
+	lst_init_step = LST_INIT_WI_SERIAL;
+
+	nscheds = cfs_cpt_number(lnet_cpt_table());
+	LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds);
+	if (lst_sched_test == NULL)
+		goto error;
+
+	lst_init_step = LST_INIT_WI_TEST;
+	for (i = 0; i < nscheds; i++) {
+		int nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
+
+		/* reserve at least one CPU for LND */
+		nthrs = max(nthrs - 1, 1);
+		rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i,
+					 nthrs, &lst_sched_test[i]);
+		if (rc != 0) {
+			CERROR("Failed to create CPT affinity WI scheduler "
+			       "%d for LST\n", i);
+			goto error;
+		}
+	}
+
+	rc = srpc_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup rpc\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_RPC;
+
+	rc = sfw_startup();
+	if (rc != 0) {
+		CERROR("LST can't startup framework\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_FW;
+
+	rc = lstcon_console_init();
+	if (rc != 0) {
+		CERROR("LST can't startup console\n");
+		goto error;
+	}
+	lst_init_step = LST_INIT_CONSOLE;
+	return 0;
+error:
+	lnet_selftest_fini();
+	return rc;
+}
+
+
+MODULE_DESCRIPTION("LNet Selftest");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "0.9.0", lnet_selftest_init, lnet_selftest_fini);
diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c
new file mode 100644
index 000000000000..f0f919482b56
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/ping_test.c
@@ -0,0 +1,229 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/conctl.c
+ *
+ * Test client & Server
+ *
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ */
+
+#include "selftest.h"
+
+#define LST_PING_TEST_MAGIC     0xbabeface
+
+int ping_srv_workitems = SFW_TEST_WI_MAX;
+CFS_MODULE_PARM(ping_srv_workitems, "i", int, 0644, "# PING server workitems");
+
+typedef struct {
+	spinlock_t	pnd_lock;	/* serialize */
+	int		pnd_counter;	/* sequence counter */
+} lst_ping_data_t;
+
+static lst_ping_data_t  lst_ping_data;
+
+static int
+ping_client_init(sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+
+	LASSERT(tsi->tsi_is_client);
+	LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	spin_lock_init(&lst_ping_data.pnd_lock);
+	lst_ping_data.pnd_counter = 0;
+
+	return 0;
+}
+
+static void
+ping_client_fini (sfw_test_instance_t *tsi)
+{
+	sfw_session_t *sn = tsi->tsi_batch->bat_session;
+	int	    errors;
+
+	LASSERT (sn != NULL);
+	LASSERT (tsi->tsi_is_client);
+
+	errors = atomic_read(&sn->sn_ping_errors);
+	if (errors)
+		CWARN ("%d pings have failed.\n", errors);
+	else
+		CDEBUG (D_NET, "Ping test finished OK.\n");
+}
+
+static int
+ping_client_prep_rpc(sfw_test_unit_t *tsu,
+		     lnet_process_id_t dest, srpc_client_rpc_t **rpc)
+{
+	srpc_ping_reqst_t   *req;
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn  = tsi->tsi_batch->bat_session;
+	struct timeval       tv;
+	int		     rc;
+
+	LASSERT(sn != NULL);
+	LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0);
+
+	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc);
+	if (rc != 0)
+		return rc;
+
+	req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst;
+
+	req->pnr_magic = LST_PING_TEST_MAGIC;
+
+	spin_lock(&lst_ping_data.pnd_lock);
+	req->pnr_seq = lst_ping_data.pnd_counter++;
+	spin_unlock(&lst_ping_data.pnd_lock);
+
+	cfs_fs_timeval(&tv);
+	req->pnr_time_sec  = tv.tv_sec;
+	req->pnr_time_usec = tv.tv_usec;
+
+	return rc;
+}
+
+static void
+ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc)
+{
+	sfw_test_instance_t *tsi = tsu->tsu_instance;
+	sfw_session_t       *sn = tsi->tsi_batch->bat_session;
+	srpc_ping_reqst_t   *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst;
+	srpc_ping_reply_t   *reply = &rpc->crpc_replymsg.msg_body.ping_reply;
+	struct timeval       tv;
+
+	LASSERT (sn != NULL);
+
+	if (rpc->crpc_status != 0) {
+		if (!tsi->tsi_stopping) /* rpc could have been aborted */
+			atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Unable to ping %s (%d): %d\n",
+			libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq, rpc->crpc_status);
+		return;
+	}
+
+	if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) {
+		__swab32s(&reply->pnr_seq);
+		__swab32s(&reply->pnr_magic);
+		__swab32s(&reply->pnr_status);
+	}
+
+	if (reply->pnr_magic != LST_PING_TEST_MAGIC) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Bad magic %u from %s, %u expected.\n",
+			reply->pnr_magic, libcfs_id2str(rpc->crpc_dest),
+			LST_PING_TEST_MAGIC);
+		return;
+	}
+
+	if (reply->pnr_seq != reqst->pnr_seq) {
+		rpc->crpc_status = -EBADMSG;
+		atomic_inc(&sn->sn_ping_errors);
+		CERROR ("Bad seq %u from %s, %u expected.\n",
+			reply->pnr_seq, libcfs_id2str(rpc->crpc_dest),
+			reqst->pnr_seq);
+		return;
+	}
+
+	cfs_fs_timeval(&tv);
+	CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq,
+		(unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000
+			   + (tv.tv_usec - reqst->pnr_time_usec)));
+	return;
+}
+
+static int
+ping_server_handle(struct srpc_server_rpc *rpc)
+{
+	struct srpc_service	*sv  = rpc->srpc_scd->scd_svc;
+	srpc_msg_t	*reqstmsg = &rpc->srpc_reqstbuf->buf_msg;
+	srpc_msg_t	  *replymsg = &rpc->srpc_replymsg;
+	srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst;
+	srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply;
+
+	LASSERT (sv->sv_id == SRPC_SERVICE_PING);
+
+	if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) {
+		LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC));
+
+		__swab32s(&req->pnr_seq);
+		__swab32s(&req->pnr_magic);
+		__swab64s(&req->pnr_time_sec);
+		__swab64s(&req->pnr_time_usec);
+	}
+	LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id));
+
+	if (req->pnr_magic != LST_PING_TEST_MAGIC) {
+		CERROR ("Unexpect magic %08x from %s\n",
+			req->pnr_magic, libcfs_id2str(rpc->srpc_peer));
+		return -EINVAL;
+	}
+
+	rep->pnr_seq   = req->pnr_seq;
+	rep->pnr_magic = LST_PING_TEST_MAGIC;
+
+	if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) {
+		replymsg->msg_ses_feats = LST_FEATS_MASK;
+		rep->pnr_status = EPROTO;
+		return 0;
+	}
+
+	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
+
+	CDEBUG(D_NET, "Get ping %d from %s\n",
+	       req->pnr_seq, libcfs_id2str(rpc->srpc_peer));
+	return 0;
+}
+
+sfw_test_client_ops_t ping_test_client;
+void ping_init_test_client(void)
+{
+	ping_test_client.tso_init     = ping_client_init;
+	ping_test_client.tso_fini     = ping_client_fini;
+	ping_test_client.tso_prep_rpc = ping_client_prep_rpc;
+	ping_test_client.tso_done_rpc = ping_client_done_rpc;
+}
+
+srpc_service_t ping_test_service;
+void ping_init_test_service(void)
+{
+	ping_test_service.sv_id       = SRPC_SERVICE_PING;
+	ping_test_service.sv_name     = "ping_test";
+	ping_test_service.sv_handler  = ping_server_handle;
+	ping_test_service.sv_wi_total = ping_srv_workitems;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c
new file mode 100644
index 000000000000..91d83f4b746e
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.c
@@ -0,0 +1,1665 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/rpc.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ *
+ * 2012-05-13: Liang Zhen <liang@whamcloud.com>
+ * - percpt data for service to improve smp performance
+ * - code cleanup
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+typedef enum {
+	SRPC_STATE_NONE,
+	SRPC_STATE_NI_INIT,
+	SRPC_STATE_EQ_INIT,
+	SRPC_STATE_RUNNING,
+	SRPC_STATE_STOPPING,
+} srpc_state_t;
+
+struct smoketest_rpc {
+	spinlock_t	 rpc_glock;	/* global lock */
+	srpc_service_t	*rpc_services[SRPC_SERVICE_MAX_ID + 1];
+	lnet_handle_eq_t rpc_lnet_eq;	/* _the_ LNet event queue */
+	srpc_state_t	 rpc_state;
+	srpc_counters_t	 rpc_counters;
+	__u64		 rpc_matchbits;	/* matchbits counter */
+} srpc_data;
+
+static inline int
+srpc_serv_portal(int svc_id)
+{
+	return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ?
+	       SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL;
+}
+
+/* forward ref's */
+int srpc_handle_rpc (swi_workitem_t *wi);
+
+void srpc_get_counters (srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	*cnt = srpc_data.rpc_counters;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+void srpc_set_counters (const srpc_counters_t *cnt)
+{
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters = *cnt;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+int
+srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
+{
+	nob = min(nob, (int)PAGE_CACHE_SIZE);
+
+	LASSERT(nob > 0);
+	LASSERT(i >= 0 && i < bk->bk_niov);
+
+	bk->bk_iovs[i].kiov_offset = 0;
+	bk->bk_iovs[i].kiov_page   = pg;
+	bk->bk_iovs[i].kiov_len    = nob;
+	return nob;
+}
+
+void
+srpc_free_bulk (srpc_bulk_t *bk)
+{
+	int	 i;
+	struct page *pg;
+
+	LASSERT (bk != NULL);
+
+	for (i = 0; i < bk->bk_niov; i++) {
+		pg = bk->bk_iovs[i].kiov_page;
+		if (pg == NULL) break;
+
+		__free_page(pg);
+	}
+
+	LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov]));
+	return;
+}
+
+srpc_bulk_t *
+srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink)
+{
+	srpc_bulk_t  *bk;
+	struct page  **pages;
+	int	      i;
+
+	LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV);
+
+	LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt,
+			 offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	if (bk == NULL) {
+		CERROR("Can't allocate descriptor for %d pages\n", bulk_npg);
+		return NULL;
+	}
+
+	memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg]));
+	bk->bk_sink   = sink;
+	bk->bk_len    = bulk_len;
+	bk->bk_niov   = bulk_npg;
+	UNUSED(pages);
+
+	for (i = 0; i < bulk_npg; i++) {
+		struct page *pg;
+		int	    nob;
+
+		pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_IOFS);
+		if (pg == NULL) {
+			CERROR("Can't allocate page %d of %d\n", i, bulk_npg);
+			srpc_free_bulk(bk);
+			return NULL;
+		}
+
+		nob = srpc_add_bulk_page(bk, pg, i, bulk_len);
+		bulk_len -= nob;
+	}
+
+	return bk;
+}
+
+static inline __u64
+srpc_next_id (void)
+{
+	__u64 id;
+
+	spin_lock(&srpc_data.rpc_glock);
+	id = srpc_data.rpc_matchbits++;
+	spin_unlock(&srpc_data.rpc_glock);
+	return id;
+}
+
+void
+srpc_init_server_rpc(struct srpc_server_rpc *rpc,
+		     struct srpc_service_cd *scd,
+		     struct srpc_buffer *buffer)
+{
+	memset(rpc, 0, sizeof(*rpc));
+	swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc,
+			  srpc_serv_is_framework(scd->scd_svc) ?
+			  lst_sched_serial : lst_sched_test[scd->scd_cpt]);
+
+	rpc->srpc_ev.ev_fired = 1; /* no event expected now */
+
+	rpc->srpc_scd      = scd;
+	rpc->srpc_reqstbuf = buffer;
+	rpc->srpc_peer     = buffer->buf_peer;
+	rpc->srpc_self     = buffer->buf_self;
+	LNetInvalidateHandle(&rpc->srpc_replymdh);
+}
+
+static void
+srpc_service_fini(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	struct srpc_buffer	*buf;
+	struct list_head		*q;
+	int			i;
+
+	if (svc->sv_cpt_data == NULL)
+		return;
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		while (1) {
+			if (!list_empty(&scd->scd_buf_posted))
+				q = &scd->scd_buf_posted;
+			else if (!list_empty(&scd->scd_buf_blocked))
+				q = &scd->scd_buf_blocked;
+			else
+				break;
+
+			while (!list_empty(q)) {
+				buf = list_entry(q->next,
+						     struct srpc_buffer,
+						     buf_list);
+				list_del(&buf->buf_list);
+				LIBCFS_FREE(buf, sizeof(*buf));
+			}
+		}
+
+		LASSERT(list_empty(&scd->scd_rpc_active));
+
+		while (!list_empty(&scd->scd_rpc_free)) {
+			rpc = list_entry(scd->scd_rpc_free.next,
+					     struct srpc_server_rpc,
+					     srpc_list);
+			list_del(&rpc->srpc_list);
+			LIBCFS_FREE(rpc, sizeof(*rpc));
+		}
+	}
+
+	cfs_percpt_free(svc->sv_cpt_data);
+	svc->sv_cpt_data = NULL;
+}
+
+static int
+srpc_service_nrpcs(struct srpc_service *svc)
+{
+	int nrpcs = svc->sv_wi_total / svc->sv_ncpts;
+
+	return srpc_serv_is_framework(svc) ?
+	       max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN);
+}
+
+int srpc_add_buffer(struct swi_workitem *wi);
+
+static int
+srpc_service_init(struct srpc_service *svc)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			nrpcs;
+	int			i;
+	int			j;
+
+	svc->sv_shuttingdown = 0;
+
+	svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(),
+					    sizeof(struct srpc_service_cd));
+	if (svc->sv_cpt_data == NULL)
+		return -ENOMEM;
+
+	svc->sv_ncpts = srpc_serv_is_framework(svc) ?
+			1 : cfs_cpt_number(lnet_cpt_table());
+	nrpcs = srpc_service_nrpcs(svc);
+
+	cfs_percpt_for_each(scd, i, svc->sv_cpt_data) {
+		scd->scd_cpt = i;
+		scd->scd_svc = svc;
+		spin_lock_init(&scd->scd_lock);
+		INIT_LIST_HEAD(&scd->scd_rpc_free);
+		INIT_LIST_HEAD(&scd->scd_rpc_active);
+		INIT_LIST_HEAD(&scd->scd_buf_posted);
+		INIT_LIST_HEAD(&scd->scd_buf_blocked);
+
+		scd->scd_ev.ev_data = scd;
+		scd->scd_ev.ev_type = SRPC_REQUEST_RCVD;
+
+		/* NB: don't use lst_sched_serial for adding buffer,
+		 * see details in srpc_service_add_buffers() */
+		swi_init_workitem(&scd->scd_buf_wi, scd,
+				  srpc_add_buffer, lst_sched_test[i]);
+
+		if (i != 0 && srpc_serv_is_framework(svc)) {
+			/* NB: framework service only needs srpc_service_cd for
+			 * one partition, but we allocate for all to make
+			 * it easier to implement, it will waste a little
+			 * memory but nobody should care about this */
+			continue;
+		}
+
+		for (j = 0; j < nrpcs; j++) {
+			LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(),
+					 i, sizeof(*rpc));
+			if (rpc == NULL) {
+				srpc_service_fini(svc);
+				return -ENOMEM;
+			}
+			list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+		}
+	}
+
+	return 0;
+}
+
+int
+srpc_add_service(struct srpc_service *sv)
+{
+	int id = sv->sv_id;
+
+	LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID);
+
+	if (srpc_service_init(sv) != 0)
+		return -ENOMEM;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	if (srpc_data.rpc_services[id] != NULL) {
+		spin_unlock(&srpc_data.rpc_glock);
+		goto failed;
+	}
+
+	srpc_data.rpc_services[id] = sv;
+	spin_unlock(&srpc_data.rpc_glock);
+
+	CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name);
+	return 0;
+
+ failed:
+	srpc_service_fini(sv);
+	return -EBUSY;
+}
+
+int
+srpc_remove_service (srpc_service_t *sv)
+{
+	int id = sv->sv_id;
+
+	spin_lock(&srpc_data.rpc_glock);
+
+	if (srpc_data.rpc_services[id] != sv) {
+		spin_unlock(&srpc_data.rpc_glock);
+		return -ENOENT;
+	}
+
+	srpc_data.rpc_services[id] = NULL;
+	spin_unlock(&srpc_data.rpc_glock);
+	return 0;
+}
+
+int
+srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf,
+		       int len, int options, lnet_process_id_t peer,
+		       lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int		 rc;
+	lnet_md_t	 md;
+	lnet_handle_me_t meh;
+
+	rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK,
+			  local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh);
+	if (rc != 0) {
+		CERROR ("LNetMEAttach failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	md.threshold = 1;
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.options   = options;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+
+	rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR ("LNetMDAttach failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+
+		rc = LNetMEUnlink(meh);
+		LASSERT (rc == 0);
+		return -ENOMEM;
+	}
+
+	CDEBUG (D_NET,
+		"Posted passive RDMA: peer %s, portal %d, matchbits "LPX64"\n",
+		libcfs_id2str(peer), portal, matchbits);
+	return 0;
+}
+
+int
+srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len,
+		      int options, lnet_process_id_t peer, lnet_nid_t self,
+		      lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	int       rc;
+	lnet_md_t md;
+
+	md.user_ptr  = ev;
+	md.start     = buf;
+	md.length    = len;
+	md.eq_handle = srpc_data.rpc_lnet_eq;
+	md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1;
+	md.options   = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET);
+
+	rc = LNetMDBind(md, LNET_UNLINK, mdh);
+	if (rc != 0) {
+		CERROR ("LNetMDBind failed: %d\n", rc);
+		LASSERT (rc == -ENOMEM);
+		return -ENOMEM;
+	}
+
+	/* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options.
+	 * they're only meaningful for MDs attached to an ME (i.e. passive
+	 * buffers... */
+	if ((options & LNET_MD_OP_PUT) != 0) {
+		rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer,
+			     portal, matchbits, 0, 0);
+	} else {
+		LASSERT ((options & LNET_MD_OP_GET) != 0);
+
+		rc = LNetGet(self, *mdh, peer, portal, matchbits, 0);
+	}
+
+	if (rc != 0) {
+		CERROR ("LNet%s(%s, %d, "LPD64") failed: %d\n",
+			((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get",
+			libcfs_id2str(peer), portal, matchbits, rc);
+
+		/* The forthcoming unlink event will complete this operation
+		 * with failure, so fall through and return success here.
+		 */
+		rc = LNetMDUnlink(*mdh);
+		LASSERT (rc == 0);
+	} else {
+		CDEBUG (D_NET,
+			"Posted active RDMA: peer %s, portal %u, matchbits "LPX64"\n",
+			libcfs_id2str(peer), portal, matchbits);
+	}
+	return 0;
+}
+
+int
+srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf,
+			int len, lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	return srpc_post_active_rdma(srpc_serv_portal(service), service,
+				     buf, len, LNET_MD_OP_PUT, peer,
+				     LNET_NID_ANY, mdh, ev);
+}
+
+int
+srpc_post_passive_rqtbuf(int service, int local, void *buf, int len,
+			 lnet_handle_md_t *mdh, srpc_event_t *ev)
+{
+	lnet_process_id_t any = {0};
+
+	any.nid = LNET_NID_ANY;
+	any.pid = LNET_PID_ANY;
+
+	return srpc_post_passive_rdma(srpc_serv_portal(service),
+				      local, service, buf, len,
+				      LNET_MD_OP_PUT, any, mdh, ev);
+}
+
+int
+srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf)
+{
+	struct srpc_service	*sv = scd->scd_svc;
+	struct srpc_msg		*msg = &buf->buf_msg;
+	int			rc;
+
+	LNetInvalidateHandle(&buf->buf_mdh);
+	list_add(&buf->buf_list, &scd->scd_buf_posted);
+	scd->scd_buf_nposted++;
+	spin_unlock(&scd->scd_lock);
+
+	rc = srpc_post_passive_rqtbuf(sv->sv_id,
+				      !srpc_serv_is_framework(sv),
+				      msg, sizeof(*msg), &buf->buf_mdh,
+				      &scd->scd_ev);
+
+	/* At this point, a RPC (new or delayed) may have arrived in
+	 * msg and its event handler has been called. So we must add
+	 * buf to scd_buf_posted _before_ dropping scd_lock */
+
+	spin_lock(&scd->scd_lock);
+
+	if (rc == 0) {
+		if (!sv->sv_shuttingdown)
+			return 0;
+
+		spin_unlock(&scd->scd_lock);
+		/* srpc_shutdown_service might have tried to unlink me
+		 * when my buf_mdh was still invalid */
+		LNetMDUnlink(buf->buf_mdh);
+		spin_lock(&scd->scd_lock);
+		return 0;
+	}
+
+	scd->scd_buf_nposted--;
+	if (sv->sv_shuttingdown)
+		return rc; /* don't allow to change scd_buf_posted */
+
+	list_del(&buf->buf_list);
+	spin_unlock(&scd->scd_lock);
+
+	LIBCFS_FREE(buf, sizeof(*buf));
+
+	spin_lock(&scd->scd_lock);
+	return rc;
+}
+
+int
+srpc_add_buffer(struct swi_workitem *wi)
+{
+	struct srpc_service_cd	*scd = wi->swi_workitem.wi_data;
+	struct srpc_buffer	*buf;
+	int			rc = 0;
+
+	/* it's called by workitem scheduler threads, these threads
+	 * should have been set CPT affinity, so buffers will be posted
+	 * on CPT local list of Portal */
+	spin_lock(&scd->scd_lock);
+
+	while (scd->scd_buf_adjust > 0 &&
+	       !scd->scd_svc->sv_shuttingdown) {
+		scd->scd_buf_adjust--; /* consume it */
+		scd->scd_buf_posting++;
+
+		spin_unlock(&scd->scd_lock);
+
+		LIBCFS_ALLOC(buf, sizeof(*buf));
+		if (buf == NULL) {
+			CERROR("Failed to add new buf to service: %s\n",
+			       scd->scd_svc->sv_name);
+			spin_lock(&scd->scd_lock);
+			rc = -ENOMEM;
+			break;
+		}
+
+		spin_lock(&scd->scd_lock);
+		if (scd->scd_svc->sv_shuttingdown) {
+			spin_unlock(&scd->scd_lock);
+			LIBCFS_FREE(buf, sizeof(*buf));
+
+			spin_lock(&scd->scd_lock);
+			rc = -ESHUTDOWN;
+			break;
+		}
+
+		rc = srpc_service_post_buffer(scd, buf);
+		if (rc != 0)
+			break; /* buf has been freed inside */
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+		scd->scd_buf_total++;
+		scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4);
+	}
+
+	if (rc != 0) {
+		scd->scd_buf_err_stamp = cfs_time_current_sec();
+		scd->scd_buf_err = rc;
+
+		LASSERT(scd->scd_buf_posting > 0);
+		scd->scd_buf_posting--;
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return 0;
+}
+
+int
+srpc_service_add_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			rc = 0;
+	int			i;
+
+	LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		scd->scd_buf_err = 0;
+		scd->scd_buf_err_stamp = 0;
+		scd->scd_buf_posting = 0;
+		scd->scd_buf_adjust = nbuffer;
+		/* start to post buffers */
+		swi_schedule_workitem(&scd->scd_buf_wi);
+		spin_unlock(&scd->scd_lock);
+
+		/* framework service only post buffer for one partition  */
+		if (srpc_serv_is_framework(sv))
+			break;
+	}
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		/*
+		 * NB: srpc_service_add_buffers() can be called inside
+		 * thread context of lst_sched_serial, and we don't normally
+		 * allow to sleep inside thread context of WI scheduler
+		 * because it will block current scheduler thread from doing
+		 * anything else, even worse, it could deadlock if it's
+		 * waiting on result from another WI of the same scheduler.
+		 * However, it's safe at here because scd_buf_wi is scheduled
+		 * by thread in a different WI scheduler (lst_sched_test),
+		 * so we don't have any risk of deadlock, though this could
+		 * block all WIs pending on lst_sched_serial for a moment
+		 * which is not good but not fatal.
+		 */
+		lst_wait_until(scd->scd_buf_err != 0 ||
+			       (scd->scd_buf_adjust == 0 &&
+				scd->scd_buf_posting == 0),
+			       scd->scd_lock, "waiting for adding buffer\n");
+
+		if (scd->scd_buf_err != 0 && rc == 0)
+			rc = scd->scd_buf_err;
+
+		spin_unlock(&scd->scd_lock);
+	}
+
+	return rc;
+}
+
+void
+srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer)
+{
+	struct srpc_service_cd	*scd;
+	int			num;
+	int			i;
+
+	LASSERT(!sv->sv_shuttingdown);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		num = scd->scd_buf_total + scd->scd_buf_posting;
+		scd->scd_buf_adjust -= min(nbuffer, num);
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+/* returns 1 if sv has finished, otherwise 0 */
+int
+srpc_finish_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+		if (!swi_deschedule_workitem(&scd->scd_buf_wi))
+			return 0;
+
+		if (scd->scd_buf_nposted > 0) {
+			CDEBUG(D_NET, "waiting for %d posted buffers to unlink",
+			       scd->scd_buf_nposted);
+			spin_unlock(&scd->scd_lock);
+			return 0;
+		}
+
+		if (list_empty(&scd->scd_rpc_active)) {
+			spin_unlock(&scd->scd_lock);
+			continue;
+		}
+
+		rpc = list_entry(scd->scd_rpc_active.next,
+				     struct srpc_server_rpc, srpc_list);
+		CNETERR("Active RPC %p on shutdown: sv %s, peer %s, "
+			"wi %s scheduled %d running %d, "
+			"ev fired %d type %d status %d lnet %d\n",
+			rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+			swi_state2str(rpc->srpc_wi.swi_state),
+			rpc->srpc_wi.swi_workitem.wi_scheduled,
+			rpc->srpc_wi.swi_workitem.wi_running,
+			rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type,
+			rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet);
+		spin_unlock(&scd->scd_lock);
+		return 0;
+	}
+
+	/* no lock needed from now on */
+	srpc_service_fini(sv);
+	return 1;
+}
+
+/* called with sv->sv_lock held */
+void
+srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf)
+{
+	if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) {
+		if (srpc_service_post_buffer(scd, buf) != 0) {
+			CWARN("Failed to post %s buffer\n",
+			      scd->scd_svc->sv_name);
+		}
+		return;
+	}
+
+	/* service is shutting down, or we want to recycle some buffers */
+	scd->scd_buf_total--;
+
+	if (scd->scd_buf_adjust < 0) {
+		scd->scd_buf_adjust++;
+		if (scd->scd_buf_adjust < 0 &&
+		    scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) {
+			CDEBUG(D_INFO,
+			       "Try to recyle %d buffers but nothing left\n",
+			       scd->scd_buf_adjust);
+			scd->scd_buf_adjust = 0;
+		}
+	}
+
+	spin_unlock(&scd->scd_lock);
+	LIBCFS_FREE(buf, sizeof(*buf));
+	spin_lock(&scd->scd_lock);
+}
+
+void
+srpc_abort_service(struct srpc_service *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	int			i;
+
+	CDEBUG(D_NET, "Aborting service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the abort, NB:
+		 * racing with incoming RPCs; complete fix should make test
+		 * RPCs carry session ID in its headers */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) {
+			rpc->srpc_aborted = 1;
+			swi_schedule_workitem(&rpc->srpc_wi);
+		}
+
+		spin_unlock(&scd->scd_lock);
+	}
+}
+
+void
+srpc_shutdown_service(srpc_service_t *sv)
+{
+	struct srpc_service_cd	*scd;
+	struct srpc_server_rpc	*rpc;
+	srpc_buffer_t		*buf;
+	int			i;
+
+	CDEBUG(D_NET, "Shutting down service: id %d, name %s\n",
+	       sv->sv_id, sv->sv_name);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_lock(&scd->scd_lock);
+
+	sv->sv_shuttingdown = 1; /* i.e. no new active RPC */
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data)
+		spin_unlock(&scd->scd_lock);
+
+	cfs_percpt_for_each(scd, i, sv->sv_cpt_data) {
+		spin_lock(&scd->scd_lock);
+
+		/* schedule in-flight RPCs to notice the shutdown */
+		list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list)
+			swi_schedule_workitem(&rpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+
+		/* OK to traverse scd_buf_posted without lock, since no one
+		 * touches scd_buf_posted now */
+		list_for_each_entry(buf, &scd->scd_buf_posted, buf_list)
+			LNetMDUnlink(buf->buf_mdh);
+	}
+}
+
+int
+srpc_send_request (srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_reqstev;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REQUEST_SENT;
+
+	rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service,
+				     &rpc->crpc_reqstmsg, sizeof(srpc_msg_t),
+				     &rpc->crpc_reqstmdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_prepare_reply (srpc_client_rpc_t *rpc)
+{
+	srpc_event_t *ev = &rpc->crpc_replyev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid;
+	int	   rc;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &rpc->crpc_replymsg, sizeof(srpc_msg_t),
+				    LNET_MD_OP_PUT, rpc->crpc_dest,
+				    &rpc->crpc_replymdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_prepare_bulk (srpc_client_rpc_t *rpc)
+{
+	srpc_bulk_t  *bk = &rpc->crpc_bulk;
+	srpc_event_t *ev = &rpc->crpc_bulkev;
+	__u64	*id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid;
+	int	   rc;
+	int	   opt;
+
+	LASSERT (bk->bk_niov <= LNET_MAX_IOV);
+
+	if (bk->bk_niov == 0) return 0; /* nothing to do */
+
+	opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_BULK_REQ_RCVD;
+
+	*id = srpc_next_id();
+
+	rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id,
+				    &bk->bk_iovs[0], bk->bk_niov, opt,
+				    rpc->crpc_dest, &bk->bk_mdh, ev);
+	if (rc != 0) {
+		LASSERT (rc == -ENOMEM);
+		ev->ev_fired = 1;  /* no more event expected */
+	}
+	return rc;
+}
+
+int
+srpc_do_bulk (srpc_server_rpc_t *rpc)
+{
+	srpc_event_t  *ev = &rpc->srpc_ev;
+	srpc_bulk_t   *bk = rpc->srpc_bulk;
+	__u64	  id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid;
+	int	    rc;
+	int	    opt;
+
+	LASSERT (bk != NULL);
+
+	opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT;
+	opt |= LNET_MD_KIOV;
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT;
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id,
+				   &bk->bk_iovs[0], bk->bk_niov, opt,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &bk->bk_mdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* only called from srpc_handle_rpc */
+void
+srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status)
+{
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv  = scd->scd_svc;
+	srpc_buffer_t		*buffer;
+
+	LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE);
+
+	rpc->srpc_status = status;
+
+	CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR,
+		"Server RPC %p done: service %s, peer %s, status %s:%d\n",
+		rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer),
+		swi_state2str(rpc->srpc_wi.swi_state), status);
+
+	if (status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_dropped++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	if (rpc->srpc_done != NULL)
+		(*rpc->srpc_done) (rpc);
+	LASSERT(rpc->srpc_bulk == NULL);
+
+	spin_lock(&scd->scd_lock);
+
+	if (rpc->srpc_reqstbuf != NULL) {
+		/* NB might drop sv_lock in srpc_service_recycle_buffer, but
+		 * sv won't go away for scd_rpc_active must not be empty */
+		srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */
+
+	/*
+	 * No one can schedule me now since:
+	 * - I'm not on scd_rpc_active.
+	 * - all LNet events have been fired.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT(rpc->srpc_ev.ev_fired);
+	swi_exit_workitem(&rpc->srpc_wi);
+
+	if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) {
+		buffer = list_entry(scd->scd_buf_blocked.next,
+					srpc_buffer_t, buf_list);
+		list_del(&buffer->buf_list);
+
+		srpc_init_server_rpc(rpc, scd, buffer);
+		list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active);
+		swi_schedule_workitem(&rpc->srpc_wi);
+	} else {
+		list_add(&rpc->srpc_list, &scd->scd_rpc_free);
+	}
+
+	spin_unlock(&scd->scd_lock);
+	return;
+}
+
+/* handles an incoming RPC */
+int
+srpc_handle_rpc(swi_workitem_t *wi)
+{
+	struct srpc_server_rpc	*rpc = wi->swi_workitem.wi_data;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	int			rc = 0;
+
+	LASSERT(wi == &rpc->srpc_wi);
+
+	spin_lock(&scd->scd_lock);
+
+	if (sv->sv_shuttingdown || rpc->srpc_aborted) {
+		spin_unlock(&scd->scd_lock);
+
+		if (rpc->srpc_bulk != NULL)
+			LNetMDUnlink(rpc->srpc_bulk->bk_mdh);
+		LNetMDUnlink(rpc->srpc_replymdh);
+
+		if (ev->ev_fired) { /* no more event, OK to finish */
+			srpc_server_rpc_done(rpc, -ESHUTDOWN);
+			return 1;
+		}
+		return 0;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG ();
+	case SWI_STATE_NEWBORN: {
+		srpc_msg_t	   *msg;
+		srpc_generic_reply_t *reply;
+
+		msg = &rpc->srpc_reqstbuf->buf_msg;
+		reply = &rpc->srpc_replymsg.msg_body.reply;
+
+		if (msg->msg_magic == 0) {
+			/* moaned already in srpc_lnet_ev_handler */
+			srpc_server_rpc_done(rpc, EBADMSG);
+			return 1;
+		}
+
+		srpc_unpack_msg_hdr(msg);
+		if (msg->msg_version != SRPC_MSG_VERSION) {
+			CWARN("Version mismatch: %u, %u expected, from %s\n",
+			      msg->msg_version, SRPC_MSG_VERSION,
+			      libcfs_id2str(rpc->srpc_peer));
+			reply->status = EPROTO;
+			/* drop through and send reply */
+		} else {
+			reply->status = 0;
+			rc = (*sv->sv_handler)(rpc);
+			LASSERT(reply->status == 0 || !rpc->srpc_bulk);
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_BULK_STARTED;
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = srpc_do_bulk(rpc);
+			if (rc == 0)
+				return 0; /* wait for bulk */
+
+			LASSERT (ev->ev_fired);
+			ev->ev_status = rc;
+		}
+	}
+	case SWI_STATE_BULK_STARTED:
+		LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired);
+
+		if (rpc->srpc_bulk != NULL) {
+			rc = ev->ev_status;
+
+			if (sv->sv_bulk_ready != NULL)
+				rc = (*sv->sv_bulk_ready) (rpc, rc);
+
+			if (rc != 0) {
+				srpc_server_rpc_done(rpc, rc);
+				return 1;
+			}
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_SUBMITTED;
+		rc = srpc_send_reply(rpc);
+		if (rc == 0)
+			return 0; /* wait for reply */
+		srpc_server_rpc_done(rpc, rc);
+		return 1;
+
+	case SWI_STATE_REPLY_SUBMITTED:
+		if (!ev->ev_fired) {
+			CERROR("RPC %p: bulk %p, service %d\n",
+			       rpc, rpc->srpc_bulk, sv->sv_id);
+			CERROR("Event: status %d, type %d, lnet %d\n",
+			       ev->ev_status, ev->ev_type, ev->ev_lnet);
+			LASSERT (ev->ev_fired);
+		}
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_server_rpc_done(rpc, ev->ev_status);
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+srpc_client_rpc_expired (void *data)
+{
+	srpc_client_rpc_t *rpc = data;
+
+	CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n",
+	       rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+	       rpc->crpc_timeout);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_timeout = 0;
+	srpc_abort_rpc(rpc, -ETIMEDOUT);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	spin_lock(&srpc_data.rpc_glock);
+	srpc_data.rpc_counters.rpcs_expired++;
+	spin_unlock(&srpc_data.rpc_glock);
+}
+
+inline void
+srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+	stt_timer_t *timer = &rpc->crpc_timer;
+
+	if (rpc->crpc_timeout == 0) return;
+
+	INIT_LIST_HEAD(&timer->stt_list);
+	timer->stt_data    = rpc;
+	timer->stt_func    = srpc_client_rpc_expired;
+	timer->stt_expires = cfs_time_add(rpc->crpc_timeout,
+					  cfs_time_current_sec());
+	stt_add_timer(timer);
+	return;
+}
+
+/*
+ * Called with rpc->crpc_lock held.
+ *
+ * Upon exit the RPC expiry timer is not queued and the handler is not
+ * running on any CPU. */
+void
+srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc)
+{
+	/* timer not planted or already exploded */
+	if (rpc->crpc_timeout == 0)
+		return;
+
+	/* timer sucessfully defused */
+	if (stt_del_timer(&rpc->crpc_timer))
+		return;
+
+	/* timer detonated, wait for it to explode */
+	while (rpc->crpc_timeout != 0) {
+		spin_unlock(&rpc->crpc_lock);
+
+		schedule();
+
+		spin_lock(&rpc->crpc_lock);
+	}
+}
+
+void
+srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status)
+{
+	swi_workitem_t *wi = &rpc->crpc_wi;
+
+	LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE);
+
+	spin_lock(&rpc->crpc_lock);
+
+	rpc->crpc_closed = 1;
+	if (rpc->crpc_status == 0)
+		rpc->crpc_status = status;
+
+	srpc_del_client_rpc_timer(rpc);
+
+	CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR,
+		"Client RPC done: service %d, peer %s, status %s:%d:%d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(wi->swi_state), rpc->crpc_aborted, status);
+
+	/*
+	 * No one can schedule me now since:
+	 * - RPC timer has been defused.
+	 * - all LNet events have been fired.
+	 * - crpc_closed has been set, preventing srpc_abort_rpc from
+	 *   scheduling me.
+	 * Cancel pending schedules and prevent future schedule attempts:
+	 */
+	LASSERT (!srpc_event_pending(rpc));
+	swi_exit_workitem(wi);
+
+	spin_unlock(&rpc->crpc_lock);
+
+	(*rpc->crpc_done)(rpc);
+	return;
+}
+
+/* sends an outgoing RPC */
+int
+srpc_send_rpc (swi_workitem_t *wi)
+{
+	int		rc = 0;
+	srpc_client_rpc_t *rpc;
+	srpc_msg_t	*reply;
+	int		do_bulk;
+
+	LASSERT(wi != NULL);
+
+	rpc = wi->swi_workitem.wi_data;
+
+	LASSERT (rpc != NULL);
+	LASSERT (wi == &rpc->crpc_wi);
+
+	reply = &rpc->crpc_replymsg;
+	do_bulk = rpc->crpc_bulk.bk_niov > 0;
+
+	spin_lock(&rpc->crpc_lock);
+
+	if (rpc->crpc_aborted) {
+		spin_unlock(&rpc->crpc_lock);
+		goto abort;
+	}
+
+	spin_unlock(&rpc->crpc_lock);
+
+	switch (wi->swi_state) {
+	default:
+		LBUG ();
+	case SWI_STATE_NEWBORN:
+		LASSERT (!srpc_event_pending(rpc));
+
+		rc = srpc_prepare_reply(rpc);
+		if (rc != 0) {
+			srpc_client_rpc_done(rpc, rc);
+			return 1;
+		}
+
+		rc = srpc_prepare_bulk(rpc);
+		if (rc != 0) break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SUBMITTED;
+		rc = srpc_send_request(rpc);
+		break;
+
+	case SWI_STATE_REQUEST_SUBMITTED:
+		/* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any
+		 * order; however, they're processed in a strict order:
+		 * rqt, rpy, and bulk. */
+		if (!rpc->crpc_reqstev.ev_fired) break;
+
+		rc = rpc->crpc_reqstev.ev_status;
+		if (rc != 0) break;
+
+		wi->swi_state = SWI_STATE_REQUEST_SENT;
+		/* perhaps more events, fall thru */
+	case SWI_STATE_REQUEST_SENT: {
+		srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service);
+
+		if (!rpc->crpc_replyev.ev_fired) break;
+
+		rc = rpc->crpc_replyev.ev_status;
+		if (rc != 0) break;
+
+		srpc_unpack_msg_hdr(reply);
+		if (reply->msg_type != type ||
+		    (reply->msg_magic != SRPC_MSG_MAGIC &&
+		     reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CWARN ("Bad message from %s: type %u (%d expected),"
+			       " magic %u (%d expected).\n",
+			       libcfs_id2str(rpc->crpc_dest),
+			       reply->msg_type, type,
+			       reply->msg_magic, SRPC_MSG_MAGIC);
+			rc = -EBADMSG;
+			break;
+		}
+
+		if (do_bulk && reply->msg_body.reply.status != 0) {
+			CWARN ("Remote error %d at %s, unlink bulk buffer in "
+			       "case peer didn't initiate bulk transfer\n",
+			       reply->msg_body.reply.status,
+			       libcfs_id2str(rpc->crpc_dest));
+			LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+		}
+
+		wi->swi_state = SWI_STATE_REPLY_RECEIVED;
+	}
+	case SWI_STATE_REPLY_RECEIVED:
+		if (do_bulk && !rpc->crpc_bulkev.ev_fired) break;
+
+		rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0;
+
+		/* Bulk buffer was unlinked due to remote error. Clear error
+		 * since reply buffer still contains valid data.
+		 * NB rpc->crpc_done shouldn't look into bulk data in case of
+		 * remote error. */
+		if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK &&
+		    rpc->crpc_status == 0 && reply->msg_body.reply.status != 0)
+			rc = 0;
+
+		wi->swi_state = SWI_STATE_DONE;
+		srpc_client_rpc_done(rpc, rc);
+		return 1;
+	}
+
+	if (rc != 0) {
+		spin_lock(&rpc->crpc_lock);
+		srpc_abort_rpc(rpc, rc);
+		spin_unlock(&rpc->crpc_lock);
+	}
+
+abort:
+	if (rpc->crpc_aborted) {
+		LNetMDUnlink(rpc->crpc_reqstmdh);
+		LNetMDUnlink(rpc->crpc_replymdh);
+		LNetMDUnlink(rpc->crpc_bulk.bk_mdh);
+
+		if (!srpc_event_pending(rpc)) {
+			srpc_client_rpc_done(rpc, -EINTR);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+srpc_client_rpc_t *
+srpc_create_client_rpc (lnet_process_id_t peer, int service,
+			int nbulkiov, int bulklen,
+			void (*rpc_done)(srpc_client_rpc_t *),
+			void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	srpc_client_rpc_t *rpc;
+
+	LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t,
+				   crpc_bulk.bk_iovs[nbulkiov]));
+	if (rpc == NULL)
+		return NULL;
+
+	srpc_init_client_rpc(rpc, peer, service, nbulkiov,
+			     bulklen, rpc_done, rpc_fini, priv);
+	return rpc;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_abort_rpc (srpc_client_rpc_t *rpc, int why)
+{
+	LASSERT (why != 0);
+
+	if (rpc->crpc_aborted || /* already aborted */
+	    rpc->crpc_closed)    /* callback imminent */
+		return;
+
+	CDEBUG (D_NET,
+		"Aborting RPC: service %d, peer %s, state %s, why %d\n",
+		rpc->crpc_service, libcfs_id2str(rpc->crpc_dest),
+		swi_state2str(rpc->crpc_wi.swi_state), why);
+
+	rpc->crpc_aborted = 1;
+	rpc->crpc_status  = why;
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+/* called with rpc->crpc_lock held */
+void
+srpc_post_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (!rpc->crpc_aborted);
+	LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING);
+
+	CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n",
+		libcfs_id2str(rpc->crpc_dest), rpc->crpc_service,
+		rpc->crpc_timeout);
+
+	srpc_add_client_rpc_timer(rpc);
+	swi_schedule_workitem(&rpc->crpc_wi);
+	return;
+}
+
+
+int
+srpc_send_reply(struct srpc_server_rpc *rpc)
+{
+	srpc_event_t		*ev = &rpc->srpc_ev;
+	struct srpc_msg		*msg = &rpc->srpc_replymsg;
+	struct srpc_buffer	*buffer = rpc->srpc_reqstbuf;
+	struct srpc_service_cd	*scd = rpc->srpc_scd;
+	struct srpc_service	*sv = scd->scd_svc;
+	__u64			rpyid;
+	int			rc;
+
+	LASSERT(buffer != NULL);
+	rpyid = buffer->buf_msg.msg_body.reqst.rpyid;
+
+	spin_lock(&scd->scd_lock);
+
+	if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) {
+		/* Repost buffer before replying since test client
+		 * might send me another RPC once it gets the reply */
+		if (srpc_service_post_buffer(scd, buffer) != 0)
+			CWARN("Failed to repost %s buffer\n", sv->sv_name);
+		rpc->srpc_reqstbuf = NULL;
+	}
+
+	spin_unlock(&scd->scd_lock);
+
+	ev->ev_fired = 0;
+	ev->ev_data  = rpc;
+	ev->ev_type  = SRPC_REPLY_SENT;
+
+	msg->msg_magic   = SRPC_MSG_MAGIC;
+	msg->msg_version = SRPC_MSG_VERSION;
+	msg->msg_type    = srpc_service2reply(sv->sv_id);
+
+	rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg,
+				   sizeof(*msg), LNET_MD_OP_PUT,
+				   rpc->srpc_peer, rpc->srpc_self,
+				   &rpc->srpc_replymdh, ev);
+	if (rc != 0)
+		ev->ev_fired = 1;  /* no more event expected */
+	return rc;
+}
+
+/* when in kernel always called with LNET_LOCK() held, and in thread context */
+void
+srpc_lnet_ev_handler(lnet_event_t *ev)
+{
+	struct srpc_service_cd	*scd;
+	srpc_event_t      *rpcev = ev->md.user_ptr;
+	srpc_client_rpc_t *crpc;
+	srpc_server_rpc_t *srpc;
+	srpc_buffer_t     *buffer;
+	srpc_service_t    *sv;
+	srpc_msg_t	*msg;
+	srpc_msg_type_t    type;
+
+	LASSERT (!in_interrupt());
+
+	if (ev->status != 0) {
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.errors++;
+		spin_unlock(&srpc_data.rpc_glock);
+	}
+
+	rpcev->ev_lnet = ev->type;
+
+	switch (rpcev->ev_type) {
+	default:
+		CERROR("Unknown event: status %d, type %d, lnet %d\n",
+		       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+		LBUG ();
+	case SRPC_REQUEST_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+			srpc_data.rpc_counters.rpcs_sent++;
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_RCVD:
+	case SRPC_BULK_REQ_RCVD:
+		crpc = rpcev->ev_data;
+
+		if (rpcev != &crpc->crpc_reqstev &&
+		    rpcev != &crpc->crpc_replyev &&
+		    rpcev != &crpc->crpc_bulkev) {
+			CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n",
+			       rpcev, crpc, &crpc->crpc_reqstev,
+			       &crpc->crpc_replyev, &crpc->crpc_bulkev);
+			CERROR("Bad event: status %d, type %d, lnet %d\n",
+			       rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet);
+			LBUG ();
+		}
+
+		spin_lock(&crpc->crpc_lock);
+
+		LASSERT(rpcev->ev_fired == 0);
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+						-EINTR : ev->status;
+		swi_schedule_workitem(&crpc->crpc_wi);
+
+		spin_unlock(&crpc->crpc_lock);
+		break;
+
+	case SRPC_REQUEST_RCVD:
+		scd = rpcev->ev_data;
+		sv = scd->scd_svc;
+
+		LASSERT(rpcev == &scd->scd_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		LASSERT (ev->unlinked);
+		LASSERT (ev->type == LNET_EVENT_PUT ||
+			 ev->type == LNET_EVENT_UNLINK);
+		LASSERT (ev->type != LNET_EVENT_UNLINK ||
+			 sv->sv_shuttingdown);
+
+		buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg);
+		buffer->buf_peer = ev->initiator;
+		buffer->buf_self = ev->target.nid;
+
+		LASSERT(scd->scd_buf_nposted > 0);
+		scd->scd_buf_nposted--;
+
+		if (sv->sv_shuttingdown) {
+			/* Leave buffer on scd->scd_buf_nposted since
+			 * srpc_finish_service needs to traverse it. */
+			spin_unlock(&scd->scd_lock);
+			break;
+		}
+
+		if (scd->scd_buf_err_stamp != 0 &&
+		    scd->scd_buf_err_stamp < cfs_time_current_sec()) {
+			/* re-enable adding buffer */
+			scd->scd_buf_err_stamp = 0;
+			scd->scd_buf_err = 0;
+		}
+
+		if (scd->scd_buf_err == 0 && /* adding buffer is enabled */
+		    scd->scd_buf_adjust == 0 &&
+		    scd->scd_buf_nposted < scd->scd_buf_low) {
+			scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2,
+						  SFW_TEST_WI_MIN);
+			swi_schedule_workitem(&scd->scd_buf_wi);
+		}
+
+		list_del(&buffer->buf_list); /* from scd->scd_buf_posted */
+		msg = &buffer->buf_msg;
+		type = srpc_service2request(sv->sv_id);
+
+		if (ev->status != 0 || ev->mlength != sizeof(*msg) ||
+		    (msg->msg_type != type &&
+		     msg->msg_type != __swab32(type)) ||
+		    (msg->msg_magic != SRPC_MSG_MAGIC &&
+		     msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) {
+			CERROR ("Dropping RPC (%s) from %s: "
+				"status %d mlength %d type %u magic %u.\n",
+				sv->sv_name, libcfs_id2str(ev->initiator),
+				ev->status, ev->mlength,
+				msg->msg_type, msg->msg_magic);
+
+			/* NB can't call srpc_service_recycle_buffer here since
+			 * it may call LNetM[DE]Attach. The invalid magic tells
+			 * srpc_handle_rpc to drop this RPC */
+			msg->msg_magic = 0;
+		}
+
+		if (!list_empty(&scd->scd_rpc_free)) {
+			srpc = list_entry(scd->scd_rpc_free.next,
+					      struct srpc_server_rpc,
+					      srpc_list);
+			list_del(&srpc->srpc_list);
+
+			srpc_init_server_rpc(srpc, scd, buffer);
+			list_add_tail(&srpc->srpc_list,
+					  &scd->scd_rpc_active);
+			swi_schedule_workitem(&srpc->srpc_wi);
+		} else {
+			list_add_tail(&buffer->buf_list,
+					  &scd->scd_buf_blocked);
+		}
+
+		spin_unlock(&scd->scd_lock);
+
+		spin_lock(&srpc_data.rpc_glock);
+		srpc_data.rpc_counters.rpcs_rcvd++;
+		spin_unlock(&srpc_data.rpc_glock);
+		break;
+
+	case SRPC_BULK_GET_RPLD:
+		LASSERT (ev->type == LNET_EVENT_SEND ||
+			 ev->type == LNET_EVENT_REPLY ||
+			 ev->type == LNET_EVENT_UNLINK);
+
+		if (!ev->unlinked)
+			break; /* wait for final event */
+
+	case SRPC_BULK_PUT_SENT:
+		if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) {
+			spin_lock(&srpc_data.rpc_glock);
+
+			if (rpcev->ev_type == SRPC_BULK_GET_RPLD)
+				srpc_data.rpc_counters.bulk_get += ev->mlength;
+			else
+				srpc_data.rpc_counters.bulk_put += ev->mlength;
+
+			spin_unlock(&srpc_data.rpc_glock);
+		}
+	case SRPC_REPLY_SENT:
+		srpc = rpcev->ev_data;
+		scd  = srpc->srpc_scd;
+
+		LASSERT(rpcev == &srpc->srpc_ev);
+
+		spin_lock(&scd->scd_lock);
+
+		rpcev->ev_fired  = 1;
+		rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ?
+				   -EINTR : ev->status;
+		swi_schedule_workitem(&srpc->srpc_wi);
+
+		spin_unlock(&scd->scd_lock);
+		break;
+	}
+}
+
+
+int
+srpc_startup (void)
+{
+	int rc;
+
+	memset(&srpc_data, 0, sizeof(struct smoketest_rpc));
+	spin_lock_init(&srpc_data.rpc_glock);
+
+	/* 1 second pause to avoid timestamp reuse */
+	cfs_pause(cfs_time_seconds(1));
+	srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48;
+
+	srpc_data.rpc_state = SRPC_STATE_NONE;
+
+	rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+	if (rc < 0) {
+		CERROR ("LNetNIInit() has failed: %d\n", rc);
+		return rc;
+	}
+
+	srpc_data.rpc_state = SRPC_STATE_NI_INIT;
+
+	LNetInvalidateHandle(&srpc_data.rpc_lnet_eq);
+	rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq);
+	if (rc != 0) {
+		CERROR("LNetEQAlloc() has failed: %d\n", rc);
+		goto bail;
+	}
+
+	rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+	rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL);
+	LASSERT(rc == 0);
+
+	srpc_data.rpc_state = SRPC_STATE_EQ_INIT;
+
+	rc = stt_startup();
+
+bail:
+	if (rc != 0)
+		srpc_shutdown();
+	else
+		srpc_data.rpc_state = SRPC_STATE_RUNNING;
+
+	return rc;
+}
+
+void
+srpc_shutdown (void)
+{
+	int i;
+	int rc;
+	int state;
+
+	state = srpc_data.rpc_state;
+	srpc_data.rpc_state = SRPC_STATE_STOPPING;
+
+	switch (state) {
+	default:
+		LBUG ();
+	case SRPC_STATE_RUNNING:
+		spin_lock(&srpc_data.rpc_glock);
+
+		for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) {
+			srpc_service_t *sv = srpc_data.rpc_services[i];
+
+			LASSERTF (sv == NULL,
+				  "service not empty: id %d, name %s\n",
+				  i, sv->sv_name);
+		}
+
+		spin_unlock(&srpc_data.rpc_glock);
+
+		stt_shutdown();
+
+	case SRPC_STATE_EQ_INIT:
+		rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
+		rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
+		LASSERT (rc == 0);
+		rc = LNetEQFree(srpc_data.rpc_lnet_eq);
+		LASSERT (rc == 0); /* the EQ should have no user by now */
+
+	case SRPC_STATE_NI_INIT:
+		LNetNIFini();
+	}
+
+	return;
+}
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h
new file mode 100644
index 000000000000..b905d49a351f
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/rpc.h
@@ -0,0 +1,302 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __SELFTEST_RPC_H__
+#define __SELFTEST_RPC_H__
+
+#include <linux/lnet/lnetst.h>
+
+/*
+ * LST wired structures
+ *
+ * XXX: *REPLY == *REQST + 1
+ */
+typedef enum {
+	SRPC_MSG_MKSN_REQST     = 0,
+	SRPC_MSG_MKSN_REPLY     = 1,
+	SRPC_MSG_RMSN_REQST     = 2,
+	SRPC_MSG_RMSN_REPLY     = 3,
+	SRPC_MSG_BATCH_REQST    = 4,
+	SRPC_MSG_BATCH_REPLY    = 5,
+	SRPC_MSG_STAT_REQST     = 6,
+	SRPC_MSG_STAT_REPLY     = 7,
+	SRPC_MSG_TEST_REQST     = 8,
+	SRPC_MSG_TEST_REPLY     = 9,
+	SRPC_MSG_DEBUG_REQST    = 10,
+	SRPC_MSG_DEBUG_REPLY    = 11,
+	SRPC_MSG_BRW_REQST      = 12,
+	SRPC_MSG_BRW_REPLY      = 13,
+	SRPC_MSG_PING_REQST     = 14,
+	SRPC_MSG_PING_REPLY     = 15,
+	SRPC_MSG_JOIN_REQST     = 16,
+	SRPC_MSG_JOIN_REPLY     = 17,
+} srpc_msg_type_t;
+
+
+/* CAVEAT EMPTOR:
+ * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer,
+ * and 2nd field matchbits of bulk buffer if any.
+ *
+ * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field
+ * session id if needed.
+ */
+typedef struct {
+	__u64			rpyid;		/* reply buffer matchbits */
+	__u64			bulkid;		/* bulk buffer matchbits */
+} WIRE_ATTR srpc_generic_reqst_t;
+
+typedef struct {
+	__u32		   status;
+	lst_sid_t	       sid;
+} WIRE_ATTR srpc_generic_reply_t;
+
+/* FRAMEWORK RPCs */
+typedef struct {
+	__u64			mksn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       mksn_sid;	/* session id */
+	__u32			mksn_force;      /* use brute force */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reqst_t;			/* make session request */
+
+typedef struct {
+	__u32		   mksn_status;      /* session status */
+	lst_sid_t	       mksn_sid;	 /* session id */
+	__u32		   mksn_timeout;     /* session timeout */
+	char			mksn_name[LST_NAME_SIZE];
+} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */
+
+typedef struct {
+	__u64			rmsn_rpyid;      /* reply buffer matchbits */
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */
+
+typedef struct {
+	__u32			rmsn_status;
+	lst_sid_t		rmsn_sid;	/* session id */
+} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */
+
+typedef struct {
+	__u64			join_rpyid;     /* reply buffer matchbits */
+	lst_sid_t	       join_sid;       /* session id to join */
+	char		    join_group[LST_NAME_SIZE]; /* group name */
+} WIRE_ATTR srpc_join_reqst_t;
+
+typedef struct {
+	__u32		   join_status;    /* returned status */
+	lst_sid_t	       join_sid;       /* session id */
+	__u32			join_timeout;   /* # seconds' inactivity to expire */
+	char		    join_session[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_join_reply_t;
+
+typedef struct {
+	__u64		   dbg_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_flags;      /* bitmap of debug */
+} WIRE_ATTR srpc_debug_reqst_t;
+
+typedef struct {
+	__u32		   dbg_status;     /* returned code */
+	lst_sid_t	       dbg_sid;	/* session id */
+	__u32		   dbg_timeout;    /* session timeout */
+	__u32		   dbg_nbatch;     /* # of batches in the node */
+	char		    dbg_name[LST_NAME_SIZE]; /* session name */
+} WIRE_ATTR srpc_debug_reply_t;
+
+#define SRPC_BATCH_OPC_RUN      1
+#define SRPC_BATCH_OPC_STOP     2
+#define SRPC_BATCH_OPC_QUERY    3
+
+typedef struct {
+	__u64		   bar_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       bar_sid;	/* session id */
+	lst_bid_t	       bar_bid;	/* batch id */
+	__u32		   bar_opc;	/* create/start/stop batch */
+	__u32		   bar_testidx;    /* index of test */
+	__u32		   bar_arg;	/* parameters */
+} WIRE_ATTR srpc_batch_reqst_t;
+
+typedef struct {
+	__u32		   bar_status;     /* status of request */
+	lst_sid_t	       bar_sid;	/* session id */
+	__u32		   bar_active;     /* # of active tests in batch/test */
+	__u32		   bar_time;       /* remained time */
+} WIRE_ATTR srpc_batch_reply_t;
+
+typedef struct {
+	__u64		   str_rpyid;      /* reply buffer matchbits */
+	lst_sid_t	       str_sid;	/* session id */
+	__u32		   str_type;       /* type of stat */
+} WIRE_ATTR srpc_stat_reqst_t;
+
+typedef struct {
+	__u32		   str_status;
+	lst_sid_t	       str_sid;
+	sfw_counters_t	  str_fw;
+	srpc_counters_t	 str_rpc;
+	lnet_counters_t	 str_lnet;
+} WIRE_ATTR srpc_stat_reply_t;
+
+typedef struct {
+	__u32		   blk_opc;	/* bulk operation code */
+	__u32		   blk_npg;	/* # of pages */
+	__u32		   blk_flags;      /* reserved flags */
+} WIRE_ATTR test_bulk_req_t;
+
+typedef struct {
+	/** bulk operation code */
+	__u16			blk_opc;
+	/** data check flags */
+	__u16			blk_flags;
+	/** data length */
+	__u32			blk_len;
+	/** reserved: offset */
+	__u32		   blk_offset;
+} WIRE_ATTR test_bulk_req_v1_t;
+
+typedef struct {
+	__u32			png_size;       /* size of ping message */
+	__u32			png_flags;      /* reserved flags */
+} WIRE_ATTR test_ping_req_t;
+
+typedef struct {
+	__u64			tsr_rpyid;      /* reply buffer matchbits */
+	__u64			tsr_bulkid;     /* bulk buffer matchbits */
+	lst_sid_t		tsr_sid;	/* session id */
+	lst_bid_t		tsr_bid;	/* batch id */
+	__u32			tsr_service;    /* test type: bulk|ping|... */
+	/* test client loop count or # server buffers needed */
+	__u32			tsr_loop;
+	__u32			tsr_concur;     /* concurrency of test */
+	__u8			tsr_is_client;  /* is test client or not */
+	__u8			tsr_stop_onerr; /* stop on error */
+	__u32			tsr_ndest;      /* # of dest nodes */
+
+	union {
+		test_ping_req_t		ping;
+		test_bulk_req_t		bulk_v0;
+		test_bulk_req_v1_t	bulk_v1;
+	}		tsr_u;
+} WIRE_ATTR srpc_test_reqst_t;
+
+typedef struct {
+	__u32			tsr_status;     /* returned code */
+	lst_sid_t		tsr_sid;
+} WIRE_ATTR srpc_test_reply_t;
+
+/* TEST RPCs */
+typedef struct {
+	__u64		   pnr_rpyid;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+	__u64		   pnr_time_sec;
+	__u64		   pnr_time_usec;
+} WIRE_ATTR srpc_ping_reqst_t;
+
+typedef struct {
+	__u32		   pnr_status;
+	__u32		   pnr_magic;
+	__u32		   pnr_seq;
+} WIRE_ATTR srpc_ping_reply_t;
+
+typedef struct {
+	__u64		   brw_rpyid;      /* reply buffer matchbits */
+	__u64		   brw_bulkid;     /* bulk buffer matchbits */
+	__u32		   brw_rw;	 /* read or write */
+	__u32		   brw_len;	/* bulk data len */
+	__u32		   brw_flags;      /* bulk data patterns */
+} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */
+
+typedef struct {
+	__u32		   brw_status;
+} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */
+
+#define SRPC_MSG_MAGIC		  0xeeb0f00d
+#define SRPC_MSG_VERSION		1
+
+typedef struct srpc_msg {
+	/** magic number */
+	__u32	msg_magic;
+	/** message version number */
+	__u32	msg_version;
+	/** type of message body: srpc_msg_type_t */
+	__u32	msg_type;
+	__u32	msg_reserved0;
+	__u32	msg_reserved1;
+	/** test session features */
+	__u32	msg_ses_feats;
+	union {
+		srpc_generic_reqst_t reqst;
+		srpc_generic_reply_t reply;
+
+		srpc_mksn_reqst_t    mksn_reqst;
+		srpc_mksn_reply_t    mksn_reply;
+		srpc_rmsn_reqst_t    rmsn_reqst;
+		srpc_rmsn_reply_t    rmsn_reply;
+		srpc_debug_reqst_t   dbg_reqst;
+		srpc_debug_reply_t   dbg_reply;
+		srpc_batch_reqst_t   bat_reqst;
+		srpc_batch_reply_t   bat_reply;
+		srpc_stat_reqst_t    stat_reqst;
+		srpc_stat_reply_t    stat_reply;
+		srpc_test_reqst_t    tes_reqst;
+		srpc_test_reply_t    tes_reply;
+		srpc_join_reqst_t    join_reqst;
+		srpc_join_reply_t    join_reply;
+
+		srpc_ping_reqst_t    ping_reqst;
+		srpc_ping_reply_t    ping_reply;
+		srpc_brw_reqst_t     brw_reqst;
+		srpc_brw_reply_t     brw_reply;
+	}     msg_body;
+} WIRE_ATTR srpc_msg_t;
+
+static inline void
+srpc_unpack_msg_hdr(srpc_msg_t *msg)
+{
+	if (msg->msg_magic == SRPC_MSG_MAGIC)
+		return; /* no flipping needed */
+
+	/* We do not swap the magic number here as it is needed to
+	   determine whether the body needs to be swapped. */
+	/* __swab32s(&msg->msg_magic); */
+	__swab32s(&msg->msg_type);
+	__swab32s(&msg->msg_version);
+	__swab32s(&msg->msg_ses_feats);
+	__swab32s(&msg->msg_reserved0);
+	__swab32s(&msg->msg_reserved1);
+}
+
+#endif /* __SELFTEST_RPC_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h
new file mode 100644
index 000000000000..8053b0563ff3
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/selftest.h
@@ -0,0 +1,611 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/selftest.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_SELFTEST_H__
+#define __SELFTEST_SELFTEST_H__
+
+#define LNET_ONLY
+
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lnet.h>
+#include <linux/lnet/lib-lnet.h>
+#include <linux/lnet/lib-types.h>
+#include <linux/lnet/lnetst.h>
+
+#include "rpc.h"
+#include "timer.h"
+
+#ifndef MADE_WITHOUT_COMPROMISE
+#define MADE_WITHOUT_COMPROMISE
+#endif
+
+
+#define SWI_STATE_NEWBORN		  0
+#define SWI_STATE_REPLY_SUBMITTED	  1
+#define SWI_STATE_REPLY_SENT	       2
+#define SWI_STATE_REQUEST_SUBMITTED	3
+#define SWI_STATE_REQUEST_SENT	     4
+#define SWI_STATE_REPLY_RECEIVED	   5
+#define SWI_STATE_BULK_STARTED	     6
+#define SWI_STATE_DONE		     10
+
+/* forward refs */
+struct srpc_service;
+struct srpc_service_cd;
+struct sfw_test_unit;
+struct sfw_test_instance;
+
+/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework
+ * services, e.g. create/modify session.
+ */
+#define SRPC_SERVICE_DEBUG	      0
+#define SRPC_SERVICE_MAKE_SESSION       1
+#define SRPC_SERVICE_REMOVE_SESSION     2
+#define SRPC_SERVICE_BATCH	      3
+#define SRPC_SERVICE_TEST	       4
+#define SRPC_SERVICE_QUERY_STAT	 5
+#define SRPC_SERVICE_JOIN	       6
+#define SRPC_FRAMEWORK_SERVICE_MAX_ID   10
+/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */
+#define SRPC_SERVICE_BRW		11
+#define SRPC_SERVICE_PING	       12
+#define SRPC_SERVICE_MAX_ID	     12
+
+#define SRPC_REQUEST_PORTAL	     50
+/* a lazy portal for framework RPC requests */
+#define SRPC_FRAMEWORK_REQUEST_PORTAL   51
+/* all reply/bulk RDMAs go to this portal */
+#define SRPC_RDMA_PORTAL		52
+
+static inline srpc_msg_type_t
+srpc_service2request (int service)
+{
+	switch (service) {
+	default:
+		LBUG ();
+	case SRPC_SERVICE_DEBUG:
+		return SRPC_MSG_DEBUG_REQST;
+
+	case SRPC_SERVICE_MAKE_SESSION:
+		return SRPC_MSG_MKSN_REQST;
+
+	case SRPC_SERVICE_REMOVE_SESSION:
+		return SRPC_MSG_RMSN_REQST;
+
+	case SRPC_SERVICE_BATCH:
+		return SRPC_MSG_BATCH_REQST;
+
+	case SRPC_SERVICE_TEST:
+		return SRPC_MSG_TEST_REQST;
+
+	case SRPC_SERVICE_QUERY_STAT:
+		return SRPC_MSG_STAT_REQST;
+
+	case SRPC_SERVICE_BRW:
+		return SRPC_MSG_BRW_REQST;
+
+	case SRPC_SERVICE_PING:
+		return SRPC_MSG_PING_REQST;
+
+	case SRPC_SERVICE_JOIN:
+		return SRPC_MSG_JOIN_REQST;
+	}
+}
+
+static inline srpc_msg_type_t
+srpc_service2reply (int service)
+{
+	return srpc_service2request(service) + 1;
+}
+
+typedef enum {
+	SRPC_BULK_REQ_RCVD   = 1, /* passive bulk request(PUT sink/GET source) received */
+	SRPC_BULK_PUT_SENT   = 2, /* active bulk PUT sent (source) */
+	SRPC_BULK_GET_RPLD   = 3, /* active bulk GET replied (sink) */
+	SRPC_REPLY_RCVD      = 4, /* incoming reply received */
+	SRPC_REPLY_SENT      = 5, /* outgoing reply sent */
+	SRPC_REQUEST_RCVD    = 6, /* incoming request received */
+	SRPC_REQUEST_SENT    = 7, /* outgoing request sent */
+} srpc_event_type_t;
+
+/* RPC event */
+typedef struct {
+	srpc_event_type_t ev_type;   /* what's up */
+	lnet_event_kind_t ev_lnet;   /* LNet event type */
+	int	       ev_fired;  /* LNet event fired? */
+	int	       ev_status; /* LNet event status */
+	void	     *ev_data;   /* owning server/client RPC */
+} srpc_event_t;
+
+typedef struct {
+	int	      bk_len;  /* len of bulk data */
+	lnet_handle_md_t bk_mdh;
+	int	      bk_sink; /* sink/source */
+	int	      bk_niov; /* # iov in bk_iovs */
+	lnet_kiov_t      bk_iovs[0];
+} srpc_bulk_t; /* bulk descriptor */
+
+/* message buffer descriptor */
+typedef struct srpc_buffer {
+	struct list_head	   buf_list; /* chain on srpc_service::*_msgq */
+	srpc_msg_t	   buf_msg;
+	lnet_handle_md_t     buf_mdh;
+	lnet_nid_t	   buf_self;
+	lnet_process_id_t    buf_peer;
+} srpc_buffer_t;
+
+struct swi_workitem;
+typedef int (*swi_action_t) (struct swi_workitem *);
+
+typedef struct swi_workitem {
+	struct cfs_wi_sched	*swi_sched;
+	cfs_workitem_t       swi_workitem;
+	swi_action_t	 swi_action;
+	int		  swi_state;
+} swi_workitem_t;
+
+/* server-side state of a RPC */
+typedef struct srpc_server_rpc {
+	/* chain on srpc_service::*_rpcq */
+	struct list_head		srpc_list;
+	struct srpc_service_cd *srpc_scd;
+	swi_workitem_t       srpc_wi;
+	srpc_event_t	 srpc_ev;      /* bulk/reply event */
+	lnet_nid_t	   srpc_self;
+	lnet_process_id_t    srpc_peer;
+	srpc_msg_t	   srpc_replymsg;
+	lnet_handle_md_t     srpc_replymdh;
+	srpc_buffer_t       *srpc_reqstbuf;
+	srpc_bulk_t	 *srpc_bulk;
+
+	unsigned int	 srpc_aborted; /* being given up */
+	int		  srpc_status;
+	void	       (*srpc_done)(struct srpc_server_rpc *);
+} srpc_server_rpc_t;
+
+/* client-side state of a RPC */
+typedef struct srpc_client_rpc {
+	struct list_head		crpc_list;	/* chain on user's lists */
+	spinlock_t		crpc_lock;	/* serialize */
+	int		  crpc_service;
+	atomic_t	 crpc_refcount;
+	int		  crpc_timeout; /* # seconds to wait for reply */
+	stt_timer_t	  crpc_timer;
+	swi_workitem_t       crpc_wi;
+	lnet_process_id_t    crpc_dest;
+
+	void	       (*crpc_done)(struct srpc_client_rpc *);
+	void	       (*crpc_fini)(struct srpc_client_rpc *);
+	int		  crpc_status;    /* completion status */
+	void		*crpc_priv;      /* caller data */
+
+	/* state flags */
+	unsigned int	 crpc_aborted:1; /* being given up */
+	unsigned int	 crpc_closed:1;  /* completed */
+
+	/* RPC events */
+	srpc_event_t	 crpc_bulkev;    /* bulk event */
+	srpc_event_t	 crpc_reqstev;   /* request event */
+	srpc_event_t	 crpc_replyev;   /* reply event */
+
+	/* bulk, request(reqst), and reply exchanged on wire */
+	srpc_msg_t	   crpc_reqstmsg;
+	srpc_msg_t	   crpc_replymsg;
+	lnet_handle_md_t     crpc_reqstmdh;
+	lnet_handle_md_t     crpc_replymdh;
+	srpc_bulk_t	  crpc_bulk;
+} srpc_client_rpc_t;
+
+#define srpc_client_rpc_size(rpc)				       \
+offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov])
+
+#define srpc_client_rpc_addref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	atomic_inc(&(rpc)->crpc_refcount);			  \
+} while (0)
+
+#define srpc_client_rpc_decref(rpc)				     \
+do {								    \
+	CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n",			 \
+	       (rpc), libcfs_id2str((rpc)->crpc_dest),		  \
+	       atomic_read(&(rpc)->crpc_refcount));		 \
+	LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0);	    \
+	if (atomic_dec_and_test(&(rpc)->crpc_refcount))	     \
+		srpc_destroy_client_rpc(rpc);			   \
+} while (0)
+
+#define srpc_event_pending(rpc)   ((rpc)->crpc_bulkev.ev_fired == 0 ||  \
+				   (rpc)->crpc_reqstev.ev_fired == 0 || \
+				   (rpc)->crpc_replyev.ev_fired == 0)
+
+/* CPU partition data of srpc service */
+struct srpc_service_cd {
+	/** serialize */
+	spinlock_t		scd_lock;
+	/** backref to service */
+	struct srpc_service	*scd_svc;
+	/** event buffer */
+	srpc_event_t		scd_ev;
+	/** free RPC descriptors */
+	struct list_head		scd_rpc_free;
+	/** in-flight RPCs */
+	struct list_head		scd_rpc_active;
+	/** workitem for posting buffer */
+	swi_workitem_t		scd_buf_wi;
+	/** CPT id */
+	int			scd_cpt;
+	/** error code for scd_buf_wi */
+	int			scd_buf_err;
+	/** timestamp for scd_buf_err */
+	unsigned long	   scd_buf_err_stamp;
+	/** total # request buffers */
+	int			scd_buf_total;
+	/** # posted request buffers */
+	int			scd_buf_nposted;
+	/** in progress of buffer posting */
+	int			scd_buf_posting;
+	/** allocate more buffers if scd_buf_nposted < scd_buf_low */
+	int			scd_buf_low;
+	/** increase/decrease some buffers */
+	int			scd_buf_adjust;
+	/** posted message buffers */
+	struct list_head		scd_buf_posted;
+	/** blocked for RPC descriptor */
+	struct list_head		scd_buf_blocked;
+};
+
+/* number of server workitems (mini-thread) for testing service */
+#define SFW_TEST_WI_MIN		256
+#define SFW_TEST_WI_MAX		2048
+/* extra buffers for tolerating buggy peers, or unbalanced number
+ * of peers between partitions  */
+#define SFW_TEST_WI_EXTRA	64
+
+/* number of server workitems (mini-thread) for framework service */
+#define SFW_FRWK_WI_MIN		16
+#define SFW_FRWK_WI_MAX		256
+
+typedef struct srpc_service {
+	int			sv_id;		/* service id */
+	const char		*sv_name;	/* human readable name */
+	int			sv_wi_total;	/* total server workitems */
+	int			sv_shuttingdown;
+	int			sv_ncpts;
+	/* percpt data for srpc_service */
+	struct srpc_service_cd	**sv_cpt_data;
+	/* Service callbacks:
+	 * - sv_handler: process incoming RPC request
+	 * - sv_bulk_ready: notify bulk data
+	 */
+	int	      (*sv_handler) (srpc_server_rpc_t *);
+	int	      (*sv_bulk_ready) (srpc_server_rpc_t *, int);
+} srpc_service_t;
+
+typedef struct {
+	struct list_head	sn_list;    /* chain on fw_zombie_sessions */
+	lst_sid_t	 sn_id;      /* unique identifier */
+	unsigned int      sn_timeout; /* # seconds' inactivity to expire */
+	int	       sn_timer_active;
+	unsigned int	  sn_features;
+	stt_timer_t       sn_timer;
+	struct list_head	sn_batches; /* list of batches */
+	char	      sn_name[LST_NAME_SIZE];
+	atomic_t      sn_refcount;
+	atomic_t      sn_brw_errors;
+	atomic_t      sn_ping_errors;
+	cfs_time_t	sn_started;
+} sfw_session_t;
+
+#define sfw_sid_equal(sid0, sid1)     ((sid0).ses_nid == (sid1).ses_nid && \
+				       (sid0).ses_stamp == (sid1).ses_stamp)
+
+typedef struct {
+	struct list_head	bat_list;      /* chain on sn_batches */
+	lst_bid_t	 bat_id;	/* batch id */
+	int	       bat_error;     /* error code of batch */
+	sfw_session_t    *bat_session;   /* batch's session */
+	atomic_t      bat_nactive;   /* # of active tests */
+	struct list_head	bat_tests;     /* test instances */
+} sfw_batch_t;
+
+typedef struct {
+	int  (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */
+	void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */
+	int  (*tso_prep_rpc)(struct sfw_test_unit *tsu,
+			     lnet_process_id_t dest,
+			     srpc_client_rpc_t **rpc);   /* prep a tests rpc */
+	void (*tso_done_rpc)(struct sfw_test_unit *tsu,
+			     srpc_client_rpc_t *rpc);    /* done a test rpc */
+} sfw_test_client_ops_t;
+
+typedef struct sfw_test_instance {
+	struct list_head	      tsi_list;	 /* chain on batch */
+	int		     tsi_service;      /* test type */
+	sfw_batch_t	    *tsi_batch;	/* batch */
+	sfw_test_client_ops_t  *tsi_ops;	  /* test client operations */
+
+	/* public parameter for all test units */
+	unsigned int		tsi_is_client:1;     /* is test client */
+	unsigned int		tsi_stoptsu_onerr:1; /* stop tsu on error */
+	int		     tsi_concur;	  /* concurrency */
+	int		     tsi_loop;	    /* loop count */
+
+	/* status of test instance */
+	spinlock_t		tsi_lock;	  /* serialize */
+	unsigned int		tsi_stopping:1;   /* test is stopping */
+	atomic_t	    tsi_nactive;      /* # of active test unit */
+	struct list_head	      tsi_units;	/* test units */
+	struct list_head	      tsi_free_rpcs;    /* free rpcs */
+	struct list_head	      tsi_active_rpcs;  /* active rpcs */
+
+	union {
+		test_ping_req_t		ping;	  /* ping parameter */
+		test_bulk_req_t		bulk_v0;  /* bulk parameter */
+		test_bulk_req_v1_t	bulk_v1;  /* bulk v1 parameter */
+	} tsi_u;
+} sfw_test_instance_t;
+
+/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
+ * the end of pages are not used */
+#define SFW_MAX_CONCUR     LST_MAX_CONCUR
+#define SFW_ID_PER_PAGE    (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_MAX_NDESTS     (LNET_MAX_IOV * SFW_ID_PER_PAGE)
+#define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
+
+typedef struct sfw_test_unit {
+	struct list_head	    tsu_list;	 /* chain on lst_test_instance */
+	lnet_process_id_t     tsu_dest;	 /* id of dest node */
+	int		   tsu_loop;	 /* loop count of the test */
+	sfw_test_instance_t  *tsu_instance;     /* pointer to test instance */
+	void		 *tsu_private;      /* private data */
+	swi_workitem_t	tsu_worker;       /* workitem of the test unit */
+} sfw_test_unit_t;
+
+typedef struct sfw_test_case {
+	struct list_head	      tsc_list;	 /* chain on fw_tests */
+	srpc_service_t	 *tsc_srv_service;  /* test service */
+	sfw_test_client_ops_t  *tsc_cli_ops;      /* ops of test client */
+} sfw_test_case_t;
+
+srpc_client_rpc_t *
+sfw_create_rpc(lnet_process_id_t peer, int service,
+	       unsigned features, int nbulkiov, int bulklen,
+	       void (*done) (srpc_client_rpc_t *), void *priv);
+int sfw_create_test_rpc(sfw_test_unit_t *tsu,
+			lnet_process_id_t peer, unsigned features,
+			int nblk, int blklen, srpc_client_rpc_t **rpc);
+void sfw_abort_rpc(srpc_client_rpc_t *rpc);
+void sfw_post_rpc(srpc_client_rpc_t *rpc);
+void sfw_client_rpc_done(srpc_client_rpc_t *rpc);
+void sfw_unpack_message(srpc_msg_t *msg);
+void sfw_free_pages(srpc_server_rpc_t *rpc);
+void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i);
+int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len,
+		    int sink);
+int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply);
+
+srpc_client_rpc_t *
+srpc_create_client_rpc(lnet_process_id_t peer, int service,
+		       int nbulkiov, int bulklen,
+		       void (*rpc_done)(srpc_client_rpc_t *),
+		       void (*rpc_fini)(srpc_client_rpc_t *), void *priv);
+void srpc_post_rpc(srpc_client_rpc_t *rpc);
+void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why);
+void srpc_free_bulk(srpc_bulk_t *bk);
+srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len,
+			     int sink);
+int srpc_send_rpc(swi_workitem_t *wi);
+int srpc_send_reply(srpc_server_rpc_t *rpc);
+int srpc_add_service(srpc_service_t *sv);
+int srpc_remove_service(srpc_service_t *sv);
+void srpc_shutdown_service(srpc_service_t *sv);
+void srpc_abort_service(srpc_service_t *sv);
+int srpc_finish_service(srpc_service_t *sv);
+int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer);
+void srpc_get_counters(srpc_counters_t *cnt);
+void srpc_set_counters(const srpc_counters_t *cnt);
+
+extern struct cfs_wi_sched *lst_sched_serial;
+extern struct cfs_wi_sched **lst_sched_test;
+
+static inline int
+srpc_serv_is_framework(struct srpc_service *svc)
+{
+	return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID;
+}
+
+static inline int
+swi_wi_action(cfs_workitem_t *wi)
+{
+	swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem);
+
+	return swi->swi_action(swi);
+}
+
+static inline void
+swi_init_workitem(swi_workitem_t *swi, void *data,
+		  swi_action_t action, struct cfs_wi_sched *sched)
+{
+	swi->swi_sched  = sched;
+	swi->swi_action = action;
+	swi->swi_state  = SWI_STATE_NEWBORN;
+	cfs_wi_init(&swi->swi_workitem, data, swi_wi_action);
+}
+
+static inline void
+swi_schedule_workitem(swi_workitem_t *wi)
+{
+	cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem);
+}
+
+static inline void
+swi_exit_workitem(swi_workitem_t *swi)
+{
+	cfs_wi_exit(swi->swi_sched, &swi->swi_workitem);
+}
+
+static inline int
+swi_deschedule_workitem(swi_workitem_t *swi)
+{
+	return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem);
+}
+
+
+int sfw_startup(void);
+int srpc_startup(void);
+void sfw_shutdown(void);
+void srpc_shutdown(void);
+
+static inline void
+srpc_destroy_client_rpc (srpc_client_rpc_t *rpc)
+{
+	LASSERT (rpc != NULL);
+	LASSERT (!srpc_event_pending(rpc));
+	LASSERT (atomic_read(&rpc->crpc_refcount) == 0);
+
+	if (rpc->crpc_fini == NULL) {
+		LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc));
+	} else {
+		(*rpc->crpc_fini) (rpc);
+	}
+
+	return;
+}
+
+static inline void
+srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer,
+		      int service, int nbulkiov, int bulklen,
+		      void (*rpc_done)(srpc_client_rpc_t *),
+		      void (*rpc_fini)(srpc_client_rpc_t *), void *priv)
+{
+	LASSERT (nbulkiov <= LNET_MAX_IOV);
+
+	memset(rpc, 0, offsetof(srpc_client_rpc_t,
+				crpc_bulk.bk_iovs[nbulkiov]));
+
+	INIT_LIST_HEAD(&rpc->crpc_list);
+	swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc,
+			  lst_sched_test[lnet_cpt_of_nid(peer.nid)]);
+	spin_lock_init(&rpc->crpc_lock);
+	atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */
+
+	rpc->crpc_dest	 = peer;
+	rpc->crpc_priv	 = priv;
+	rpc->crpc_service      = service;
+	rpc->crpc_bulk.bk_len  = bulklen;
+	rpc->crpc_bulk.bk_niov = nbulkiov;
+	rpc->crpc_done	 = rpc_done;
+	rpc->crpc_fini	 = rpc_fini;
+	LNetInvalidateHandle(&rpc->crpc_reqstmdh);
+	LNetInvalidateHandle(&rpc->crpc_replymdh);
+	LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh);
+
+	/* no event is expected at this point */
+	rpc->crpc_bulkev.ev_fired  =
+	rpc->crpc_reqstev.ev_fired =
+	rpc->crpc_replyev.ev_fired = 1;
+
+	rpc->crpc_reqstmsg.msg_magic   = SRPC_MSG_MAGIC;
+	rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION;
+	rpc->crpc_reqstmsg.msg_type    = srpc_service2request(service);
+	return;
+}
+
+static inline const char *
+swi_state2str (int state)
+{
+#define STATE2STR(x) case x: return #x
+	switch(state) {
+		default:
+			LBUG();
+		STATE2STR(SWI_STATE_NEWBORN);
+		STATE2STR(SWI_STATE_REPLY_SUBMITTED);
+		STATE2STR(SWI_STATE_REPLY_SENT);
+		STATE2STR(SWI_STATE_REQUEST_SUBMITTED);
+		STATE2STR(SWI_STATE_REQUEST_SENT);
+		STATE2STR(SWI_STATE_REPLY_RECEIVED);
+		STATE2STR(SWI_STATE_BULK_STARTED);
+		STATE2STR(SWI_STATE_DONE);
+	}
+#undef STATE2STR
+}
+
+#define UNUSED(x)       ( (void)(x) )
+
+
+#define selftest_wait_events()	cfs_pause(cfs_time_seconds(1) / 10)
+
+
+#define lst_wait_until(cond, lock, fmt, ...)				\
+do {									\
+	int __I = 2;							\
+	while (!(cond)) {						\
+		CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET,		\
+		       fmt, ## __VA_ARGS__);				\
+		spin_unlock(&(lock));					\
+									\
+		selftest_wait_events();					\
+									\
+		spin_lock(&(lock));					\
+	}								\
+} while (0)
+
+static inline void
+srpc_wait_service_shutdown(srpc_service_t *sv)
+{
+	int i = 2;
+
+	LASSERT(sv->sv_shuttingdown);
+
+	while (srpc_finish_service(sv) == 0) {
+		i++;
+		CDEBUG (((i & -i) == i) ? D_WARNING : D_NET,
+			"Waiting for %s service to shutdown...\n",
+			sv->sv_name);
+		selftest_wait_events();
+	}
+}
+
+#endif /* __SELFTEST_SELFTEST_H__ */
diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c
new file mode 100644
index 000000000000..2c078550277b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.c
@@ -0,0 +1,253 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.c
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include "selftest.h"
+
+
+/*
+ * Timers are implemented as a sorted queue of expiry times. The queue
+ * is slotted, with each slot holding timers which expire in a
+ * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are
+ * sorted by increasing expiry time. The number of slots is 2**7 (128),
+ * to cover a time period of 1024 seconds into the future before wrapping.
+ */
+#define STTIMER_MINPOLL	3   /* log2 min poll interval (8 s) */
+#define STTIMER_SLOTTIME       (1 << STTIMER_MINPOLL)
+#define STTIMER_SLOTTIMEMASK   (~(STTIMER_SLOTTIME - 1))
+#define STTIMER_NSLOTS	       (1 << 7)
+#define STTIMER_SLOT(t)	       (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \
+						    (STTIMER_NSLOTS - 1))])
+
+struct st_timer_data {
+	spinlock_t	 stt_lock;
+	/* start time of the slot processed previously */
+	cfs_time_t       stt_prev_slot;
+	struct list_head       stt_hash[STTIMER_NSLOTS];
+	int	      stt_shuttingdown;
+	wait_queue_head_t      stt_waitq;
+	int	      stt_nthreads;
+} stt_data;
+
+void
+stt_add_timer(stt_timer_t *timer)
+{
+	struct list_head *pos;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT (stt_data.stt_nthreads > 0);
+	LASSERT (!stt_data.stt_shuttingdown);
+	LASSERT (timer->stt_func != NULL);
+	LASSERT (list_empty(&timer->stt_list));
+	LASSERT (cfs_time_after(timer->stt_expires, cfs_time_current_sec()));
+
+	/* a simple insertion sort */
+	list_for_each_prev (pos, STTIMER_SLOT(timer->stt_expires)) {
+		stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list);
+
+		if (cfs_time_aftereq(timer->stt_expires, old->stt_expires))
+			break;
+	}
+	list_add(&timer->stt_list, pos);
+
+	spin_unlock(&stt_data.stt_lock);
+}
+
+/*
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ *
+ * CAVEAT EMPTOR:
+ * When 0 is returned, it is possible that timer->stt_func _is_ running on
+ * another CPU.
+ */
+int
+stt_del_timer (stt_timer_t *timer)
+{
+	int ret = 0;
+
+	spin_lock(&stt_data.stt_lock);
+
+	LASSERT (stt_data.stt_nthreads > 0);
+	LASSERT (!stt_data.stt_shuttingdown);
+
+	if (!list_empty(&timer->stt_list)) {
+		ret = 1;
+		list_del_init(&timer->stt_list);
+	}
+
+	spin_unlock(&stt_data.stt_lock);
+	return ret;
+}
+
+/* called with stt_data.stt_lock held */
+int
+stt_expire_list (struct list_head *slot, cfs_time_t now)
+{
+	int	  expired = 0;
+	stt_timer_t *timer;
+
+	while (!list_empty(slot)) {
+		timer = list_entry(slot->next, stt_timer_t, stt_list);
+
+		if (cfs_time_after(timer->stt_expires, now))
+			break;
+
+		list_del_init(&timer->stt_list);
+		spin_unlock(&stt_data.stt_lock);
+
+		expired++;
+		(*timer->stt_func) (timer->stt_data);
+
+		spin_lock(&stt_data.stt_lock);
+	}
+
+	return expired;
+}
+
+int
+stt_check_timers (cfs_time_t *last)
+{
+	int	expired = 0;
+	cfs_time_t now;
+	cfs_time_t this_slot;
+
+	now = cfs_time_current_sec();
+	this_slot = now & STTIMER_SLOTTIMEMASK;
+
+	spin_lock(&stt_data.stt_lock);
+
+	while (cfs_time_aftereq(this_slot, *last)) {
+		expired += stt_expire_list(STTIMER_SLOT(this_slot), now);
+		this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME);
+	}
+
+	*last = now & STTIMER_SLOTTIMEMASK;
+	spin_unlock(&stt_data.stt_lock);
+	return expired;
+}
+
+
+int
+stt_timer_main (void *arg)
+{
+	int rc = 0;
+	UNUSED(arg);
+
+	SET_BUT_UNUSED(rc);
+
+	cfs_block_allsigs();
+
+	while (!stt_data.stt_shuttingdown) {
+		stt_check_timers(&stt_data.stt_prev_slot);
+
+		rc = wait_event_timeout(stt_data.stt_waitq,
+					stt_data.stt_shuttingdown,
+					cfs_time_seconds(STTIMER_SLOTTIME));
+	}
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads--;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+int
+stt_start_timer_thread (void)
+{
+	task_t *task;
+
+	LASSERT(!stt_data.stt_shuttingdown);
+
+	task = kthread_run(stt_timer_main, NULL, "st_timer");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	spin_lock(&stt_data.stt_lock);
+	stt_data.stt_nthreads++;
+	spin_unlock(&stt_data.stt_lock);
+	return 0;
+}
+
+
+int
+stt_startup (void)
+{
+	int rc = 0;
+	int i;
+
+	stt_data.stt_shuttingdown = 0;
+	stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK;
+
+	spin_lock_init(&stt_data.stt_lock);
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		INIT_LIST_HEAD(&stt_data.stt_hash[i]);
+
+	stt_data.stt_nthreads = 0;
+	init_waitqueue_head(&stt_data.stt_waitq);
+	rc = stt_start_timer_thread();
+	if (rc != 0)
+		CERROR ("Can't spawn timer thread: %d\n", rc);
+
+	return rc;
+}
+
+void
+stt_shutdown (void)
+{
+	int i;
+
+	spin_lock(&stt_data.stt_lock);
+
+	for (i = 0; i < STTIMER_NSLOTS; i++)
+		LASSERT (list_empty(&stt_data.stt_hash[i]));
+
+	stt_data.stt_shuttingdown = 1;
+
+	wake_up(&stt_data.stt_waitq);
+	lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock,
+		       "waiting for %d threads to terminate\n",
+		       stt_data.stt_nthreads);
+
+	spin_unlock(&stt_data.stt_lock);
+}
diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h
new file mode 100644
index 000000000000..56dbfe5ea1e5
--- /dev/null
+++ b/drivers/staging/lustre/lnet/selftest/timer.h
@@ -0,0 +1,53 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/selftest/timer.h
+ *
+ * Author: Isaac Huang <isaac@clusterfs.com>
+ */
+#ifndef __SELFTEST_TIMER_H__
+#define __SELFTEST_TIMER_H__
+
+typedef struct {
+	struct list_head	stt_list;
+	cfs_time_t	stt_expires;
+	void	    (*stt_func) (void *);
+	void	     *stt_data;
+} stt_timer_t;
+
+void stt_add_timer (stt_timer_t *timer);
+int stt_del_timer (stt_timer_t *timer);
+int stt_startup (void);
+void stt_shutdown (void);
+
+#endif /* __SELFTEST_TIMER_H__ */
author	Peng Tao <bergwolf@gmail.com>	2013-05-02 16:46:55 +0800
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2013-05-14 13:54:50 -0400
commit	d7e09d0397e84eefbabfd9cb353221f3c6448d83 (patch)
tree	caeeaffa2967acfe0988dc65cf04089c0a27c7aa /drivers/staging/lustre/lnet
parent	staging: vt6656: remove unused definitions (diff)
download	linux-dev-d7e09d0397e84eefbabfd9cb353221f3c6448d83.tar.xz linux-dev-d7e09d0397e84eefbabfd9cb353221f3c6448d83.zip