aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lnet/lnet
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/lustre/lnet/lnet')
-rw-r--r--drivers/staging/lustre/lnet/lnet/Makefile8
-rw-r--r--drivers/staging/lustre/lnet/lnet/acceptor.c527
-rw-r--r--drivers/staging/lustre/lnet/lnet/api-errno.c39
-rw-r--r--drivers/staging/lustre/lnet/lnet/api-ni.c1941
-rw-r--r--drivers/staging/lustre/lnet/lnet/config.c1264
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-eq.c447
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-md.c451
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-me.c297
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-move.c2441
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-msg.c650
-rw-r--r--drivers/staging/lustre/lnet/lnet/lib-ptl.c938
-rw-r--r--drivers/staging/lustre/lnet/lnet/lo.c120
-rw-r--r--drivers/staging/lustre/lnet/lnet/module.c154
-rw-r--r--drivers/staging/lustre/lnet/lnet/peer.c337
-rw-r--r--drivers/staging/lustre/lnet/lnet/router.c1694
-rw-r--r--drivers/staging/lustre/lnet/lnet/router_proc.c950
16 files changed, 12258 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile
new file mode 100644
index 000000000000..1bd9ef774208
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_LNET) += lnet.o
+
+lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o \
+ lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o \
+ router_proc.o acceptor.o peer.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c
new file mode 100644
index 000000000000..81ef28bbcba0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/acceptor.c
@@ -0,0 +1,527 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+
+static int accept_port = 988;
+static int accept_backlog = 127;
+static int accept_timeout = 5;
+
+struct {
+ int pta_shutdown;
+ socket_t *pta_sock;
+ struct completion pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_port(void)
+{
+ return accept_port;
+}
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+ return (magic == constant ||
+ magic == __swab32(constant));
+}
+
+
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+static char *accept = "secure";
+
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+ "Accept connections (secure|all|none)");
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+ "Acceptor's port (same on all nodes)");
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+ "Acceptor's listen backlog");
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+ "Acceptor's timeout (seconds)");
+
+static char *accept_type = NULL;
+
+int
+lnet_acceptor_get_tunables(void)
+{
+ /* Userland acceptor uses 'accept_type' instead of 'accept', due to
+ * conflict with 'accept(2)', but kernel acceptor still uses 'accept'
+ * for compatibility. Hence the trick. */
+ accept_type = accept;
+ return 0;
+}
+
+int
+lnet_acceptor_timeout(void)
+{
+ return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid,
+ __u32 peer_ip, int peer_port)
+{
+ switch (rc) {
+ /* "normal" errors */
+ case -ECONNREFUSED:
+ CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was "
+ "refused: check that Lustre is running on that node.\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ CNETERR("Connection to %s at host %u.%u.%u.%u "
+ "was unreachable: the network or that node may "
+ "be down, or Lustre may be misconfigured.\n",
+ libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+ break;
+ case -ETIMEDOUT:
+ CNETERR("Connection to %s at host %u.%u.%u.%u on "
+ "port %d took too long: that node may be hung "
+ "or experiencing high load.\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ case -ECONNRESET:
+ LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u"
+ " on port %d was reset: "
+ "is it running a compatible version of "
+ "Lustre and is %s one of its NIDs?\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port,
+ libcfs_nid2str(peer_nid));
+ break;
+ case -EPROTO:
+ LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at "
+ "host %u.%u.%u.%u on port %d: is it running "
+ "a compatible version of Lustre?\n",
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ case -EADDRINUSE:
+ LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to "
+ "connect to %s at host %u.%u.%u.%u on port "
+ "%d\n", libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ default:
+ LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s"
+ " at host %u.%u.%u.%u on port %d\n", rc,
+ libcfs_nid2str(peer_nid),
+ HIPQUAD(peer_ip), peer_port);
+ break;
+ }
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(socket_t **sockp, lnet_nid_t peer_nid,
+ __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+ lnet_acceptor_connreq_t cr;
+ socket_t *sock;
+ int rc;
+ int port;
+ int fatal;
+
+ CLASSERT (sizeof(cr) <= 16); /* not too big to be on the stack */
+
+ for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT;
+ port >= LNET_ACCEPTOR_MIN_RESERVED_PORT;
+ --port) {
+ /* Iterate through reserved ports. */
+
+ rc = libcfs_sock_connect(&sock, &fatal,
+ local_ip, port,
+ peer_ip, peer_port);
+ if (rc != 0) {
+ if (fatal)
+ goto failed;
+ continue;
+ }
+
+ CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+ cr.acr_nid = peer_nid;
+
+ if (the_lnet.ln_testprotocompat != 0) {
+ /* single-shot proto check */
+ lnet_net_lock(LNET_LOCK_EX);
+ if ((the_lnet.ln_testprotocompat & 4) != 0) {
+ cr.acr_version++;
+ the_lnet.ln_testprotocompat &= ~4;
+ }
+ if ((the_lnet.ln_testprotocompat & 8) != 0) {
+ cr.acr_magic = LNET_PROTO_MAGIC;
+ the_lnet.ln_testprotocompat &= ~8;
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+ }
+
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+ if (rc != 0)
+ goto failed_sock;
+
+ *sockp = sock;
+ return 0;
+ }
+
+ rc = -EADDRINUSE;
+ goto failed;
+
+ failed_sock:
+ libcfs_sock_release(sock);
+ failed:
+ lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+ return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+
+/* Below is the code common for both kernel and MT user-space */
+
+int
+lnet_accept(socket_t *sock, __u32 magic)
+{
+ lnet_acceptor_connreq_t cr;
+ __u32 peer_ip;
+ int peer_port;
+ int rc;
+ int flip;
+ lnet_ni_t *ni;
+ char *str;
+
+ LASSERT (sizeof(cr) <= 16); /* not too big for the stack */
+
+ rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+ LASSERT (rc == 0); /* we succeeded before */
+
+ if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+ if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+ /* future version compatibility!
+ * When LNET unifies protocols over all LNDs, the first
+ * thing sent will be a version query. I send back
+ * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+ memset (&cr, 0, sizeof(cr));
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+
+ if (rc != 0)
+ CERROR("Error sending magic+version in response"
+ "to LNET magic from %u.%u.%u.%u: %d\n",
+ HIPQUAD(peer_ip), rc);
+ return -EPROTO;
+ }
+
+ if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+ str = "'old' socknal/tcpnal";
+ else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+ str = "'old' ranal";
+ else
+ str = "unrecognised";
+
+ LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u"
+ " magic %08x: %s acceptor protocol\n",
+ HIPQUAD(peer_ip), magic, str);
+ return -EPROTO;
+ }
+
+ flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+ rc = libcfs_sock_read(sock, &cr.acr_version,
+ sizeof(cr.acr_version),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request version from "
+ "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+ return -EIO;
+ }
+
+ if (flip)
+ __swab32s(&cr.acr_version);
+
+ if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+ /* future version compatibility!
+ * An acceptor-specific protocol rev will first send a version
+ * query. I send back my current version to tell her I'm
+ * "old". */
+ int peer_version = cr.acr_version;
+
+ memset (&cr, 0, sizeof(cr));
+ cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+ cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+ rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+ accept_timeout);
+
+ if (rc != 0)
+ CERROR("Error sending magic+version in response"
+ "to version %d from %u.%u.%u.%u: %d\n",
+ peer_version, HIPQUAD(peer_ip), rc);
+ return -EPROTO;
+ }
+
+ rc = libcfs_sock_read(sock, &cr.acr_nid,
+ sizeof(cr) -
+ offsetof(lnet_acceptor_connreq_t, acr_nid),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request from "
+ "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+ return -EIO;
+ }
+
+ if (flip)
+ __swab64s(&cr.acr_nid);
+
+ ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+ if (ni == NULL || /* no matching net */
+ ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+ if (ni != NULL)
+ lnet_ni_decref(ni);
+ LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u"
+ " for %s: No matching NI\n",
+ HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+ return -EPERM;
+ }
+
+ if (ni->ni_lnd->lnd_accept == NULL) {
+ /* This catches a request for the loopback LND */
+ lnet_ni_decref(ni);
+ LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u"
+ " for %s: NI doesn not accept IP connections\n",
+ HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+ return -EPERM;
+ }
+
+ CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n",
+ libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip));
+
+ rc = ni->ni_lnd->lnd_accept(ni, sock);
+
+ lnet_ni_decref(ni);
+ return rc;
+}
+
+int
+lnet_acceptor(void *arg)
+{
+ socket_t *newsock;
+ int rc;
+ __u32 magic;
+ __u32 peer_ip;
+ int peer_port;
+ int secure = (int)((long_ptr_t)arg);
+
+ LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+ cfs_block_allsigs();
+
+ rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+ 0, accept_port, accept_backlog);
+ if (rc != 0) {
+ if (rc == -EADDRINUSE)
+ LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port"
+ " %d: port already in use\n",
+ accept_port);
+ else
+ LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port "
+ "%d: unexpected error %d\n",
+ accept_port, rc);
+
+ lnet_acceptor_state.pta_sock = NULL;
+ } else {
+ LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port);
+ }
+
+ /* set init status and unblock parent */
+ lnet_acceptor_state.pta_shutdown = rc;
+ complete(&lnet_acceptor_state.pta_signal);
+
+ if (rc != 0)
+ return rc;
+
+ while (!lnet_acceptor_state.pta_shutdown) {
+
+ rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+ if (rc != 0) {
+ if (rc != -EAGAIN) {
+ CWARN("Accept error %d: pausing...\n", rc);
+ cfs_pause(cfs_time_seconds(1));
+ }
+ continue;
+ }
+
+ /* maybe we're waken up with libcfs_sock_abort_accept() */
+ if (lnet_acceptor_state.pta_shutdown) {
+ libcfs_sock_release(newsock);
+ break;
+ }
+
+ rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+ if (rc != 0) {
+ CERROR("Can't determine new connection's address\n");
+ goto failed;
+ }
+
+ if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+ CERROR("Refusing connection from %u.%u.%u.%u: "
+ "insecure port %d\n",
+ HIPQUAD(peer_ip), peer_port);
+ goto failed;
+ }
+
+ rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+ accept_timeout);
+ if (rc != 0) {
+ CERROR("Error %d reading connection request from "
+ "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+ goto failed;
+ }
+
+ rc = lnet_accept(newsock, magic);
+ if (rc != 0)
+ goto failed;
+
+ continue;
+
+ failed:
+ libcfs_sock_release(newsock);
+ }
+
+ libcfs_sock_release(lnet_acceptor_state.pta_sock);
+ lnet_acceptor_state.pta_sock = NULL;
+
+ CDEBUG(D_NET, "Acceptor stopping\n");
+
+ /* unblock lnet_acceptor_stop() */
+ complete(&lnet_acceptor_state.pta_signal);
+ return 0;
+}
+
+static inline int
+accept2secure(const char *acc, long *sec)
+{
+ if (!strcmp(acc, "secure")) {
+ *sec = 1;
+ return 1;
+ } else if (!strcmp(acc, "all")) {
+ *sec = 0;
+ return 1;
+ } else if (!strcmp(acc, "none")) {
+ return 0;
+ } else {
+ LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n",
+ acc);
+ return -EINVAL;
+ }
+}
+
+int
+lnet_acceptor_start(void)
+{
+ int rc;
+ long rc2;
+ long secure;
+
+ LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+ rc = lnet_acceptor_get_tunables();
+ if (rc != 0)
+ return rc;
+
+
+ init_completion(&lnet_acceptor_state.pta_signal);
+ rc = accept2secure(accept_type, &secure);
+ if (rc <= 0) {
+ fini_completion(&lnet_acceptor_state.pta_signal);
+ return rc;
+ }
+
+ if (lnet_count_acceptor_nis() == 0) /* not required */
+ return 0;
+
+ rc2 = PTR_ERR(kthread_run(lnet_acceptor,
+ (void *)(ulong_ptr_t)secure,
+ "acceptor_%03ld", secure));
+ if (IS_ERR_VALUE(rc2)) {
+ CERROR("Can't start acceptor thread: %ld\n", rc2);
+ fini_completion(&lnet_acceptor_state.pta_signal);
+
+ return -ESRCH;
+ }
+
+ /* wait for acceptor to startup */
+ wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+ if (!lnet_acceptor_state.pta_shutdown) {
+ /* started OK */
+ LASSERT(lnet_acceptor_state.pta_sock != NULL);
+ return 0;
+ }
+
+ LASSERT(lnet_acceptor_state.pta_sock == NULL);
+ fini_completion(&lnet_acceptor_state.pta_signal);
+
+ return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+ if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+ return;
+
+ lnet_acceptor_state.pta_shutdown = 1;
+ libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+ /* block until acceptor signals exit */
+ wait_for_completion(&lnet_acceptor_state.pta_signal);
+
+ fini_completion(&lnet_acceptor_state.pta_signal);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c
new file mode 100644
index 000000000000..695b27265e23
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-errno.c
@@ -0,0 +1,39 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/api-errno.c
+ *
+ * Instantiate the string table of errors
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c
new file mode 100644
index 000000000000..e88bee362497
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/api-ni.c
@@ -0,0 +1,1941 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+#include <linux/log2.h>
+
+#define D_LNI D_CONSOLE
+
+lnet_t the_lnet; /* THE state of the network */
+EXPORT_SYMBOL(the_lnet);
+
+
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+ "LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+ "local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+ "routes to non-local networks");
+
+static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444,
+ "size of remote network hash table");
+
+char *
+lnet_get_routes(void)
+{
+ return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+ char *nets;
+ int rc;
+
+ if (*networks != 0 && *ip2nets != 0) {
+ LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or "
+ "'ip2nets' but not both at once\n");
+ return NULL;
+ }
+
+ if (*ip2nets != 0) {
+ rc = lnet_parse_ip2nets(&nets, ip2nets);
+ return (rc == 0) ? nets : NULL;
+ }
+
+ if (*networks != 0)
+ return networks;
+
+ return "tcp";
+}
+
+void
+lnet_init_locks(void)
+{
+ spin_lock_init(&the_lnet.ln_eq_wait_lock);
+ init_waitqueue_head(&the_lnet.ln_eq_waitq);
+ mutex_init(&the_lnet.ln_lnd_mutex);
+ mutex_init(&the_lnet.ln_api_mutex);
+}
+
+void
+lnet_fini_locks(void)
+{
+}
+
+
+static int
+lnet_create_remote_nets_table(void)
+{
+ int i;
+ struct list_head *hash;
+
+ LASSERT(the_lnet.ln_remote_nets_hash == NULL);
+ LASSERT(the_lnet.ln_remote_nets_hbits > 0);
+ LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+ if (hash == NULL) {
+ CERROR("Failed to create remote nets hash table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&hash[i]);
+ the_lnet.ln_remote_nets_hash = hash;
+ return 0;
+}
+
+static void
+lnet_destroy_remote_nets_table(void)
+{
+ int i;
+ struct list_head *hash;
+
+ if (the_lnet.ln_remote_nets_hash == NULL)
+ return;
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++)
+ LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i]));
+
+ LIBCFS_FREE(the_lnet.ln_remote_nets_hash,
+ LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash));
+ the_lnet.ln_remote_nets_hash = NULL;
+}
+
+static void
+lnet_destroy_locks(void)
+{
+ if (the_lnet.ln_res_lock != NULL) {
+ cfs_percpt_lock_free(the_lnet.ln_res_lock);
+ the_lnet.ln_res_lock = NULL;
+ }
+
+ if (the_lnet.ln_net_lock != NULL) {
+ cfs_percpt_lock_free(the_lnet.ln_net_lock);
+ the_lnet.ln_net_lock = NULL;
+ }
+
+ lnet_fini_locks();
+}
+
+static int
+lnet_create_locks(void)
+{
+ lnet_init_locks();
+
+ the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+ if (the_lnet.ln_res_lock == NULL)
+ goto failed;
+
+ the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table());
+ if (the_lnet.ln_net_lock == NULL)
+ goto failed;
+
+ return 0;
+
+ failed:
+ lnet_destroy_locks();
+ return -ENOMEM;
+}
+
+void lnet_assert_wire_constants (void)
+{
+ /* Wire protocol assertions generated by 'wirecheck'
+ * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+ * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+ * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
+
+ /* Constants... */
+ CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+ CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+ CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+ CLASSERT (LNET_MSG_ACK == 0);
+ CLASSERT (LNET_MSG_PUT == 1);
+ CLASSERT (LNET_MSG_GET == 2);
+ CLASSERT (LNET_MSG_REPLY == 3);
+ CLASSERT (LNET_MSG_HELLO == 4);
+
+ /* Checks for struct ptl_handle_wire_t */
+ CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+ CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+ CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+ CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+ CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+ /* Checks for struct lnet_magicversion_t */
+ CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+ CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+ CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+ CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+ CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+ CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+ CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+ /* Checks for struct lnet_hdr_t */
+ CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+ CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
+
+ /* Ack */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+ /* Put */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+ /* Get */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+ /* Reply */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+ /* Hello */
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+ CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+ CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type)
+{
+ lnd_t *lnd;
+ struct list_head *tmp;
+
+ /* holding lnd mutex */
+ list_for_each (tmp, &the_lnet.ln_lnds) {
+ lnd = list_entry(tmp, lnd_t, lnd_list);
+
+ if ((int)lnd->lnd_type == type)
+ return lnd;
+ }
+
+ return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+ LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+
+ list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+ lnd->lnd_refcount = 0;
+
+ CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_register_lnd);
+
+void
+lnet_unregister_lnd (lnd_t *lnd)
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+ LASSERT (lnd->lnd_refcount == 0);
+
+ list_del (&lnd->lnd_list);
+ CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+}
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+void
+lnet_counters_get(lnet_counters_t *counters)
+{
+ lnet_counters_t *ctr;
+ int i;
+
+ memset(counters, 0, sizeof(*counters));
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+ counters->msgs_max += ctr->msgs_max;
+ counters->msgs_alloc += ctr->msgs_alloc;
+ counters->errors += ctr->errors;
+ counters->send_count += ctr->send_count;
+ counters->recv_count += ctr->recv_count;
+ counters->route_count += ctr->route_count;
+ counters->drop_length += ctr->drop_length;
+ counters->send_length += ctr->send_length;
+ counters->recv_length += ctr->recv_length;
+ counters->route_length += ctr->route_length;
+ counters->drop_length += ctr->drop_length;
+
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get);
+
+void
+lnet_counters_reset(void)
+{
+ lnet_counters_t *counters;
+ int i;
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ cfs_percpt_for_each(counters, i, the_lnet.ln_counters)
+ memset(counters, 0, sizeof(lnet_counters_t));
+
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_reset);
+
+#ifdef LNET_USE_LIB_FREELIST
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+ char *space;
+
+ LASSERT (n > 0);
+
+ size += offsetof (lnet_freeobj_t, fo_contents);
+
+ LIBCFS_ALLOC(space, n * size);
+ if (space == NULL)
+ return (-ENOMEM);
+
+ INIT_LIST_HEAD (&fl->fl_list);
+ fl->fl_objs = space;
+ fl->fl_nobjs = n;
+ fl->fl_objsize = size;
+
+ do
+ {
+ memset (space, 0, size);
+ list_add ((struct list_head *)space, &fl->fl_list);
+ space += size;
+ } while (--n != 0);
+
+ return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+ struct list_head *el;
+ int count;
+
+ if (fl->fl_nobjs == 0)
+ return;
+
+ count = 0;
+ for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+ count++;
+
+ LASSERT (count == fl->fl_nobjs);
+
+ LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+ memset (fl, 0, sizeof (*fl));
+}
+
+#endif /* LNET_USE_LIB_FREELIST */
+
+__u64
+lnet_create_interface_cookie (void)
+{
+ /* NB the interface cookie in wire handles guards against delayed
+ * replies and ACKs appearing valid after reboot. Initialisation time,
+ * even if it's only implemented to millisecond resolution is probably
+ * easily good enough. */
+ struct timeval tv;
+ __u64 cookie;
+ do_gettimeofday(&tv);
+ cookie = tv.tv_sec;
+ cookie *= 1000000;
+ cookie += tv.tv_usec;
+ return cookie;
+}
+
+static char *
+lnet_res_type2str(int type)
+{
+ switch (type) {
+ default:
+ LBUG();
+ case LNET_COOKIE_TYPE_MD:
+ return "MD";
+ case LNET_COOKIE_TYPE_ME:
+ return "ME";
+ case LNET_COOKIE_TYPE_EQ:
+ return "EQ";
+ }
+}
+
+void
+lnet_res_container_cleanup(struct lnet_res_container *rec)
+{
+ int count = 0;
+
+ if (rec->rec_type == 0) /* not set yet, it's uninitialized */
+ return;
+
+ while (!list_empty(&rec->rec_active)) {
+ struct list_head *e = rec->rec_active.next;
+
+ list_del_init(e);
+ if (rec->rec_type == LNET_COOKIE_TYPE_EQ) {
+ lnet_eq_free(list_entry(e, lnet_eq_t, eq_list));
+
+ } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) {
+ lnet_md_free(list_entry(e, lnet_libmd_t, md_list));
+
+ } else { /* NB: Active MEs should be attached on portals */
+ LBUG();
+ }
+ count++;
+ }
+
+ if (count > 0) {
+ /* Found alive MD/ME/EQ, user really should unlink/free
+ * all of them before finalize LNet, but if someone didn't,
+ * we have to recycle garbage for him */
+ CERROR("%d active elements on exit of %s container\n",
+ count, lnet_res_type2str(rec->rec_type));
+ }
+
+#ifdef LNET_USE_LIB_FREELIST
+ lnet_freelist_fini(&rec->rec_freelist);
+#endif
+ if (rec->rec_lh_hash != NULL) {
+ LIBCFS_FREE(rec->rec_lh_hash,
+ LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+ rec->rec_lh_hash = NULL;
+ }
+
+ rec->rec_type = 0; /* mark it as finalized */
+}
+
+int
+lnet_res_container_setup(struct lnet_res_container *rec,
+ int cpt, int type, int objnum, int objsz)
+{
+ int rc = 0;
+ int i;
+
+ LASSERT(rec->rec_type == 0);
+
+ rec->rec_type = type;
+ INIT_LIST_HEAD(&rec->rec_active);
+
+#ifdef LNET_USE_LIB_FREELIST
+ memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist));
+ rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz);
+ if (rc != 0)
+ goto out;
+#endif
+ rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type;
+
+ /* Arbitrary choice of hash table size */
+ LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt,
+ LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]));
+ if (rec->rec_lh_hash == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < LNET_LH_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&rec->rec_lh_hash[i]);
+
+ return 0;
+
+out:
+ CERROR("Failed to setup %s resource container\n",
+ lnet_res_type2str(type));
+ lnet_res_container_cleanup(rec);
+ return rc;
+}
+
+static void
+lnet_res_containers_destroy(struct lnet_res_container **recs)
+{
+ struct lnet_res_container *rec;
+ int i;
+
+ cfs_percpt_for_each(rec, i, recs)
+ lnet_res_container_cleanup(rec);
+
+ cfs_percpt_free(recs);
+}
+
+static struct lnet_res_container **
+lnet_res_containers_create(int type, int objnum, int objsz)
+{
+ struct lnet_res_container **recs;
+ struct lnet_res_container *rec;
+ int rc;
+ int i;
+
+ recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec));
+ if (recs == NULL) {
+ CERROR("Failed to allocate %s resource containers\n",
+ lnet_res_type2str(type));
+ return NULL;
+ }
+
+ cfs_percpt_for_each(rec, i, recs) {
+ rc = lnet_res_container_setup(rec, i, type, objnum, objsz);
+ if (rc != 0) {
+ lnet_res_containers_destroy(recs);
+ return NULL;
+ }
+ }
+
+ return recs;
+}
+
+lnet_libhandle_t *
+lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie)
+{
+ /* ALWAYS called with lnet_res_lock held */
+ struct list_head *head;
+ lnet_libhandle_t *lh;
+ unsigned int hash;
+
+ if ((cookie & LNET_COOKIE_MASK) != rec->rec_type)
+ return NULL;
+
+ hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS);
+ head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK];
+
+ list_for_each_entry(lh, head, lh_hash_chain) {
+ if (lh->lh_cookie == cookie)
+ return lh;
+ }
+
+ return NULL;
+}
+
+void
+lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh)
+{
+ /* ALWAYS called with lnet_res_lock held */
+ unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS;
+ unsigned int hash;
+
+ lh->lh_cookie = rec->rec_lh_cookie;
+ rec->rec_lh_cookie += 1 << ibits;
+
+ hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK;
+
+ list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]);
+}
+
+
+int lnet_unprepare(void);
+
+int
+lnet_prepare(lnet_pid_t requested_pid)
+{
+ /* Prepare to bring up the network */
+ struct lnet_res_container **recs;
+ int rc = 0;
+
+ LASSERT (the_lnet.ln_refcount == 0);
+
+ the_lnet.ln_routing = 0;
+
+ LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+ the_lnet.ln_pid = requested_pid;
+
+ INIT_LIST_HEAD(&the_lnet.ln_test_peers);
+ INIT_LIST_HEAD(&the_lnet.ln_nis);
+ INIT_LIST_HEAD(&the_lnet.ln_nis_cpt);
+ INIT_LIST_HEAD(&the_lnet.ln_nis_zombie);
+ INIT_LIST_HEAD(&the_lnet.ln_routers);
+
+ rc = lnet_create_remote_nets_table();
+ if (rc != 0)
+ goto failed;
+
+ the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+ the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(lnet_counters_t));
+ if (the_lnet.ln_counters == NULL) {
+ CERROR("Failed to allocate counters for LNet\n");
+ rc = -ENOMEM;
+ goto failed;
+ }
+
+ rc = lnet_peer_tables_create();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_msg_containers_create();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0,
+ LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS,
+ sizeof(lnet_eq_t));
+ if (rc != 0)
+ goto failed;
+
+ recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES,
+ sizeof(lnet_me_t));
+ if (recs == NULL)
+ goto failed;
+
+ the_lnet.ln_me_containers = recs;
+
+ recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS,
+ sizeof(lnet_libmd_t));
+ if (recs == NULL)
+ goto failed;
+
+ the_lnet.ln_md_containers = recs;
+
+ rc = lnet_portals_create();
+ if (rc != 0) {
+ CERROR("Failed to create portals for LNet: %d\n", rc);
+ goto failed;
+ }
+
+ return 0;
+
+ failed:
+ lnet_unprepare();
+ return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+ /* NB no LNET_LOCK since this is the last reference. All LND instances
+ * have shut down already, so it is safe to unlink and free all
+ * descriptors, even those that appear committed to a network op (eg MD
+ * with non-zero pending count) */
+
+ lnet_fail_nid(LNET_NID_ANY, 0);
+
+ LASSERT(the_lnet.ln_refcount == 0);
+ LASSERT(list_empty(&the_lnet.ln_test_peers));
+ LASSERT(list_empty(&the_lnet.ln_nis));
+ LASSERT(list_empty(&the_lnet.ln_nis_cpt));
+ LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+ lnet_portals_destroy();
+
+ if (the_lnet.ln_md_containers != NULL) {
+ lnet_res_containers_destroy(the_lnet.ln_md_containers);
+ the_lnet.ln_md_containers = NULL;
+ }
+
+ if (the_lnet.ln_me_containers != NULL) {
+ lnet_res_containers_destroy(the_lnet.ln_me_containers);
+ the_lnet.ln_me_containers = NULL;
+ }
+
+ lnet_res_container_cleanup(&the_lnet.ln_eq_container);
+
+ lnet_msg_containers_destroy();
+ lnet_peer_tables_destroy();
+ lnet_rtrpools_free();
+
+ if (the_lnet.ln_counters != NULL) {
+ cfs_percpt_free(the_lnet.ln_counters);
+ the_lnet.ln_counters = NULL;
+ }
+ lnet_destroy_remote_nets_table();
+
+ return 0;
+}
+
+lnet_ni_t *
+lnet_net2ni_locked(__u32 net, int cpt)
+{
+ struct list_head *tmp;
+ lnet_ni_t *ni;
+
+ LASSERT(cpt != LNET_LOCK_EX);
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (LNET_NIDNET(ni->ni_nid) == net) {
+ lnet_ni_addref_locked(ni, cpt);
+ return ni;
+ }
+ }
+
+ return NULL;
+}
+
+lnet_ni_t *
+lnet_net2ni(__u32 net)
+{
+ lnet_ni_t *ni;
+
+ lnet_net_lock(0);
+ ni = lnet_net2ni_locked(net, 0);
+ lnet_net_unlock(0);
+
+ return ni;
+}
+EXPORT_SYMBOL(lnet_net2ni);
+
+static unsigned int
+lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
+{
+ __u64 key = nid;
+ unsigned int val;
+
+ LASSERT(number >= 1 && number <= LNET_CPT_NUMBER);
+
+ if (number == 1)
+ return 0;
+
+ val = cfs_hash_long(key, LNET_CPT_BITS);
+ /* NB: LNET_CP_NUMBER doesn't have to be PO2 */
+ if (val < number)
+ return val;
+
+ return (unsigned int)(key + val + (val >> 1)) % number;
+}
+
+int
+lnet_cpt_of_nid_locked(lnet_nid_t nid)
+{
+ struct lnet_ni *ni;
+
+ /* must called with hold of lnet_net_lock */
+ if (LNET_CPT_NUMBER == 1)
+ return 0; /* the only one */
+
+ /* take lnet_net_lock(any) would be OK */
+ if (!list_empty(&the_lnet.ln_nis_cpt)) {
+ list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) {
+ if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid))
+ continue;
+
+ LASSERT(ni->ni_cpts != NULL);
+ return ni->ni_cpts[lnet_nid_cpt_hash
+ (nid, ni->ni_ncpts)];
+ }
+ }
+
+ return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+}
+
+int
+lnet_cpt_of_nid(lnet_nid_t nid)
+{
+ int cpt;
+ int cpt2;
+
+ if (LNET_CPT_NUMBER == 1)
+ return 0; /* the only one */
+
+ if (list_empty(&the_lnet.ln_nis_cpt))
+ return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER);
+
+ cpt = lnet_net_lock_current();
+ cpt2 = lnet_cpt_of_nid_locked(nid);
+ lnet_net_unlock(cpt);
+
+ return cpt2;
+}
+EXPORT_SYMBOL(lnet_cpt_of_nid);
+
+int
+lnet_islocalnet(__u32 net)
+{
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+
+ ni = lnet_net2ni_locked(net, cpt);
+ if (ni != NULL)
+ lnet_ni_decref_locked(ni, cpt);
+
+ lnet_net_unlock(cpt);
+
+ return ni != NULL;
+}
+
+lnet_ni_t *
+lnet_nid2ni_locked(lnet_nid_t nid, int cpt)
+{
+ struct lnet_ni *ni;
+ struct list_head *tmp;
+
+ LASSERT(cpt != LNET_LOCK_EX);
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (ni->ni_nid == nid) {
+ lnet_ni_addref_locked(ni, cpt);
+ return ni;
+ }
+ }
+
+ return NULL;
+}
+
+int
+lnet_islocalnid(lnet_nid_t nid)
+{
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ ni = lnet_nid2ni_locked(nid, cpt);
+ if (ni != NULL)
+ lnet_ni_decref_locked(ni, cpt);
+ lnet_net_unlock(cpt);
+
+ return ni != NULL;
+}
+
+int
+lnet_count_acceptor_nis (void)
+{
+ /* Return the # of NIs that need the acceptor. */
+ int count = 0;
+ struct list_head *tmp;
+ struct lnet_ni *ni;
+ int cpt;
+
+ cpt = lnet_net_lock_current();
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (ni->ni_lnd->lnd_accept != NULL)
+ count++;
+ }
+
+ lnet_net_unlock(cpt);
+
+ return count;
+}
+
+static int
+lnet_ni_tq_credits(lnet_ni_t *ni)
+{
+ int credits;
+
+ LASSERT(ni->ni_ncpts >= 1);
+
+ if (ni->ni_ncpts == 1)
+ return ni->ni_maxtxcredits;
+
+ credits = ni->ni_maxtxcredits / ni->ni_ncpts;
+ credits = max(credits, 8 * ni->ni_peertxcredits);
+ credits = min(credits, ni->ni_maxtxcredits);
+
+ return credits;
+}
+
+void
+lnet_shutdown_lndnis (void)
+{
+ int i;
+ int islo;
+ lnet_ni_t *ni;
+
+ /* NB called holding the global mutex */
+
+ /* All quiet on the API front */
+ LASSERT(!the_lnet.ln_shutdown);
+ LASSERT(the_lnet.ln_refcount == 0);
+ LASSERT(list_empty(&the_lnet.ln_nis_zombie));
+
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_shutdown = 1; /* flag shutdown */
+
+ /* Unlink NIs from the global table */
+ while (!list_empty(&the_lnet.ln_nis)) {
+ ni = list_entry(the_lnet.ln_nis.next,
+ lnet_ni_t, ni_list);
+ /* move it to zombie list and nobody can find it anymore */
+ list_move(&ni->ni_list, &the_lnet.ln_nis_zombie);
+ lnet_ni_decref_locked(ni, 0); /* drop ln_nis' ref */
+
+ if (!list_empty(&ni->ni_cptlist)) {
+ list_del_init(&ni->ni_cptlist);
+ lnet_ni_decref_locked(ni, 0);
+ }
+ }
+
+ /* Drop the cached eqwait NI. */
+ if (the_lnet.ln_eq_waitni != NULL) {
+ lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0);
+ the_lnet.ln_eq_waitni = NULL;
+ }
+
+ /* Drop the cached loopback NI. */
+ if (the_lnet.ln_loni != NULL) {
+ lnet_ni_decref_locked(the_lnet.ln_loni, 0);
+ the_lnet.ln_loni = NULL;
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ /* Clear lazy portals and drop delayed messages which hold refs
+ * on their lnet_msg_t::msg_rxpeer */
+ for (i = 0; i < the_lnet.ln_nportals; i++)
+ LNetClearLazyPortal(i);
+
+ /* Clear the peer table and wait for all peers to go (they hold refs on
+ * their NIs) */
+ lnet_peer_tables_cleanup();
+
+ lnet_net_lock(LNET_LOCK_EX);
+ /* Now wait for the NI's I just nuked to show up on ln_zombie_nis
+ * and shut them down in guaranteed thread context */
+ i = 2;
+ while (!list_empty(&the_lnet.ln_nis_zombie)) {
+ int *ref;
+ int j;
+
+ ni = list_entry(the_lnet.ln_nis_zombie.next,
+ lnet_ni_t, ni_list);
+ list_del_init(&ni->ni_list);
+ cfs_percpt_for_each(ref, j, ni->ni_refs) {
+ if (*ref == 0)
+ continue;
+ /* still busy, add it back to zombie list */
+ list_add(&ni->ni_list, &the_lnet.ln_nis_zombie);
+ break;
+ }
+
+ while (!list_empty(&ni->ni_list)) {
+ lnet_net_unlock(LNET_LOCK_EX);
+ ++i;
+ if ((i & (-i)) == i) {
+ CDEBUG(D_WARNING,
+ "Waiting for zombie LNI %s\n",
+ libcfs_nid2str(ni->ni_nid));
+ }
+ cfs_pause(cfs_time_seconds(1));
+ lnet_net_lock(LNET_LOCK_EX);
+ continue;
+ }
+
+ ni->ni_lnd->lnd_refcount--;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ islo = ni->ni_lnd->lnd_type == LOLND;
+
+ LASSERT (!in_interrupt ());
+ (ni->ni_lnd->lnd_shutdown)(ni);
+
+ /* can't deref lnd anymore now; it might have unregistered
+ * itself... */
+
+ if (!islo)
+ CDEBUG(D_LNI, "Removed LNI %s\n",
+ libcfs_nid2str(ni->ni_nid));
+
+ lnet_ni_free(ni);
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ the_lnet.ln_shutdown = 0;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (the_lnet.ln_network_tokens != NULL) {
+ LIBCFS_FREE(the_lnet.ln_network_tokens,
+ the_lnet.ln_network_tokens_nob);
+ the_lnet.ln_network_tokens = NULL;
+ }
+}
+
+int
+lnet_startup_lndnis (void)
+{
+ lnd_t *lnd;
+ struct lnet_ni *ni;
+ struct lnet_tx_queue *tq;
+ struct list_head nilist;
+ int i;
+ int rc = 0;
+ int lnd_type;
+ int nicount = 0;
+ char *nets = lnet_get_networks();
+
+ INIT_LIST_HEAD(&nilist);
+
+ if (nets == NULL)
+ goto failed;
+
+ rc = lnet_parse_networks(&nilist, nets);
+ if (rc != 0)
+ goto failed;
+
+ while (!list_empty(&nilist)) {
+ ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+ lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+ LASSERT (libcfs_isknown_lnd(lnd_type));
+
+ if (lnd_type == CIBLND ||
+ lnd_type == OPENIBLND ||
+ lnd_type == IIBLND ||
+ lnd_type == VIBLND) {
+ CERROR("LND %s obsoleted\n",
+ libcfs_lnd2str(lnd_type));
+ goto failed;
+ }
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+ lnd = lnet_find_lnd_by_type(lnd_type);
+
+ if (lnd == NULL) {
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+ rc = request_module("%s",
+ libcfs_lnd2modname(lnd_type));
+ LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex);
+
+ lnd = lnet_find_lnd_by_type(lnd_type);
+ if (lnd == NULL) {
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+ CERROR("Can't load LND %s, module %s, rc=%d\n",
+ libcfs_lnd2str(lnd_type),
+ libcfs_lnd2modname(lnd_type), rc);
+ goto failed;
+ }
+ }
+
+ lnet_net_lock(LNET_LOCK_EX);
+ lnd->lnd_refcount++;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ ni->ni_lnd = lnd;
+
+ rc = (lnd->lnd_startup)(ni);
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex);
+
+ if (rc != 0) {
+ LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s"
+ "\n",
+ rc, libcfs_lnd2str(lnd->lnd_type));
+ lnet_net_lock(LNET_LOCK_EX);
+ lnd->lnd_refcount--;
+ lnet_net_unlock(LNET_LOCK_EX);
+ goto failed;
+ }
+
+ LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL);
+
+ list_del(&ni->ni_list);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ /* refcount for ln_nis */
+ lnet_ni_addref_locked(ni, 0);
+ list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+ if (ni->ni_cpts != NULL) {
+ list_add_tail(&ni->ni_cptlist,
+ &the_lnet.ln_nis_cpt);
+ lnet_ni_addref_locked(ni, 0);
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (lnd->lnd_type == LOLND) {
+ lnet_ni_addref(ni);
+ LASSERT (the_lnet.ln_loni == NULL);
+ the_lnet.ln_loni = ni;
+ continue;
+ }
+
+ if (ni->ni_peertxcredits == 0 ||
+ ni->ni_maxtxcredits == 0) {
+ LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n",
+ libcfs_lnd2str(lnd->lnd_type),
+ ni->ni_peertxcredits == 0 ?
+ "" : "per-peer ");
+ goto failed;
+ }
+
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+ tq->tq_credits_min =
+ tq->tq_credits_max =
+ tq->tq_credits = lnet_ni_tq_credits(ni);
+ }
+
+ CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
+ libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits,
+ lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER,
+ ni->ni_peerrtrcredits, ni->ni_peertimeout);
+
+ nicount++;
+ }
+
+ if (the_lnet.ln_eq_waitni != NULL && nicount > 1) {
+ lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type;
+ LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network"
+ "\n",
+ libcfs_lnd2str(lnd_type));
+ goto failed;
+ }
+
+ return 0;
+
+ failed:
+ lnet_shutdown_lndnis();
+
+ while (!list_empty(&nilist)) {
+ ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+ list_del(&ni->ni_list);
+ lnet_ni_free(ni);
+ }
+
+ return -ENETDOWN;
+}
+
+/**
+ * Initialize LNet library.
+ *
+ * Only userspace program needs to call this function - it's automatically
+ * called in the kernel at module loading time. Caller has to call LNetFini()
+ * after a call to LNetInit(), if and only if the latter returned 0. It must
+ * be called exactly once.
+ *
+ * \return 0 on success, and -ve on failures.
+ */
+int
+LNetInit(void)
+{
+ int rc;
+
+ lnet_assert_wire_constants();
+ LASSERT(!the_lnet.ln_init);
+
+ memset(&the_lnet, 0, sizeof(the_lnet));
+
+ /* refer to global cfs_cpt_table for now */
+ the_lnet.ln_cpt_table = cfs_cpt_table;
+ the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_table);
+
+ LASSERT(the_lnet.ln_cpt_number > 0);
+ if (the_lnet.ln_cpt_number > LNET_CPT_MAX) {
+ /* we are under risk of consuming all lh_cookie */
+ CERROR("Can't have %d CPTs for LNet (max allowed is %d), "
+ "please change setting of CPT-table and retry\n",
+ the_lnet.ln_cpt_number, LNET_CPT_MAX);
+ return -1;
+ }
+
+ while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number)
+ the_lnet.ln_cpt_bits++;
+
+ rc = lnet_create_locks();
+ if (rc != 0) {
+ CERROR("Can't create LNet global locks: %d\n", rc);
+ return -1;
+ }
+
+ the_lnet.ln_refcount = 0;
+ the_lnet.ln_init = 1;
+ LNetInvalidateHandle(&the_lnet.ln_rc_eqh);
+ INIT_LIST_HEAD(&the_lnet.ln_lnds);
+ INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
+ INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
+
+ /* The hash table size is the number of bits it takes to express the set
+ * ln_num_routes, minus 1 (better to under estimate than over so we
+ * don't waste memory). */
+ if (rnet_htable_size <= 0)
+ rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT;
+ else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX)
+ rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX;
+ the_lnet.ln_remote_nets_hbits = max_t(int, 1,
+ order_base_2(rnet_htable_size) - 1);
+
+ /* All LNDs apart from the LOLND are in separate modules. They
+ * register themselves when their module loads, and unregister
+ * themselves when their module is unloaded. */
+ lnet_register_lnd(&the_lolnd);
+ return 0;
+}
+EXPORT_SYMBOL(LNetInit);
+
+/**
+ * Finalize LNet library.
+ *
+ * Only userspace program needs to call this function. It can be called
+ * at most once.
+ *
+ * \pre LNetInit() called with success.
+ * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls.
+ */
+void
+LNetFini(void)
+{
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount == 0);
+
+ while (!list_empty(&the_lnet.ln_lnds))
+ lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+ lnd_t, lnd_list));
+ lnet_destroy_locks();
+
+ the_lnet.ln_init = 0;
+}
+EXPORT_SYMBOL(LNetFini);
+
+/**
+ * Set LNet PID and start LNet interfaces, routing, and forwarding.
+ *
+ * Userspace program should call this after a successful call to LNetInit().
+ * Users must call this function at least once before any other functions.
+ * For each successful call there must be a corresponding call to
+ * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is
+ * ignored.
+ *
+ * The PID used by LNet may be different from the one requested.
+ * See LNetGetId().
+ *
+ * \param requested_pid PID requested by the caller.
+ *
+ * \return >= 0 on success, and < 0 error code on failures.
+ */
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+ int im_a_router = 0;
+ int rc;
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+ if (the_lnet.ln_refcount > 0) {
+ rc = the_lnet.ln_refcount++;
+ goto out;
+ }
+
+ lnet_get_tunables();
+
+ if (requested_pid == LNET_PID_ANY) {
+ /* Don't instantiate LNET just for me */
+ rc = -ENETDOWN;
+ goto failed0;
+ }
+
+ rc = lnet_prepare(requested_pid);
+ if (rc != 0)
+ goto failed0;
+
+ rc = lnet_startup_lndnis();
+ if (rc != 0)
+ goto failed1;
+
+ rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_check_routes();
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_rtrpools_alloc(im_a_router);
+ if (rc != 0)
+ goto failed2;
+
+ rc = lnet_acceptor_start();
+ if (rc != 0)
+ goto failed2;
+
+ the_lnet.ln_refcount = 1;
+ /* Now I may use my own API functions... */
+
+ /* NB router checker needs the_lnet.ln_ping_info in
+ * lnet_router_checker -> lnet_update_ni_status_locked */
+ rc = lnet_ping_target_init();
+ if (rc != 0)
+ goto failed3;
+
+ rc = lnet_router_checker_start();
+ if (rc != 0)
+ goto failed4;
+
+ lnet_proc_init();
+ goto out;
+
+ failed4:
+ lnet_ping_target_fini();
+ failed3:
+ the_lnet.ln_refcount = 0;
+ lnet_acceptor_stop();
+ failed2:
+ lnet_destroy_routes();
+ lnet_shutdown_lndnis();
+ failed1:
+ lnet_unprepare();
+ failed0:
+ LASSERT (rc < 0);
+ out:
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+ return rc;
+}
+EXPORT_SYMBOL(LNetNIInit);
+
+/**
+ * Stop LNet interfaces, routing, and forwarding.
+ *
+ * Users must call this function once for each successful call to LNetNIInit().
+ * Once the LNetNIFini() operation has been started, the results of pending
+ * API operations are undefined.
+ *
+ * \return always 0 for current implementation.
+ */
+int
+LNetNIFini()
+{
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (the_lnet.ln_refcount != 1) {
+ the_lnet.ln_refcount--;
+ } else {
+ LASSERT (!the_lnet.ln_niinit_self);
+
+ lnet_proc_fini();
+ lnet_router_checker_stop();
+ lnet_ping_target_fini();
+
+ /* Teardown fns that use my own API functions BEFORE here */
+ the_lnet.ln_refcount = 0;
+
+ lnet_acceptor_stop();
+ lnet_destroy_routes();
+ lnet_shutdown_lndnis();
+ lnet_unprepare();
+ }
+
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(LNetNIFini);
+
+/**
+ * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet
+ * internal ioctl handler.
+ *
+ * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it.
+ *
+ * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer.
+ * The data will be printed to system console. Don't use it excessively.
+ * \param arg A pointer to lnet_process_id_t, process ID of the peer.
+ *
+ * \return Always return 0 when called by users directly (i.e., not via ioctl).
+ */
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+ struct libcfs_ioctl_data *data = arg;
+ lnet_process_id_t id = {0};
+ lnet_ni_t *ni;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ switch (cmd) {
+ case IOC_LIBCFS_GET_NI:
+ rc = LNetGetId(data->ioc_count, &id);
+ data->ioc_nid = id.nid;
+ return rc;
+
+ case IOC_LIBCFS_FAIL_NID:
+ return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+
+ case IOC_LIBCFS_ADD_ROUTE:
+ rc = lnet_add_route(data->ioc_net, data->ioc_count,
+ data->ioc_nid);
+ return (rc != 0) ? rc : lnet_check_routes();
+
+ case IOC_LIBCFS_DEL_ROUTE:
+ return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+ case IOC_LIBCFS_GET_ROUTE:
+ return lnet_get_route(data->ioc_count,
+ &data->ioc_net, &data->ioc_count,
+ &data->ioc_nid, &data->ioc_flags);
+ case IOC_LIBCFS_NOTIFY_ROUTER:
+ return lnet_notify(NULL, data->ioc_nid, data->ioc_flags,
+ cfs_time_current() -
+ cfs_time_seconds(cfs_time_current_sec() -
+ (time_t)data->ioc_u64[0]));
+
+ case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+ /* This can be removed once lustre stops calling it */
+ return 0;
+
+ case IOC_LIBCFS_LNET_DIST:
+ rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+ if (rc < 0 && rc != -EHOSTUNREACH)
+ return rc;
+
+ data->ioc_u32[0] = rc;
+ return 0;
+
+ case IOC_LIBCFS_TESTPROTOCOMPAT:
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_testprotocompat = data->ioc_flags;
+ lnet_net_unlock(LNET_LOCK_EX);
+ return 0;
+
+ case IOC_LIBCFS_PING:
+ id.nid = data->ioc_nid;
+ id.pid = data->ioc_u32[0];
+ rc = lnet_ping(id, data->ioc_u32[1], /* timeout */
+ (lnet_process_id_t *)data->ioc_pbuf1,
+ data->ioc_plen1/sizeof(lnet_process_id_t));
+ if (rc < 0)
+ return rc;
+ data->ioc_count = rc;
+ return 0;
+
+ case IOC_LIBCFS_DEBUG_PEER: {
+ /* CAVEAT EMPTOR: this one designed for calling directly; not
+ * via an ioctl */
+ id = *((lnet_process_id_t *) arg);
+
+ lnet_debug_peer(id.nid);
+
+ ni = lnet_net2ni(LNET_NIDNET(id.nid));
+ if (ni == NULL) {
+ CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id));
+ } else {
+ if (ni->ni_lnd->lnd_ctl == NULL) {
+ CDEBUG(D_WARNING, "No ctl for %s\n",
+ libcfs_id2str(id));
+ } else {
+ (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+ }
+
+ lnet_ni_decref(ni);
+ }
+ return 0;
+ }
+
+ default:
+ ni = lnet_net2ni(data->ioc_net);
+ if (ni == NULL)
+ return -EINVAL;
+
+ if (ni->ni_lnd->lnd_ctl == NULL)
+ rc = -EINVAL;
+ else
+ rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+ lnet_ni_decref(ni);
+ return rc;
+ }
+ /* not reached */
+}
+EXPORT_SYMBOL(LNetCtl);
+
+/**
+ * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that
+ * all interfaces share a same PID, as requested by LNetNIInit().
+ *
+ * \param index Index of the interface to look up.
+ * \param id On successful return, this location will hold the
+ * lnet_process_id_t ID of the interface.
+ *
+ * \retval 0 If an interface exists at \a index.
+ * \retval -ENOENT If no interface has been found.
+ */
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+ struct lnet_ni *ni;
+ struct list_head *tmp;
+ int cpt;
+ int rc = -ENOENT;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_net_lock_current();
+
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ if (index-- != 0)
+ continue;
+
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ id->nid = ni->ni_nid;
+ id->pid = the_lnet.ln_pid;
+ rc = 0;
+ break;
+ }
+
+ lnet_net_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetGetId);
+
+/**
+ * Print a string representation of handle \a h into buffer \a str of
+ * \a len bytes.
+ */
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+ snprintf(str, len, LPX64, h.cookie);
+}
+EXPORT_SYMBOL(LNetSnprintHandle);
+
+static int
+lnet_create_ping_info(void)
+{
+ int i;
+ int n;
+ int rc;
+ unsigned int infosz;
+ lnet_ni_t *ni;
+ lnet_process_id_t id;
+ lnet_ping_info_t *pinfo;
+
+ for (n = 0; ; n++) {
+ rc = LNetGetId(n, &id);
+ if (rc == -ENOENT)
+ break;
+
+ LASSERT (rc == 0);
+ }
+
+ infosz = offsetof(lnet_ping_info_t, pi_ni[n]);
+ LIBCFS_ALLOC(pinfo, infosz);
+ if (pinfo == NULL) {
+ CERROR("Can't allocate ping info[%d]\n", n);
+ return -ENOMEM;
+ }
+
+ pinfo->pi_nnis = n;
+ pinfo->pi_pid = the_lnet.ln_pid;
+ pinfo->pi_magic = LNET_PROTO_PING_MAGIC;
+ pinfo->pi_features = LNET_PING_FEAT_NI_STATUS;
+
+ for (i = 0; i < n; i++) {
+ lnet_ni_status_t *ns = &pinfo->pi_ni[i];
+
+ rc = LNetGetId(i, &id);
+ LASSERT (rc == 0);
+
+ ns->ns_nid = id.nid;
+ ns->ns_status = LNET_NI_STATUS_UP;
+
+ lnet_net_lock(0);
+
+ ni = lnet_nid2ni_locked(id.nid, 0);
+ LASSERT(ni != NULL);
+
+ lnet_ni_lock(ni);
+ LASSERT(ni->ni_status == NULL);
+ ni->ni_status = ns;
+ lnet_ni_unlock(ni);
+
+ lnet_ni_decref_locked(ni, 0);
+ lnet_net_unlock(0);
+ }
+
+ the_lnet.ln_ping_info = pinfo;
+ return 0;
+}
+
+static void
+lnet_destroy_ping_info(void)
+{
+ struct lnet_ni *ni;
+
+ lnet_net_lock(0);
+
+ list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+ lnet_ni_lock(ni);
+ ni->ni_status = NULL;
+ lnet_ni_unlock(ni);
+ }
+
+ lnet_net_unlock(0);
+
+ LIBCFS_FREE(the_lnet.ln_ping_info,
+ offsetof(lnet_ping_info_t,
+ pi_ni[the_lnet.ln_ping_info->pi_nnis]));
+ the_lnet.ln_ping_info = NULL;
+ return;
+}
+
+int
+lnet_ping_target_init(void)
+{
+ lnet_md_t md = {0};
+ lnet_handle_me_t meh;
+ lnet_process_id_t id;
+ int rc;
+ int rc2;
+ int infosz;
+
+ rc = lnet_create_ping_info();
+ if (rc != 0)
+ return rc;
+
+ /* We can have a tiny EQ since we only need to see the unlink event on
+ * teardown, which by definition is the last one! */
+ rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+ if (rc != 0) {
+ CERROR("Can't allocate ping EQ: %d\n", rc);
+ goto failed_0;
+ }
+
+ memset(&id, 0, sizeof(lnet_process_id_t));
+ id.nid = LNET_NID_ANY;
+ id.pid = LNET_PID_ANY;
+
+ rc = LNetMEAttach(LNET_RESERVED_PORTAL, id,
+ LNET_PROTO_PING_MATCHBITS, 0,
+ LNET_UNLINK, LNET_INS_AFTER,
+ &meh);
+ if (rc != 0) {
+ CERROR("Can't create ping ME: %d\n", rc);
+ goto failed_1;
+ }
+
+ /* initialize md content */
+ infosz = offsetof(lnet_ping_info_t,
+ pi_ni[the_lnet.ln_ping_info->pi_nnis]);
+ md.start = the_lnet.ln_ping_info;
+ md.length = infosz;
+ md.threshold = LNET_MD_THRESH_INF;
+ md.max_size = 0;
+ md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE |
+ LNET_MD_MANAGE_REMOTE;
+ md.user_ptr = NULL;
+ md.eq_handle = the_lnet.ln_ping_target_eq;
+
+ rc = LNetMDAttach(meh, md,
+ LNET_RETAIN,
+ &the_lnet.ln_ping_target_md);
+ if (rc != 0) {
+ CERROR("Can't attach ping MD: %d\n", rc);
+ goto failed_2;
+ }
+
+ return 0;
+
+ failed_2:
+ rc2 = LNetMEUnlink(meh);
+ LASSERT (rc2 == 0);
+ failed_1:
+ rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+ LASSERT (rc2 == 0);
+ failed_0:
+ lnet_destroy_ping_info();
+ return rc;
+}
+
+void
+lnet_ping_target_fini(void)
+{
+ lnet_event_t event;
+ int rc;
+ int which;
+ int timeout_ms = 1000;
+ sigset_t blocked = cfs_block_allsigs();
+
+ LNetMDUnlink(the_lnet.ln_ping_target_md);
+ /* NB md could be busy; this just starts the unlink */
+
+ for (;;) {
+ rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+ timeout_ms, &event, &which);
+
+ /* I expect overflow... */
+ LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+ if (rc == 0) {
+ /* timed out: provide a diagnostic */
+ CWARN("Still waiting for ping MD to unlink\n");
+ timeout_ms *= 2;
+ continue;
+ }
+
+ /* Got a valid event */
+ if (event.unlinked)
+ break;
+ }
+
+ rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+ LASSERT (rc == 0);
+ lnet_destroy_ping_info();
+ cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+ lnet_handle_eq_t eqh;
+ lnet_handle_md_t mdh;
+ lnet_event_t event;
+ lnet_md_t md = {0};
+ int which;
+ int unlinked = 0;
+ int replied = 0;
+ const int a_long_time = 60000; /* mS */
+ int infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]);
+ lnet_ping_info_t *info;
+ lnet_process_id_t tmpid;
+ int i;
+ int nob;
+ int rc;
+ int rc2;
+ sigset_t blocked;
+
+ if (n_ids <= 0 ||
+ id.nid == LNET_NID_ANY ||
+ timeout_ms > 500000 || /* arbitrary limit! */
+ n_ids > 20) /* arbitrary limit! */
+ return -EINVAL;
+
+ if (id.pid == LNET_PID_ANY)
+ id.pid = LUSTRE_SRV_LNET_PID;
+
+ LIBCFS_ALLOC(info, infosz);
+ if (info == NULL)
+ return -ENOMEM;
+
+ /* NB 2 events max (including any unlink event) */
+ rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+ if (rc != 0) {
+ CERROR("Can't allocate EQ: %d\n", rc);
+ goto out_0;
+ }
+
+ /* initialize md content */
+ md.start = info;
+ md.length = infosz;
+ md.threshold = 2; /*GET/REPLY*/
+ md.max_size = 0;
+ md.options = LNET_MD_TRUNCATE;
+ md.user_ptr = NULL;
+ md.eq_handle = eqh;
+
+ rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+ if (rc != 0) {
+ CERROR("Can't bind MD: %d\n", rc);
+ goto out_1;
+ }
+
+ rc = LNetGet(LNET_NID_ANY, mdh, id,
+ LNET_RESERVED_PORTAL,
+ LNET_PROTO_PING_MATCHBITS, 0);
+
+ if (rc != 0) {
+ /* Don't CERROR; this could be deliberate! */
+
+ rc2 = LNetMDUnlink(mdh);
+ LASSERT (rc2 == 0);
+
+ /* NB must wait for the UNLINK event below... */
+ unlinked = 1;
+ timeout_ms = a_long_time;
+ }
+
+ do {
+ /* MUST block for unlink to complete */
+ if (unlinked)
+ blocked = cfs_block_allsigs();
+
+ rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+ if (unlinked)
+ cfs_restore_sigs(blocked);
+
+ CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+ (rc2 <= 0) ? -1 : event.type,
+ (rc2 <= 0) ? -1 : event.status,
+ (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+ LASSERT (rc2 != -EOVERFLOW); /* can't miss anything */
+
+ if (rc2 <= 0 || event.status != 0) {
+ /* timeout or error */
+ if (!replied && rc == 0)
+ rc = (rc2 < 0) ? rc2 :
+ (rc2 == 0) ? -ETIMEDOUT :
+ event.status;
+
+ if (!unlinked) {
+ /* Ensure completion in finite time... */
+ LNetMDUnlink(mdh);
+ /* No assertion (racing with network) */
+ unlinked = 1;
+ timeout_ms = a_long_time;
+ } else if (rc2 == 0) {
+ /* timed out waiting for unlink */
+ CWARN("ping %s: late network completion\n",
+ libcfs_id2str(id));
+ }
+ } else if (event.type == LNET_EVENT_REPLY) {
+ replied = 1;
+ rc = event.mlength;
+ }
+
+ } while (rc2 <= 0 || !event.unlinked);
+
+ if (!replied) {
+ if (rc >= 0)
+ CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+ libcfs_id2str(id));
+ rc = -EIO;
+ goto out_1;
+ }
+
+ nob = rc;
+ LASSERT (nob >= 0 && nob <= infosz);
+
+ rc = -EPROTO; /* if I can't parse... */
+
+ if (nob < 8) {
+ /* can't check magic/version */
+ CERROR("%s: ping info too short %d\n",
+ libcfs_id2str(id), nob);
+ goto out_1;
+ }
+
+ if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+ lnet_swap_pinginfo(info);
+ } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+ CERROR("%s: Unexpected magic %08x\n",
+ libcfs_id2str(id), info->pi_magic);
+ goto out_1;
+ }
+
+ if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) {
+ CERROR("%s: ping w/o NI status: 0x%x\n",
+ libcfs_id2str(id), info->pi_features);
+ goto out_1;
+ }
+
+ if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) {
+ CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id),
+ nob, (int)offsetof(lnet_ping_info_t, pi_ni[0]));
+ goto out_1;
+ }
+
+ if (info->pi_nnis < n_ids)
+ n_ids = info->pi_nnis;
+
+ if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) {
+ CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id),
+ nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids]));
+ goto out_1;
+ }
+
+ rc = -EFAULT; /* If I SEGV... */
+
+ for (i = 0; i < n_ids; i++) {
+ tmpid.pid = info->pi_pid;
+ tmpid.nid = info->pi_ni[i].ns_nid;
+ if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+ goto out_1;
+ }
+ rc = info->pi_nnis;
+
+ out_1:
+ rc2 = LNetEQFree(eqh);
+ if (rc2 != 0)
+ CERROR("rc2 %d\n", rc2);
+ LASSERT (rc2 == 0);
+
+ out_0:
+ LIBCFS_FREE(info, infosz);
+ return rc;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c
new file mode 100644
index 000000000000..28711e6e8b03
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/config.c
@@ -0,0 +1,1264 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+typedef struct { /* tmp struct for parsing routes */
+ struct list_head ltb_list; /* stash on lists */
+ int ltb_size; /* allocated size */
+ char ltb_text[0]; /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0; /* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB (4<<10)
+
+void
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+ static char dots[LNET_SINGLE_TEXTBUF_NOB];
+ static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+
+ memset(dots, '.', sizeof(dots));
+ dots[sizeof(dots)-1] = 0;
+ memset(dashes, '-', sizeof(dashes));
+ dashes[sizeof(dashes)-1] = 0;
+
+ LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str);
+ LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n",
+ (int)strlen(name), dots, offset, dots,
+ (width < 1) ? 0 : width - 1, dashes);
+}
+
+int
+lnet_issep (char c)
+{
+ switch (c) {
+ case '\n':
+ case '\r':
+ case ';':
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+ struct list_head *tmp;
+ lnet_ni_t *ni;
+
+ list_for_each (tmp, nilist) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+ if (LNET_NIDNET(ni->ni_nid) == net)
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+lnet_ni_free(struct lnet_ni *ni)
+{
+ if (ni->ni_refs != NULL)
+ cfs_percpt_free(ni->ni_refs);
+
+ if (ni->ni_tx_queues != NULL)
+ cfs_percpt_free(ni->ni_tx_queues);
+
+ if (ni->ni_cpts != NULL)
+ cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts);
+
+ LIBCFS_FREE(ni, sizeof(*ni));
+}
+
+lnet_ni_t *
+lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist)
+{
+ struct lnet_tx_queue *tq;
+ struct lnet_ni *ni;
+ int rc;
+ int i;
+
+ if (!lnet_net_unique(net, nilist)) {
+ LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n",
+ libcfs_net2str(net));
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(ni, sizeof(*ni));
+ if (ni == NULL) {
+ CERROR("Out of memory creating network %s\n",
+ libcfs_net2str(net));
+ return NULL;
+ }
+
+ spin_lock_init(&ni->ni_lock);
+ INIT_LIST_HEAD(&ni->ni_cptlist);
+ ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ni->ni_refs[0]));
+ if (ni->ni_refs == NULL)
+ goto failed;
+
+ ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ni->ni_tx_queues[0]));
+ if (ni->ni_tx_queues == NULL)
+ goto failed;
+
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues)
+ INIT_LIST_HEAD(&tq->tq_delayed);
+
+ if (el == NULL) {
+ ni->ni_cpts = NULL;
+ ni->ni_ncpts = LNET_CPT_NUMBER;
+ } else {
+ rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts);
+ if (rc <= 0) {
+ CERROR("Failed to set CPTs for NI %s: %d\n",
+ libcfs_net2str(net), rc);
+ goto failed;
+ }
+
+ LASSERT(rc <= LNET_CPT_NUMBER);
+ if (rc == LNET_CPT_NUMBER) {
+ LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0]));
+ ni->ni_cpts = NULL;
+ }
+
+ ni->ni_ncpts = rc;
+ }
+
+ /* LND will fill in the address part of the NID */
+ ni->ni_nid = LNET_MKNID(net, 0);
+ ni->ni_last_alive = cfs_time_current_sec();
+ list_add_tail(&ni->ni_list, nilist);
+ return ni;
+ failed:
+ lnet_ni_free(ni);
+ return NULL;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+ struct cfs_expr_list *el = NULL;
+ int tokensize = strlen(networks) + 1;
+ char *tokens;
+ char *str;
+ char *tmp;
+ struct lnet_ni *ni;
+ __u32 net;
+ int nnets = 0;
+
+ if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+ /* _WAY_ conservative */
+ LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too "
+ "long\n");
+ return -EINVAL;
+ }
+
+ LIBCFS_ALLOC(tokens, tokensize);
+ if (tokens == NULL) {
+ CERROR("Can't allocate net tokens\n");
+ return -ENOMEM;
+ }
+
+ the_lnet.ln_network_tokens = tokens;
+ the_lnet.ln_network_tokens_nob = tokensize;
+ memcpy (tokens, networks, tokensize);
+ str = tmp = tokens;
+
+ /* Add in the loopback network */
+ ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist);
+ if (ni == NULL)
+ goto failed;
+
+ while (str != NULL && *str != 0) {
+ char *comma = strchr(str, ',');
+ char *bracket = strchr(str, '(');
+ char *square = strchr(str, '[');
+ char *iface;
+ int niface;
+ int rc;
+
+ /* NB we don't check interface conflicts here; it's the LNDs
+ * responsibility (if it cares at all) */
+
+ if (square != NULL && (comma == NULL || square < comma)) {
+ /* i.e: o2ib0(ib0)[1,2], number between square
+ * brackets are CPTs this NI needs to be bond */
+ if (bracket != NULL && bracket > square) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ tmp = strchr(square, ']');
+ if (tmp == NULL) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ rc = cfs_expr_list_parse(square, tmp - square + 1,
+ 0, LNET_CPT_NUMBER - 1, &el);
+ if (rc != 0) {
+ tmp = square;
+ goto failed_syntax;
+ }
+
+ while (square <= tmp)
+ *square++ = ' ';
+ }
+
+ if (bracket == NULL ||
+ (comma != NULL && comma < bracket)) {
+
+ /* no interface list specified */
+
+ if (comma != NULL)
+ *comma++ = 0;
+ net = libcfs_str2net(cfs_trimwhite(str));
+
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ LCONSOLE_ERROR_MSG(0x113, "Unrecognised network"
+ " type\n");
+ tmp = str;
+ goto failed_syntax;
+ }
+
+ if (LNET_NETTYP(net) != LOLND && /* LO is implicit */
+ lnet_ni_alloc(net, el, nilist) == NULL)
+ goto failed;
+
+ if (el != NULL) {
+ cfs_expr_list_free(el);
+ el = NULL;
+ }
+
+ str = comma;
+ continue;
+ }
+
+ *bracket = 0;
+ net = libcfs_str2net(cfs_trimwhite(str));
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ tmp = str;
+ goto failed_syntax;
+ }
+
+ nnets++;
+ ni = lnet_ni_alloc(net, el, nilist);
+ if (ni == NULL)
+ goto failed;
+
+ if (el != NULL) {
+ cfs_expr_list_free(el);
+ el = NULL;
+ }
+
+ niface = 0;
+ iface = bracket + 1;
+
+ bracket = strchr(iface, ')');
+ if (bracket == NULL) {
+ tmp = iface;
+ goto failed_syntax;
+ }
+
+ *bracket = 0;
+ do {
+ comma = strchr(iface, ',');
+ if (comma != NULL)
+ *comma++ = 0;
+
+ iface = cfs_trimwhite(iface);
+ if (*iface == 0) {
+ tmp = iface;
+ goto failed_syntax;
+ }
+
+ if (niface == LNET_MAX_INTERFACES) {
+ LCONSOLE_ERROR_MSG(0x115, "Too many interfaces "
+ "for net %s\n",
+ libcfs_net2str(net));
+ goto failed;
+ }
+
+ ni->ni_interfaces[niface++] = iface;
+ iface = comma;
+ } while (iface != NULL);
+
+ str = bracket + 1;
+ comma = strchr(bracket + 1, ',');
+ if (comma != NULL) {
+ *comma = 0;
+ str = cfs_trimwhite(str);
+ if (*str != 0) {
+ tmp = str;
+ goto failed_syntax;
+ }
+ str = comma + 1;
+ continue;
+ }
+
+ str = cfs_trimwhite(str);
+ if (*str != 0) {
+ tmp = str;
+ goto failed_syntax;
+ }
+ }
+
+ LASSERT(!list_empty(nilist));
+ return 0;
+
+ failed_syntax:
+ lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp));
+ failed:
+ while (!list_empty(nilist)) {
+ ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+
+ list_del(&ni->ni_list);
+ lnet_ni_free(ni);
+ }
+
+ if (el != NULL)
+ cfs_expr_list_free(el);
+
+ LIBCFS_FREE(tokens, tokensize);
+ the_lnet.ln_network_tokens = NULL;
+
+ return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len)
+{
+ lnet_text_buf_t *ltb;
+ int nob;
+
+ /* NB allocate space for the terminating 0 */
+ nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+ if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+ /* _way_ conservative for "route net gateway..." */
+ CERROR("text buffer too big\n");
+ return NULL;
+ }
+
+ if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+ CERROR("Too many text buffers\n");
+ return NULL;
+ }
+
+ LIBCFS_ALLOC(ltb, nob);
+ if (ltb == NULL)
+ return NULL;
+
+ ltb->ltb_size = nob;
+ ltb->ltb_text[0] = 0;
+ lnet_tbnob += nob;
+ return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+ lnet_tbnob -= ltb->ltb_size;
+ LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+ lnet_text_buf_t *ltb;
+
+ while (!list_empty(tbs)) {
+ ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ }
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+ struct list_head *tmp;
+ lnet_text_buf_t *ltb;
+
+ list_for_each (tmp, tbs) {
+ ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+ CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+ }
+
+ CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str)
+{
+ struct list_head pending;
+ char *sep;
+ int nob;
+ int i;
+ lnet_text_buf_t *ltb;
+
+ INIT_LIST_HEAD(&pending);
+
+ /* Split 'str' into separate commands */
+ for (;;) {
+ /* skip leading whitespace */
+ while (cfs_iswhite(*str))
+ str++;
+
+ /* scan for separator or comment */
+ for (sep = str; *sep != 0; sep++)
+ if (lnet_issep(*sep) || *sep == '#')
+ break;
+
+ nob = (int)(sep - str);
+ if (nob > 0) {
+ ltb = lnet_new_text_buf(nob);
+ if (ltb == NULL) {
+ lnet_free_text_bufs(&pending);
+ return -1;
+ }
+
+ for (i = 0; i < nob; i++)
+ if (cfs_iswhite(str[i]))
+ ltb->ltb_text[i] = ' ';
+ else
+ ltb->ltb_text[i] = str[i];
+
+ ltb->ltb_text[nob] = 0;
+
+ list_add_tail(&ltb->ltb_list, &pending);
+ }
+
+ if (*sep == '#') {
+ /* scan for separator */
+ do {
+ sep++;
+ } while (*sep != 0 && !lnet_issep(*sep));
+ }
+
+ if (*sep == 0)
+ break;
+
+ str = sep + 1;
+ }
+
+ list_splice(&pending, tbs->prev);
+ return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list,
+ char *str, char *sep1, char *sep2,
+ char *item, int itemlen)
+{
+ int len1 = (int)(sep1 - str);
+ int len2 = strlen(sep2 + 1);
+ lnet_text_buf_t *ltb;
+
+ LASSERT (*sep1 == '[');
+ LASSERT (*sep2 == ']');
+
+ ltb = lnet_new_text_buf(len1 + itemlen + len2);
+ if (ltb == NULL)
+ return -ENOMEM;
+
+ memcpy(ltb->ltb_text, str, len1);
+ memcpy(&ltb->ltb_text[len1], item, itemlen);
+ memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+ ltb->ltb_text[len1 + itemlen + len2] = 0;
+
+ list_add_tail(&ltb->ltb_list, list);
+ return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+ char num[16];
+ struct list_head pending;
+ char *sep;
+ char *sep2;
+ char *parsed;
+ char *enditem;
+ int lo;
+ int hi;
+ int stride;
+ int i;
+ int nob;
+ int scanned;
+
+ INIT_LIST_HEAD(&pending);
+
+ sep = strchr(str, '[');
+ if (sep == NULL) /* nothing to expand */
+ return 0;
+
+ sep2 = strchr(sep, ']');
+ if (sep2 == NULL)
+ goto failed;
+
+ for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+ enditem = ++parsed;
+ while (enditem < sep2 && *enditem != ',')
+ enditem++;
+
+ if (enditem == parsed) /* no empty items */
+ goto failed;
+
+ if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+ if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+ /* simple string enumeration */
+ if (lnet_expand1tb(&pending, str, sep, sep2,
+ parsed, (int)(enditem - parsed)) != 0)
+ goto failed;
+
+ continue;
+ }
+
+ stride = 1;
+ }
+
+ /* range expansion */
+
+ if (enditem != parsed + scanned) /* no trailing junk */
+ goto failed;
+
+ if (hi < 0 || lo < 0 || stride < 0 || hi < lo ||
+ (hi - lo) % stride != 0)
+ goto failed;
+
+ for (i = lo; i <= hi; i += stride) {
+
+ snprintf(num, sizeof(num), "%d", i);
+ nob = strlen(num);
+ if (nob + 1 == sizeof(num))
+ goto failed;
+
+ if (lnet_expand1tb(&pending, str, sep, sep2,
+ num, nob) != 0)
+ goto failed;
+ }
+ }
+
+ list_splice(&pending, tbs->prev);
+ return 1;
+
+ failed:
+ lnet_free_text_bufs(&pending);
+ return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+ int len = strlen(str);
+ int nob = len;
+
+ return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+ nob == len &&
+ *hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+ /* static scratch buffer OK (single threaded) */
+ static char cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+ struct list_head nets;
+ struct list_head gateways;
+ struct list_head *tmp1;
+ struct list_head *tmp2;
+ __u32 net;
+ lnet_nid_t nid;
+ lnet_text_buf_t *ltb;
+ int rc;
+ char *sep;
+ char *token = str;
+ int ntokens = 0;
+ int myrc = -1;
+ unsigned int hops;
+ int got_hops = 0;
+
+ INIT_LIST_HEAD(&gateways);
+ INIT_LIST_HEAD(&nets);
+
+ /* save a copy of the string for error messages */
+ strncpy(cmd, str, sizeof(cmd) - 1);
+ cmd[sizeof(cmd) - 1] = 0;
+
+ sep = str;
+ for (;;) {
+ /* scan for token start */
+ while (cfs_iswhite(*sep))
+ sep++;
+ if (*sep == 0) {
+ if (ntokens < (got_hops ? 3 : 2))
+ goto token_error;
+ break;
+ }
+
+ ntokens++;
+ token = sep++;
+
+ /* scan for token end */
+ while (*sep != 0 && !cfs_iswhite(*sep))
+ sep++;
+ if (*sep != 0)
+ *sep++ = 0;
+
+ if (ntokens == 1) {
+ tmp2 = &nets; /* expanding nets */
+ } else if (ntokens == 2 &&
+ lnet_parse_hops(token, &hops)) {
+ got_hops = 1; /* got a hop count */
+ continue;
+ } else {
+ tmp2 = &gateways; /* expanding gateways */
+ }
+
+ ltb = lnet_new_text_buf(strlen(token));
+ if (ltb == NULL)
+ goto out;
+
+ strcpy(ltb->ltb_text, token);
+ tmp1 = &ltb->ltb_list;
+ list_add_tail(tmp1, tmp2);
+
+ while (tmp1 != tmp2) {
+ ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+ rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+ if (rc < 0)
+ goto token_error;
+
+ tmp1 = tmp1->next;
+
+ if (rc > 0) { /* expanded! */
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ continue;
+ }
+
+ if (ntokens == 1) {
+ net = libcfs_str2net(ltb->ltb_text);
+ if (net == LNET_NIDNET(LNET_NID_ANY) ||
+ LNET_NETTYP(net) == LOLND)
+ goto token_error;
+ } else {
+ nid = libcfs_str2nid(ltb->ltb_text);
+ if (nid == LNET_NID_ANY ||
+ LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+ goto token_error;
+ }
+ }
+ }
+
+ if (!got_hops)
+ hops = 1;
+
+ LASSERT (!list_empty(&nets));
+ LASSERT (!list_empty(&gateways));
+
+ list_for_each (tmp1, &nets) {
+ ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+ net = libcfs_str2net(ltb->ltb_text);
+ LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+ list_for_each (tmp2, &gateways) {
+ ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+ nid = libcfs_str2nid(ltb->ltb_text);
+ LASSERT (nid != LNET_NID_ANY);
+
+ if (lnet_islocalnid(nid)) {
+ *im_a_router = 1;
+ continue;
+ }
+
+ rc = lnet_add_route (net, hops, nid);
+ if (rc != 0) {
+ CERROR("Can't create route "
+ "to %s via %s\n",
+ libcfs_net2str(net),
+ libcfs_nid2str(nid));
+ goto out;
+ }
+ }
+ }
+
+ myrc = 0;
+ goto out;
+
+ token_error:
+ lnet_syntax("routes", cmd, (int)(token - str), strlen(token));
+ out:
+ lnet_free_text_bufs(&nets);
+ lnet_free_text_bufs(&gateways);
+ return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+ lnet_text_buf_t *ltb;
+
+ while (!list_empty(tbs)) {
+ ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+ if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+ lnet_free_text_bufs(tbs);
+ return -EINVAL;
+ }
+
+ list_del(&ltb->ltb_list);
+ lnet_free_text_buf(ltb);
+ }
+
+ return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+ struct list_head tbs;
+ int rc = 0;
+
+ *im_a_router = 0;
+
+ INIT_LIST_HEAD(&tbs);
+
+ if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+ CERROR("Error parsing routes\n");
+ rc = -EINVAL;
+ } else {
+ rc = lnet_parse_route_tbs(&tbs, im_a_router);
+ }
+
+ LASSERT (lnet_tbnob == 0);
+ return rc;
+}
+
+int
+lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip)
+{
+ LIST_HEAD (list);
+ int rc;
+ int i;
+
+ rc = cfs_ip_addr_parse(token, len, &list);
+ if (rc != 0)
+ return rc;
+
+ for (rc = i = 0; !rc && i < nip; i++)
+ rc = cfs_ip_addr_match(ipaddrs[i], &list);
+
+ cfs_ip_addr_free(&list);
+
+ return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+ static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+ int matched = 0;
+ int ntokens = 0;
+ int len;
+ char *net = NULL;
+ char *sep;
+ char *token;
+ int rc;
+
+ LASSERT (strlen(net_entry) < sizeof(tokens));
+
+ /* work on a copy of the string */
+ strcpy(tokens, net_entry);
+ sep = tokens;
+ for (;;) {
+ /* scan for token start */
+ while (cfs_iswhite(*sep))
+ sep++;
+ if (*sep == 0)
+ break;
+
+ token = sep++;
+
+ /* scan for token end */
+ while (*sep != 0 && !cfs_iswhite(*sep))
+ sep++;
+ if (*sep != 0)
+ *sep++ = 0;
+
+ if (ntokens++ == 0) {
+ net = token;
+ continue;
+ }
+
+ len = strlen(token);
+
+ rc = lnet_match_network_token(token, len, ipaddrs, nip);
+ if (rc < 0) {
+ lnet_syntax("ip2nets", net_entry,
+ (int)(token - tokens), len);
+ return rc;
+ }
+
+ matched |= (rc != 0);
+ }
+
+ if (!matched)
+ return 0;
+
+ strcpy(net_entry, net); /* replace with matched net */
+ return 1;
+}
+
+__u32
+lnet_netspec2net(char *netspec)
+{
+ char *bracket = strchr(netspec, '(');
+ __u32 net;
+
+ if (bracket != NULL)
+ *bracket = 0;
+
+ net = libcfs_str2net(netspec);
+
+ if (bracket != NULL)
+ *bracket = '(';
+
+ return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+ int offset = 0;
+ int offset2;
+ int len;
+ lnet_text_buf_t *tb;
+ lnet_text_buf_t *tb2;
+ struct list_head *t;
+ char *sep;
+ char *bracket;
+ __u32 net;
+
+ LASSERT (!list_empty(nets));
+ LASSERT (nets->next == nets->prev); /* single entry */
+
+ tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+ for (;;) {
+ sep = strchr(tb->ltb_text, ',');
+ bracket = strchr(tb->ltb_text, '(');
+
+ if (sep != NULL &&
+ bracket != NULL &&
+ bracket < sep) {
+ /* netspec lists interfaces... */
+
+ offset2 = offset + (int)(bracket - tb->ltb_text);
+ len = strlen(bracket);
+
+ bracket = strchr(bracket + 1, ')');
+
+ if (bracket == NULL ||
+ !(bracket[1] == ',' || bracket[1] == 0)) {
+ lnet_syntax("ip2nets", source, offset2, len);
+ return -EINVAL;
+ }
+
+ sep = (bracket[1] == 0) ? NULL : bracket + 1;
+ }
+
+ if (sep != NULL)
+ *sep++ = 0;
+
+ net = lnet_netspec2net(tb->ltb_text);
+ if (net == LNET_NIDNET(LNET_NID_ANY)) {
+ lnet_syntax("ip2nets", source, offset,
+ strlen(tb->ltb_text));
+ return -EINVAL;
+ }
+
+ list_for_each(t, nets) {
+ tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+ if (tb2 == tb)
+ continue;
+
+ if (net == lnet_netspec2net(tb2->ltb_text)) {
+ /* duplicate network */
+ lnet_syntax("ip2nets", source, offset,
+ strlen(tb->ltb_text));
+ return -EINVAL;
+ }
+ }
+
+ if (sep == NULL)
+ return 0;
+
+ offset += (int)(sep - tb->ltb_text);
+ tb2 = lnet_new_text_buf(strlen(sep));
+ if (tb2 == NULL)
+ return -ENOMEM;
+
+ strcpy(tb2->ltb_text, sep);
+ list_add_tail(&tb2->ltb_list, nets);
+
+ tb = tb2;
+ }
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+ static char networks[LNET_SINGLE_TEXTBUF_NOB];
+ static char source[LNET_SINGLE_TEXTBUF_NOB];
+
+ struct list_head raw_entries;
+ struct list_head matched_nets;
+ struct list_head current_nets;
+ struct list_head *t;
+ struct list_head *t2;
+ lnet_text_buf_t *tb;
+ lnet_text_buf_t *tb2;
+ __u32 net1;
+ __u32 net2;
+ int len;
+ int count;
+ int dup;
+ int rc;
+
+ INIT_LIST_HEAD(&raw_entries);
+ if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+ CERROR("Error parsing ip2nets\n");
+ LASSERT (lnet_tbnob == 0);
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&matched_nets);
+ INIT_LIST_HEAD(&current_nets);
+ networks[0] = 0;
+ count = 0;
+ len = 0;
+ rc = 0;
+
+ while (!list_empty(&raw_entries)) {
+ tb = list_entry(raw_entries.next, lnet_text_buf_t,
+ ltb_list);
+
+ strncpy(source, tb->ltb_text, sizeof(source)-1);
+ source[sizeof(source)-1] = 0;
+
+ /* replace ltb_text with the network(s) add on match */
+ rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+ if (rc < 0)
+ break;
+
+ list_del(&tb->ltb_list);
+
+ if (rc == 0) { /* no match */
+ lnet_free_text_buf(tb);
+ continue;
+ }
+
+ /* split into separate networks */
+ INIT_LIST_HEAD(&current_nets);
+ list_add(&tb->ltb_list, &current_nets);
+ rc = lnet_splitnets(source, &current_nets);
+ if (rc < 0)
+ break;
+
+ dup = 0;
+ list_for_each (t, &current_nets) {
+ tb = list_entry(t, lnet_text_buf_t, ltb_list);
+ net1 = lnet_netspec2net(tb->ltb_text);
+ LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+ list_for_each(t2, &matched_nets) {
+ tb2 = list_entry(t2, lnet_text_buf_t,
+ ltb_list);
+ net2 = lnet_netspec2net(tb2->ltb_text);
+ LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+ if (net1 == net2) {
+ dup = 1;
+ break;
+ }
+ }
+
+ if (dup)
+ break;
+ }
+
+ if (dup) {
+ lnet_free_text_bufs(&current_nets);
+ continue;
+ }
+
+ list_for_each_safe(t, t2, &current_nets) {
+ tb = list_entry(t, lnet_text_buf_t, ltb_list);
+
+ list_del(&tb->ltb_list);
+ list_add_tail(&tb->ltb_list, &matched_nets);
+
+ len += snprintf(networks + len, sizeof(networks) - len,
+ "%s%s", (len == 0) ? "" : ",",
+ tb->ltb_text);
+
+ if (len >= sizeof(networks)) {
+ CERROR("Too many matched networks\n");
+ rc = -E2BIG;
+ goto out;
+ }
+ }
+
+ count++;
+ }
+
+ out:
+ lnet_free_text_bufs(&raw_entries);
+ lnet_free_text_bufs(&matched_nets);
+ lnet_free_text_bufs(&current_nets);
+ LASSERT (lnet_tbnob == 0);
+
+ if (rc < 0)
+ return rc;
+
+ *networksp = networks;
+ return count;
+}
+
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+ LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+ int up;
+ __u32 netmask;
+ __u32 *ipaddrs;
+ __u32 *ipaddrs2;
+ int nip;
+ char **ifnames;
+ int nif = libcfs_ipif_enumerate(&ifnames);
+ int i;
+ int rc;
+
+ if (nif <= 0)
+ return nif;
+
+ LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+ if (ipaddrs == NULL) {
+ CERROR("Can't allocate ipaddrs[%d]\n", nif);
+ libcfs_ipif_free_enumeration(ifnames, nif);
+ return -ENOMEM;
+ }
+
+ for (i = nip = 0; i < nif; i++) {
+ if (!strcmp(ifnames[i], "lo"))
+ continue;
+
+ rc = libcfs_ipif_query(ifnames[i], &up,
+ &ipaddrs[nip], &netmask);
+ if (rc != 0) {
+ CWARN("Can't query interface %s: %d\n",
+ ifnames[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Ignoring interface %s: it's down\n",
+ ifnames[i]);
+ continue;
+ }
+
+ nip++;
+ }
+
+ libcfs_ipif_free_enumeration(ifnames, nif);
+
+ if (nip == nif) {
+ *ipaddrsp = ipaddrs;
+ } else {
+ if (nip > 0) {
+ LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+ if (ipaddrs2 == NULL) {
+ CERROR("Can't allocate ipaddrs[%d]\n", nip);
+ nip = -ENOMEM;
+ } else {
+ memcpy(ipaddrs2, ipaddrs,
+ nip * sizeof(*ipaddrs));
+ *ipaddrsp = ipaddrs2;
+ rc = nip;
+ }
+ }
+ lnet_ipaddr_free_enumeration(ipaddrs, nif);
+ }
+ return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+ __u32 *ipaddrs;
+ int nip = lnet_ipaddr_enumerate(&ipaddrs);
+ int rc;
+
+ if (nip < 0) {
+ LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP "
+ "interfaces for ip2nets to match\n", nip);
+ return nip;
+ }
+
+ if (nip == 0) {
+ LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces "
+ "for ip2nets to match\n");
+ return -ENOENT;
+ }
+
+ rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+ lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+ if (rc < 0) {
+ LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc);
+ return rc;
+ }
+
+ if (rc == 0) {
+ LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match "
+ "any local IP interfaces\n");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni)
+{
+ __u32 net = LNET_NIDNET(ni->ni_nid);
+ char **names;
+ int n;
+ __u32 ip;
+ __u32 netmask;
+ int up;
+ int i;
+ int rc;
+
+ /* Convenience for LNDs that use the IP address of a local interface as
+ * the local address part of their NID */
+
+ if (ni->ni_interfaces[0] != NULL) {
+
+ CLASSERT (LNET_MAX_INTERFACES > 1);
+
+ if (ni->ni_interfaces[1] != NULL) {
+ CERROR("Net %s doesn't support multiple interfaces\n",
+ libcfs_net2str(net));
+ return -EPERM;
+ }
+
+ rc = libcfs_ipif_query(ni->ni_interfaces[0],
+ &up, &ip, &netmask);
+ if (rc != 0) {
+ CERROR("Net %s can't query interface %s: %d\n",
+ libcfs_net2str(net), ni->ni_interfaces[0], rc);
+ return -EPERM;
+ }
+
+ if (!up) {
+ CERROR("Net %s can't use interface %s: it's down\n",
+ libcfs_net2str(net), ni->ni_interfaces[0]);
+ return -ENETDOWN;
+ }
+
+ ni->ni_nid = LNET_MKNID(net, ip);
+ return 0;
+ }
+
+ n = libcfs_ipif_enumerate(&names);
+ if (n <= 0) {
+ CERROR("Net %s can't enumerate interfaces: %d\n",
+ libcfs_net2str(net), n);
+ return 0;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+ continue;
+
+ rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+
+ if (rc != 0) {
+ CWARN("Net %s can't query interface %s: %d\n",
+ libcfs_net2str(net), names[i], rc);
+ continue;
+ }
+
+ if (!up) {
+ CWARN("Net %s ignoring interface %s (down)\n",
+ libcfs_net2str(net), names[i]);
+ continue;
+ }
+
+ libcfs_ipif_free_enumeration(names, n);
+ ni->ni_nid = LNET_MKNID(net, ip);
+ return 0;
+ }
+
+ CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+ libcfs_ipif_free_enumeration(names, n);
+ return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
new file mode 100644
index 000000000000..78297a7d94e8
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -0,0 +1,447 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-eq.c
+ *
+ * Library level Event queue management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create an event queue that has room for \a count number of events.
+ *
+ * The event queue is circular and older events will be overwritten by new
+ * ones if they are not removed in time by the user using the functions
+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to
+ * determine the appropriate size of the event queue to prevent this loss
+ * of events. Note that when EQ handler is specified in \a callback, no
+ * event loss can happen, since the handler is run for each event deposited
+ * into the EQ.
+ *
+ * \param count The number of events to be stored in the event queue. It
+ * will be rounded up to the next power of two.
+ * \param callback A handler function that runs when an event is deposited
+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to
+ * indicate that no event handler is desired.
+ * \param handle On successful return, this location will hold a handle for
+ * the newly created EQ.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If an parameter is not valid.
+ * \retval -ENOMEM If memory for the EQ can't be allocated.
+ *
+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics.
+ */
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback,
+ lnet_handle_eq_t *handle)
+{
+ lnet_eq_t *eq;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ /* We need count to be a power of 2 so that when eq_{enq,deq}_seq
+ * overflow, they don't skip entries, so the queue has the same
+ * apparent capacity at all times */
+
+ count = cfs_power2_roundup(count);
+
+ if (callback != LNET_EQ_HANDLER_NONE && count != 0) {
+ CWARN("EQ callback is guaranteed to get every event, "
+ "do you still want to set eqcount %d for polling "
+ "event which will have locking overhead? "
+ "Please contact with developer to confirm\n", count);
+ }
+
+ /* count can be 0 if only need callback, we can eliminate
+ * overhead of enqueue event */
+ if (count == 0 && callback == LNET_EQ_HANDLER_NONE)
+ return -EINVAL;
+
+ eq = lnet_eq_alloc();
+ if (eq == NULL)
+ return -ENOMEM;
+
+ if (count != 0) {
+ LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
+ if (eq->eq_events == NULL)
+ goto failed;
+ /* NB allocator has set all event sequence numbers to 0,
+ * so all them should be earlier than eq_deq_seq */
+ }
+
+ eq->eq_deq_seq = 1;
+ eq->eq_enq_seq = 1;
+ eq->eq_size = count;
+ eq->eq_callback = callback;
+
+ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*eq->eq_refs[0]));
+ if (eq->eq_refs == NULL)
+ goto failed;
+
+ /* MUST hold both exclusive lnet_res_lock */
+ lnet_res_lock(LNET_LOCK_EX);
+ /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+ * both EQ lookup and poll event with only lnet_eq_wait_lock */
+ lnet_eq_wait_lock();
+
+ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh);
+ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active);
+
+ lnet_eq_wait_unlock();
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ lnet_eq2handle(handle, eq);
+ return 0;
+
+failed:
+ if (eq->eq_events != NULL)
+ LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t));
+
+ if (eq->eq_refs != NULL)
+ cfs_percpt_free(eq->eq_refs);
+
+ lnet_eq_free(eq);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(LNetEQAlloc);
+
+/**
+ * Release the resources associated with an event queue if it's idle;
+ * otherwise do nothing and it's up to the user to try again.
+ *
+ * \param eqh A handle for the event queue to be released.
+ *
+ * \retval 0 If the EQ is not in use and freed.
+ * \retval -ENOENT If \a eqh does not point to a valid EQ.
+ * \retval -EBUSY If the EQ is still in use by some MDs.
+ */
+int
+LNetEQFree(lnet_handle_eq_t eqh)
+{
+ struct lnet_eq *eq;
+ lnet_event_t *events = NULL;
+ int **refs = NULL;
+ int *ref;
+ int rc = 0;
+ int size = 0;
+ int i;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ lnet_res_lock(LNET_LOCK_EX);
+ /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do
+ * both EQ lookup and poll event with only lnet_eq_wait_lock */
+ lnet_eq_wait_lock();
+
+ eq = lnet_handle2eq(&eqh);
+ if (eq == NULL) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ cfs_percpt_for_each(ref, i, eq->eq_refs) {
+ LASSERT(*ref >= 0);
+ if (*ref == 0)
+ continue;
+
+ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n",
+ i, *ref);
+ rc = -EBUSY;
+ goto out;
+ }
+
+ /* stash for free after lock dropped */
+ events = eq->eq_events;
+ size = eq->eq_size;
+ refs = eq->eq_refs;
+
+ lnet_res_lh_invalidate(&eq->eq_lh);
+ list_del(&eq->eq_list);
+ lnet_eq_free_locked(eq);
+ out:
+ lnet_eq_wait_unlock();
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ if (events != NULL)
+ LIBCFS_FREE(events, size * sizeof(lnet_event_t));
+ if (refs != NULL)
+ cfs_percpt_free(refs);
+
+ return rc;
+}
+EXPORT_SYMBOL(LNetEQFree);
+
+void
+lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */
+ int index;
+
+ if (eq->eq_size == 0) {
+ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE);
+ eq->eq_callback(ev);
+ return;
+ }
+
+ lnet_eq_wait_lock();
+ ev->sequence = eq->eq_enq_seq++;
+
+ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size));
+ index = ev->sequence & (eq->eq_size - 1);
+
+ eq->eq_events[index] = *ev;
+
+ if (eq->eq_callback != LNET_EQ_HANDLER_NONE)
+ eq->eq_callback(ev);
+
+ /* Wake anyone waiting in LNetEQPoll() */
+ if (waitqueue_active(&the_lnet.ln_eq_waitq))
+ wake_up_all(&the_lnet.ln_eq_waitq);
+ lnet_eq_wait_unlock();
+}
+
+int
+lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev)
+{
+ int new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+ lnet_event_t *new_event = &eq->eq_events[new_index];
+ int rc;
+ ENTRY;
+
+ /* must called with lnet_eq_wait_lock hold */
+ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence))
+ RETURN(0);
+
+ /* We've got a new event... */
+ *ev = *new_event;
+
+ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
+ new_event, eq->eq_deq_seq, eq->eq_size);
+
+ /* ...but did it overwrite an event we've not seen yet? */
+ if (eq->eq_deq_seq == new_event->sequence) {
+ rc = 1;
+ } else {
+ /* don't complain with CERROR: some EQs are sized small
+ * anyway; if it's important, the caller should complain */
+ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
+ eq->eq_deq_seq, new_event->sequence);
+ rc = -EOVERFLOW;
+ }
+
+ eq->eq_deq_seq = new_event->sequence + 1;
+ RETURN(rc);
+}
+
+/**
+ * A nonblocking function that can be used to get the next event in an EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. The event is removed from the queue.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 0 No pending event in the EQ.
+ * \retval 1 Indicates success.
+ * \retval -ENOENT If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+ int which;
+
+ return LNetEQPoll(&eventq, 1, 0,
+ event, &which);
+}
+EXPORT_SYMBOL(LNetEQGet);
+
+/**
+ * Block the calling process until there is an event in the EQ.
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully. This function returns the next event
+ * in the EQ and removes it from the EQ.
+ *
+ * \param eventq A handle for the event queue.
+ * \param event On successful return (1 or -EOVERFLOW), this location will
+ * hold the next event in the EQ.
+ *
+ * \retval 1 Indicates success.
+ * \retval -ENOENT If \a eventq does not point to a valid EQ.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ has been dropped due to limited space in the EQ.
+ */
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+ int which;
+
+ return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+ event, &which);
+}
+EXPORT_SYMBOL(LNetEQWait);
+
+
+static int
+lnet_eq_wait_locked(int *timeout_ms)
+{
+ int tms = *timeout_ms;
+ int wait;
+ wait_queue_t wl;
+ cfs_time_t now;
+
+ if (tms == 0)
+ return -1; /* don't want to wait and no new event */
+
+ init_waitqueue_entry_current(&wl);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+ lnet_eq_wait_unlock();
+
+ if (tms < 0) {
+ waitq_wait(&wl, TASK_INTERRUPTIBLE);
+
+ } else {
+ struct timeval tv;
+
+ now = cfs_time_current();
+ waitq_timedwait(&wl, TASK_INTERRUPTIBLE,
+ cfs_time_seconds(tms) / 1000);
+ cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv);
+ tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000);
+ if (tms < 0) /* no more wait but may have new event */
+ tms = 0;
+ }
+
+ wait = tms != 0; /* might need to call here again */
+ *timeout_ms = tms;
+
+ lnet_eq_wait_lock();
+ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl);
+
+ return wait;
+}
+
+
+
+/**
+ * Block the calling process until there's an event from a set of EQs or
+ * timeout happens.
+ *
+ * If an event handler is associated with the EQ, the handler will run before
+ * this function returns successfully, in which case the corresponding event
+ * is consumed.
+ *
+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a
+ * fixed period, or block indefinitely.
+ *
+ * \param eventqs,neq An array of EQ handles, and size of the array.
+ * \param timeout_ms Time in milliseconds to wait for an event to occur on
+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an
+ * infinite timeout.
+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will
+ * hold the next event in the EQs, and \a which will contain the index of the
+ * EQ from which the event was taken.
+ *
+ * \retval 0 No pending event in the EQs after timeout.
+ * \retval 1 Indicates success.
+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that
+ * at least one event between this event and the last event obtained from the
+ * EQ indicated by \a which has been dropped due to limited space in the EQ.
+ * \retval -ENOENT If there's an invalid handle in \a eventqs.
+ */
+int
+LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+ lnet_event_t *event, int *which)
+{
+ int wait = 1;
+ int rc;
+ int i;
+ ENTRY;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (neq < 1)
+ RETURN(-ENOENT);
+
+ lnet_eq_wait_lock();
+
+ for (;;) {
+ for (i = 0; i < neq; i++) {
+ lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+ if (eq == NULL) {
+ lnet_eq_wait_unlock();
+ RETURN(-ENOENT);
+ }
+
+ rc = lnet_eq_dequeue_event(eq, event);
+ if (rc != 0) {
+ lnet_eq_wait_unlock();
+ *which = i;
+ RETURN(rc);
+ }
+ }
+
+ if (wait == 0)
+ break;
+
+ /*
+ * return value of lnet_eq_wait_locked:
+ * -1 : did nothing and it's sure no new event
+ * 1 : sleep inside and wait until new event
+ * 0 : don't want to wait anymore, but might have new event
+ * so need to call dequeue again
+ */
+ wait = lnet_eq_wait_locked(&timeout_ms);
+ if (wait < 0) /* no new event */
+ break;
+ }
+
+ lnet_eq_wait_unlock();
+ RETURN(0);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
new file mode 100644
index 000000000000..ae643f26933b
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-md.c
@@ -0,0 +1,451 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-md.c
+ *
+ * Memory Descriptor management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_unlink(lnet_libmd_t *md)
+{
+ if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
+ /* first unlink attempt... */
+ lnet_me_t *me = md->md_me;
+
+ md->md_flags |= LNET_MD_FLAG_ZOMBIE;
+
+ /* Disassociate from ME (if any), and unlink it if it was created
+ * with LNET_UNLINK */
+ if (me != NULL) {
+ /* detach MD from portal */
+ lnet_ptl_detach_md(me, md);
+ if (me->me_unlink == LNET_UNLINK)
+ lnet_me_unlink(me);
+ }
+
+ /* ensure all future handle lookups fail */
+ lnet_res_lh_invalidate(&md->md_lh);
+ }
+
+ if (md->md_refcount != 0) {
+ CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+ return;
+ }
+
+ CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+ if (md->md_eq != NULL) {
+ int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+
+ LASSERT(*md->md_eq->eq_refs[cpt] > 0);
+ (*md->md_eq->eq_refs[cpt])--;
+ }
+
+ LASSERT(!list_empty(&md->md_list));
+ list_del_init(&md->md_list);
+ lnet_md_free_locked(md);
+}
+
+static int
+lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
+{
+ int i;
+ unsigned int niov;
+ int total_length = 0;
+
+ lmd->md_me = NULL;
+ lmd->md_start = umd->start;
+ lmd->md_offset = 0;
+ lmd->md_max_size = umd->max_size;
+ lmd->md_options = umd->options;
+ lmd->md_user_ptr = umd->user_ptr;
+ lmd->md_eq = NULL;
+ lmd->md_threshold = umd->threshold;
+ lmd->md_refcount = 0;
+ lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+ if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+ if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+ return -EINVAL;
+
+ lmd->md_niov = niov = umd->length;
+ memcpy(lmd->md_iov.iov, umd->start,
+ niov * sizeof (lmd->md_iov.iov[0]));
+
+ for (i = 0; i < (int)niov; i++) {
+ /* We take the base address on trust */
+ if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
+ return -EINVAL;
+
+ total_length += lmd->md_iov.iov[i].iov_len;
+ }
+
+ lmd->md_length = total_length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > total_length)) // illegal max_size
+ return -EINVAL;
+
+ } else if ((umd->options & LNET_MD_KIOV) != 0) {
+ lmd->md_niov = niov = umd->length;
+ memcpy(lmd->md_iov.kiov, umd->start,
+ niov * sizeof (lmd->md_iov.kiov[0]));
+
+ for (i = 0; i < (int)niov; i++) {
+ /* We take the page pointer on trust */
+ if (lmd->md_iov.kiov[i].kiov_offset +
+ lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE )
+ return -EINVAL; /* invalid length */
+
+ total_length += lmd->md_iov.kiov[i].kiov_len;
+ }
+
+ lmd->md_length = total_length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > total_length)) // illegal max_size
+ return -EINVAL;
+ } else { /* contiguous */
+ lmd->md_length = umd->length;
+ lmd->md_niov = niov = 1;
+ lmd->md_iov.iov[0].iov_base = umd->start;
+ lmd->md_iov.iov[0].iov_len = umd->length;
+
+ if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
+ (umd->max_size < 0 ||
+ umd->max_size > (int)umd->length)) // illegal max_size
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* must be called with resource lock held */
+static int
+lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt)
+{
+ struct lnet_res_container *container = the_lnet.ln_md_containers[cpt];
+
+ /* NB we are passed an allocated, but inactive md.
+ * if we return success, caller may lnet_md_unlink() it.
+ * otherwise caller may only lnet_md_free() it.
+ */
+ /* This implementation doesn't know how to create START events or
+ * disable END events. Best to LASSERT our caller is compliant so
+ * we find out quickly... */
+ /* TODO - reevaluate what should be here in light of
+ * the removal of the start and end events
+ * maybe there we shouldn't even allow LNET_EQ_NONE!)
+ * LASSERT (eq == NULL);
+ */
+ if (!LNetHandleIsInvalid(eq_handle)) {
+ md->md_eq = lnet_handle2eq(&eq_handle);
+
+ if (md->md_eq == NULL)
+ return -ENOENT;
+
+ (*md->md_eq->eq_refs[cpt])++;
+ }
+
+ lnet_res_lh_initialize(container, &md->md_lh);
+
+ LASSERT(list_empty(&md->md_list));
+ list_add(&md->md_list, &container->rec_active);
+
+ return 0;
+}
+
+/* must be called with lnet_res_lock held */
+void
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
+{
+ /* NB this doesn't copy out all the iov entries so when a
+ * discontiguous MD is copied out, the target gets to know the
+ * original iov pointer (in start) and the number of entries it had
+ * and that's all.
+ */
+ umd->start = lmd->md_start;
+ umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+ lmd->md_length : lmd->md_niov;
+ umd->threshold = lmd->md_threshold;
+ umd->max_size = lmd->md_max_size;
+ umd->options = lmd->md_options;
+ umd->user_ptr = lmd->md_user_ptr;
+ lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
+}
+
+int
+lnet_md_validate(lnet_md_t *umd)
+{
+ if (umd->start == NULL && umd->length != 0) {
+ CERROR("MD start pointer can not be NULL with length %u\n",
+ umd->length);
+ return -EINVAL;
+ }
+
+ if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+ umd->length > LNET_MAX_IOV) {
+ CERROR("Invalid option: too many fragments %u, %d max\n",
+ umd->length, LNET_MAX_IOV);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * Create a memory descriptor and attach it to a ME
+ *
+ * \param meh A handle for a ME to associate the new MD with.
+ * \param umd Provides initial values for the user-visible parts of a MD.
+ * Other than its use for initialization, there is no linkage between this
+ * structure and the MD maintained by the LNet.
+ * \param unlink A flag to indicate whether the MD is automatically unlinked
+ * when it becomes inactive, either because the operation threshold drops to
+ * zero or because the available memory becomes less than \a umd.max_size.
+ * (Note that the check for unlinking a MD only occurs after the completion
+ * of a successful operation on the MD.) The value LNET_UNLINK enables auto
+ * unlinking; the value LNET_RETAIN disables it.
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink().
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a
+ * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by
+ * calling LNetInvalidateHandle() on it.
+ * \retval -EBUSY If the ME pointed to by \a meh is already associated with
+ * a MD.
+ */
+int
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+ lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+ LIST_HEAD (matches);
+ LIST_HEAD (drops);
+ struct lnet_me *me;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (lnet_md_validate(&umd) != 0)
+ return -EINVAL;
+
+ if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) {
+ CERROR("Invalid option: no MD_OP set\n");
+ return -EINVAL;
+ }
+
+ md = lnet_md_alloc(&umd);
+ if (md == NULL)
+ return -ENOMEM;
+
+ rc = lnet_md_build(md, &umd, unlink);
+ cpt = lnet_cpt_of_cookie(meh.cookie);
+
+ lnet_res_lock(cpt);
+ if (rc != 0)
+ goto failed;
+
+ me = lnet_handle2me(&meh);
+ if (me == NULL)
+ rc = -ENOENT;
+ else if (me->me_md != NULL)
+ rc = -EBUSY;
+ else
+ rc = lnet_md_link(md, umd.eq_handle, cpt);
+
+ if (rc != 0)
+ goto failed;
+
+ /* attach this MD to portal of ME and check if it matches any
+ * blocked msgs on this portal */
+ lnet_ptl_attach_md(me, md, &matches, &drops);
+
+ lnet_md2handle(handle, md);
+
+ lnet_res_unlock(cpt);
+
+ lnet_drop_delayed_msg_list(&drops, "Bad match");
+ lnet_recv_delayed_msg_list(&matches);
+
+ return 0;
+
+ failed:
+ lnet_md_free_locked(md);
+
+ lnet_res_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetMDAttach);
+
+/**
+ * Create a "free floating" memory descriptor - a MD that is not associated
+ * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations.
+ *
+ * \param umd,unlink See the discussion for LNetMDAttach().
+ * \param handle On successful returns, a handle to the newly created MD is
+ * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(),
+ * and LNetGet() operations.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a umd is not valid.
+ * \retval -ENOMEM If new MD cannot be allocated.
+ * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that
+ * it's OK to supply a NULL \a umd.eq_handle by calling
+ * LNetInvalidateHandle() on it.
+ */
+int
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
+{
+ lnet_libmd_t *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (lnet_md_validate(&umd) != 0)
+ return -EINVAL;
+
+ if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) {
+ CERROR("Invalid option: GET|PUT illegal on active MDs\n");
+ return -EINVAL;
+ }
+
+ md = lnet_md_alloc(&umd);
+ if (md == NULL)
+ return -ENOMEM;
+
+ rc = lnet_md_build(md, &umd, unlink);
+
+ cpt = lnet_res_lock_current();
+ if (rc != 0)
+ goto failed;
+
+ rc = lnet_md_link(md, umd.eq_handle, cpt);
+ if (rc != 0)
+ goto failed;
+
+ lnet_md2handle(handle, md);
+
+ lnet_res_unlock(cpt);
+ return 0;
+
+ failed:
+ lnet_md_free_locked(md);
+
+ lnet_res_unlock(cpt);
+ return rc;
+}
+EXPORT_SYMBOL(LNetMDBind);
+
+/**
+ * Unlink the memory descriptor from any ME it may be linked to and release
+ * the internal resources associated with it.
+ *
+ * This function does not free the memory region associated with the MD;
+ * i.e., the memory the user allocated for this MD. If the ME associated with
+ * this MD is not NULL and was created with auto unlink enabled, the ME is
+ * unlinked as well (see LNetMEAttach()).
+ *
+ * Explicitly unlinking a MD via this function call has the same behavior as
+ * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK
+ * is generated in the latter case.
+ *
+ * An unlinked event can be reported in two ways:
+ * - If there's no pending operations on the MD, it's unlinked immediately
+ * and an LNET_EVENT_UNLINK event is logged before this function returns.
+ * - Otherwise, the MD is only marked for deletion when this function
+ * returns, and the unlinked event will be piggybacked on the event of
+ * the completion of the last operation by setting the unlinked field of
+ * the event. No dedicated LNET_EVENT_UNLINK event is generated.
+ *
+ * Note that in both cases the unlinked field of the event is always set; no
+ * more event will happen on the MD after such an event is logged.
+ *
+ * \param mdh A handle for the MD to be unlinked.
+ *
+ * \retval 0 On success.
+ * \retval -ENOENT If \a mdh does not point to a valid MD object.
+ */
+int
+LNetMDUnlink (lnet_handle_md_t mdh)
+{
+ lnet_event_t ev;
+ lnet_libmd_t *md;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL) {
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ /* If the MD is busy, lnet_md_unlink just marks it for deletion, and
+ * when the NAL is done, the completion event flags that the MD was
+ * unlinked. Otherwise, we enqueue an event now... */
+
+ if (md->md_eq != NULL &&
+ md->md_refcount == 0) {
+ lnet_build_unlink_event(md, &ev);
+ lnet_eq_enqueue_event(md->md_eq, &ev);
+ }
+
+ lnet_md_unlink(md);
+
+ lnet_res_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMDUnlink);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c
new file mode 100644
index 000000000000..0081075cabee
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-me.c
@@ -0,0 +1,297 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-me.c
+ *
+ * Match Entry management routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/**
+ * Create and attach a match entry to the match list of \a portal. The new
+ * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach()
+ * can be used to attach a MD to an empty ME.
+ *
+ * \param portal The portal table index where the ME should be attached.
+ * \param match_id Specifies the match criteria for the process ID of
+ * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be
+ * used to wildcard either of the identifiers in the lnet_process_id_t
+ * structure.
+ * \param match_bits,ignore_bits Specify the match criteria to apply
+ * to the match bits in the incoming request. The ignore bits are used
+ * to mask out insignificant bits in the incoming match bits. The resulting
+ * bits are then compared to the ME's match bits to determine if the
+ * incoming request meets the match criteria.
+ * \param unlink Indicates whether the ME should be unlinked when the memory
+ * descriptor associated with it is unlinked (Note that the check for
+ * unlinking a ME only occurs when the memory descriptor is unlinked.).
+ * Valid values are LNET_RETAIN and LNET_UNLINK.
+ * \param pos Indicates whether the new ME should be prepended or
+ * appended to the match list. Allowed constants: LNET_INS_BEFORE,
+ * LNET_INS_AFTER.
+ * \param handle On successful returns, a handle to the newly created ME
+ * object is saved here. This handle can be used later in LNetMEInsert(),
+ * LNetMEUnlink(), or LNetMDAttach() functions.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is invalid.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ */
+int
+LNetMEAttach(unsigned int portal,
+ lnet_process_id_t match_id,
+ __u64 match_bits, __u64 ignore_bits,
+ lnet_unlink_t unlink, lnet_ins_pos_t pos,
+ lnet_handle_me_t *handle)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_me *me;
+ struct list_head *head;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if ((int)portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ mtable = lnet_mt_of_attach(portal, match_id,
+ match_bits, ignore_bits, pos);
+ if (mtable == NULL) /* can't match portal type */
+ return -EPERM;
+
+ me = lnet_me_alloc();
+ if (me == NULL)
+ return -ENOMEM;
+
+ lnet_res_lock(mtable->mt_cpt);
+
+ me->me_portal = portal;
+ me->me_match_id = match_id;
+ me->me_match_bits = match_bits;
+ me->me_ignore_bits = ignore_bits;
+ me->me_unlink = unlink;
+ me->me_md = NULL;
+
+ lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt],
+ &me->me_lh);
+ if (ignore_bits != 0)
+ head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+ else
+ head = lnet_mt_match_head(mtable, match_id, match_bits);
+
+ me->me_pos = head - &mtable->mt_mhash[0];
+ if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL)
+ list_add_tail(&me->me_list, head);
+ else
+ list_add(&me->me_list, head);
+
+ lnet_me2handle(handle, me);
+
+ lnet_res_unlock(mtable->mt_cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEAttach);
+
+/**
+ * Create and a match entry and insert it before or after the ME pointed to by
+ * \a current_meh. The new ME is empty, i.e. not associated with a memory
+ * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME.
+ *
+ * This function is identical to LNetMEAttach() except for the position
+ * where the new ME is inserted.
+ *
+ * \param current_meh A handle for a ME. The new ME will be inserted
+ * immediately before or immediately after this ME.
+ * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion
+ * for LNetMEAttach().
+ *
+ * \retval 0 On success.
+ * \retval -ENOMEM If new ME object cannot be allocated.
+ * \retval -ENOENT If \a current_meh does not point to a valid match entry.
+ */
+int
+LNetMEInsert(lnet_handle_me_t current_meh,
+ lnet_process_id_t match_id,
+ __u64 match_bits, __u64 ignore_bits,
+ lnet_unlink_t unlink, lnet_ins_pos_t pos,
+ lnet_handle_me_t *handle)
+{
+ struct lnet_me *current_me;
+ struct lnet_me *new_me;
+ struct lnet_portal *ptl;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ if (pos == LNET_INS_LOCAL)
+ return -EPERM;
+
+ new_me = lnet_me_alloc();
+ if (new_me == NULL)
+ return -ENOMEM;
+
+ cpt = lnet_cpt_of_cookie(current_meh.cookie);
+
+ lnet_res_lock(cpt);
+
+ current_me = lnet_handle2me(&current_meh);
+ if (current_me == NULL) {
+ lnet_me_free_locked(new_me);
+
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ LASSERT(current_me->me_portal < the_lnet.ln_nportals);
+
+ ptl = the_lnet.ln_portals[current_me->me_portal];
+ if (lnet_ptl_is_unique(ptl)) {
+ /* nosense to insertion on unique portal */
+ lnet_me_free_locked(new_me);
+ lnet_res_unlock(cpt);
+ return -EPERM;
+ }
+
+ new_me->me_pos = current_me->me_pos;
+ new_me->me_portal = current_me->me_portal;
+ new_me->me_match_id = match_id;
+ new_me->me_match_bits = match_bits;
+ new_me->me_ignore_bits = ignore_bits;
+ new_me->me_unlink = unlink;
+ new_me->me_md = NULL;
+
+ lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh);
+
+ if (pos == LNET_INS_AFTER)
+ list_add(&new_me->me_list, &current_me->me_list);
+ else
+ list_add_tail(&new_me->me_list, &current_me->me_list);
+
+ lnet_me2handle(handle, new_me);
+
+ lnet_res_unlock(cpt);
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEInsert);
+
+/**
+ * Unlink a match entry from its match list.
+ *
+ * This operation also releases any resources associated with the ME. If a
+ * memory descriptor is attached to the ME, then it will be unlinked as well
+ * and an unlink event will be generated. It is an error to use the ME handle
+ * after calling LNetMEUnlink().
+ *
+ * \param meh A handle for the ME to be unlinked.
+ *
+ * \retval 0 On success.
+ * \retval -ENOENT If \a meh does not point to a valid ME.
+ * \see LNetMDUnlink() for the discussion on delivering unlink event.
+ */
+int
+LNetMEUnlink(lnet_handle_me_t meh)
+{
+ lnet_me_t *me;
+ lnet_libmd_t *md;
+ lnet_event_t ev;
+ int cpt;
+
+ LASSERT(the_lnet.ln_init);
+ LASSERT(the_lnet.ln_refcount > 0);
+
+ cpt = lnet_cpt_of_cookie(meh.cookie);
+ lnet_res_lock(cpt);
+
+ me = lnet_handle2me(&meh);
+ if (me == NULL) {
+ lnet_res_unlock(cpt);
+ return -ENOENT;
+ }
+
+ md = me->me_md;
+ if (md != NULL &&
+ md->md_eq != NULL &&
+ md->md_refcount == 0) {
+ lnet_build_unlink_event(md, &ev);
+ lnet_eq_enqueue_event(md->md_eq, &ev);
+ }
+
+ lnet_me_unlink(me);
+
+ lnet_res_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(LNetMEUnlink);
+
+/* call with lnet_res_lock please */
+void
+lnet_me_unlink(lnet_me_t *me)
+{
+ list_del(&me->me_list);
+
+ if (me->me_md != NULL) {
+ lnet_libmd_t *md = me->me_md;
+
+ /* detach MD from portal of this ME */
+ lnet_ptl_detach_md(me, md);
+ lnet_md_unlink(md);
+ }
+
+ lnet_res_lh_invalidate(&me->me_lh);
+ lnet_me_free_locked(me);
+}
+
+#if 0
+static void
+lib_me_dump(lnet_me_t *me)
+{
+ CWARN("Match Entry %p ("LPX64")\n", me,
+ me->me_lh.lh_cookie);
+
+ CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
+ me->me_match_bits, me->me_ignore_bits);
+
+ CWARN("\tMD\t= %p\n", me->md);
+ CWARN("\tprev\t= %p\n",
+ list_entry(me->me_list.prev, lnet_me_t, me_list));
+ CWARN("\tnext\t= %p\n",
+ list_entry(me->me_list.next, lnet_me_t, me_list));
+}
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
new file mode 100644
index 000000000000..49b0f1287a69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -0,0 +1,2441 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-move.c
+ *
+ * Data movement routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+ "Reserved");
+
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
+{
+ lnet_test_peer_t *tp;
+ struct list_head *el;
+ struct list_head *next;
+ struct list_head cull;
+
+ LASSERT (the_lnet.ln_init);
+
+ /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+ if (threshold != 0) {
+ /* Adding a new entry */
+ LIBCFS_ALLOC(tp, sizeof(*tp));
+ if (tp == NULL)
+ return -ENOMEM;
+
+ tp->tp_nid = nid;
+ tp->tp_threshold = threshold;
+
+ lnet_net_lock(0);
+ list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers);
+ lnet_net_unlock(0);
+ return 0;
+ }
+
+ /* removing entries */
+ INIT_LIST_HEAD(&cull);
+
+ lnet_net_lock(0);
+
+ list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+ tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+ if (tp->tp_threshold == 0 || /* needs culling anyway */
+ nid == LNET_NID_ANY || /* removing all entries */
+ tp->tp_nid == nid) /* matched this one */
+ {
+ list_del (&tp->tp_list);
+ list_add (&tp->tp_list, &cull);
+ }
+ }
+
+ lnet_net_unlock(0);
+
+ while (!list_empty (&cull)) {
+ tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+
+ list_del (&tp->tp_list);
+ LIBCFS_FREE(tp, sizeof (*tp));
+ }
+ return 0;
+}
+
+static int
+fail_peer (lnet_nid_t nid, int outgoing)
+{
+ lnet_test_peer_t *tp;
+ struct list_head *el;
+ struct list_head *next;
+ struct list_head cull;
+ int fail = 0;
+
+ INIT_LIST_HEAD (&cull);
+
+ /* NB: use lnet_net_lock(0) to serialize operations on test peers */
+ lnet_net_lock(0);
+
+ list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+ tp = list_entry (el, lnet_test_peer_t, tp_list);
+
+ if (tp->tp_threshold == 0) {
+ /* zombie entry */
+ if (outgoing) {
+ /* only cull zombies on outgoing tests,
+ * since we may be at interrupt priority on
+ * incoming messages. */
+ list_del (&tp->tp_list);
+ list_add (&tp->tp_list, &cull);
+ }
+ continue;
+ }
+
+ if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
+ nid == tp->tp_nid) { /* fail this peer */
+ fail = 1;
+
+ if (tp->tp_threshold != LNET_MD_THRESH_INF) {
+ tp->tp_threshold--;
+ if (outgoing &&
+ tp->tp_threshold == 0) {
+ /* see above */
+ list_del (&tp->tp_list);
+ list_add (&tp->tp_list, &cull);
+ }
+ }
+ break;
+ }
+ }
+
+ lnet_net_unlock(0);
+
+ while (!list_empty (&cull)) {
+ tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
+ list_del (&tp->tp_list);
+
+ LIBCFS_FREE(tp, sizeof (*tp));
+ }
+
+ return (fail);
+}
+
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
+{
+ unsigned int nob = 0;
+
+ while (niov-- > 0)
+ nob += (iov++)->iov_len;
+
+ return (nob);
+}
+EXPORT_SYMBOL(lnet_iov_nob);
+
+void
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+ unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+ unsigned int nob)
+{
+ /* NB diov, siov are READ-ONLY */
+ unsigned int this_nob;
+
+ if (nob == 0)
+ return;
+
+ /* skip complete frags before 'doffset' */
+ LASSERT (ndiov > 0);
+ while (doffset >= diov->iov_len) {
+ doffset -= diov->iov_len;
+ diov++;
+ ndiov--;
+ LASSERT (ndiov > 0);
+ }
+
+ /* skip complete frags before 'soffset' */
+ LASSERT (nsiov > 0);
+ while (soffset >= siov->iov_len) {
+ soffset -= siov->iov_len;
+ siov++;
+ nsiov--;
+ LASSERT (nsiov > 0);
+ }
+
+ do {
+ LASSERT (ndiov > 0);
+ LASSERT (nsiov > 0);
+ this_nob = MIN(diov->iov_len - doffset,
+ siov->iov_len - soffset);
+ this_nob = MIN(this_nob, nob);
+
+ memcpy ((char *)diov->iov_base + doffset,
+ (char *)siov->iov_base + soffset, this_nob);
+ nob -= this_nob;
+
+ if (diov->iov_len > doffset + this_nob) {
+ doffset += this_nob;
+ } else {
+ diov++;
+ ndiov--;
+ doffset = 0;
+ }
+
+ if (siov->iov_len > soffset + this_nob) {
+ soffset += this_nob;
+ } else {
+ siov++;
+ nsiov--;
+ soffset = 0;
+ }
+ } while (nob > 0);
+}
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+
+int
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+ int src_niov, struct iovec *src,
+ unsigned int offset, unsigned int len)
+{
+ /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+ * for exactly 'len' bytes, and return the number of entries.
+ * NB not destructive to 'src' */
+ unsigned int frag_len;
+ unsigned int niov;
+
+ if (len == 0) /* no data => */
+ return (0); /* no frags */
+
+ LASSERT (src_niov > 0);
+ while (offset >= src->iov_len) { /* skip initial frags */
+ offset -= src->iov_len;
+ src_niov--;
+ src++;
+ LASSERT (src_niov > 0);
+ }
+
+ niov = 1;
+ for (;;) {
+ LASSERT (src_niov > 0);
+ LASSERT ((int)niov <= dst_niov);
+
+ frag_len = src->iov_len - offset;
+ dst->iov_base = ((char *)src->iov_base) + offset;
+
+ if (len <= frag_len) {
+ dst->iov_len = len;
+ return (niov);
+ }
+
+ dst->iov_len = frag_len;
+
+ len -= frag_len;
+ dst++;
+ src++;
+ niov++;
+ src_niov--;
+ offset = 0;
+ }
+}
+EXPORT_SYMBOL(lnet_extract_iov);
+
+
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
+{
+ unsigned int nob = 0;
+
+ while (niov-- > 0)
+ nob += (kiov++)->kiov_len;
+
+ return (nob);
+}
+EXPORT_SYMBOL(lnet_kiov_nob);
+
+void
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+ unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+ unsigned int nob)
+{
+ /* NB diov, siov are READ-ONLY */
+ unsigned int this_nob;
+ char *daddr = NULL;
+ char *saddr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT (!in_interrupt ());
+
+ LASSERT (ndiov > 0);
+ while (doffset >= diov->kiov_len) {
+ doffset -= diov->kiov_len;
+ diov++;
+ ndiov--;
+ LASSERT (ndiov > 0);
+ }
+
+ LASSERT (nsiov > 0);
+ while (soffset >= siov->kiov_len) {
+ soffset -= siov->kiov_len;
+ siov++;
+ nsiov--;
+ LASSERT (nsiov > 0);
+ }
+
+ do {
+ LASSERT (ndiov > 0);
+ LASSERT (nsiov > 0);
+ this_nob = MIN(diov->kiov_len - doffset,
+ siov->kiov_len - soffset);
+ this_nob = MIN(this_nob, nob);
+
+ if (daddr == NULL)
+ daddr = ((char *)kmap(diov->kiov_page)) +
+ diov->kiov_offset + doffset;
+ if (saddr == NULL)
+ saddr = ((char *)kmap(siov->kiov_page)) +
+ siov->kiov_offset + soffset;
+
+ /* Vanishing risk of kmap deadlock when mapping 2 pages.
+ * However in practice at least one of the kiovs will be mapped
+ * kernel pages and the map/unmap will be NOOPs */
+
+ memcpy (daddr, saddr, this_nob);
+ nob -= this_nob;
+
+ if (diov->kiov_len > doffset + this_nob) {
+ daddr += this_nob;
+ doffset += this_nob;
+ } else {
+ kunmap(diov->kiov_page);
+ daddr = NULL;
+ diov++;
+ ndiov--;
+ doffset = 0;
+ }
+
+ if (siov->kiov_len > soffset + this_nob) {
+ saddr += this_nob;
+ soffset += this_nob;
+ } else {
+ kunmap(siov->kiov_page);
+ saddr = NULL;
+ siov++;
+ nsiov--;
+ soffset = 0;
+ }
+ } while (nob > 0);
+
+ if (daddr != NULL)
+ kunmap(diov->kiov_page);
+ if (saddr != NULL)
+ kunmap(siov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+ unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+ unsigned int nob)
+{
+ /* NB iov, kiov are READ-ONLY */
+ unsigned int this_nob;
+ char *addr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT (!in_interrupt ());
+
+ LASSERT (niov > 0);
+ while (iovoffset >= iov->iov_len) {
+ iovoffset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ LASSERT (nkiov > 0);
+ while (kiovoffset >= kiov->kiov_len) {
+ kiovoffset -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ LASSERT (nkiov > 0);
+ }
+
+ do {
+ LASSERT (niov > 0);
+ LASSERT (nkiov > 0);
+ this_nob = MIN(iov->iov_len - iovoffset,
+ kiov->kiov_len - kiovoffset);
+ this_nob = MIN(this_nob, nob);
+
+ if (addr == NULL)
+ addr = ((char *)kmap(kiov->kiov_page)) +
+ kiov->kiov_offset + kiovoffset;
+
+ memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+ nob -= this_nob;
+
+ if (iov->iov_len > iovoffset + this_nob) {
+ iovoffset += this_nob;
+ } else {
+ iov++;
+ niov--;
+ iovoffset = 0;
+ }
+
+ if (kiov->kiov_len > kiovoffset + this_nob) {
+ addr += this_nob;
+ kiovoffset += this_nob;
+ } else {
+ kunmap(kiov->kiov_page);
+ addr = NULL;
+ kiov++;
+ nkiov--;
+ kiovoffset = 0;
+ }
+
+ } while (nob > 0);
+
+ if (addr != NULL)
+ kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+ unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+ unsigned int nob)
+{
+ /* NB kiov, iov are READ-ONLY */
+ unsigned int this_nob;
+ char *addr = NULL;
+
+ if (nob == 0)
+ return;
+
+ LASSERT (!in_interrupt ());
+
+ LASSERT (nkiov > 0);
+ while (kiovoffset >= kiov->kiov_len) {
+ kiovoffset -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ LASSERT (nkiov > 0);
+ }
+
+ LASSERT (niov > 0);
+ while (iovoffset >= iov->iov_len) {
+ iovoffset -= iov->iov_len;
+ iov++;
+ niov--;
+ LASSERT (niov > 0);
+ }
+
+ do {
+ LASSERT (nkiov > 0);
+ LASSERT (niov > 0);
+ this_nob = MIN(kiov->kiov_len - kiovoffset,
+ iov->iov_len - iovoffset);
+ this_nob = MIN(this_nob, nob);
+
+ if (addr == NULL)
+ addr = ((char *)kmap(kiov->kiov_page)) +
+ kiov->kiov_offset + kiovoffset;
+
+ memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+ nob -= this_nob;
+
+ if (kiov->kiov_len > kiovoffset + this_nob) {
+ addr += this_nob;
+ kiovoffset += this_nob;
+ } else {
+ kunmap(kiov->kiov_page);
+ addr = NULL;
+ kiov++;
+ nkiov--;
+ kiovoffset = 0;
+ }
+
+ if (iov->iov_len > iovoffset + this_nob) {
+ iovoffset += this_nob;
+ } else {
+ iov++;
+ niov--;
+ iovoffset = 0;
+ }
+ } while (nob > 0);
+
+ if (addr != NULL)
+ kunmap(kiov->kiov_page);
+}
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+ int src_niov, lnet_kiov_t *src,
+ unsigned int offset, unsigned int len)
+{
+ /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+ * for exactly 'len' bytes, and return the number of entries.
+ * NB not destructive to 'src' */
+ unsigned int frag_len;
+ unsigned int niov;
+
+ if (len == 0) /* no data => */
+ return (0); /* no frags */
+
+ LASSERT (src_niov > 0);
+ while (offset >= src->kiov_len) { /* skip initial frags */
+ offset -= src->kiov_len;
+ src_niov--;
+ src++;
+ LASSERT (src_niov > 0);
+ }
+
+ niov = 1;
+ for (;;) {
+ LASSERT (src_niov > 0);
+ LASSERT ((int)niov <= dst_niov);
+
+ frag_len = src->kiov_len - offset;
+ dst->kiov_page = src->kiov_page;
+ dst->kiov_offset = src->kiov_offset + offset;
+
+ if (len <= frag_len) {
+ dst->kiov_len = len;
+ LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+ return (niov);
+ }
+
+ dst->kiov_len = frag_len;
+ LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+
+ len -= frag_len;
+ dst++;
+ src++;
+ niov++;
+ src_niov--;
+ offset = 0;
+ }
+}
+EXPORT_SYMBOL(lnet_extract_kiov);
+
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ unsigned int niov = 0;
+ struct iovec *iov = NULL;
+ lnet_kiov_t *kiov = NULL;
+ int rc;
+
+ LASSERT (!in_interrupt ());
+ LASSERT (mlen == 0 || msg != NULL);
+
+ if (msg != NULL) {
+ LASSERT(msg->msg_receiving);
+ LASSERT(!msg->msg_sending);
+ LASSERT(rlen == msg->msg_len);
+ LASSERT(mlen <= msg->msg_len);
+ LASSERT(msg->msg_offset == offset);
+ LASSERT(msg->msg_wanted == mlen);
+
+ msg->msg_receiving = 0;
+
+ if (mlen != 0) {
+ niov = msg->msg_niov;
+ iov = msg->msg_iov;
+ kiov = msg->msg_kiov;
+
+ LASSERT (niov > 0);
+ LASSERT ((iov == NULL) != (kiov == NULL));
+ }
+ }
+
+ rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+ niov, iov, kiov, offset, mlen, rlen);
+ if (rc < 0)
+ lnet_finalize(ni, msg, rc);
+}
+
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+ lnet_libmd_t *md = msg->msg_md;
+
+ LASSERT (msg->msg_len > 0);
+ LASSERT (!msg->msg_routing);
+ LASSERT (md != NULL);
+ LASSERT (msg->msg_niov == 0);
+ LASSERT (msg->msg_iov == NULL);
+ LASSERT (msg->msg_kiov == NULL);
+
+ msg->msg_niov = md->md_niov;
+ if ((md->md_options & LNET_MD_KIOV) != 0)
+ msg->msg_kiov = md->md_iov.kiov;
+ else
+ msg->msg_iov = md->md_iov.iov;
+}
+
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+ unsigned int offset, unsigned int len)
+{
+ msg->msg_type = type;
+ msg->msg_target = target;
+ msg->msg_len = len;
+ msg->msg_offset = offset;
+
+ if (len != 0)
+ lnet_setpayloadbuffer(msg);
+
+ memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+ msg->msg_hdr.type = cpu_to_le32(type);
+ msg->msg_hdr.dest_nid = cpu_to_le64(target.nid);
+ msg->msg_hdr.dest_pid = cpu_to_le32(target.pid);
+ /* src_nid will be set later */
+ msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid);
+ msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
+
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ void *priv = msg->msg_private;
+ int rc;
+
+ LASSERT (!in_interrupt ());
+ LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+ (msg->msg_txcredit && msg->msg_peertxcredit));
+
+ rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+ if (rc < 0)
+ lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ int rc;
+
+ LASSERT(!msg->msg_sending);
+ LASSERT(msg->msg_receiving);
+ LASSERT(!msg->msg_rx_ready_delay);
+ LASSERT(ni->ni_lnd->lnd_eager_recv != NULL);
+
+ msg->msg_rx_ready_delay = 1;
+ rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg,
+ &msg->msg_private);
+ if (rc != 0) {
+ CERROR("recv from %s / send to %s aborted: "
+ "eager_recv failed %d\n",
+ libcfs_nid2str(msg->msg_rxpeer->lp_nid),
+ libcfs_id2str(msg->msg_target), rc);
+ LASSERT(rc < 0); /* required by my callers */
+ }
+
+ return rc;
+}
+
+/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+void
+lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+ cfs_time_t last_alive = 0;
+
+ LASSERT(lnet_peer_aliveness_enabled(lp));
+ LASSERT(ni->ni_lnd->lnd_query != NULL);
+
+ lnet_net_unlock(lp->lp_cpt);
+ (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive);
+ lnet_net_lock(lp->lp_cpt);
+
+ lp->lp_last_query = cfs_time_current();
+
+ if (last_alive != 0) /* NI has updated timestamp */
+ lp->lp_last_alive = last_alive;
+}
+
+/* NB: always called with lnet_net_lock held */
+static inline int
+lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now)
+{
+ int alive;
+ cfs_time_t deadline;
+
+ LASSERT (lnet_peer_aliveness_enabled(lp));
+
+ /* Trust lnet_notify() if it has more recent aliveness news, but
+ * ignore the initial assumed death (see lnet_peers_start_down()).
+ */
+ if (!lp->lp_alive && lp->lp_alive_count > 0 &&
+ cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive))
+ return 0;
+
+ deadline = cfs_time_add(lp->lp_last_alive,
+ cfs_time_seconds(lp->lp_ni->ni_peertimeout));
+ alive = cfs_time_after(deadline, now);
+
+ /* Update obsolete lp_alive except for routers assumed to be dead
+ * initially, because router checker would update aliveness in this
+ * case, and moreover lp_last_alive at peer creation is assumed.
+ */
+ if (alive && !lp->lp_alive &&
+ !(lnet_isrouter(lp) && lp->lp_alive_count == 0))
+ lnet_notify_locked(lp, 0, 1, lp->lp_last_alive);
+
+ return alive;
+}
+
+
+/* NB: returns 1 when alive, 0 when dead, negative when error;
+ * may drop the lnet_net_lock */
+int
+lnet_peer_alive_locked (lnet_peer_t *lp)
+{
+ cfs_time_t now = cfs_time_current();
+
+ if (!lnet_peer_aliveness_enabled(lp))
+ return -ENODEV;
+
+ if (lnet_peer_is_alive(lp, now))
+ return 1;
+
+ /* Peer appears dead, but we should avoid frequent NI queries (at
+ * most once per lnet_queryinterval seconds). */
+ if (lp->lp_last_query != 0) {
+ static const int lnet_queryinterval = 1;
+
+ cfs_time_t next_query =
+ cfs_time_add(lp->lp_last_query,
+ cfs_time_seconds(lnet_queryinterval));
+
+ if (cfs_time_before(now, next_query)) {
+ if (lp->lp_alive)
+ CWARN("Unexpected aliveness of peer %s: "
+ "%d < %d (%d/%d)\n",
+ libcfs_nid2str(lp->lp_nid),
+ (int)now, (int)next_query,
+ lnet_queryinterval,
+ lp->lp_ni->ni_peertimeout);
+ return 0;
+ }
+ }
+
+ /* query NI for latest aliveness news */
+ lnet_ni_query_locked(lp->lp_ni, lp);
+
+ if (lnet_peer_is_alive(lp, now))
+ return 1;
+
+ lnet_notify_locked(lp, 0, 0, lp->lp_last_alive);
+ return 0;
+}
+
+int
+lnet_post_send_locked(lnet_msg_t *msg, int do_send)
+{
+ /* lnet_send is going to lnet_net_unlock immediately after this,
+ * so it sets do_send FALSE and I don't do the unlock/send/lock bit.
+ * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer
+ * appears dead, and 0 if sent or OK to send */
+ struct lnet_peer *lp = msg->msg_txpeer;
+ struct lnet_ni *ni = lp->lp_ni;
+ struct lnet_tx_queue *tq;
+ int cpt;
+
+ /* non-lnet_send() callers have checked before */
+ LASSERT(!do_send || msg->msg_tx_delayed);
+ LASSERT(!msg->msg_receiving);
+ LASSERT(msg->msg_tx_committed);
+
+ cpt = msg->msg_tx_cpt;
+ tq = ni->ni_tx_queues[cpt];
+
+ /* NB 'lp' is always the next hop */
+ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
+ lnet_peer_alive_locked(lp) == 0) {
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
+ lnet_net_unlock(cpt);
+
+ CNETERR("Dropping message for %s: peer not alive\n",
+ libcfs_id2str(msg->msg_target));
+ if (do_send)
+ lnet_finalize(ni, msg, -EHOSTUNREACH);
+
+ lnet_net_lock(cpt);
+ return EHOSTUNREACH;
+ }
+
+ if (!msg->msg_peertxcredit) {
+ LASSERT ((lp->lp_txcredits < 0) ==
+ !list_empty(&lp->lp_txq));
+
+ msg->msg_peertxcredit = 1;
+ lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+ lp->lp_txcredits--;
+
+ if (lp->lp_txcredits < lp->lp_mintxcredits)
+ lp->lp_mintxcredits = lp->lp_txcredits;
+
+ if (lp->lp_txcredits < 0) {
+ msg->msg_tx_delayed = 1;
+ list_add_tail(&msg->msg_list, &lp->lp_txq);
+ return EAGAIN;
+ }
+ }
+
+ if (!msg->msg_txcredit) {
+ LASSERT((tq->tq_credits < 0) ==
+ !list_empty(&tq->tq_delayed));
+
+ msg->msg_txcredit = 1;
+ tq->tq_credits--;
+
+ if (tq->tq_credits < tq->tq_credits_min)
+ tq->tq_credits_min = tq->tq_credits;
+
+ if (tq->tq_credits < 0) {
+ msg->msg_tx_delayed = 1;
+ list_add_tail(&msg->msg_list, &tq->tq_delayed);
+ return EAGAIN;
+ }
+ }
+
+ if (do_send) {
+ lnet_net_unlock(cpt);
+ lnet_ni_send(ni, msg);
+ lnet_net_lock(cpt);
+ }
+ return 0;
+}
+
+
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg)
+{
+ lnet_rtrbufpool_t *rbp;
+ int cpt;
+
+ LASSERT(msg->msg_rx_committed);
+
+ cpt = msg->msg_rx_cpt;
+ rbp = &the_lnet.ln_rtrpools[cpt][0];
+
+ LASSERT(msg->msg_len <= LNET_MTU);
+ while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+ rbp++;
+ LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
+ }
+
+ return rbp;
+}
+
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
+{
+ /* lnet_parse is going to lnet_net_unlock immediately after this, so it
+ * sets do_recv FALSE and I don't do the unlock/send/lock bit. I
+ * return EAGAIN if msg blocked and 0 if received or OK to receive */
+ lnet_peer_t *lp = msg->msg_rxpeer;
+ lnet_rtrbufpool_t *rbp;
+ lnet_rtrbuf_t *rb;
+
+ LASSERT (msg->msg_iov == NULL);
+ LASSERT (msg->msg_kiov == NULL);
+ LASSERT (msg->msg_niov == 0);
+ LASSERT (msg->msg_routing);
+ LASSERT (msg->msg_receiving);
+ LASSERT (!msg->msg_sending);
+
+ /* non-lnet_parse callers only receive delayed messages */
+ LASSERT(!do_recv || msg->msg_rx_delayed);
+
+ if (!msg->msg_peerrtrcredit) {
+ LASSERT ((lp->lp_rtrcredits < 0) ==
+ !list_empty(&lp->lp_rtrq));
+
+ msg->msg_peerrtrcredit = 1;
+ lp->lp_rtrcredits--;
+ if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+ lp->lp_minrtrcredits = lp->lp_rtrcredits;
+
+ if (lp->lp_rtrcredits < 0) {
+ /* must have checked eager_recv before here */
+ LASSERT(msg->msg_rx_ready_delay);
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+ return EAGAIN;
+ }
+ }
+
+ rbp = lnet_msg2bufpool(msg);
+
+ if (!msg->msg_rtrcredit) {
+ LASSERT ((rbp->rbp_credits < 0) ==
+ !list_empty(&rbp->rbp_msgs));
+
+ msg->msg_rtrcredit = 1;
+ rbp->rbp_credits--;
+ if (rbp->rbp_credits < rbp->rbp_mincredits)
+ rbp->rbp_mincredits = rbp->rbp_credits;
+
+ if (rbp->rbp_credits < 0) {
+ /* must have checked eager_recv before here */
+ LASSERT(msg->msg_rx_ready_delay);
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+ return EAGAIN;
+ }
+ }
+
+ LASSERT (!list_empty(&rbp->rbp_bufs));
+ rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+ list_del(&rb->rb_list);
+
+ msg->msg_niov = rbp->rbp_npages;
+ msg->msg_kiov = &rb->rb_kiov[0];
+
+ if (do_recv) {
+ int cpt = msg->msg_rx_cpt;
+
+ lnet_net_unlock(cpt);
+ lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+ 0, msg->msg_len, msg->msg_len);
+ lnet_net_lock(cpt);
+ }
+ return 0;
+}
+
+void
+lnet_return_tx_credits_locked(lnet_msg_t *msg)
+{
+ lnet_peer_t *txpeer = msg->msg_txpeer;
+ lnet_msg_t *msg2;
+
+ if (msg->msg_txcredit) {
+ struct lnet_ni *ni = txpeer->lp_ni;
+ struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt];
+
+ /* give back NI txcredits */
+ msg->msg_txcredit = 0;
+
+ LASSERT((tq->tq_credits < 0) ==
+ !list_empty(&tq->tq_delayed));
+
+ tq->tq_credits++;
+ if (tq->tq_credits <= 0) {
+ msg2 = list_entry(tq->tq_delayed.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ LASSERT(msg2->msg_txpeer->lp_ni == ni);
+ LASSERT(msg2->msg_tx_delayed);
+
+ (void) lnet_post_send_locked(msg2, 1);
+ }
+ }
+
+ if (msg->msg_peertxcredit) {
+ /* give back peer txcredits */
+ msg->msg_peertxcredit = 0;
+
+ LASSERT((txpeer->lp_txcredits < 0) ==
+ !list_empty(&txpeer->lp_txq));
+
+ txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+ LASSERT (txpeer->lp_txqnob >= 0);
+
+ txpeer->lp_txcredits++;
+ if (txpeer->lp_txcredits <= 0) {
+ msg2 = list_entry(txpeer->lp_txq.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ LASSERT(msg2->msg_txpeer == txpeer);
+ LASSERT(msg2->msg_tx_delayed);
+
+ (void) lnet_post_send_locked(msg2, 1);
+ }
+ }
+
+ if (txpeer != NULL) {
+ msg->msg_txpeer = NULL;
+ lnet_peer_decref_locked(txpeer);
+ }
+}
+
+void
+lnet_return_rx_credits_locked(lnet_msg_t *msg)
+{
+ lnet_peer_t *rxpeer = msg->msg_rxpeer;
+ lnet_msg_t *msg2;
+
+ if (msg->msg_rtrcredit) {
+ /* give back global router credits */
+ lnet_rtrbuf_t *rb;
+ lnet_rtrbufpool_t *rbp;
+
+ /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+ * there until it gets one allocated, or aborts the wait
+ * itself */
+ LASSERT (msg->msg_kiov != NULL);
+
+ rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+ rbp = rb->rb_pool;
+ LASSERT (rbp == lnet_msg2bufpool(msg));
+
+ msg->msg_kiov = NULL;
+ msg->msg_rtrcredit = 0;
+
+ LASSERT((rbp->rbp_credits < 0) ==
+ !list_empty(&rbp->rbp_msgs));
+ LASSERT((rbp->rbp_credits > 0) ==
+ !list_empty(&rbp->rbp_bufs));
+
+ list_add(&rb->rb_list, &rbp->rbp_bufs);
+ rbp->rbp_credits++;
+ if (rbp->rbp_credits <= 0) {
+ msg2 = list_entry(rbp->rbp_msgs.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ (void) lnet_post_routed_recv_locked(msg2, 1);
+ }
+ }
+
+ if (msg->msg_peerrtrcredit) {
+ /* give back peer router credits */
+ msg->msg_peerrtrcredit = 0;
+
+ LASSERT((rxpeer->lp_rtrcredits < 0) ==
+ !list_empty(&rxpeer->lp_rtrq));
+
+ rxpeer->lp_rtrcredits++;
+ if (rxpeer->lp_rtrcredits <= 0) {
+ msg2 = list_entry(rxpeer->lp_rtrq.next,
+ lnet_msg_t, msg_list);
+ list_del(&msg2->msg_list);
+
+ (void) lnet_post_routed_recv_locked(msg2, 1);
+ }
+ }
+ if (rxpeer != NULL) {
+ msg->msg_rxpeer = NULL;
+ lnet_peer_decref_locked(rxpeer);
+ }
+}
+
+static int
+lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
+{
+ lnet_peer_t *p1 = r1->lr_gateway;
+ lnet_peer_t *p2 = r2->lr_gateway;
+
+ if (r1->lr_hops < r2->lr_hops)
+ return 1;
+
+ if (r1->lr_hops > r2->lr_hops)
+ return -1;
+
+ if (p1->lp_txqnob < p2->lp_txqnob)
+ return 1;
+
+ if (p1->lp_txqnob > p2->lp_txqnob)
+ return -1;
+
+ if (p1->lp_txcredits > p2->lp_txcredits)
+ return 1;
+
+ if (p1->lp_txcredits < p2->lp_txcredits)
+ return -1;
+
+ if (r1->lr_seq - r2->lr_seq <= 0)
+ return 1;
+
+ return -1;
+}
+
+static lnet_peer_t *
+lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid)
+{
+ lnet_remotenet_t *rnet;
+ lnet_route_t *rtr;
+ lnet_route_t *rtr_best;
+ lnet_route_t *rtr_last;
+ struct lnet_peer *lp_best;
+ struct lnet_peer *lp;
+ int rc;
+
+ /* If @rtr_nid is not LNET_NID_ANY, return the gateway with
+ * rtr_nid nid, otherwise find the best gateway I can use */
+
+ rnet = lnet_find_net_locked(LNET_NIDNET(target));
+ if (rnet == NULL)
+ return NULL;
+
+ lp_best = NULL;
+ rtr_best = rtr_last = NULL;
+ list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) {
+ lp = rtr->lr_gateway;
+
+ if (!lp->lp_alive || /* gateway is down */
+ ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 &&
+ rtr->lr_downis != 0)) /* NI to target is down */
+ continue;
+
+ if (ni != NULL && lp->lp_ni != ni)
+ continue;
+
+ if (lp->lp_nid == rtr_nid) /* it's pre-determined router */
+ return lp;
+
+ if (lp_best == NULL) {
+ rtr_best = rtr_last = rtr;
+ lp_best = lp;
+ continue;
+ }
+
+ /* no protection on below fields, but it's harmless */
+ if (rtr_last->lr_seq - rtr->lr_seq < 0)
+ rtr_last = rtr;
+
+ rc = lnet_compare_routes(rtr, rtr_best);
+ if (rc < 0)
+ continue;
+
+ rtr_best = rtr;
+ lp_best = lp;
+ }
+
+ /* set sequence number on the best router to the latest sequence + 1
+ * so we can round-robin all routers, it's race and inaccurate but
+ * harmless and functional */
+ if (rtr_best != NULL)
+ rtr_best->lr_seq = rtr_last->lr_seq + 1;
+ return lp_best;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+ lnet_nid_t dst_nid = msg->msg_target.nid;
+ struct lnet_ni *src_ni;
+ struct lnet_ni *local_ni;
+ struct lnet_peer *lp;
+ int cpt;
+ int cpt2;
+ int rc;
+
+ /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+ * but we might want to use pre-determined router for ACK/REPLY
+ * in the future */
+ /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+ LASSERT (msg->msg_txpeer == NULL);
+ LASSERT (!msg->msg_sending);
+ LASSERT (!msg->msg_target_is_router);
+ LASSERT (!msg->msg_receiving);
+
+ msg->msg_sending = 1;
+
+ LASSERT(!msg->msg_tx_committed);
+ cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid);
+ again:
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ lnet_net_unlock(cpt);
+ return -ESHUTDOWN;
+ }
+
+ if (src_nid == LNET_NID_ANY) {
+ src_ni = NULL;
+ } else {
+ src_ni = lnet_nid2ni_locked(src_nid, cpt);
+ if (src_ni == NULL) {
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("Can't send to %s: src %s is not a "
+ "local nid\n", libcfs_nid2str(dst_nid),
+ libcfs_nid2str(src_nid));
+ return -EINVAL;
+ }
+ LASSERT (!msg->msg_routing);
+ }
+
+ /* Is this for someone on a local network? */
+ local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
+
+ if (local_ni != NULL) {
+ if (src_ni == NULL) {
+ src_ni = local_ni;
+ src_nid = src_ni->ni_nid;
+ } else if (src_ni == local_ni) {
+ lnet_ni_decref_locked(local_ni, cpt);
+ } else {
+ lnet_ni_decref_locked(local_ni, cpt);
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("No route to %s via from %s\n",
+ libcfs_nid2str(dst_nid),
+ libcfs_nid2str(src_nid));
+ return -EINVAL;
+ }
+
+ LASSERT(src_nid != LNET_NID_ANY);
+ lnet_msg_commit(msg, cpt);
+
+ if (!msg->msg_routing)
+ msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+
+ if (src_ni == the_lnet.ln_loni) {
+ /* No send credit hassles with LOLND */
+ lnet_net_unlock(cpt);
+ lnet_ni_send(src_ni, msg);
+
+ lnet_net_lock(cpt);
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+ return 0;
+ }
+
+ rc = lnet_nid2peer_locked(&lp, dst_nid, cpt);
+ /* lp has ref on src_ni; lose mine */
+ lnet_ni_decref_locked(src_ni, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ LCONSOLE_WARN("Error %d finding peer %s\n", rc,
+ libcfs_nid2str(dst_nid));
+ /* ENOMEM or shutting down */
+ return rc;
+ }
+ LASSERT (lp->lp_ni == src_ni);
+ } else {
+ /* sending to a remote network */
+ lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid);
+ if (lp == NULL) {
+ if (src_ni != NULL)
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+
+ LCONSOLE_WARN("No route to %s via %s "
+ "(all routers down)\n",
+ libcfs_id2str(msg->msg_target),
+ libcfs_nid2str(src_nid));
+ return -EHOSTUNREACH;
+ }
+
+ /* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
+ * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
+ * pre-determined router, this can happen if router table
+ * was changed when we release the lock */
+ if (rtr_nid != lp->lp_nid) {
+ cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid);
+ if (cpt2 != cpt) {
+ if (src_ni != NULL)
+ lnet_ni_decref_locked(src_ni, cpt);
+ lnet_net_unlock(cpt);
+
+ rtr_nid = lp->lp_nid;
+ cpt = cpt2;
+ goto again;
+ }
+ }
+
+ CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+ libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid),
+ lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+
+ if (src_ni == NULL) {
+ src_ni = lp->lp_ni;
+ src_nid = src_ni->ni_nid;
+ } else {
+ LASSERT (src_ni == lp->lp_ni);
+ lnet_ni_decref_locked(src_ni, cpt);
+ }
+
+ lnet_peer_addref_locked(lp);
+
+ LASSERT(src_nid != LNET_NID_ANY);
+ lnet_msg_commit(msg, cpt);
+
+ if (!msg->msg_routing) {
+ /* I'm the source and now I know which NI to send on */
+ msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+ }
+
+ msg->msg_target_is_router = 1;
+ msg->msg_target.nid = lp->lp_nid;
+ msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+ }
+
+ /* 'lp' is our best choice of peer */
+
+ LASSERT (!msg->msg_peertxcredit);
+ LASSERT (!msg->msg_txcredit);
+ LASSERT (msg->msg_txpeer == NULL);
+
+ msg->msg_txpeer = lp; /* msg takes my ref on lp */
+
+ rc = lnet_post_send_locked(msg, 0);
+ lnet_net_unlock(cpt);
+
+ if (rc == EHOSTUNREACH)
+ return -EHOSTUNREACH;
+
+ if (rc == 0)
+ lnet_ni_send(src_ni, msg);
+
+ return 0;
+}
+
+static void
+lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob)
+{
+ lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += nob;
+ lnet_net_unlock(cpt);
+
+ lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
+}
+
+static void
+lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+
+ if (msg->msg_wanted != 0)
+ lnet_setpayloadbuffer(msg);
+
+ lnet_build_msg_event(msg, LNET_EVENT_PUT);
+
+ /* Must I ACK? If so I'll grab the ack_wmd out of the header and put
+ * it back into the ACK during lnet_finalize() */
+ msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+ (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
+
+ lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed,
+ msg->msg_offset, msg->msg_wanted, hdr->payload_length);
+}
+
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ struct lnet_match_info info;
+ int rc;
+
+ /* Convert put fields to host byte order */
+ hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+ hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index);
+ hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
+
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_PUT;
+ info.mi_portal = hdr->msg.put.ptl_index;
+ info.mi_rlength = hdr->payload_length;
+ info.mi_roffset = hdr->msg.put.offset;
+ info.mi_mbits = hdr->msg.put.match_bits;
+
+ msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL;
+
+ again:
+ rc = lnet_ptl_match_md(&info, msg);
+ switch (rc) {
+ default:
+ LBUG();
+
+ case LNET_MATCHMD_OK:
+ lnet_recv_put(ni, msg);
+ return 0;
+
+ case LNET_MATCHMD_NONE:
+ if (msg->msg_rx_delayed) /* attached on delayed list */
+ return 0;
+
+ rc = lnet_ni_eager_recv(ni, msg);
+ if (rc == 0)
+ goto again;
+ /* fall through */
+
+ case LNET_MATCHMD_DROP:
+ CNETERR("Dropping PUT from %s portal %d match "LPU64
+ " offset %d length %d: %d\n",
+ libcfs_id2str(info.mi_id), info.mi_portal,
+ info.mi_mbits, info.mi_roffset, info.mi_rlength, rc);
+
+ return ENOENT; /* +ve: OK but no match */
+ }
+}
+
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
+{
+ struct lnet_match_info info;
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_handle_wire_t reply_wmd;
+ int rc;
+
+ /* Convert get fields to host byte order */
+ hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+ hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index);
+ hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length);
+ hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset);
+
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_GET;
+ info.mi_portal = hdr->msg.get.ptl_index;
+ info.mi_rlength = hdr->msg.get.sink_length;
+ info.mi_roffset = hdr->msg.get.src_offset;
+ info.mi_mbits = hdr->msg.get.match_bits;
+
+ rc = lnet_ptl_match_md(&info, msg);
+ if (rc == LNET_MATCHMD_DROP) {
+ CNETERR("Dropping GET from %s portal %d match "LPU64
+ " offset %d length %d\n",
+ libcfs_id2str(info.mi_id), info.mi_portal,
+ info.mi_mbits, info.mi_roffset, info.mi_rlength);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ LASSERT(rc == LNET_MATCHMD_OK);
+
+ lnet_build_msg_event(msg, LNET_EVENT_GET);
+
+ reply_wmd = hdr->msg.get.return_wmd;
+
+ lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+ msg->msg_offset, msg->msg_wanted);
+
+ msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
+
+ if (rdma_get) {
+ /* The LND completes the REPLY from her recv procedure */
+ lnet_ni_recv(ni, msg->msg_private, msg, 0,
+ msg->msg_offset, msg->msg_len, msg->msg_len);
+ return 0;
+ }
+
+ lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+ msg->msg_receiving = 0;
+
+ rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
+ if (rc < 0) {
+ /* didn't get as far as lnet_ni_send() */
+ CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+ libcfs_nid2str(ni->ni_nid),
+ libcfs_id2str(info.mi_id), rc);
+
+ lnet_finalize(ni, msg, rc);
+ }
+
+ return 0;
+}
+
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ void *private = msg->msg_private;
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_process_id_t src = {0};
+ lnet_libmd_t *md;
+ int rlength;
+ int mlength;
+ int cpt;
+
+ cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie);
+ lnet_res_lock(cpt);
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ /* NB handles only looked up by creator (no flips) */
+ md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CNETERR("%s: Dropping REPLY from %s for %s "
+ "MD "LPX64"."LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ (md == NULL) ? "invalid" : "inactive",
+ hdr->msg.reply.dst_wmd.wh_interface_cookie,
+ hdr->msg.reply.dst_wmd.wh_object_cookie);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("REPLY MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ LASSERT (md->md_offset == 0);
+
+ rlength = hdr->payload_length;
+ mlength = MIN(rlength, (int)md->md_length);
+
+ if (mlength < rlength &&
+ (md->md_options & LNET_MD_TRUNCATE) == 0) {
+ CNETERR("%s: Dropping REPLY from %s length %d "
+ "for MD "LPX64" would overflow (%d)\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+ mlength);
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve: OK but no match */
+ }
+
+ CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+ lnet_msg_attach_md(msg, md, 0, mlength);
+
+ if (mlength != 0)
+ lnet_setpayloadbuffer(msg);
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+ lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+ return 0;
+}
+
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_process_id_t src = {0};
+ lnet_libmd_t *md;
+ int cpt;
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ /* Convert ack fields to host byte order */
+ hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
+ hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
+
+ cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie);
+ lnet_res_lock(cpt);
+
+ /* NB handles only looked up by creator (no flips) */
+ md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ /* Don't moan; this is expected */
+ CDEBUG(D_NET,
+ "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ (md == NULL) ? "invalid" : "inactive",
+ hdr->msg.ack.dst_wmd.wh_interface_cookie,
+ hdr->msg.ack.dst_wmd.wh_object_cookie);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("Source MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+ return ENOENT; /* +ve! */
+ }
+
+ CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+ hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_ACK);
+
+ lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+ return 0;
+}
+
+static int
+lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+ int rc = 0;
+
+ if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+ lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+ if (ni->ni_lnd->lnd_eager_recv == NULL) {
+ msg->msg_rx_ready_delay = 1;
+ } else {
+ lnet_net_unlock(msg->msg_rx_cpt);
+ rc = lnet_ni_eager_recv(ni, msg);
+ lnet_net_lock(msg->msg_rx_cpt);
+ }
+ }
+
+ if (rc == 0)
+ rc = lnet_post_routed_recv_locked(msg, 0);
+ return rc;
+}
+
+char *
+lnet_msgtyp2str (int type)
+{
+ switch (type) {
+ case LNET_MSG_ACK:
+ return ("ACK");
+ case LNET_MSG_PUT:
+ return ("PUT");
+ case LNET_MSG_GET:
+ return ("GET");
+ case LNET_MSG_REPLY:
+ return ("REPLY");
+ case LNET_MSG_HELLO:
+ return ("HELLO");
+ default:
+ return ("<UNKNOWN>");
+ }
+}
+EXPORT_SYMBOL(lnet_msgtyp2str);
+
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
+{
+ lnet_process_id_t src = {0};
+ lnet_process_id_t dst = {0};
+ char *type_str = lnet_msgtyp2str (hdr->type);
+
+ src.nid = hdr->src_nid;
+ src.pid = hdr->src_pid;
+
+ dst.nid = hdr->dest_nid;
+ dst.pid = hdr->dest_pid;
+
+ CWARN("P3 Header at %p of type %s\n", hdr, type_str);
+ CWARN(" From %s\n", libcfs_id2str(src));
+ CWARN(" To %s\n", libcfs_id2str(dst));
+
+ switch (hdr->type) {
+ default:
+ break;
+
+ case LNET_MSG_PUT:
+ CWARN(" Ptl index %d, ack md "LPX64"."LPX64", "
+ "match bits "LPU64"\n",
+ hdr->msg.put.ptl_index,
+ hdr->msg.put.ack_wmd.wh_interface_cookie,
+ hdr->msg.put.ack_wmd.wh_object_cookie,
+ hdr->msg.put.match_bits);
+ CWARN(" Length %d, offset %d, hdr data "LPX64"\n",
+ hdr->payload_length, hdr->msg.put.offset,
+ hdr->msg.put.hdr_data);
+ break;
+
+ case LNET_MSG_GET:
+ CWARN(" Ptl index %d, return md "LPX64"."LPX64", "
+ "match bits "LPU64"\n", hdr->msg.get.ptl_index,
+ hdr->msg.get.return_wmd.wh_interface_cookie,
+ hdr->msg.get.return_wmd.wh_object_cookie,
+ hdr->msg.get.match_bits);
+ CWARN(" Length %d, src offset %d\n",
+ hdr->msg.get.sink_length,
+ hdr->msg.get.src_offset);
+ break;
+
+ case LNET_MSG_ACK:
+ CWARN(" dst md "LPX64"."LPX64", "
+ "manipulated length %d\n",
+ hdr->msg.ack.dst_wmd.wh_interface_cookie,
+ hdr->msg.ack.dst_wmd.wh_object_cookie,
+ hdr->msg.ack.mlength);
+ break;
+
+ case LNET_MSG_REPLY:
+ CWARN(" dst md "LPX64"."LPX64", "
+ "length %d\n",
+ hdr->msg.reply.dst_wmd.wh_interface_cookie,
+ hdr->msg.reply.dst_wmd.wh_object_cookie,
+ hdr->payload_length);
+ }
+
+}
+
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
+ void *private, int rdma_req)
+{
+ int rc = 0;
+ int cpt;
+ int for_me;
+ struct lnet_msg *msg;
+ lnet_pid_t dest_pid;
+ lnet_nid_t dest_nid;
+ lnet_nid_t src_nid;
+ __u32 payload_length;
+ __u32 type;
+
+ LASSERT (!in_interrupt ());
+
+ type = le32_to_cpu(hdr->type);
+ src_nid = le64_to_cpu(hdr->src_nid);
+ dest_nid = le64_to_cpu(hdr->dest_nid);
+ dest_pid = le32_to_cpu(hdr->dest_pid);
+ payload_length = le32_to_cpu(hdr->payload_length);
+
+ for_me = (ni->ni_nid == dest_nid);
+ cpt = lnet_cpt_of_nid(from_nid);
+
+ switch (type) {
+ case LNET_MSG_ACK:
+ case LNET_MSG_GET:
+ if (payload_length > 0) {
+ CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type), payload_length);
+ return -EPROTO;
+ }
+ break;
+
+ case LNET_MSG_PUT:
+ case LNET_MSG_REPLY:
+ if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+ CERROR("%s, src %s: bad %s payload %d "
+ "(%d max expected)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type),
+ payload_length,
+ for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+ return -EPROTO;
+ }
+ break;
+
+ default:
+ CERROR("%s, src %s: Bad message type 0x%x\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid), type);
+ return -EPROTO;
+ }
+
+ if (the_lnet.ln_routing &&
+ ni->ni_last_alive != cfs_time_current_sec()) {
+ lnet_ni_lock(ni);
+
+ /* NB: so far here is the only place to set NI status to "up */
+ ni->ni_last_alive = cfs_time_current_sec();
+ if (ni->ni_status != NULL &&
+ ni->ni_status->ns_status == LNET_NI_STATUS_DOWN)
+ ni->ni_status->ns_status = LNET_NI_STATUS_UP;
+ lnet_ni_unlock(ni);
+ }
+
+ /* Regard a bad destination NID as a protocol error. Senders should
+ * know what they're doing; if they don't they're misconfigured, buggy
+ * or malicious so we chop them off at the knees :) */
+
+ if (!for_me) {
+ if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+ /* should have gone direct */
+ CERROR ("%s, src %s: Bad dest nid %s "
+ "(should have been sent direct)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (lnet_islocalnid(dest_nid)) {
+ /* dest is another local NI; sender should have used
+ * this node's NID on its own network */
+ CERROR ("%s, src %s: Bad dest nid %s "
+ "(it's my nid but on a different network)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (rdma_req && type == LNET_MSG_GET) {
+ CERROR ("%s, src %s: Bad optimized GET for %s "
+ "(final destination must be me)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ return -EPROTO;
+ }
+
+ if (!the_lnet.ln_routing) {
+ CERROR ("%s, src %s: Dropping message for %s "
+ "(routing not enabled)\n",
+ libcfs_nid2str(from_nid),
+ libcfs_nid2str(src_nid),
+ libcfs_nid2str(dest_nid));
+ goto drop;
+ }
+ }
+
+ /* Message looks OK; we're not going to return an error, so we MUST
+ * call back lnd_recv() come what may... */
+
+ if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer (src_nid, 0)) /* shall we now? */
+ {
+ CERROR("%s, src %s: Dropping %s to simulate failure\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type));
+ goto drop;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("%s, src %s: Dropping %s (out of memory)\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type));
+ goto drop;
+ }
+
+ /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+
+ msg->msg_type = type;
+ msg->msg_private = private;
+ msg->msg_receiving = 1;
+ msg->msg_len = msg->msg_wanted = payload_length;
+ msg->msg_offset = 0;
+ msg->msg_hdr = *hdr;
+ /* for building message event */
+ msg->msg_from = from_nid;
+ if (!for_me) {
+ msg->msg_target.pid = dest_pid;
+ msg->msg_target.nid = dest_nid;
+ msg->msg_routing = 1;
+
+ } else {
+ /* convert common msg->hdr fields to host byteorder */
+ msg->msg_hdr.type = type;
+ msg->msg_hdr.src_nid = src_nid;
+ msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid);
+ msg->msg_hdr.dest_nid = dest_nid;
+ msg->msg_hdr.dest_pid = dest_pid;
+ msg->msg_hdr.payload_length = payload_length;
+ }
+
+ lnet_net_lock(cpt);
+ rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ CERROR("%s, src %s: Dropping %s "
+ "(error %d looking up sender)\n",
+ libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+ lnet_msgtyp2str(type), rc);
+ lnet_msg_free(msg);
+ goto drop;
+ }
+
+ lnet_msg_commit(msg, cpt);
+
+ if (!for_me) {
+ rc = lnet_parse_forward_locked(ni, msg);
+ lnet_net_unlock(cpt);
+
+ if (rc < 0)
+ goto free_drop;
+ if (rc == 0) {
+ lnet_ni_recv(ni, msg->msg_private, msg, 0,
+ 0, payload_length, payload_length);
+ }
+ return 0;
+ }
+
+ lnet_net_unlock(cpt);
+
+ switch (type) {
+ case LNET_MSG_ACK:
+ rc = lnet_parse_ack(ni, msg);
+ break;
+ case LNET_MSG_PUT:
+ rc = lnet_parse_put(ni, msg);
+ break;
+ case LNET_MSG_GET:
+ rc = lnet_parse_get(ni, msg, rdma_req);
+ break;
+ case LNET_MSG_REPLY:
+ rc = lnet_parse_reply(ni, msg);
+ break;
+ default:
+ LASSERT(0);
+ rc = -EPROTO;
+ goto free_drop; /* prevent an unused label if !kernel */
+ }
+
+ if (rc == 0)
+ return 0;
+
+ LASSERT (rc == ENOENT);
+
+ free_drop:
+ LASSERT(msg->msg_md == NULL);
+ lnet_finalize(ni, msg, rc);
+
+ drop:
+ lnet_drop_message(ni, cpt, private, payload_length);
+ return 0;
+}
+EXPORT_SYMBOL(lnet_parse);
+
+void
+lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
+{
+ while (!list_empty(head)) {
+ lnet_process_id_t id = {0};
+ lnet_msg_t *msg;
+
+ msg = list_entry(head->next, lnet_msg_t, msg_list);
+ list_del(&msg->msg_list);
+
+ id.nid = msg->msg_hdr.src_nid;
+ id.pid = msg->msg_hdr.src_pid;
+
+ LASSERT(msg->msg_md == NULL);
+ LASSERT(msg->msg_rx_delayed);
+ LASSERT(msg->msg_rxpeer != NULL);
+ LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+ CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+ " offset %d length %d: %s\n",
+ libcfs_id2str(id),
+ msg->msg_hdr.msg.put.ptl_index,
+ msg->msg_hdr.msg.put.match_bits,
+ msg->msg_hdr.msg.put.offset,
+ msg->msg_hdr.payload_length, reason);
+
+ /* NB I can't drop msg's ref on msg_rxpeer until after I've
+ * called lnet_drop_message(), so I just hang onto msg as well
+ * until that's done */
+
+ lnet_drop_message(msg->msg_rxpeer->lp_ni,
+ msg->msg_rxpeer->lp_cpt,
+ msg->msg_private, msg->msg_len);
+ /*
+ * NB: message will not generate event because w/o attached MD,
+ * but we still should give error code so lnet_msg_decommit()
+ * can skip counters operations and other checks.
+ */
+ lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
+ }
+}
+
+void
+lnet_recv_delayed_msg_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ lnet_msg_t *msg;
+ lnet_process_id_t id;
+
+ msg = list_entry(head->next, lnet_msg_t, msg_list);
+ list_del(&msg->msg_list);
+
+ /* md won't disappear under me, since each msg
+ * holds a ref on it */
+
+ id.nid = msg->msg_hdr.src_nid;
+ id.pid = msg->msg_hdr.src_pid;
+
+ LASSERT(msg->msg_rx_delayed);
+ LASSERT(msg->msg_md != NULL);
+ LASSERT(msg->msg_rxpeer != NULL);
+ LASSERT(msg->msg_hdr.type == LNET_MSG_PUT);
+
+ CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+ "match "LPU64" offset %d length %d.\n",
+ libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index,
+ msg->msg_hdr.msg.put.match_bits,
+ msg->msg_hdr.msg.put.offset,
+ msg->msg_hdr.payload_length);
+
+ lnet_recv_put(msg->msg_rxpeer->lp_ni, msg);
+ }
+}
+
+/**
+ * Initiate an asynchronous PUT operation.
+ *
+ * There are several events associated with a PUT: completion of the send on
+ * the initiator node (LNET_EVENT_SEND), and when the send completes
+ * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating
+ * that the operation was accepted by the target. The event LNET_EVENT_PUT is
+ * used at the target node to indicate the completion of incoming data
+ * delivery.
+ *
+ * The local events will be logged in the EQ associated with the MD pointed to
+ * by \a mdh handle. Using a MD without an associated EQ results in these
+ * events being discarded. In this case, the caller must have another
+ * mechanism (e.g., a higher level protocol) for determining when it is safe
+ * to modify the memory region associated with the MD.
+ *
+ * Note that LNet does not guarantee the order of LNET_EVENT_SEND and
+ * LNET_EVENT_ACK, though intuitively ACK should happen after SEND.
+ *
+ * \param self Indicates the NID of a local interface through which to send
+ * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself.
+ * \param mdh A handle for the MD that describes the memory to be sent. The MD
+ * must be "free floating" (See LNetMDBind()).
+ * \param ack Controls whether an acknowledgment is requested.
+ * Acknowledgments are only sent when they are requested by the initiating
+ * process and the target MD enables them.
+ * \param target A process identifier for the target process.
+ * \param portal The index in the \a target's portal table.
+ * \param match_bits The match bits to use for MD selection at the target
+ * process.
+ * \param offset The offset into the target MD (only used when the target
+ * MD has the LNET_MD_MANAGE_REMOTE option set).
+ * \param hdr_data 64 bits of user data that can be included in the message
+ * header. This data is written to an event queue entry at the target if an
+ * EQ is present on the matching MD.
+ *
+ * \retval 0 Success, and only in this case events will be generated
+ * and logged to EQ (if it exists).
+ * \retval -EIO Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ *
+ * \see lnet_event_t::hdr_data and lnet_event_kind_t.
+ */
+int
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+ lnet_process_id_t target, unsigned int portal,
+ __u64 match_bits, unsigned int offset,
+ __u64 hdr_data)
+{
+ struct lnet_msg *msg;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer (target.nid, 1)) /* shall we now? */
+ {
+ CERROR("Dropping PUT to %s: simulated failure\n",
+ libcfs_id2str(target));
+ return -EIO;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+ libcfs_id2str(target));
+ return -ENOMEM;
+ }
+ msg->msg_vmflush = !!memory_pressure_get();
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n",
+ match_bits, portal, libcfs_id2str(target),
+ md == NULL ? -1 : md->md_threshold);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("Source MD also attached to portal %d\n",
+ md->md_me->me_portal);
+ lnet_res_unlock(cpt);
+
+ lnet_msg_free(msg);
+ return -ENOENT;
+ }
+
+ CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+ msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+ msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+ msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+ msg->msg_hdr.msg.put.hdr_data = hdr_data;
+
+ /* NB handles only looked up by creator (no flips) */
+ if (ack == LNET_ACK_REQ) {
+ msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+ the_lnet.ln_interface_cookie;
+ msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+ md->md_lh.lh_cookie;
+ } else {
+ msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie =
+ LNET_WIRE_HANDLE_COOKIE_NONE;
+ msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie =
+ LNET_WIRE_HANDLE_COOKIE_NONE;
+ }
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+ rc = lnet_send(self, msg, LNET_NID_ANY);
+ if (rc != 0) {
+ CNETERR( "Error sending PUT to %s: %d\n",
+ libcfs_id2str(target), rc);
+ lnet_finalize (NULL, msg, rc);
+ }
+
+ /* completion will be signalled by an event */
+ return 0;
+}
+EXPORT_SYMBOL(LNetPut);
+
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
+{
+ /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This
+ * returns a msg for the LND to pass to lnet_finalize() when the sink
+ * data has been received.
+ *
+ * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
+ * lnet_finalize() is called on it, so the LND must call this first */
+
+ struct lnet_msg *msg = lnet_msg_alloc();
+ struct lnet_libmd *getmd = getmsg->msg_md;
+ lnet_process_id_t peer_id = getmsg->msg_target;
+ int cpt;
+
+ LASSERT(!getmsg->msg_target_is_router);
+ LASSERT(!getmsg->msg_routing);
+
+ cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie);
+ lnet_res_lock(cpt);
+
+ LASSERT (getmd->md_refcount > 0);
+
+ if (msg == NULL) {
+ CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
+ goto drop;
+ }
+
+ if (getmd->md_threshold == 0) {
+ CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id),
+ getmd);
+ lnet_res_unlock(cpt);
+ goto drop;
+ }
+
+ LASSERT(getmd->md_offset == 0);
+
+ CDEBUG(D_NET, "%s: Reply from %s md %p\n",
+ libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
+
+ /* setup information for lnet_build_msg_event */
+ msg->msg_from = peer_id.nid;
+ msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
+ msg->msg_hdr.src_nid = peer_id.nid;
+ msg->msg_hdr.payload_length = getmd->md_length;
+ msg->msg_receiving = 1; /* required by lnet_msg_attach_md */
+
+ lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length);
+ lnet_res_unlock(cpt);
+
+ cpt = lnet_cpt_of_nid(peer_id.nid);
+
+ lnet_net_lock(cpt);
+ lnet_msg_commit(msg, cpt);
+ lnet_net_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_REPLY);
+
+ return msg;
+
+ drop:
+ cpt = lnet_cpt_of_nid(peer_id.nid);
+
+ lnet_net_lock(cpt);
+ the_lnet.ln_counters[cpt]->drop_count++;
+ the_lnet.ln_counters[cpt]->drop_length += getmd->md_length;
+ lnet_net_unlock(cpt);
+
+ if (msg != NULL)
+ lnet_msg_free(msg);
+
+ return NULL;
+}
+EXPORT_SYMBOL(lnet_create_reply_msg);
+
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+ /* Set the REPLY length, now the RDMA that elides the REPLY message has
+ * completed and I know it. */
+ LASSERT (reply != NULL);
+ LASSERT (reply->msg_type == LNET_MSG_GET);
+ LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+ /* NB I trusted my peer to RDMA. If she tells me she's written beyond
+ * the end of my buffer, I might as well be dead. */
+ LASSERT (len <= reply->msg_ev.mlength);
+
+ reply->msg_ev.mlength = len;
+}
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+
+/**
+ * Initiate an asynchronous GET operation.
+ *
+ * On the initiator node, an LNET_EVENT_SEND is logged when the GET request
+ * is sent, and an LNET_EVENT_REPLY is logged when the data returned from
+ * the target node in the REPLY has been written to local MD.
+ *
+ * On the target node, an LNET_EVENT_GET is logged when the GET request
+ * arrives and is accepted into a MD.
+ *
+ * \param self,target,portal,match_bits,offset See the discussion in LNetPut().
+ * \param mdh A handle for the MD that describes the memory into which the
+ * requested data will be received. The MD must be "free floating" (See LNetMDBind()).
+ *
+ * \retval 0 Success, and only in this case events will be generated
+ * and logged to EQ (if it exists) of the MD.
+ * \retval -EIO Simulated failure.
+ * \retval -ENOMEM Memory allocation failure.
+ * \retval -ENOENT Invalid MD object.
+ */
+int
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh,
+ lnet_process_id_t target, unsigned int portal,
+ __u64 match_bits, unsigned int offset)
+{
+ struct lnet_msg *msg;
+ struct lnet_libmd *md;
+ int cpt;
+ int rc;
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+ fail_peer (target.nid, 1)) /* shall we now? */
+ {
+ CERROR("Dropping GET to %s: simulated failure\n",
+ libcfs_id2str(target));
+ return -EIO;
+ }
+
+ msg = lnet_msg_alloc();
+ if (msg == NULL) {
+ CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+ libcfs_id2str(target));
+ return -ENOMEM;
+ }
+
+ cpt = lnet_cpt_of_cookie(mdh.cookie);
+ lnet_res_lock(cpt);
+
+ md = lnet_handle2md(&mdh);
+ if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) {
+ CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n",
+ match_bits, portal, libcfs_id2str(target),
+ md == NULL ? -1 : md->md_threshold);
+ if (md != NULL && md->md_me != NULL)
+ CERROR("REPLY MD also attached to portal %d\n",
+ md->md_me->me_portal);
+
+ lnet_res_unlock(cpt);
+
+ lnet_msg_free(msg);
+
+ return -ENOENT;
+ }
+
+ CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
+
+ lnet_msg_attach_md(msg, md, 0, 0);
+
+ lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
+
+ msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+ msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+ msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+ msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
+
+ /* NB handles only looked up by creator (no flips) */
+ msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie =
+ the_lnet.ln_interface_cookie;
+ msg->msg_hdr.msg.get.return_wmd.wh_object_cookie =
+ md->md_lh.lh_cookie;
+
+ lnet_res_unlock(cpt);
+
+ lnet_build_msg_event(msg, LNET_EVENT_SEND);
+
+ rc = lnet_send(self, msg, LNET_NID_ANY);
+ if (rc < 0) {
+ CNETERR( "Error sending GET to %s: %d\n",
+ libcfs_id2str(target), rc);
+ lnet_finalize (NULL, msg, rc);
+ }
+
+ /* completion will be signalled by an event */
+ return 0;
+}
+EXPORT_SYMBOL(LNetGet);
+
+/**
+ * Calculate distance to node at \a dstnid.
+ *
+ * \param dstnid Target NID.
+ * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid
+ * is saved here.
+ * \param orderp If not NULL, order of the route to reach \a dstnid is saved
+ * here.
+ *
+ * \retval 0 If \a dstnid belongs to a local interface, and reserved option
+ * local_nid_dist_zero is set, which is the default.
+ * \retval positives Distance to target NID, i.e. number of hops plus one.
+ * \retval -EHOSTUNREACH If \a dstnid is not reachable.
+ */
+int
+LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
+{
+ struct list_head *e;
+ struct lnet_ni *ni;
+ lnet_remotenet_t *rnet;
+ __u32 dstnet = LNET_NIDNET(dstnid);
+ int hops;
+ int cpt;
+ __u32 order = 2;
+ struct list_head *rn_list;
+
+ /* if !local_nid_dist_zero, I don't return a distance of 0 ever
+ * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+ * keep order 0 free for 0@lo and order 1 free for a local NID
+ * match */
+
+ LASSERT (the_lnet.ln_init);
+ LASSERT (the_lnet.ln_refcount > 0);
+
+ cpt = lnet_net_lock_current();
+
+ list_for_each (e, &the_lnet.ln_nis) {
+ ni = list_entry(e, lnet_ni_t, ni_list);
+
+ if (ni->ni_nid == dstnid) {
+ if (srcnidp != NULL)
+ *srcnidp = dstnid;
+ if (orderp != NULL) {
+ if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+ *orderp = 0;
+ else
+ *orderp = 1;
+ }
+ lnet_net_unlock(cpt);
+
+ return local_nid_dist_zero ? 0 : 1;
+ }
+
+ if (LNET_NIDNET(ni->ni_nid) == dstnet) {
+ if (srcnidp != NULL)
+ *srcnidp = ni->ni_nid;
+ if (orderp != NULL)
+ *orderp = order;
+ lnet_net_unlock(cpt);
+ return 1;
+ }
+
+ order++;
+ }
+
+ rn_list = lnet_net2rnethash(dstnet);
+ list_for_each(e, rn_list) {
+ rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+ if (rnet->lrn_net == dstnet) {
+ lnet_route_t *route;
+ lnet_route_t *shortest = NULL;
+
+ LASSERT (!list_empty(&rnet->lrn_routes));
+
+ list_for_each_entry(route, &rnet->lrn_routes,
+ lr_list) {
+ if (shortest == NULL ||
+ route->lr_hops < shortest->lr_hops)
+ shortest = route;
+ }
+
+ LASSERT (shortest != NULL);
+ hops = shortest->lr_hops;
+ if (srcnidp != NULL)
+ *srcnidp = shortest->lr_gateway->lp_ni->ni_nid;
+ if (orderp != NULL)
+ *orderp = order;
+ lnet_net_unlock(cpt);
+ return hops + 1;
+ }
+ order++;
+ }
+
+ lnet_net_unlock(cpt);
+ return -EHOSTUNREACH;
+}
+EXPORT_SYMBOL(LNetDist);
+
+/**
+ * Set the number of asynchronous messages expected from a target process.
+ *
+ * This function is only meaningful for userspace callers. It's a no-op when
+ * called from kernel.
+ *
+ * Asynchronous messages are those that can come from a target when the
+ * userspace process is not waiting for IO to complete; e.g., AST callbacks
+ * from Lustre servers. Specifying the expected number of such messages
+ * allows them to be eagerly received when user process is not running in
+ * LNet; otherwise network errors may occur.
+ *
+ * \param id Process ID of the target process.
+ * \param nasync Number of asynchronous messages expected from the target.
+ *
+ * \return 0 on success, and an error code otherwise.
+ */
+int
+LNetSetAsync(lnet_process_id_t id, int nasync)
+{
+ return 0;
+}
+EXPORT_SYMBOL(LNetSetAsync);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c
new file mode 100644
index 000000000000..8f3a50bd5f69
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c
@@ -0,0 +1,650 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-msg.c
+ *
+ * Message decoding, parsing and finalizing routines
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+void
+lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev)
+{
+ ENTRY;
+
+ memset(ev, 0, sizeof(*ev));
+
+ ev->status = 0;
+ ev->unlinked = 1;
+ ev->type = LNET_EVENT_UNLINK;
+ lnet_md_deconstruct(md, &ev->md);
+ lnet_md2handle(&ev->md_handle, md);
+ EXIT;
+}
+
+/*
+ * Don't need any lock, must be called after lnet_commit_md
+ */
+void
+lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type)
+{
+ lnet_hdr_t *hdr = &msg->msg_hdr;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(!msg->msg_routing);
+
+ ev->type = ev_type;
+
+ if (ev_type == LNET_EVENT_SEND) {
+ /* event for active message */
+ ev->target.nid = le64_to_cpu(hdr->dest_nid);
+ ev->target.pid = le32_to_cpu(hdr->dest_pid);
+ ev->initiator.nid = LNET_NID_ANY;
+ ev->initiator.pid = the_lnet.ln_pid;
+ ev->sender = LNET_NID_ANY;
+
+ } else {
+ /* event for passive message */
+ ev->target.pid = hdr->dest_pid;
+ ev->target.nid = hdr->dest_nid;
+ ev->initiator.pid = hdr->src_pid;
+ ev->initiator.nid = hdr->src_nid;
+ ev->rlength = hdr->payload_length;
+ ev->sender = msg->msg_from;
+ ev->mlength = msg->msg_wanted;
+ ev->offset = msg->msg_offset;
+ }
+
+ switch (ev_type) {
+ default:
+ LBUG();
+
+ case LNET_EVENT_PUT: /* passive PUT */
+ ev->pt_index = hdr->msg.put.ptl_index;
+ ev->match_bits = hdr->msg.put.match_bits;
+ ev->hdr_data = hdr->msg.put.hdr_data;
+ return;
+
+ case LNET_EVENT_GET: /* passive GET */
+ ev->pt_index = hdr->msg.get.ptl_index;
+ ev->match_bits = hdr->msg.get.match_bits;
+ ev->hdr_data = 0;
+ return;
+
+ case LNET_EVENT_ACK: /* ACK */
+ ev->match_bits = hdr->msg.ack.match_bits;
+ ev->mlength = hdr->msg.ack.mlength;
+ return;
+
+ case LNET_EVENT_REPLY: /* REPLY */
+ return;
+
+ case LNET_EVENT_SEND: /* active message */
+ if (msg->msg_type == LNET_MSG_PUT) {
+ ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index);
+ ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+ ev->offset = le32_to_cpu(hdr->msg.put.offset);
+ ev->mlength =
+ ev->rlength = le32_to_cpu(hdr->payload_length);
+ ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data);
+
+ } else {
+ LASSERT(msg->msg_type == LNET_MSG_GET);
+ ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index);
+ ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
+ ev->mlength =
+ ev->rlength = le32_to_cpu(hdr->msg.get.sink_length);
+ ev->offset = le32_to_cpu(hdr->msg.get.src_offset);
+ ev->hdr_data = 0;
+ }
+ return;
+ }
+}
+
+void
+lnet_msg_commit(lnet_msg_t *msg, int cpt)
+{
+ struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
+ lnet_counters_t *counters = the_lnet.ln_counters[cpt];
+
+ /* routed message can be committed for both receiving and sending */
+ LASSERT(!msg->msg_tx_committed);
+
+ if (msg->msg_sending) {
+ LASSERT(!msg->msg_receiving);
+
+ msg->msg_tx_cpt = cpt;
+ msg->msg_tx_committed = 1;
+ if (msg->msg_rx_committed) { /* routed message REPLY */
+ LASSERT(msg->msg_onactivelist);
+ return;
+ }
+ } else {
+ LASSERT(!msg->msg_sending);
+ msg->msg_rx_cpt = cpt;
+ msg->msg_rx_committed = 1;
+ }
+
+ LASSERT(!msg->msg_onactivelist);
+ msg->msg_onactivelist = 1;
+ list_add(&msg->msg_activelist, &container->msc_active);
+
+ counters->msgs_alloc++;
+ if (counters->msgs_alloc > counters->msgs_max)
+ counters->msgs_max = counters->msgs_alloc;
+}
+
+static void
+lnet_msg_decommit_tx(lnet_msg_t *msg, int status)
+{
+ lnet_counters_t *counters;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(msg->msg_tx_committed);
+ if (status != 0)
+ goto out;
+
+ counters = the_lnet.ln_counters[msg->msg_tx_cpt];
+ switch (ev->type) {
+ default: /* routed message */
+ LASSERT(msg->msg_routing);
+ LASSERT(msg->msg_rx_committed);
+ LASSERT(ev->type == 0);
+
+ counters->route_length += msg->msg_len;
+ counters->route_count++;
+ goto out;
+
+ case LNET_EVENT_PUT:
+ /* should have been decommitted */
+ LASSERT(!msg->msg_rx_committed);
+ /* overwritten while sending ACK */
+ LASSERT(msg->msg_type == LNET_MSG_ACK);
+ msg->msg_type = LNET_MSG_PUT; /* fix type */
+ break;
+
+ case LNET_EVENT_SEND:
+ LASSERT(!msg->msg_rx_committed);
+ if (msg->msg_type == LNET_MSG_PUT)
+ counters->send_length += msg->msg_len;
+ break;
+
+ case LNET_EVENT_GET:
+ LASSERT(msg->msg_rx_committed);
+ /* overwritten while sending reply, we should never be
+ * here for optimized GET */
+ LASSERT(msg->msg_type == LNET_MSG_REPLY);
+ msg->msg_type = LNET_MSG_GET; /* fix type */
+ break;
+ }
+
+ counters->send_count++;
+ out:
+ lnet_return_tx_credits_locked(msg);
+ msg->msg_tx_committed = 0;
+}
+
+static void
+lnet_msg_decommit_rx(lnet_msg_t *msg, int status)
+{
+ lnet_counters_t *counters;
+ lnet_event_t *ev = &msg->msg_ev;
+
+ LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
+ LASSERT(msg->msg_rx_committed);
+
+ if (status != 0)
+ goto out;
+
+ counters = the_lnet.ln_counters[msg->msg_rx_cpt];
+ switch (ev->type) {
+ default:
+ LASSERT(ev->type == 0);
+ LASSERT(msg->msg_routing);
+ goto out;
+
+ case LNET_EVENT_ACK:
+ LASSERT(msg->msg_type == LNET_MSG_ACK);
+ break;
+
+ case LNET_EVENT_GET:
+ /* type is "REPLY" if it's an optimized GET on passive side,
+ * because optimized GET will never be committed for sending,
+ * so message type wouldn't be changed back to "GET" by
+ * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
+ LASSERT(msg->msg_type == LNET_MSG_REPLY ||
+ msg->msg_type == LNET_MSG_GET);
+ counters->send_length += msg->msg_wanted;
+ break;
+
+ case LNET_EVENT_PUT:
+ LASSERT(msg->msg_type == LNET_MSG_PUT);
+ break;
+
+ case LNET_EVENT_REPLY:
+ /* type is "GET" if it's an optimized GET on active side,
+ * see details in lnet_create_reply_msg() */
+ LASSERT(msg->msg_type == LNET_MSG_GET ||
+ msg->msg_type == LNET_MSG_REPLY);
+ break;
+ }
+
+ counters->recv_count++;
+ if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
+ counters->recv_length += msg->msg_wanted;
+
+ out:
+ lnet_return_rx_credits_locked(msg);
+ msg->msg_rx_committed = 0;
+}
+
+void
+lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status)
+{
+ int cpt2 = cpt;
+
+ LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
+ LASSERT(msg->msg_onactivelist);
+
+ if (msg->msg_tx_committed) { /* always decommit for sending first */
+ LASSERT(cpt == msg->msg_tx_cpt);
+ lnet_msg_decommit_tx(msg, status);
+ }
+
+ if (msg->msg_rx_committed) {
+ /* forwarding msg committed for both receiving and sending */
+ if (cpt != msg->msg_rx_cpt) {
+ lnet_net_unlock(cpt);
+ cpt2 = msg->msg_rx_cpt;
+ lnet_net_lock(cpt2);
+ }
+ lnet_msg_decommit_rx(msg, status);
+ }
+
+ list_del(&msg->msg_activelist);
+ msg->msg_onactivelist = 0;
+
+ the_lnet.ln_counters[cpt2]->msgs_alloc--;
+
+ if (cpt2 != cpt) {
+ lnet_net_unlock(cpt2);
+ lnet_net_lock(cpt);
+ }
+}
+
+void
+lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md,
+ unsigned int offset, unsigned int mlen)
+{
+ /* NB: @offset and @len are only useful for receiving */
+ /* Here, we attach the MD on lnet_msg and mark it busy and
+ * decrementing its threshold. Come what may, the lnet_msg "owns"
+ * the MD until a call to lnet_msg_detach_md or lnet_finalize()
+ * signals completion. */
+ LASSERT(!msg->msg_routing);
+
+ msg->msg_md = md;
+ if (msg->msg_receiving) { /* commited for receiving */
+ msg->msg_offset = offset;
+ msg->msg_wanted = mlen;
+ }
+
+ md->md_refcount++;
+ if (md->md_threshold != LNET_MD_THRESH_INF) {
+ LASSERT(md->md_threshold > 0);
+ md->md_threshold--;
+ }
+
+ /* build umd in event */
+ lnet_md2handle(&msg->msg_ev.md_handle, md);
+ lnet_md_deconstruct(md, &msg->msg_ev.md);
+}
+
+void
+lnet_msg_detach_md(lnet_msg_t *msg, int status)
+{
+ lnet_libmd_t *md = msg->msg_md;
+ int unlink;
+
+ /* Now it's safe to drop my caller's ref */
+ md->md_refcount--;
+ LASSERT(md->md_refcount >= 0);
+
+ unlink = lnet_md_unlinkable(md);
+ if (md->md_eq != NULL) {
+ msg->msg_ev.status = status;
+ msg->msg_ev.unlinked = unlink;
+ lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
+ }
+
+ if (unlink)
+ lnet_md_unlink(md);
+
+ msg->msg_md = NULL;
+}
+
+static int
+lnet_complete_msg_locked(lnet_msg_t *msg, int cpt)
+{
+ lnet_handle_wire_t ack_wmd;
+ int rc;
+ int status = msg->msg_ev.status;
+
+ LASSERT (msg->msg_onactivelist);
+
+ if (status == 0 && msg->msg_ack) {
+ /* Only send an ACK if the PUT completed successfully */
+
+ lnet_msg_decommit(msg, cpt, 0);
+
+ msg->msg_ack = 0;
+ lnet_net_unlock(cpt);
+
+ LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+ LASSERT(!msg->msg_routing);
+
+ ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+
+ lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+ msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+ msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+ msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+
+ /* NB: we probably want to use NID of msg::msg_from as 3rd
+ * parameter (router NID) if it's routed message */
+ rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
+
+ lnet_net_lock(cpt);
+ /*
+ * NB: message is committed for sending, we should return
+ * on success because LND will finalize this message later.
+ *
+ * Also, there is possibility that message is commited for
+ * sending and also failed before delivering to LND,
+ * i.e: ENOMEM, in that case we can't fall through either
+ * because CPT for sending can be different with CPT for
+ * receiving, so we should return back to lnet_finalize()
+ * to make sure we are locking the correct partition.
+ */
+ return rc;
+
+ } else if (status == 0 && /* OK so far */
+ (msg->msg_routing && !msg->msg_sending)) {
+ /* not forwarded */
+ LASSERT(!msg->msg_receiving); /* called back recv already */
+ lnet_net_unlock(cpt);
+
+ rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
+
+ lnet_net_lock(cpt);
+ /*
+ * NB: message is committed for sending, we should return
+ * on success because LND will finalize this message later.
+ *
+ * Also, there is possibility that message is commited for
+ * sending and also failed before delivering to LND,
+ * i.e: ENOMEM, in that case we can't fall through either:
+ * - The rule is message must decommit for sending first if
+ * the it's committed for both sending and receiving
+ * - CPT for sending can be different with CPT for receiving,
+ * so we should return back to lnet_finalize() to make
+ * sure we are locking the correct partition.
+ */
+ return rc;
+ }
+
+ lnet_msg_decommit(msg, cpt, status);
+ lnet_msg_free_locked(msg);
+ return 0;
+}
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
+{
+ struct lnet_msg_container *container;
+ int my_slot;
+ int cpt;
+ int rc;
+ int i;
+
+ LASSERT (!in_interrupt ());
+
+ if (msg == NULL)
+ return;
+#if 0
+ CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+ lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+ msg->msg_target_is_router ? "t" : "",
+ msg->msg_routing ? "X" : "",
+ msg->msg_ack ? "A" : "",
+ msg->msg_sending ? "S" : "",
+ msg->msg_receiving ? "R" : "",
+ msg->msg_delayed ? "d" : "",
+ msg->msg_txcredit ? "C" : "",
+ msg->msg_peertxcredit ? "c" : "",
+ msg->msg_rtrcredit ? "F" : "",
+ msg->msg_peerrtrcredit ? "f" : "",
+ msg->msg_onactivelist ? "!" : "",
+ msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+ msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+ msg->msg_ev.status = status;
+
+ if (msg->msg_md != NULL) {
+ cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+
+ lnet_res_lock(cpt);
+ lnet_msg_detach_md(msg, status);
+ lnet_res_unlock(cpt);
+ }
+
+ again:
+ rc = 0;
+ if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
+ /* not commited to network yet */
+ LASSERT(!msg->msg_onactivelist);
+ lnet_msg_free(msg);
+ return;
+ }
+
+ /*
+ * NB: routed message can be commited for both receiving and sending,
+ * we should finalize in LIFO order and keep counters correct.
+ * (finalize sending first then finalize receiving)
+ */
+ cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
+ lnet_net_lock(cpt);
+
+ container = the_lnet.ln_msg_containers[cpt];
+ list_add_tail(&msg->msg_list, &container->msc_finalizing);
+
+ /* Recursion breaker. Don't complete the message here if I am (or
+ * enough other threads are) already completing messages */
+
+ my_slot = -1;
+ for (i = 0; i < container->msc_nfinalizers; i++) {
+ if (container->msc_finalizers[i] == current)
+ break;
+
+ if (my_slot < 0 && container->msc_finalizers[i] == NULL)
+ my_slot = i;
+ }
+
+ if (i < container->msc_nfinalizers || my_slot < 0) {
+ lnet_net_unlock(cpt);
+ return;
+ }
+
+ container->msc_finalizers[my_slot] = current;
+
+ while (!list_empty(&container->msc_finalizing)) {
+ msg = list_entry(container->msc_finalizing.next,
+ lnet_msg_t, msg_list);
+
+ list_del(&msg->msg_list);
+
+ /* NB drops and regains the lnet lock if it actually does
+ * anything, so my finalizing friends can chomp along too */
+ rc = lnet_complete_msg_locked(msg, cpt);
+ if (rc != 0)
+ break;
+ }
+
+ container->msc_finalizers[my_slot] = NULL;
+ lnet_net_unlock(cpt);
+
+ if (rc != 0)
+ goto again;
+}
+EXPORT_SYMBOL(lnet_finalize);
+
+void
+lnet_msg_container_cleanup(struct lnet_msg_container *container)
+{
+ int count = 0;
+
+ if (container->msc_init == 0)
+ return;
+
+ while (!list_empty(&container->msc_active)) {
+ lnet_msg_t *msg = list_entry(container->msc_active.next,
+ lnet_msg_t, msg_activelist);
+
+ LASSERT(msg->msg_onactivelist);
+ msg->msg_onactivelist = 0;
+ list_del(&msg->msg_activelist);
+ lnet_msg_free(msg);
+ count++;
+ }
+
+ if (count > 0)
+ CERROR("%d active msg on exit\n", count);
+
+ if (container->msc_finalizers != NULL) {
+ LIBCFS_FREE(container->msc_finalizers,
+ container->msc_nfinalizers *
+ sizeof(*container->msc_finalizers));
+ container->msc_finalizers = NULL;
+ }
+#ifdef LNET_USE_LIB_FREELIST
+ lnet_freelist_fini(&container->msc_freelist);
+#endif
+ container->msc_init = 0;
+}
+
+int
+lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
+{
+ int rc;
+
+ container->msc_init = 1;
+
+ INIT_LIST_HEAD(&container->msc_active);
+ INIT_LIST_HEAD(&container->msc_finalizing);
+
+#ifdef LNET_USE_LIB_FREELIST
+ memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t));
+
+ rc = lnet_freelist_init(&container->msc_freelist,
+ LNET_FL_MAX_MSGS, sizeof(lnet_msg_t));
+ if (rc != 0) {
+ CERROR("Failed to init freelist for message container\n");
+ lnet_msg_container_cleanup(container);
+ return rc;
+ }
+#else
+ rc = 0;
+#endif
+ /* number of CPUs */
+ container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
+
+ LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
+ container->msc_nfinalizers *
+ sizeof(*container->msc_finalizers));
+
+ if (container->msc_finalizers == NULL) {
+ CERROR("Failed to allocate message finalizers\n");
+ lnet_msg_container_cleanup(container);
+ return -ENOMEM;
+ }
+
+ return rc;
+}
+
+void
+lnet_msg_containers_destroy(void)
+{
+ struct lnet_msg_container *container;
+ int i;
+
+ if (the_lnet.ln_msg_containers == NULL)
+ return;
+
+ cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
+ lnet_msg_container_cleanup(container);
+
+ cfs_percpt_free(the_lnet.ln_msg_containers);
+ the_lnet.ln_msg_containers = NULL;
+}
+
+int
+lnet_msg_containers_create(void)
+{
+ struct lnet_msg_container *container;
+ int rc;
+ int i;
+
+ the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*container));
+
+ if (the_lnet.ln_msg_containers == NULL) {
+ CERROR("Failed to allocate cpu-partition data for network\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
+ rc = lnet_msg_container_setup(container, i);
+ if (rc != 0) {
+ lnet_msg_containers_destroy();
+ return rc;
+ }
+ }
+
+ return 0;
+}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
new file mode 100644
index 000000000000..9b9e7d3139b0
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c
@@ -0,0 +1,938 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/lib-ptl.c
+ *
+ * portal & match routines
+ *
+ * Author: liang@whamcloud.com
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+/* NB: add /proc interfaces in upcoming patches */
+int portal_rotor = LNET_PTL_ROTOR_HASH_RT;
+CFS_MODULE_PARM(portal_rotor, "i", int, 0644,
+ "redirect PUTs to different cpu-partitions");
+
+static int
+lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id,
+ __u64 mbits, __u64 ignore_bits)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[index];
+ int unique;
+
+ unique = ignore_bits == 0 &&
+ match_id.nid != LNET_NID_ANY &&
+ match_id.pid != LNET_PID_ANY;
+
+ LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl));
+
+ /* prefer to check w/o any lock */
+ if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl)))
+ goto match;
+
+ /* unset, new portal */
+ lnet_ptl_lock(ptl);
+ /* check again with lock */
+ if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) {
+ lnet_ptl_unlock(ptl);
+ goto match;
+ }
+
+ /* still not set */
+ if (unique)
+ lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE);
+ else
+ lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD);
+
+ lnet_ptl_unlock(ptl);
+
+ return 1;
+
+ match:
+ if ((lnet_ptl_is_unique(ptl) && !unique) ||
+ (lnet_ptl_is_wildcard(ptl) && unique))
+ return 0;
+ return 1;
+}
+
+static void
+lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt)
+{
+ struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+ int i;
+
+ /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ mtable->mt_enabled = 1;
+
+ ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt;
+ for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) {
+ LASSERT(ptl->ptl_mt_maps[i] != cpt);
+ if (ptl->ptl_mt_maps[i] < cpt)
+ break;
+
+ /* swap to order */
+ ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i];
+ ptl->ptl_mt_maps[i] = cpt;
+ }
+
+ ptl->ptl_mt_nmaps++;
+}
+
+static void
+lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt)
+{
+ struct lnet_match_table *mtable = ptl->ptl_mtables[cpt];
+ int i;
+
+ /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ if (LNET_CPT_NUMBER == 1)
+ return; /* never disable the only match-table */
+
+ mtable->mt_enabled = 0;
+
+ LASSERT(ptl->ptl_mt_nmaps > 0 &&
+ ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER);
+
+ /* remove it from mt_maps */
+ ptl->ptl_mt_nmaps--;
+ for (i = 0; i < ptl->ptl_mt_nmaps; i++) {
+ if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */
+ ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1];
+ }
+}
+
+static int
+lnet_try_match_md(lnet_libmd_t *md,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock;
+ * lnet_match_blocked_msg() relies on this to avoid races */
+ unsigned int offset;
+ unsigned int mlength;
+ lnet_me_t *me = md->md_me;
+
+ /* MD exhausted */
+ if (lnet_md_exhausted(md))
+ return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED;
+
+ /* mismatched MD op */
+ if ((md->md_options & info->mi_opc) == 0)
+ return LNET_MATCHMD_NONE;
+
+ /* mismatched ME nid/pid? */
+ if (me->me_match_id.nid != LNET_NID_ANY &&
+ me->me_match_id.nid != info->mi_id.nid)
+ return LNET_MATCHMD_NONE;
+
+ if (me->me_match_id.pid != LNET_PID_ANY &&
+ me->me_match_id.pid != info->mi_id.pid)
+ return LNET_MATCHMD_NONE;
+
+ /* mismatched ME matchbits? */
+ if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0)
+ return LNET_MATCHMD_NONE;
+
+ /* Hurrah! This _is_ a match; check it out... */
+
+ if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+ offset = md->md_offset;
+ else
+ offset = info->mi_roffset;
+
+ if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+ mlength = md->md_max_size;
+ LASSERT(md->md_offset + mlength <= md->md_length);
+ } else {
+ mlength = md->md_length - offset;
+ }
+
+ if (info->mi_rlength <= mlength) { /* fits in allowed space */
+ mlength = info->mi_rlength;
+ } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+ /* this packet _really_ is too big */
+ CERROR("Matching packet from %s, match "LPU64
+ " length %d too big: %d left, %d allowed\n",
+ libcfs_id2str(info->mi_id), info->mi_mbits,
+ info->mi_rlength, md->md_length - offset, mlength);
+
+ return LNET_MATCHMD_DROP;
+ }
+
+ /* Commit to this ME/MD */
+ CDEBUG(D_NET, "Incoming %s index %x from %s of "
+ "length %d/%d into md "LPX64" [%d] + %d\n",
+ (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get",
+ info->mi_portal, libcfs_id2str(info->mi_id), mlength,
+ info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset);
+
+ lnet_msg_attach_md(msg, md, offset, mlength);
+ md->md_offset = offset + mlength;
+
+ if (!lnet_md_exhausted(md))
+ return LNET_MATCHMD_OK;
+
+ /* Auto-unlink NOW, so the ME gets unlinked if required.
+ * We bumped md->md_refcount above so the MD just gets flagged
+ * for unlink when it is finalized. */
+ if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0)
+ lnet_md_unlink(md);
+
+ return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED;
+}
+
+static struct lnet_match_table *
+lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits)
+{
+ if (LNET_CPT_NUMBER == 1)
+ return ptl->ptl_mtables[0]; /* the only one */
+
+ /* if it's a unique portal, return match-table hashed by NID */
+ return lnet_ptl_is_unique(ptl) ?
+ ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL;
+}
+
+struct lnet_match_table *
+lnet_mt_of_attach(unsigned int index, lnet_process_id_t id,
+ __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos)
+{
+ struct lnet_portal *ptl;
+ struct lnet_match_table *mtable;
+
+ /* NB: called w/o lock */
+ LASSERT(index < the_lnet.ln_nportals);
+
+ if (!lnet_ptl_match_type(index, id, mbits, ignore_bits))
+ return NULL;
+
+ ptl = the_lnet.ln_portals[index];
+
+ mtable = lnet_match2mt(ptl, id, mbits);
+ if (mtable != NULL) /* unique portal or only one match-table */
+ return mtable;
+
+ /* it's a wildcard portal */
+ switch (pos) {
+ default:
+ return NULL;
+ case LNET_INS_BEFORE:
+ case LNET_INS_AFTER:
+ /* posted by no affinity thread, always hash to specific
+ * match-table to avoid buffer stealing which is heavy */
+ return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER];
+ case LNET_INS_LOCAL:
+ /* posted by cpu-affinity thread */
+ return ptl->ptl_mtables[lnet_cpt_current()];
+ }
+}
+
+static struct lnet_match_table *
+lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_portal *ptl;
+ int nmaps;
+ int rotor;
+ int routed;
+ int cpt;
+
+ /* NB: called w/o lock */
+ LASSERT(info->mi_portal < the_lnet.ln_nportals);
+ ptl = the_lnet.ln_portals[info->mi_portal];
+
+ LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl));
+
+ mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits);
+ if (mtable != NULL)
+ return mtable;
+
+ /* it's a wildcard portal */
+ routed = LNET_NIDNET(msg->msg_hdr.src_nid) !=
+ LNET_NIDNET(msg->msg_hdr.dest_nid);
+
+ if (portal_rotor == LNET_PTL_ROTOR_OFF ||
+ (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) {
+ cpt = lnet_cpt_current();
+ if (ptl->ptl_mtables[cpt]->mt_enabled)
+ return ptl->ptl_mtables[cpt];
+ }
+
+ rotor = ptl->ptl_rotor++; /* get round-robin factor */
+ if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed)
+ cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid);
+ else
+ cpt = rotor % LNET_CPT_NUMBER;
+
+ if (!ptl->ptl_mtables[cpt]->mt_enabled) {
+ /* is there any active entry for this portal? */
+ nmaps = ptl->ptl_mt_nmaps;
+ /* map to an active mtable to avoid heavy "stealing" */
+ if (nmaps != 0) {
+ /* NB: there is possibility that ptl_mt_maps is being
+ * changed because we are not under protection of
+ * lnet_ptl_lock, but it shouldn't hurt anything */
+ cpt = ptl->ptl_mt_maps[rotor % nmaps];
+ }
+ }
+
+ return ptl->ptl_mtables[cpt];
+}
+
+static int
+lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos)
+{
+ __u64 *bmap;
+ int i;
+
+ if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+ return 0;
+
+ if (pos < 0) { /* check all bits */
+ for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) {
+ if (mtable->mt_exhausted[i] != (__u64)(-1))
+ return 0;
+ }
+ return 1;
+ }
+
+ LASSERT(pos <= LNET_MT_HASH_IGNORE);
+ /* mtable::mt_mhash[pos] is marked as exhausted or not */
+ bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+ pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+ return ((*bmap) & (1ULL << pos)) != 0;
+}
+
+static void
+lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted)
+{
+ __u64 *bmap;
+
+ LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]));
+ LASSERT(pos <= LNET_MT_HASH_IGNORE);
+
+ /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */
+ bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64];
+ pos &= (1 << LNET_MT_BITS_U64) - 1;
+
+ if (!exhausted)
+ *bmap &= ~(1ULL << pos);
+ else
+ *bmap |= 1ULL << pos;
+}
+
+struct list_head *
+lnet_mt_match_head(struct lnet_match_table *mtable,
+ lnet_process_id_t id, __u64 mbits)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal];
+
+ if (lnet_ptl_is_wildcard(ptl)) {
+ return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK];
+ } else {
+ unsigned long hash = mbits + id.nid + id.pid;
+
+ LASSERT(lnet_ptl_is_unique(ptl));
+ hash = cfs_hash_long(hash, LNET_MT_HASH_BITS);
+ return &mtable->mt_mhash[hash];
+ }
+}
+
+int
+lnet_mt_match_md(struct lnet_match_table *mtable,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct list_head *head;
+ lnet_me_t *me;
+ lnet_me_t *tmp;
+ int exhausted = 0;
+ int rc;
+
+ /* any ME with ignore bits? */
+ if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE]))
+ head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE];
+ else
+ head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ again:
+ /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */
+ if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal]))
+ exhausted = LNET_MATCHMD_EXHAUSTED;
+
+ list_for_each_entry_safe(me, tmp, head, me_list) {
+ /* ME attached but MD not attached yet */
+ if (me->me_md == NULL)
+ continue;
+
+ LASSERT(me == me->me_md->md_me);
+
+ rc = lnet_try_match_md(me->me_md, info, msg);
+ if ((rc & LNET_MATCHMD_EXHAUSTED) == 0)
+ exhausted = 0; /* mlist is not empty */
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0) {
+ /* don't return EXHAUSTED bit because we don't know
+ * whether the mlist is empty or not */
+ return rc & ~LNET_MATCHMD_EXHAUSTED;
+ }
+ }
+
+ if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */
+ lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1);
+ if (!lnet_mt_test_exhausted(mtable, -1))
+ exhausted = 0;
+ }
+
+ if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) {
+ head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits);
+ goto again; /* re-check MEs w/o ignore-bits */
+ }
+
+ if (info->mi_opc == LNET_MD_OP_GET ||
+ !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal]))
+ return LNET_MATCHMD_DROP | exhausted;
+
+ return LNET_MATCHMD_NONE | exhausted;
+}
+
+static int
+lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg)
+{
+ int rc;
+
+ /* message arrived before any buffer posting on this portal,
+ * simply delay or drop this message */
+ if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)))
+ return 0;
+
+ lnet_ptl_lock(ptl);
+ /* check it again with hold of lock */
+ if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) {
+ lnet_ptl_unlock(ptl);
+ return 0;
+ }
+
+ if (lnet_ptl_is_lazy(ptl)) {
+ if (msg->msg_rx_ready_delay) {
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_delayed);
+ }
+ rc = LNET_MATCHMD_NONE;
+ } else {
+ rc = LNET_MATCHMD_DROP;
+ }
+
+ lnet_ptl_unlock(ptl);
+ return rc;
+}
+
+static int
+lnet_ptl_match_delay(struct lnet_portal *ptl,
+ struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ int first = ptl->ptl_mt_maps[0]; /* read w/o lock */
+ int rc = 0;
+ int i;
+
+ /* steal buffer from other CPTs, and delay it if nothing to steal,
+ * this function is more expensive than a regular match, but we
+ * don't expect it can happen a lot */
+ LASSERT(lnet_ptl_is_wildcard(ptl));
+
+ for (i = 0; i < LNET_CPT_NUMBER; i++) {
+ struct lnet_match_table *mtable;
+ int cpt;
+
+ cpt = (first + i) % LNET_CPT_NUMBER;
+ mtable = ptl->ptl_mtables[cpt];
+ if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled)
+ continue;
+
+ lnet_res_lock(cpt);
+ lnet_ptl_lock(ptl);
+
+ if (i == 0) { /* the first try, attach on stealing list */
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_stealing);
+ }
+
+ if (!list_empty(&msg->msg_list)) { /* on stealing list */
+ rc = lnet_mt_match_md(mtable, info, msg);
+
+ if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 &&
+ mtable->mt_enabled)
+ lnet_ptl_disable_mt(ptl, cpt);
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0)
+ list_del_init(&msg->msg_list);
+
+ } else {
+ /* could be matched by lnet_ptl_attach_md()
+ * which is called by another thread */
+ rc = msg->msg_md == NULL ?
+ LNET_MATCHMD_DROP : LNET_MATCHMD_OK;
+ }
+
+ if (!list_empty(&msg->msg_list) && /* not matched yet */
+ (i == LNET_CPT_NUMBER - 1 || /* the last CPT */
+ ptl->ptl_mt_nmaps == 0 || /* no active CPT */
+ (ptl->ptl_mt_nmaps == 1 && /* the only active CPT */
+ ptl->ptl_mt_maps[0] == cpt))) {
+ /* nothing to steal, delay or drop */
+ list_del_init(&msg->msg_list);
+
+ if (lnet_ptl_is_lazy(ptl)) {
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list,
+ &ptl->ptl_msg_delayed);
+ rc = LNET_MATCHMD_NONE;
+ } else {
+ rc = LNET_MATCHMD_DROP;
+ }
+ }
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(cpt);
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed)
+ break;
+ }
+
+ return rc;
+}
+
+int
+lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
+{
+ struct lnet_match_table *mtable;
+ struct lnet_portal *ptl;
+ int rc;
+
+ CDEBUG(D_NET, "Request from %s of length %d into portal %d "
+ "MB="LPX64"\n", libcfs_id2str(info->mi_id),
+ info->mi_rlength, info->mi_portal, info->mi_mbits);
+
+ if (info->mi_portal >= the_lnet.ln_nportals) {
+ CERROR("Invalid portal %d not in [0-%d]\n",
+ info->mi_portal, the_lnet.ln_nportals);
+ return LNET_MATCHMD_DROP;
+ }
+
+ ptl = the_lnet.ln_portals[info->mi_portal];
+ rc = lnet_ptl_match_early(ptl, msg);
+ if (rc != 0) /* matched or delayed early message */
+ return rc;
+
+ mtable = lnet_mt_of_match(info, msg);
+ lnet_res_lock(mtable->mt_cpt);
+
+ if (the_lnet.ln_shutdown) {
+ rc = LNET_MATCHMD_DROP;
+ goto out1;
+ }
+
+ rc = lnet_mt_match_md(mtable, info, msg);
+ if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) {
+ lnet_ptl_lock(ptl);
+ lnet_ptl_disable_mt(ptl, mtable->mt_cpt);
+ lnet_ptl_unlock(ptl);
+ }
+
+ if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */
+ goto out1;
+
+ if (!msg->msg_rx_ready_delay)
+ goto out1;
+
+ LASSERT(lnet_ptl_is_lazy(ptl));
+ LASSERT(!msg->msg_rx_delayed);
+
+ /* NB: we don't expect "delay" can happen a lot */
+ if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) {
+ lnet_ptl_lock(ptl);
+
+ msg->msg_rx_delayed = 1;
+ list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(mtable->mt_cpt);
+
+ } else {
+ lnet_res_unlock(mtable->mt_cpt);
+ rc = lnet_ptl_match_delay(ptl, info, msg);
+ }
+
+ if (msg->msg_rx_delayed) {
+ CDEBUG(D_NET,
+ "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n",
+ info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET",
+ libcfs_id2str(info->mi_id), info->mi_portal,
+ info->mi_mbits, info->mi_roffset, info->mi_rlength);
+ }
+ goto out0;
+ out1:
+ lnet_res_unlock(mtable->mt_cpt);
+ out0:
+ /* EXHAUSTED bit is only meaningful for internal functions */
+ return rc & ~LNET_MATCHMD_EXHAUSTED;
+}
+
+void
+lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md)
+{
+ LASSERT(me->me_md == md && md->md_me == me);
+
+ me->me_md = NULL;
+ md->md_me = NULL;
+}
+
+/* called with lnet_res_lock held */
+void
+lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md,
+ struct list_head *matches, struct list_head *drops)
+{
+ struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal];
+ struct lnet_match_table *mtable;
+ struct list_head *head;
+ lnet_msg_t *tmp;
+ lnet_msg_t *msg;
+ int exhausted = 0;
+ int cpt;
+
+ LASSERT(md->md_refcount == 0); /* a brand new MD */
+
+ me->me_md = md;
+ md->md_me = me;
+
+ cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie);
+ mtable = ptl->ptl_mtables[cpt];
+
+ if (list_empty(&ptl->ptl_msg_stealing) &&
+ list_empty(&ptl->ptl_msg_delayed) &&
+ !lnet_mt_test_exhausted(mtable, me->me_pos))
+ return;
+
+ lnet_ptl_lock(ptl);
+ head = &ptl->ptl_msg_stealing;
+ again:
+ list_for_each_entry_safe(msg, tmp, head, msg_list) {
+ struct lnet_match_info info;
+ lnet_hdr_t *hdr;
+ int rc;
+
+ LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing);
+
+ hdr = &msg->msg_hdr;
+ info.mi_id.nid = hdr->src_nid;
+ info.mi_id.pid = hdr->src_pid;
+ info.mi_opc = LNET_MD_OP_PUT;
+ info.mi_portal = hdr->msg.put.ptl_index;
+ info.mi_rlength = hdr->payload_length;
+ info.mi_roffset = hdr->msg.put.offset;
+ info.mi_mbits = hdr->msg.put.match_bits;
+
+ rc = lnet_try_match_md(md, &info, msg);
+
+ exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0;
+ if ((rc & LNET_MATCHMD_NONE) != 0) {
+ if (exhausted)
+ break;
+ continue;
+ }
+
+ /* Hurrah! This _is_ a match */
+ LASSERT((rc & LNET_MATCHMD_FINISH) != 0);
+ list_del_init(&msg->msg_list);
+
+ if (head == &ptl->ptl_msg_stealing) {
+ if (exhausted)
+ break;
+ /* stealing thread will handle the message */
+ continue;
+ }
+
+ if ((rc & LNET_MATCHMD_OK) != 0) {
+ list_add_tail(&msg->msg_list, matches);
+
+ CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+ "match "LPU64" offset %d length %d.\n",
+ libcfs_id2str(info.mi_id),
+ info.mi_portal, info.mi_mbits,
+ info.mi_roffset, info.mi_rlength);
+ } else {
+ list_add_tail(&msg->msg_list, drops);
+ }
+
+ if (exhausted)
+ break;
+ }
+
+ if (!exhausted && head == &ptl->ptl_msg_stealing) {
+ head = &ptl->ptl_msg_delayed;
+ goto again;
+ }
+
+ if (lnet_ptl_is_wildcard(ptl) && !exhausted) {
+ lnet_mt_set_exhausted(mtable, me->me_pos, 0);
+ if (!mtable->mt_enabled)
+ lnet_ptl_enable_mt(ptl, cpt);
+ }
+
+ lnet_ptl_unlock(ptl);
+}
+
+void
+lnet_ptl_cleanup(struct lnet_portal *ptl)
+{
+ struct lnet_match_table *mtable;
+ int i;
+
+ if (ptl->ptl_mtables == NULL) /* uninitialized portal */
+ return;
+
+ LASSERT(list_empty(&ptl->ptl_msg_delayed));
+ LASSERT(list_empty(&ptl->ptl_msg_stealing));
+ cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+ struct list_head *mhash;
+ lnet_me_t *me;
+ int j;
+
+ if (mtable->mt_mhash == NULL) /* uninitialized match-table */
+ continue;
+
+ mhash = mtable->mt_mhash;
+ /* cleanup ME */
+ for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) {
+ while (!list_empty(&mhash[j])) {
+ me = list_entry(mhash[j].next,
+ lnet_me_t, me_list);
+ CERROR("Active ME %p on exit\n", me);
+ list_del(&me->me_list);
+ lnet_me_free(me);
+ }
+ }
+ /* the extra entry is for MEs with ignore bits */
+ LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+ }
+
+ cfs_percpt_free(ptl->ptl_mtables);
+ ptl->ptl_mtables = NULL;
+}
+
+int
+lnet_ptl_setup(struct lnet_portal *ptl, int index)
+{
+ struct lnet_match_table *mtable;
+ struct list_head *mhash;
+ int i;
+ int j;
+
+ ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(struct lnet_match_table));
+ if (ptl->ptl_mtables == NULL) {
+ CERROR("Failed to create match table for portal %d\n", index);
+ return -ENOMEM;
+ }
+
+ ptl->ptl_index = index;
+ INIT_LIST_HEAD(&ptl->ptl_msg_delayed);
+ INIT_LIST_HEAD(&ptl->ptl_msg_stealing);
+ spin_lock_init(&ptl->ptl_lock);
+ cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) {
+ /* the extra entry is for MEs with ignore bits */
+ LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i,
+ sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1));
+ if (mhash == NULL) {
+ CERROR("Failed to create match hash for portal %d\n",
+ index);
+ goto failed;
+ }
+
+ memset(&mtable->mt_exhausted[0], -1,
+ sizeof(mtable->mt_exhausted[0]) *
+ LNET_MT_EXHAUSTED_BMAP);
+ mtable->mt_mhash = mhash;
+ for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++)
+ INIT_LIST_HEAD(&mhash[j]);
+
+ mtable->mt_portal = index;
+ mtable->mt_cpt = i;
+ }
+
+ return 0;
+ failed:
+ lnet_ptl_cleanup(ptl);
+ return -ENOMEM;
+}
+
+void
+lnet_portals_destroy(void)
+{
+ int i;
+
+ if (the_lnet.ln_portals == NULL)
+ return;
+
+ for (i = 0; i < the_lnet.ln_nportals; i++)
+ lnet_ptl_cleanup(the_lnet.ln_portals[i]);
+
+ cfs_array_free(the_lnet.ln_portals);
+ the_lnet.ln_portals = NULL;
+}
+
+int
+lnet_portals_create(void)
+{
+ int size;
+ int i;
+
+ size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]);
+
+ the_lnet.ln_nportals = MAX_PORTALS;
+ the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size);
+ if (the_lnet.ln_portals == NULL) {
+ CERROR("Failed to allocate portals table\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < the_lnet.ln_nportals; i++) {
+ if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) {
+ lnet_portals_destroy();
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Turn on the lazy portal attribute. Use with caution!
+ *
+ * This portal attribute only affects incoming PUT requests to the portal,
+ * and is off by default. By default, if there's no matching MD for an
+ * incoming PUT request, it is simply dropped. With the lazy attribute on,
+ * such requests are queued indefinitely until either a matching MD is
+ * posted to the portal or the lazy attribute is turned off.
+ *
+ * It would prevent dropped requests, however it should be regarded as the
+ * last line of defense - i.e. users must keep a close watch on active
+ * buffers on a lazy portal and once it becomes too low post more buffers as
+ * soon as possible. This is because delayed requests usually have detrimental
+ * effects on underlying network connections. A few delayed requests often
+ * suffice to bring an underlying connection to a complete halt, due to flow
+ * control mechanisms.
+ *
+ * There's also a DOS attack risk. If users don't post match-all MDs on a
+ * lazy portal, a malicious peer can easily stop a service by sending some
+ * PUT requests with match bits that won't match any MD. A routed server is
+ * especially vulnerable since the connections to its neighbor routers are
+ * shared among all clients.
+ *
+ * \param portal Index of the portal to enable the lazy attribute on.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetSetLazyPortal(int portal)
+{
+ struct lnet_portal *ptl;
+
+ if (portal < 0 || portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+ ptl = the_lnet.ln_portals[portal];
+
+ lnet_res_lock(LNET_LOCK_EX);
+ lnet_ptl_lock(ptl);
+
+ lnet_ptl_setopt(ptl, LNET_PTL_LAZY);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetSetLazyPortal);
+
+/**
+ * Turn off the lazy portal attribute. Delayed requests on the portal,
+ * if any, will be all dropped when this function returns.
+ *
+ * \param portal Index of the portal to disable the lazy attribute on.
+ *
+ * \retval 0 On success.
+ * \retval -EINVAL If \a portal is not a valid index.
+ */
+int
+LNetClearLazyPortal(int portal)
+{
+ struct lnet_portal *ptl;
+ LIST_HEAD (zombies);
+
+ if (portal < 0 || portal >= the_lnet.ln_nportals)
+ return -EINVAL;
+
+ ptl = the_lnet.ln_portals[portal];
+
+ lnet_res_lock(LNET_LOCK_EX);
+ lnet_ptl_lock(ptl);
+
+ if (!lnet_ptl_is_lazy(ptl)) {
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+ return 0;
+ }
+
+ if (the_lnet.ln_shutdown)
+ CWARN("Active lazy portal %d on exit\n", portal);
+ else
+ CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
+
+ /* grab all the blocked messages atomically */
+ list_splice_init(&ptl->ptl_msg_delayed, &zombies);
+
+ lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY);
+
+ lnet_ptl_unlock(ptl);
+ lnet_res_unlock(LNET_LOCK_EX);
+
+ lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr");
+
+ return 0;
+}
+EXPORT_SYMBOL(LNetClearLazyPortal);
diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c
new file mode 100644
index 000000000000..670dae34107c
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/lo.c
@@ -0,0 +1,120 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+ LASSERT (!lntmsg->msg_routing);
+ LASSERT (!lntmsg->msg_target_is_router);
+
+ return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+ int delayed, unsigned int niov,
+ struct iovec *iov, lnet_kiov_t *kiov,
+ unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+ lnet_msg_t *sendmsg = private;
+
+ if (lntmsg != NULL) { /* not discarding */
+ if (sendmsg->msg_iov != NULL) {
+ if (iov != NULL)
+ lnet_copy_iov2iov(niov, iov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_iov,
+ sendmsg->msg_offset, mlen);
+ else
+ lnet_copy_iov2kiov(niov, kiov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_iov,
+ sendmsg->msg_offset, mlen);
+ } else {
+ if (iov != NULL)
+ lnet_copy_kiov2iov(niov, iov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_kiov,
+ sendmsg->msg_offset, mlen);
+ else
+ lnet_copy_kiov2kiov(niov, kiov, offset,
+ sendmsg->msg_niov,
+ sendmsg->msg_kiov,
+ sendmsg->msg_offset, mlen);
+ }
+
+ lnet_finalize(ni, lntmsg, 0);
+ }
+
+ lnet_finalize(ni, sendmsg, 0);
+ return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+ CDEBUG (D_NET, "shutdown\n");
+ LASSERT (lolnd_instanced);
+
+ lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+ LASSERT (ni->ni_lnd == &the_lolnd);
+ LASSERT (!lolnd_instanced);
+ lolnd_instanced = 1;
+
+ return (0);
+}
+
+lnd_t the_lolnd = {
+ /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+ /* .lnd_refcount = */ 0,
+ /* .lnd_type = */ LOLND,
+ /* .lnd_startup = */ lolnd_startup,
+ /* .lnd_shutdown = */ lolnd_shutdown,
+ /* .lnt_ctl = */ NULL,
+ /* .lnd_send = */ lolnd_send,
+ /* .lnd_recv = */ lolnd_recv,
+ /* .lnd_eager_recv = */ NULL,
+ /* .lnd_notify = */ NULL,
+ /* .lnd_accept = */ NULL
+};
diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c
new file mode 100644
index 000000000000..c8323854580a
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/module.c
@@ -0,0 +1,154 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+ "configure network at module load");
+
+static struct mutex lnet_config_mutex;
+
+int
+lnet_configure (void *arg)
+{
+ /* 'arg' only there so I can be passed to cfs_create_thread() */
+ int rc = 0;
+
+ LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+ if (!the_lnet.ln_niinit_self) {
+ rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+ if (rc >= 0) {
+ the_lnet.ln_niinit_self = 1;
+ rc = 0;
+ }
+ }
+
+ LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+ return rc;
+}
+
+int
+lnet_unconfigure (void)
+{
+ int refcount;
+
+ LNET_MUTEX_LOCK(&lnet_config_mutex);
+
+ if (the_lnet.ln_niinit_self) {
+ the_lnet.ln_niinit_self = 0;
+ LNetNIFini();
+ }
+
+ LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex);
+ refcount = the_lnet.ln_refcount;
+ LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex);
+
+ LNET_MUTEX_UNLOCK(&lnet_config_mutex);
+ return (refcount == 0) ? 0 : -EBUSY;
+}
+
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+ int rc;
+
+ switch (cmd) {
+ case IOC_LIBCFS_CONFIGURE:
+ return lnet_configure(NULL);
+
+ case IOC_LIBCFS_UNCONFIGURE:
+ return lnet_unconfigure();
+
+ default:
+ /* Passing LNET_PID_ANY only gives me a ref if the net is up
+ * already; I'll need it to ensure the net can't go down while
+ * I'm called into it */
+ rc = LNetNIInit(LNET_PID_ANY);
+ if (rc >= 0) {
+ rc = LNetCtl(cmd, data);
+ LNetNIFini();
+ }
+ return rc;
+ }
+}
+
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
+
+int
+init_lnet(void)
+{
+ int rc;
+ ENTRY;
+
+ mutex_init(&lnet_config_mutex);
+
+ rc = LNetInit();
+ if (rc != 0) {
+ CERROR("LNetInit: error %d\n", rc);
+ RETURN(rc);
+ }
+
+ rc = libcfs_register_ioctl(&lnet_ioctl_handler);
+ LASSERT (rc == 0);
+
+ if (config_on_load) {
+ /* Have to schedule a separate thread to avoid deadlocking
+ * in modload */
+ (void) kthread_run(lnet_configure, NULL, "lnet_initd");
+ }
+
+ RETURN(0);
+}
+
+void
+fini_lnet(void)
+{
+ int rc;
+
+ rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
+ LASSERT (rc == 0);
+
+ LNetFini();
+}
+
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);
diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c
new file mode 100644
index 000000000000..286977691393
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/peer.c
@@ -0,0 +1,337 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/lnet/peer.c
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <linux/lnet/lib-lnet.h>
+
+int
+lnet_peer_tables_create(void)
+{
+ struct lnet_peer_table *ptable;
+ struct list_head *hash;
+ int i;
+ int j;
+
+ the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(),
+ sizeof(*ptable));
+ if (the_lnet.ln_peer_tables == NULL) {
+ CERROR("Failed to allocate cpu-partition peer tables\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ INIT_LIST_HEAD(&ptable->pt_deathrow);
+
+ LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i,
+ LNET_PEER_HASH_SIZE * sizeof(*hash));
+ if (hash == NULL) {
+ CERROR("Failed to create peer hash table\n");
+ lnet_peer_tables_destroy();
+ return -ENOMEM;
+ }
+
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+ INIT_LIST_HEAD(&hash[j]);
+ ptable->pt_hash = hash; /* sign of initialization */
+ }
+
+ return 0;
+}
+
+void
+lnet_peer_tables_destroy(void)
+{
+ struct lnet_peer_table *ptable;
+ struct list_head *hash;
+ int i;
+ int j;
+
+ if (the_lnet.ln_peer_tables == NULL)
+ return;
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ hash = ptable->pt_hash;
+ if (hash == NULL) /* not intialized */
+ break;
+
+ LASSERT(list_empty(&ptable->pt_deathrow));
+
+ ptable->pt_hash = NULL;
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++)
+ LASSERT(list_empty(&hash[j]));
+
+ LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash));
+ }
+
+ cfs_percpt_free(the_lnet.ln_peer_tables);
+ the_lnet.ln_peer_tables = NULL;
+}
+
+void
+lnet_peer_tables_cleanup(void)
+{
+ struct lnet_peer_table *ptable;
+ int i;
+ int j;
+
+ LASSERT(the_lnet.ln_shutdown); /* i.e. no new peers */
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ lnet_net_lock(i);
+
+ for (j = 0; j < LNET_PEER_HASH_SIZE; j++) {
+ struct list_head *peers = &ptable->pt_hash[j];
+
+ while (!list_empty(peers)) {
+ lnet_peer_t *lp = list_entry(peers->next,
+ lnet_peer_t,
+ lp_hashlist);
+ list_del_init(&lp->lp_hashlist);
+ /* lose hash table's ref */
+ lnet_peer_decref_locked(lp);
+ }
+ }
+
+ lnet_net_unlock(i);
+ }
+
+ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
+ LIST_HEAD (deathrow);
+ lnet_peer_t *lp;
+
+ lnet_net_lock(i);
+
+ for (j = 3; ptable->pt_number != 0; j++) {
+ lnet_net_unlock(i);
+
+ if ((j & (j - 1)) == 0) {
+ CDEBUG(D_WARNING,
+ "Waiting for %d peers on peer table\n",
+ ptable->pt_number);
+ }
+ cfs_pause(cfs_time_seconds(1) / 2);
+ lnet_net_lock(i);
+ }
+ list_splice_init(&ptable->pt_deathrow, &deathrow);
+
+ lnet_net_unlock(i);
+
+ while (!list_empty(&deathrow)) {
+ lp = list_entry(deathrow.next,
+ lnet_peer_t, lp_hashlist);
+ list_del(&lp->lp_hashlist);
+ LIBCFS_FREE(lp, sizeof(*lp));
+ }
+ }
+}
+
+void
+lnet_destroy_peer_locked(lnet_peer_t *lp)
+{
+ struct lnet_peer_table *ptable;
+
+ LASSERT(lp->lp_refcount == 0);
+ LASSERT(lp->lp_rtr_refcount == 0);
+ LASSERT(list_empty(&lp->lp_txq));
+ LASSERT(list_empty(&lp->lp_hashlist));
+ LASSERT(lp->lp_txqnob == 0);
+
+ ptable = the_lnet.ln_peer_tables[lp->lp_cpt];
+ LASSERT(ptable->pt_number > 0);
+ ptable->pt_number--;
+
+ lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt);
+ lp->lp_ni = NULL;
+
+ list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+}
+
+lnet_peer_t *
+lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
+{
+ struct list_head *peers;
+ lnet_peer_t *lp;
+
+ LASSERT(!the_lnet.ln_shutdown);
+
+ peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
+ list_for_each_entry(lp, peers, lp_hashlist) {
+ if (lp->lp_nid == nid) {
+ lnet_peer_addref_locked(lp);
+ return lp;
+ }
+ }
+
+ return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
+{
+ struct lnet_peer_table *ptable;
+ lnet_peer_t *lp = NULL;
+ lnet_peer_t *lp2;
+ int cpt2;
+ int rc = 0;
+
+ *lpp = NULL;
+ if (the_lnet.ln_shutdown) /* it's shutting down */
+ return -ESHUTDOWN;
+
+ /* cpt can be LNET_LOCK_EX if it's called from router functions */
+ cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
+
+ ptable = the_lnet.ln_peer_tables[cpt2];
+ lp = lnet_find_peer_locked(ptable, nid);
+ if (lp != NULL) {
+ *lpp = lp;
+ return 0;
+ }
+
+ if (!list_empty(&ptable->pt_deathrow)) {
+ lp = list_entry(ptable->pt_deathrow.next,
+ lnet_peer_t, lp_hashlist);
+ list_del(&lp->lp_hashlist);
+ }
+
+ /*
+ * take extra refcount in case another thread has shutdown LNet
+ * and destroyed locks and peer-table before I finish the allocation
+ */
+ ptable->pt_number++;
+ lnet_net_unlock(cpt);
+
+ if (lp != NULL)
+ memset(lp, 0, sizeof(*lp));
+ else
+ LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
+
+ if (lp == NULL) {
+ rc = -ENOMEM;
+ lnet_net_lock(cpt);
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&lp->lp_txq);
+ INIT_LIST_HEAD(&lp->lp_rtrq);
+ INIT_LIST_HEAD(&lp->lp_routes);
+
+ lp->lp_notify = 0;
+ lp->lp_notifylnd = 0;
+ lp->lp_notifying = 0;
+ lp->lp_alive_count = 0;
+ lp->lp_timestamp = 0;
+ lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+ lp->lp_last_alive = cfs_time_current(); /* assumes alive */
+ lp->lp_last_query = 0; /* haven't asked NI yet */
+ lp->lp_ping_timestamp = 0;
+ lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ lp->lp_nid = nid;
+ lp->lp_cpt = cpt2;
+ lp->lp_refcount = 2; /* 1 for caller; 1 for hash */
+ lp->lp_rtr_refcount = 0;
+
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ rc = -ESHUTDOWN;
+ goto out;
+ }
+
+ lp2 = lnet_find_peer_locked(ptable, nid);
+ if (lp2 != NULL) {
+ *lpp = lp2;
+ goto out;
+ }
+
+ lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
+ if (lp->lp_ni == NULL) {
+ rc = -EHOSTUNREACH;
+ goto out;
+ }
+
+ lp->lp_txcredits =
+ lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+ lp->lp_rtrcredits =
+ lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+
+ list_add_tail(&lp->lp_hashlist,
+ &ptable->pt_hash[lnet_nid2peerhash(nid)]);
+ ptable->pt_version++;
+ *lpp = lp;
+
+ return 0;
+out:
+ if (lp != NULL)
+ list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+ ptable->pt_number--;
+ return rc;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+ char *aliveness = "NA";
+ lnet_peer_t *lp;
+ int rc;
+ int cpt;
+
+ cpt = lnet_cpt_of_nid(nid);
+ lnet_net_lock(cpt);
+
+ rc = lnet_nid2peer_locked(&lp, nid, cpt);
+ if (rc != 0) {
+ lnet_net_unlock(cpt);
+ CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+ return;
+ }
+
+ if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
+ aliveness = lp->lp_alive ? "up" : "down";
+
+ CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+ libcfs_nid2str(lp->lp_nid), lp->lp_refcount,
+ aliveness, lp->lp_ni->ni_peertxcredits,
+ lp->lp_rtrcredits, lp->lp_minrtrcredits,
+ lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+ lnet_peer_decref_locked(lp);
+
+ lnet_net_unlock(cpt);
+}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
new file mode 100644
index 000000000000..a326ce06bc76
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router.c
@@ -0,0 +1,1694 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * This file is part of Portals
+ * http://sourceforge.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/lnet/lib-lnet.h>
+
+#if defined(LNET_ROUTER)
+
+#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */
+#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4)
+#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */
+#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4)
+#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */
+#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+ "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+ "# of 0 payload messages to buffer in the router");
+static int small_router_buffers;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+ "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+ "# of large messages to buffer in the router");
+static int peer_buffer_credits = 0;
+CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
+ "# router buffer credits per peer");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+ "Automatically mark peers down on comms error");
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+ /* NI option overrides LNet default */
+ if (ni->ni_peerrtrcredits > 0)
+ return ni->ni_peerrtrcredits;
+ if (peer_buffer_credits > 0)
+ return peer_buffer_credits;
+
+ /* As an approximation, allow this peer the same number of router
+ * buffers as it is allowed outstanding sends */
+ return ni->ni_peertxcredits;
+}
+
+/* forward ref's */
+static int lnet_router_checker(void *);
+#else
+
+int
+lnet_peer_buffer_credits(lnet_ni_t *ni)
+{
+ return 0;
+}
+
+#endif
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+ "Assume routers are down and ping them before use");
+
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+ "Avoid asymmetrical router failures (0 to disable)");
+
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+ "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+ "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+ "Seconds to wait for the reply to a router health query");
+
+int
+lnet_peers_start_down(void)
+{
+ return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
+{
+ if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
+ CDEBUG(D_NET, "Out of date\n");
+ return;
+ }
+
+ lp->lp_timestamp = when; /* update timestamp */
+ lp->lp_ping_deadline = 0; /* disable ping timeout */
+
+ if (lp->lp_alive_count != 0 && /* got old news */
+ (!lp->lp_alive) == (!alive)) { /* new date for old news */
+ CDEBUG(D_NET, "Old news\n");
+ return;
+ }
+
+ /* Flag that notification is outstanding */
+
+ lp->lp_alive_count++;
+ lp->lp_alive = !(!alive); /* 1 bit! */
+ lp->lp_notify = 1;
+ lp->lp_notifylnd |= notifylnd;
+ if (lp->lp_alive)
+ lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
+
+ CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
+{
+ int alive;
+ int notifylnd;
+
+ /* Notify only in 1 thread at any time to ensure ordered notification.
+ * NB individual events can be missed; the only guarantee is that you
+ * always get the most recent news */
+
+ if (lp->lp_notifying)
+ return;
+
+ lp->lp_notifying = 1;
+
+ while (lp->lp_notify) {
+ alive = lp->lp_alive;
+ notifylnd = lp->lp_notifylnd;
+
+ lp->lp_notifylnd = 0;
+ lp->lp_notify = 0;
+
+ if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
+ lnet_net_unlock(lp->lp_cpt);
+
+ /* A new notification could happen now; I'll handle it
+ * when control returns to me */
+
+ (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+
+ lnet_net_lock(lp->lp_cpt);
+ }
+ }
+
+ lp->lp_notifying = 0;
+}
+
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+ LASSERT(lp->lp_refcount > 0);
+ LASSERT(lp->lp_rtr_refcount >= 0);
+
+ /* lnet_net_lock must be exclusively locked */
+ lp->lp_rtr_refcount++;
+ if (lp->lp_rtr_refcount == 1) {
+ struct list_head *pos;
+
+ /* a simple insertion sort */
+ list_for_each_prev(pos, &the_lnet.ln_routers) {
+ lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
+ lp_rtr_list);
+
+ if (rtr->lp_nid < lp->lp_nid)
+ break;
+ }
+
+ list_add(&lp->lp_rtr_list, pos);
+ /* addref for the_lnet.ln_routers */
+ lnet_peer_addref_locked(lp);
+ the_lnet.ln_routers_version++;
+ }
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+ LASSERT(lp->lp_refcount > 0);
+ LASSERT(lp->lp_rtr_refcount > 0);
+
+ /* lnet_net_lock must be exclusively locked */
+ lp->lp_rtr_refcount--;
+ if (lp->lp_rtr_refcount == 0) {
+ LASSERT(list_empty(&lp->lp_routes));
+
+ if (lp->lp_rcd != NULL) {
+ list_add(&lp->lp_rcd->rcd_list,
+ &the_lnet.ln_rcd_deathrow);
+ lp->lp_rcd = NULL;
+ }
+
+ list_del(&lp->lp_rtr_list);
+ /* decref for the_lnet.ln_routers */
+ lnet_peer_decref_locked(lp);
+ the_lnet.ln_routers_version++;
+ }
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+ lnet_remotenet_t *rnet;
+ struct list_head *tmp;
+ struct list_head *rn_list;
+
+ LASSERT(!the_lnet.ln_shutdown);
+
+ rn_list = lnet_net2rnethash(net);
+ list_for_each(tmp, rn_list) {
+ rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+ if (rnet->lrn_net == net)
+ return rnet;
+ }
+ return NULL;
+}
+
+static void lnet_shuffle_seed(void)
+{
+ static int seeded = 0;
+ int lnd_type, seed[2];
+ struct timeval tv;
+ lnet_ni_t *ni;
+ struct list_head *tmp;
+
+ if (seeded)
+ return;
+
+ cfs_get_random_bytes(seed, sizeof(seed));
+
+ /* Nodes with small feet have little entropy
+ * the NID for this node gives the most entropy in the low bits */
+ list_for_each(tmp, &the_lnet.ln_nis) {
+ ni = list_entry(tmp, lnet_ni_t, ni_list);
+ lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+ if (lnd_type != LOLND)
+ seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
+ }
+
+ do_gettimeofday(&tv);
+ cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+ seeded = 1;
+ return;
+}
+
+/* NB expects LNET_LOCK held */
+void
+lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
+{
+ unsigned int len = 0;
+ unsigned int offset = 0;
+ struct list_head *e;
+
+ lnet_shuffle_seed();
+
+ list_for_each (e, &rnet->lrn_routes) {
+ len++;
+ }
+
+ /* len+1 positions to add a new entry, also prevents division by 0 */
+ offset = cfs_rand() % (len + 1);
+ list_for_each (e, &rnet->lrn_routes) {
+ if (offset == 0)
+ break;
+ offset--;
+ }
+ list_add(&route->lr_list, e);
+ list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
+
+ the_lnet.ln_remote_nets_version++;
+ lnet_rtr_addref_locked(route->lr_gateway);
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+ struct list_head *e;
+ lnet_remotenet_t *rnet;
+ lnet_remotenet_t *rnet2;
+ lnet_route_t *route;
+ lnet_ni_t *ni;
+ int add_route;
+ int rc;
+
+ CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+ libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+ if (gateway == LNET_NID_ANY ||
+ LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+ net == LNET_NIDNET(LNET_NID_ANY) ||
+ LNET_NETTYP(net) == LOLND ||
+ LNET_NIDNET(gateway) == net ||
+ hops < 1 || hops > 255)
+ return (-EINVAL);
+
+ if (lnet_islocalnet(net)) /* it's a local network */
+ return 0; /* ignore the route entry */
+
+ /* Assume net, route, all new */
+ LIBCFS_ALLOC(route, sizeof(*route));
+ LIBCFS_ALLOC(rnet, sizeof(*rnet));
+ if (route == NULL || rnet == NULL) {
+ CERROR("Out of memory creating route %s %d %s\n",
+ libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+ if (route != NULL)
+ LIBCFS_FREE(route, sizeof(*route));
+ if (rnet != NULL)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&rnet->lrn_routes);
+ rnet->lrn_net = net;
+ route->lr_hops = hops;
+ route->lr_net = net;
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
+ if (rc != 0) {
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ LIBCFS_FREE(route, sizeof(*route));
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
+ return 0; /* ignore the route entry */
+ } else {
+ CERROR("Error %d creating route %s %d %s\n", rc,
+ libcfs_net2str(net), hops,
+ libcfs_nid2str(gateway));
+ }
+ return rc;
+ }
+
+ LASSERT (!the_lnet.ln_shutdown);
+
+ rnet2 = lnet_find_net_locked(net);
+ if (rnet2 == NULL) {
+ /* new network */
+ list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
+ rnet2 = rnet;
+ }
+
+ /* Search for a duplicate route (it's a NOOP if it is) */
+ add_route = 1;
+ list_for_each (e, &rnet2->lrn_routes) {
+ lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
+
+ if (route2->lr_gateway == route->lr_gateway) {
+ add_route = 0;
+ break;
+ }
+
+ /* our lookups must be true */
+ LASSERT (route2->lr_gateway->lp_nid != gateway);
+ }
+
+ if (add_route) {
+ lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
+ lnet_add_route_to_rnet(rnet2, route);
+
+ ni = route->lr_gateway->lp_ni;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ /* XXX Assume alive */
+ if (ni->ni_lnd->lnd_notify != NULL)
+ (ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ /* -1 for notify or !add_route */
+ lnet_peer_decref_locked(route->lr_gateway);
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ if (!add_route)
+ LIBCFS_FREE(route, sizeof(*route));
+
+ if (rnet != rnet2)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ return 0;
+}
+
+int
+lnet_check_routes(void)
+{
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ lnet_route_t *route2;
+ struct list_head *e1;
+ struct list_head *e2;
+ int cpt;
+ struct list_head *rn_list;
+ int i;
+
+ cpt = lnet_net_lock_current();
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ route2 = NULL;
+ list_for_each(e2, &rnet->lrn_routes) {
+ lnet_nid_t nid1;
+ lnet_nid_t nid2;
+ int net;
+
+ route = list_entry(e2, lnet_route_t,
+ lr_list);
+
+ if (route2 == NULL) {
+ route2 = route;
+ continue;
+ }
+
+ if (route->lr_gateway->lp_ni ==
+ route2->lr_gateway->lp_ni)
+ continue;
+
+ nid1 = route->lr_gateway->lp_nid;
+ nid2 = route2->lr_gateway->lp_nid;
+ net = rnet->lrn_net;
+
+ lnet_net_unlock(cpt);
+
+ CERROR("Routes to %s via %s and %s not "
+ "supported\n",
+ libcfs_net2str(net),
+ libcfs_nid2str(nid1),
+ libcfs_nid2str(nid2));
+ return -EINVAL;
+ }
+ }
+ }
+
+ lnet_net_unlock(cpt);
+ return 0;
+}
+
+int
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
+{
+ struct lnet_peer *gateway;
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ struct list_head *e1;
+ struct list_head *e2;
+ int rc = -ENOENT;
+ struct list_head *rn_list;
+ int idx = 0;
+
+ CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+ libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+ /* NB Caller may specify either all routes via the given gateway
+ * or a specific route entry actual NIDs) */
+
+ lnet_net_lock(LNET_LOCK_EX);
+ if (net == LNET_NIDNET(LNET_NID_ANY))
+ rn_list = &the_lnet.ln_remote_nets_hash[0];
+ else
+ rn_list = lnet_net2rnethash(net);
+
+ again:
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+ net == rnet->lrn_net))
+ continue;
+
+ list_for_each(e2, &rnet->lrn_routes) {
+ route = list_entry(e2, lnet_route_t, lr_list);
+
+ gateway = route->lr_gateway;
+ if (!(gw_nid == LNET_NID_ANY ||
+ gw_nid == gateway->lp_nid))
+ continue;
+
+ list_del(&route->lr_list);
+ list_del(&route->lr_gwlist);
+ the_lnet.ln_remote_nets_version++;
+
+ if (list_empty(&rnet->lrn_routes))
+ list_del(&rnet->lrn_list);
+ else
+ rnet = NULL;
+
+ lnet_rtr_decref_locked(gateway);
+ lnet_peer_decref_locked(gateway);
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ LIBCFS_FREE(route, sizeof(*route));
+
+ if (rnet != NULL)
+ LIBCFS_FREE(rnet, sizeof(*rnet));
+
+ rc = 0;
+ lnet_net_lock(LNET_LOCK_EX);
+ goto again;
+ }
+ }
+
+ if (net == LNET_NIDNET(LNET_NID_ANY) &&
+ ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+ rn_list = &the_lnet.ln_remote_nets_hash[idx];
+ goto again;
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+ lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route(int idx, __u32 *net, __u32 *hops,
+ lnet_nid_t *gateway, __u32 *alive)
+{
+ struct list_head *e1;
+ struct list_head *e2;
+ lnet_remotenet_t *rnet;
+ lnet_route_t *route;
+ int cpt;
+ int i;
+ struct list_head *rn_list;
+
+ cpt = lnet_net_lock_current();
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+ list_for_each(e1, rn_list) {
+ rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+ list_for_each(e2, &rnet->lrn_routes) {
+ route = list_entry(e2, lnet_route_t,
+ lr_list);
+
+ if (idx-- == 0) {
+ *net = rnet->lrn_net;
+ *hops = route->lr_hops;
+ *gateway = route->lr_gateway->lp_nid;
+ *alive = route->lr_gateway->lp_alive;
+ lnet_net_unlock(cpt);
+ return 0;
+ }
+ }
+ }
+ }
+
+ lnet_net_unlock(cpt);
+ return -ENOENT;
+}
+
+void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+ int i;
+ lnet_ni_status_t *stat;
+
+ __swab32s(&info->pi_magic);
+ __swab32s(&info->pi_features);
+ __swab32s(&info->pi_pid);
+ __swab32s(&info->pi_nnis);
+ for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+ stat = &info->pi_ni[i];
+ __swab64s(&stat->ns_nid);
+ __swab32s(&stat->ns_status);
+ }
+ return;
+}
+
+/**
+ * parse router-checker pinginfo, record number of down NIs for remote
+ * networks on that router.
+ */
+static void
+lnet_parse_rc_info(lnet_rc_data_t *rcd)
+{
+ lnet_ping_info_t *info = rcd->rcd_pinginfo;
+ struct lnet_peer *gw = rcd->rcd_gateway;
+ lnet_route_t *rtr;
+
+ if (!gw->lp_alive)
+ return;
+
+ if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
+ lnet_swap_pinginfo(info);
+
+ /* NB always racing with network! */
+ if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+ CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
+ libcfs_nid2str(gw->lp_nid), info->pi_magic);
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ gw->lp_ping_feats = info->pi_features;
+ if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+ CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+ libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+ return; /* nothing I can understand */
+ }
+
+ if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+ return; /* can't carry NI status info */
+
+ list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
+ int ptl_status = LNET_NI_STATUS_INVALID;
+ int down = 0;
+ int up = 0;
+ int i;
+
+ for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+ lnet_ni_status_t *stat = &info->pi_ni[i];
+ lnet_nid_t nid = stat->ns_nid;
+
+ if (nid == LNET_NID_ANY) {
+ CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
+ libcfs_nid2str(gw->lp_nid));
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+ continue;
+
+ if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+ if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
+ down++;
+ else if (ptl_status != LNET_NI_STATUS_UP)
+ ptl_status = LNET_NI_STATUS_DOWN;
+ continue;
+ }
+
+ if (stat->ns_status == LNET_NI_STATUS_UP) {
+ if (LNET_NIDNET(nid) == rtr->lr_net) {
+ up = 1;
+ break;
+ }
+ /* ptl NIs are considered down only when
+ * they're all down */
+ if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+ ptl_status = LNET_NI_STATUS_UP;
+ continue;
+ }
+
+ CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
+ libcfs_nid2str(gw->lp_nid), stat->ns_status);
+ gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
+ return;
+ }
+
+ if (up) { /* ignore downed NIs if NI for dest network is up */
+ rtr->lr_downis = 0;
+ continue;
+ }
+ rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
+ }
+}
+
+static void
+lnet_router_checker_event(lnet_event_t *event)
+{
+ lnet_rc_data_t *rcd = event->md.user_ptr;
+ struct lnet_peer *lp;
+
+ LASSERT(rcd != NULL);
+
+ if (event->unlinked) {
+ LNetInvalidateHandle(&rcd->rcd_mdh);
+ return;
+ }
+
+ LASSERT(event->type == LNET_EVENT_SEND ||
+ event->type == LNET_EVENT_REPLY);
+
+ lp = rcd->rcd_gateway;
+ LASSERT(lp != NULL);
+
+ /* NB: it's called with holding lnet_res_lock, we have a few
+ * places need to hold both locks at the same time, please take
+ * care of lock ordering */
+ lnet_net_lock(lp->lp_cpt);
+ if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
+ /* ignore if no longer a router or rcd is replaced */
+ goto out;
+ }
+
+ if (event->type == LNET_EVENT_SEND) {
+ lp->lp_ping_notsent = 0;
+ if (event->status == 0)
+ goto out;
+ }
+
+ /* LNET_EVENT_REPLY */
+ /* A successful REPLY means the router is up. If _any_ comms
+ * to the router fail I assume it's down (this will happen if
+ * we ping alive routers to try to detect router death before
+ * apps get burned). */
+
+ lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
+ /* The router checker will wake up very shortly and do the
+ * actual notification.
+ * XXX If 'lp' stops being a router before then, it will still
+ * have the notification pending!!! */
+
+ if (avoid_asym_router_failure && event->status == 0)
+ lnet_parse_rc_info(rcd);
+
+ out:
+ lnet_net_unlock(lp->lp_cpt);
+}
+
+void
+lnet_wait_known_routerstate(void)
+{
+ lnet_peer_t *rtr;
+ struct list_head *entry;
+ int all_known;
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ for (;;) {
+ int cpt = lnet_net_lock_current();
+
+ all_known = 1;
+ list_for_each (entry, &the_lnet.ln_routers) {
+ rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+ if (rtr->lp_alive_count == 0) {
+ all_known = 0;
+ break;
+ }
+ }
+
+ lnet_net_unlock(cpt);
+
+ if (all_known)
+ return;
+
+ cfs_pause(cfs_time_seconds(1));
+ }
+}
+
+void
+lnet_update_ni_status_locked(void)
+{
+ lnet_ni_t *ni;
+ long now;
+ int timeout;
+
+ LASSERT(the_lnet.ln_routing);
+
+ timeout = router_ping_timeout +
+ MAX(live_router_check_interval, dead_router_check_interval);
+
+ now = cfs_time_current_sec();
+ list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
+ if (ni->ni_lnd->lnd_type == LOLND)
+ continue;
+
+ if (now < ni->ni_last_alive + timeout)
+ continue;
+
+ lnet_ni_lock(ni);
+ /* re-check with lock */
+ if (now < ni->ni_last_alive + timeout) {
+ lnet_ni_unlock(ni);
+ continue;
+ }
+
+ LASSERT(ni->ni_status != NULL);
+
+ if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
+ CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
+ libcfs_nid2str(ni->ni_nid), timeout);
+ /* NB: so far, this is the only place to set
+ * NI status to "down" */
+ ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
+ }
+ lnet_ni_unlock(ni);
+ }
+}
+
+void
+lnet_destroy_rc_data(lnet_rc_data_t *rcd)
+{
+ LASSERT(list_empty(&rcd->rcd_list));
+ /* detached from network */
+ LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
+
+ if (rcd->rcd_gateway != NULL) {
+ int cpt = rcd->rcd_gateway->lp_cpt;
+
+ lnet_net_lock(cpt);
+ lnet_peer_decref_locked(rcd->rcd_gateway);
+ lnet_net_unlock(cpt);
+ }
+
+ if (rcd->rcd_pinginfo != NULL)
+ LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+
+ LIBCFS_FREE(rcd, sizeof(*rcd));
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data_locked(lnet_peer_t *gateway)
+{
+ lnet_rc_data_t *rcd = NULL;
+ lnet_ping_info_t *pi;
+ int rc;
+ int i;
+
+ lnet_net_unlock(gateway->lp_cpt);
+
+ LIBCFS_ALLOC(rcd, sizeof(*rcd));
+ if (rcd == NULL)
+ goto out;
+
+ LNetInvalidateHandle(&rcd->rcd_mdh);
+ INIT_LIST_HEAD(&rcd->rcd_list);
+
+ LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+ if (pi == NULL)
+ goto out;
+
+ memset(pi, 0, LNET_PINGINFO_SIZE);
+ for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+ pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+ pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+ }
+ rcd->rcd_pinginfo = pi;
+
+ LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+ rc = LNetMDBind((lnet_md_t){.start = pi,
+ .user_ptr = rcd,
+ .length = LNET_PINGINFO_SIZE,
+ .threshold = LNET_MD_THRESH_INF,
+ .options = LNET_MD_TRUNCATE,
+ .eq_handle = the_lnet.ln_rc_eqh},
+ LNET_UNLINK,
+ &rcd->rcd_mdh);
+ if (rc < 0) {
+ CERROR("Can't bind MD: %d\n", rc);
+ goto out;
+ }
+ LASSERT(rc == 0);
+
+ lnet_net_lock(gateway->lp_cpt);
+ /* router table changed or someone has created rcd for this gateway */
+ if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
+ lnet_net_unlock(gateway->lp_cpt);
+ goto out;
+ }
+
+ lnet_peer_addref_locked(gateway);
+ rcd->rcd_gateway = gateway;
+ gateway->lp_rcd = rcd;
+ gateway->lp_ping_notsent = 0;
+
+ return rcd;
+
+ out:
+ if (rcd != NULL) {
+ if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
+ rc = LNetMDUnlink(rcd->rcd_mdh);
+ LASSERT(rc == 0);
+ }
+ lnet_destroy_rc_data(rcd);
+ }
+
+ lnet_net_lock(gateway->lp_cpt);
+ return gateway->lp_rcd;
+}
+
+static int
+lnet_router_check_interval (lnet_peer_t *rtr)
+{
+ int secs;
+
+ secs = rtr->lp_alive ? live_router_check_interval :
+ dead_router_check_interval;
+ if (secs < 0)
+ secs = 0;
+
+ return secs;
+}
+
+static void
+lnet_ping_router_locked (lnet_peer_t *rtr)
+{
+ lnet_rc_data_t *rcd = NULL;
+ cfs_time_t now = cfs_time_current();
+ int secs;
+
+ lnet_peer_addref_locked(rtr);
+
+ if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+ cfs_time_after(now, rtr->lp_ping_deadline))
+ lnet_notify_locked(rtr, 1, 0, now);
+
+ /* Run any outstanding notifications */
+ lnet_ni_notify_locked(rtr->lp_ni, rtr);
+
+ if (!lnet_isrouter(rtr) ||
+ the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+ /* router table changed or router checker is shutting down */
+ lnet_peer_decref_locked(rtr);
+ return;
+ }
+
+ rcd = rtr->lp_rcd != NULL ?
+ rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
+
+ if (rcd == NULL)
+ return;
+
+ secs = lnet_router_check_interval(rtr);
+
+ CDEBUG(D_NET,
+ "rtr %s %d: deadline %lu ping_notsent %d alive %d "
+ "alive_count %d lp_ping_timestamp %lu\n",
+ libcfs_nid2str(rtr->lp_nid), secs,
+ rtr->lp_ping_deadline, rtr->lp_ping_notsent,
+ rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
+
+ if (secs != 0 && !rtr->lp_ping_notsent &&
+ cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+ cfs_time_seconds(secs)))) {
+ int rc;
+ lnet_process_id_t id;
+ lnet_handle_md_t mdh;
+
+ id.nid = rtr->lp_nid;
+ id.pid = LUSTRE_SRV_LNET_PID;
+ CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
+
+ rtr->lp_ping_notsent = 1;
+ rtr->lp_ping_timestamp = now;
+
+ mdh = rcd->rcd_mdh;
+
+ if (rtr->lp_ping_deadline == 0) {
+ rtr->lp_ping_deadline =
+ cfs_time_shift(router_ping_timeout);
+ }
+
+ lnet_net_unlock(rtr->lp_cpt);
+
+ rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+ LNET_PROTO_PING_MATCHBITS, 0);
+
+ lnet_net_lock(rtr->lp_cpt);
+ if (rc != 0)
+ rtr->lp_ping_notsent = 0; /* no event pending */
+ }
+
+ lnet_peer_decref_locked(rtr);
+ return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+ int rc;
+ int eqsz;
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+ if (check_routers_before_use &&
+ dead_router_check_interval <= 0) {
+ LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be"
+ " set if 'check_routers_before_use' is set"
+ "\n");
+ return -EINVAL;
+ }
+
+ if (!the_lnet.ln_routing &&
+ live_router_check_interval <= 0 &&
+ dead_router_check_interval <= 0)
+ return 0;
+
+ sema_init(&the_lnet.ln_rc_signal, 0);
+ /* EQ size doesn't matter; the callback is guaranteed to get every
+ * event */
+ eqsz = 0;
+ rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
+ &the_lnet.ln_rc_eqh);
+ if (rc != 0) {
+ CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
+ return -ENOMEM;
+ }
+
+ the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+ rc = PTR_ERR(kthread_run(lnet_router_checker,
+ NULL, "router_checker"));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("Can't start router checker thread: %d\n", rc);
+ /* block until event callback signals exit */
+ down(&the_lnet.ln_rc_signal);
+ rc = LNetEQFree(the_lnet.ln_rc_eqh);
+ LASSERT(rc == 0);
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ return -ENOMEM;
+ }
+
+ if (check_routers_before_use) {
+ /* Note that a helpful side-effect of pinging all known routers
+ * at startup is that it makes them drop stale connections they
+ * may have to a previous instance of me. */
+ lnet_wait_known_routerstate();
+ }
+
+ return 0;
+}
+
+void
+lnet_router_checker_stop (void)
+{
+ int rc;
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+ return;
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+ the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
+
+ /* block until event callback signals exit */
+ down(&the_lnet.ln_rc_signal);
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+ rc = LNetEQFree(the_lnet.ln_rc_eqh);
+ LASSERT (rc == 0);
+ return;
+}
+
+static void
+lnet_prune_rc_data(int wait_unlink)
+{
+ lnet_rc_data_t *rcd;
+ lnet_rc_data_t *tmp;
+ lnet_peer_t *lp;
+ struct list_head head;
+ int i = 2;
+
+ if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
+ list_empty(&the_lnet.ln_rcd_deathrow) &&
+ list_empty(&the_lnet.ln_rcd_zombie)))
+ return;
+
+ INIT_LIST_HEAD(&head);
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
+ /* router checker is stopping, prune all */
+ list_for_each_entry(lp, &the_lnet.ln_routers,
+ lp_rtr_list) {
+ if (lp->lp_rcd == NULL)
+ continue;
+
+ LASSERT(list_empty(&lp->lp_rcd->rcd_list));
+ list_add(&lp->lp_rcd->rcd_list,
+ &the_lnet.ln_rcd_deathrow);
+ lp->lp_rcd = NULL;
+ }
+ }
+
+ /* unlink all RCDs on deathrow list */
+ list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
+
+ if (!list_empty(&head)) {
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ list_for_each_entry(rcd, &head, rcd_list)
+ LNetMDUnlink(rcd->rcd_mdh);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ list_splice_init(&head, &the_lnet.ln_rcd_zombie);
+
+ /* release all zombie RCDs */
+ while (!list_empty(&the_lnet.ln_rcd_zombie)) {
+ list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
+ rcd_list) {
+ if (LNetHandleIsInvalid(rcd->rcd_mdh))
+ list_move(&rcd->rcd_list, &head);
+ }
+
+ wait_unlink = wait_unlink &&
+ !list_empty(&the_lnet.ln_rcd_zombie);
+
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ while (!list_empty(&head)) {
+ rcd = list_entry(head.next,
+ lnet_rc_data_t, rcd_list);
+ list_del_init(&rcd->rcd_list);
+ lnet_destroy_rc_data(rcd);
+ }
+
+ if (!wait_unlink)
+ return;
+
+ i++;
+ CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+ "Waiting for rc buffers to unlink\n");
+ cfs_pause(cfs_time_seconds(1) / 4);
+
+ lnet_net_lock(LNET_LOCK_EX);
+ }
+
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+
+
+#if defined(LNET_ROUTER)
+
+static int
+lnet_router_checker(void *arg)
+{
+ lnet_peer_t *rtr;
+ struct list_head *entry;
+
+ cfs_block_allsigs();
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+ __u64 version;
+ int cpt;
+ int cpt2;
+
+ cpt = lnet_net_lock_current();
+rescan:
+ version = the_lnet.ln_routers_version;
+
+ list_for_each(entry, &the_lnet.ln_routers) {
+ rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+ cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
+ if (cpt != cpt2) {
+ lnet_net_unlock(cpt);
+ cpt = cpt2;
+ lnet_net_lock(cpt);
+ /* the routers list has changed */
+ if (version != the_lnet.ln_routers_version)
+ goto rescan;
+ }
+
+ lnet_ping_router_locked(rtr);
+
+ /* NB dropped lock */
+ if (version != the_lnet.ln_routers_version) {
+ /* the routers list has changed */
+ goto rescan;
+ }
+ }
+
+ if (the_lnet.ln_routing)
+ lnet_update_ni_status_locked();
+
+ lnet_net_unlock(cpt);
+
+ lnet_prune_rc_data(0); /* don't wait for UNLINK */
+
+ /* Call cfs_pause() here always adds 1 to load average
+ * because kernel counts # active tasks as nr_running
+ * + nr_uninterruptible. */
+ schedule_timeout_and_set_state(TASK_INTERRUPTIBLE,
+ cfs_time_seconds(1));
+ }
+
+ LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
+
+ lnet_prune_rc_data(1); /* wait for UNLINK */
+
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ up(&the_lnet.ln_rc_signal);
+ /* The unlink event callback will signal final completion */
+ return 0;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+ int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+ while (--npages >= 0)
+ __free_page(rb->rb_kiov[npages].kiov_page);
+
+ LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
+{
+ int npages = rbp->rbp_npages;
+ int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+ struct page *page;
+ lnet_rtrbuf_t *rb;
+ int i;
+
+ LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
+ if (rb == NULL)
+ return NULL;
+
+ rb->rb_pool = rbp;
+
+ for (i = 0; i < npages; i++) {
+ page = alloc_pages_node(
+ cfs_cpt_spread_node(lnet_cpt_table(), cpt),
+ __GFP_ZERO | GFP_IOFS, 0);
+ if (page == NULL) {
+ while (--i >= 0)
+ __free_page(rb->rb_kiov[i].kiov_page);
+
+ LIBCFS_FREE(rb, sz);
+ return NULL;
+ }
+
+ rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+ rb->rb_kiov[i].kiov_offset = 0;
+ rb->rb_kiov[i].kiov_page = page;
+ }
+
+ return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+ int npages = rbp->rbp_npages;
+ int nbuffers = 0;
+ lnet_rtrbuf_t *rb;
+
+ if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
+ return;
+
+ LASSERT (list_empty(&rbp->rbp_msgs));
+ LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+ while (!list_empty(&rbp->rbp_bufs)) {
+ LASSERT (rbp->rbp_credits > 0);
+
+ rb = list_entry(rbp->rbp_bufs.next,
+ lnet_rtrbuf_t, rb_list);
+ list_del(&rb->rb_list);
+ lnet_destroy_rtrbuf(rb, npages);
+ nbuffers++;
+ }
+
+ LASSERT (rbp->rbp_nbuffers == nbuffers);
+ LASSERT (rbp->rbp_credits == nbuffers);
+
+ rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
+{
+ lnet_rtrbuf_t *rb;
+ int i;
+
+ if (rbp->rbp_nbuffers != 0) {
+ LASSERT (rbp->rbp_nbuffers == nbufs);
+ return 0;
+ }
+
+ for (i = 0; i < nbufs; i++) {
+ rb = lnet_new_rtrbuf(rbp, cpt);
+
+ if (rb == NULL) {
+ CERROR("Failed to allocate %d router bufs of %d pages\n",
+ nbufs, rbp->rbp_npages);
+ return -ENOMEM;
+ }
+
+ rbp->rbp_nbuffers++;
+ rbp->rbp_credits++;
+ rbp->rbp_mincredits++;
+ list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+ /* No allocation "under fire" */
+ /* Otherwise we'd need code to schedule blocked msgs etc */
+ LASSERT (!the_lnet.ln_routing);
+ }
+
+ LASSERT (rbp->rbp_credits == nbufs);
+ return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+ INIT_LIST_HEAD(&rbp->rbp_msgs);
+ INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+ rbp->rbp_npages = npages;
+ rbp->rbp_credits = 0;
+ rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+ lnet_rtrbufpool_t *rtrp;
+ int i;
+
+ if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
+ return;
+
+ cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+ lnet_rtrpool_free_bufs(&rtrp[0]);
+ lnet_rtrpool_free_bufs(&rtrp[1]);
+ lnet_rtrpool_free_bufs(&rtrp[2]);
+ }
+
+ cfs_percpt_free(the_lnet.ln_rtrpools);
+ the_lnet.ln_rtrpools = NULL;
+}
+
+static int
+lnet_nrb_tiny_calculate(int npages)
+{
+ int nrbs = LNET_NRB_TINY;
+
+ if (tiny_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "tiny_router_buffers=%d invalid when "
+ "routing enabled\n", tiny_router_buffers);
+ return -1;
+ }
+
+ if (tiny_router_buffers > 0)
+ nrbs = tiny_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_TINY_MIN);
+}
+
+static int
+lnet_nrb_small_calculate(int npages)
+{
+ int nrbs = LNET_NRB_SMALL;
+
+ if (small_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "small_router_buffers=%d invalid when "
+ "routing enabled\n", small_router_buffers);
+ return -1;
+ }
+
+ if (small_router_buffers > 0)
+ nrbs = small_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_SMALL_MIN);
+}
+
+static int
+lnet_nrb_large_calculate(int npages)
+{
+ int nrbs = LNET_NRB_LARGE;
+
+ if (large_router_buffers < 0) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "large_router_buffers=%d invalid when "
+ "routing enabled\n", large_router_buffers);
+ return -1;
+ }
+
+ if (large_router_buffers > 0)
+ nrbs = large_router_buffers;
+
+ nrbs /= LNET_CPT_NUMBER;
+ return max(nrbs, LNET_NRB_LARGE_MIN);
+}
+
+int
+lnet_rtrpools_alloc(int im_a_router)
+{
+ lnet_rtrbufpool_t *rtrp;
+ int large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int small_pages = 1;
+ int nrb_tiny;
+ int nrb_small;
+ int nrb_large;
+ int rc;
+ int i;
+
+ if (!strcmp(forwarding, "")) {
+ /* not set either way */
+ if (!im_a_router)
+ return 0;
+ } else if (!strcmp(forwarding, "disabled")) {
+ /* explicitly disabled */
+ return 0;
+ } else if (!strcmp(forwarding, "enabled")) {
+ /* explicitly enabled */
+ } else {
+ LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either "
+ "'enabled' or 'disabled'\n");
+ return -EINVAL;
+ }
+
+ nrb_tiny = lnet_nrb_tiny_calculate(0);
+ if (nrb_tiny < 0)
+ return -EINVAL;
+
+ nrb_small = lnet_nrb_small_calculate(small_pages);
+ if (nrb_small < 0)
+ return -EINVAL;
+
+ nrb_large = lnet_nrb_large_calculate(large_pages);
+ if (nrb_large < 0)
+ return -EINVAL;
+
+ the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
+ LNET_NRBPOOLS *
+ sizeof(lnet_rtrbufpool_t));
+ if (the_lnet.ln_rtrpools == NULL) {
+ LCONSOLE_ERROR_MSG(0x10c,
+ "Failed to initialize router buffe pool\n");
+ return -ENOMEM;
+ }
+
+ cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
+ lnet_rtrpool_init(&rtrp[0], 0);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
+ if (rc != 0)
+ goto failed;
+
+ lnet_rtrpool_init(&rtrp[1], small_pages);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
+ if (rc != 0)
+ goto failed;
+
+ lnet_rtrpool_init(&rtrp[2], large_pages);
+ rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
+ if (rc != 0)
+ goto failed;
+ }
+
+ lnet_net_lock(LNET_LOCK_EX);
+ the_lnet.ln_routing = 1;
+ lnet_net_unlock(LNET_LOCK_EX);
+
+ return 0;
+
+ failed:
+ lnet_rtrpools_free();
+ return rc;
+}
+
+int
+lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+ struct lnet_peer *lp = NULL;
+ cfs_time_t now = cfs_time_current();
+ int cpt = lnet_cpt_of_nid(nid);
+
+ LASSERT (!in_interrupt ());
+
+ CDEBUG (D_NET, "%s notifying %s: %s\n",
+ (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+ libcfs_nid2str(nid),
+ alive ? "up" : "down");
+
+ if (ni != NULL &&
+ LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+ CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+ libcfs_nid2str(nid), alive ? "birth" : "death",
+ libcfs_nid2str(ni->ni_nid));
+ return -EINVAL;
+ }
+
+ /* can't do predictions... */
+ if (cfs_time_after(when, now)) {
+ CWARN ("Ignoring prediction from %s of %s %s "
+ "%ld seconds in the future\n",
+ (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+ libcfs_nid2str(nid), alive ? "up" : "down",
+ cfs_duration_sec(cfs_time_sub(when, now)));
+ return -EINVAL;
+ }
+
+ if (ni != NULL && !alive && /* LND telling me she's down */
+ !auto_down) { /* auto-down disabled */
+ CDEBUG(D_NET, "Auto-down disabled\n");
+ return 0;
+ }
+
+ lnet_net_lock(cpt);
+
+ if (the_lnet.ln_shutdown) {
+ lnet_net_unlock(cpt);
+ return -ESHUTDOWN;
+ }
+
+ lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
+ if (lp == NULL) {
+ /* nid not found */
+ lnet_net_unlock(cpt);
+ CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+ return 0;
+ }
+
+ /* We can't fully trust LND on reporting exact peer last_alive
+ * if he notifies us about dead peer. For example ksocklnd can
+ * call us with when == _time_when_the_node_was_booted_ if
+ * no connections were successfully established */
+ if (ni != NULL && !alive && when < lp->lp_last_alive)
+ when = lp->lp_last_alive;
+
+ lnet_notify_locked(lp, ni == NULL, alive, when);
+
+ lnet_ni_notify_locked(ni, lp);
+
+ lnet_peer_decref_locked(lp);
+
+ lnet_net_unlock(cpt);
+ return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+void
+lnet_get_tunables (void)
+{
+ return;
+}
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
+{
+ return -EOPNOTSUPP;
+}
+
+void
+lnet_router_checker (void)
+{
+ static time_t last = 0;
+ static int running = 0;
+
+ time_t now = cfs_time_current_sec();
+ int interval = now - last;
+ int rc;
+ __u64 version;
+ lnet_peer_t *rtr;
+
+ /* It's no use to call me again within a sec - all intervals and
+ * timeouts are measured in seconds */
+ if (last != 0 && interval < 2)
+ return;
+
+ if (last != 0 &&
+ interval > MAX(live_router_check_interval,
+ dead_router_check_interval))
+ CNETERR("Checker(%d/%d) not called for %d seconds\n",
+ live_router_check_interval, dead_router_check_interval,
+ interval);
+
+ LASSERT(LNET_CPT_NUMBER == 1);
+
+ lnet_net_lock(0);
+ LASSERT(!running); /* recursion check */
+ running = 1;
+ lnet_net_unlock(0);
+
+ last = now;
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
+ lnet_prune_rc_data(0); /* unlink all rcd and nowait */
+
+ /* consume all pending events */
+ while (1) {
+ int i;
+ lnet_event_t ev;
+
+ /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
+ * recursion breaker in LNetEQPoll would fail */
+ rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
+ if (rc == 0) /* no event pending */
+ break;
+
+ /* NB a lost SENT prevents me from pinging a router again */
+ if (rc == -EOVERFLOW) {
+ CERROR("Dropped an event!!!\n");
+ abort();
+ }
+
+ LASSERT (rc == 1);
+
+ lnet_router_checker_event(&ev);
+ }
+
+ if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
+ lnet_prune_rc_data(1); /* release rcd */
+ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+ running = 0;
+ return;
+ }
+
+ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+ lnet_net_lock(0);
+
+ version = the_lnet.ln_routers_version;
+ list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) {
+ lnet_ping_router_locked(rtr);
+ LASSERT (version == the_lnet.ln_routers_version);
+ }
+
+ lnet_net_unlock(0);
+
+ running = 0; /* lock only needed for the recursion check */
+ return;
+}
+
+/* NB lnet_peers_start_down depends on me,
+ * so must be called before any peer creation */
+void
+lnet_get_tunables (void)
+{
+ char *s;
+
+ s = getenv("LNET_ROUTER_PING_TIMEOUT");
+ if (s != NULL) router_ping_timeout = atoi(s);
+
+ s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
+ if (s != NULL) live_router_check_interval = atoi(s);
+
+ s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
+ if (s != NULL) dead_router_check_interval = atoi(s);
+
+ /* This replaces old lnd_notify mechanism */
+ check_routers_before_use = 1;
+ if (dead_router_check_interval <= 0)
+ dead_router_check_interval = 30;
+}
+
+void
+lnet_rtrpools_free(void)
+{
+}
+
+int
+lnet_rtrpools_alloc(int im_a_arouter)
+{
+ return 0;
+}
+
+#endif
diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c
new file mode 100644
index 000000000000..3084b0c75983
--- /dev/null
+++ b/drivers/staging/lustre/lnet/lnet/router_proc.c
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ *
+ * This file is part of Portals
+ * http://sourceforge.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <linux/libcfs/libcfs.h>
+#include <linux/lnet/lib-lnet.h>
+
+#if defined(LNET_ROUTER)
+
+/* This is really lnet_proc.c. You might need to update sanity test 215
+ * if any file format is changed. */
+
+static ctl_table_header_t *lnet_table_header = NULL;
+
+#define CTL_LNET (0x100)
+enum {
+ PSDEV_LNET_STATS = 100,
+ PSDEV_LNET_ROUTES,
+ PSDEV_LNET_ROUTERS,
+ PSDEV_LNET_PEERS,
+ PSDEV_LNET_BUFFERS,
+ PSDEV_LNET_NIS,
+ PSDEV_LNET_PTL_ROTOR,
+};
+
+#define LNET_LOFFT_BITS (sizeof(loff_t) * 8)
+/*
+ * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system
+ */
+#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1)
+/* change version, 16 bits or 8 bits */
+#define LNET_PROC_VER_BITS MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8)
+
+#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS
+/*
+ * bits for peer hash offset
+ * NB: we don't use the highest bit of *ppos because it's signed
+ */
+#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \
+ LNET_PROC_CPT_BITS - \
+ LNET_PROC_VER_BITS - \
+ LNET_PROC_HASH_BITS - 1)
+/* bits for hash index + position */
+#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS)
+/* bits for peer hash table + hash version */
+#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS)
+
+#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1)
+#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1)
+#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1)
+#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1)
+
+#define LNET_PROC_CPT_GET(pos) \
+ (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK)
+
+#define LNET_PROC_VER_GET(pos) \
+ (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK)
+
+#define LNET_PROC_HASH_GET(pos) \
+ (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK)
+
+#define LNET_PROC_HOFF_GET(pos) \
+ (int)((pos) & LNET_PROC_HOFF_MASK)
+
+#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \
+ (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \
+ ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \
+ ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \
+ ((off) & LNET_PROC_HOFF_MASK))
+
+#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK))
+
+static int __proc_lnet_stats(void *data, int write,
+ loff_t pos, void *buffer, int nob)
+{
+ int rc;
+ lnet_counters_t *ctrs;
+ int len;
+ char *tmpstr;
+ const int tmpsiz = 256; /* 7 %u and 4 LPU64 */
+
+ if (write) {
+ lnet_counters_reset();
+ return 0;
+ }
+
+ /* read */
+
+ LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+ if (ctrs == NULL)
+ return -ENOMEM;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL) {
+ LIBCFS_FREE(ctrs, sizeof(*ctrs));
+ return -ENOMEM;
+ }
+
+ lnet_counters_get(ctrs);
+
+ len = snprintf(tmpstr, tmpsiz,
+ "%u %u %u %u %u %u %u "LPU64" "LPU64" "
+ LPU64" "LPU64,
+ ctrs->msgs_alloc, ctrs->msgs_max,
+ ctrs->errors,
+ ctrs->send_count, ctrs->recv_count,
+ ctrs->route_count, ctrs->drop_count,
+ ctrs->send_length, ctrs->recv_length,
+ ctrs->route_length, ctrs->drop_length);
+
+ if (pos >= min_t(int, len, strlen(tmpstr)))
+ rc = 0;
+ else
+ rc = cfs_trace_copyout_string(buffer, nob,
+ tmpstr + pos, "\n");
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ LIBCFS_FREE(ctrs, sizeof(*ctrs));
+ return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_stats);
+
+int LL_PROC_PROTO(proc_lnet_routes)
+{
+ const int tmpsiz = 256;
+ char *tmpstr;
+ char *s;
+ int rc = 0;
+ int len;
+ int ver;
+ int off;
+
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ CLASSERT(sizeof(loff_t) >= 4);
+
+ off = LNET_PROC_HOFF_GET(*ppos);
+ ver = LNET_PROC_VER_GET(*ppos);
+
+ LASSERT (!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n",
+ the_lnet.ln_routing ? "enabled" : "disabled");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n",
+ "net", "hops", "state", "router");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ lnet_net_lock(0);
+ ver = (unsigned int)the_lnet.ln_remote_nets_version;
+ lnet_net_unlock(0);
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ } else {
+ struct list_head *n;
+ struct list_head *r;
+ lnet_route_t *route = NULL;
+ lnet_remotenet_t *rnet = NULL;
+ int skip = off - 1;
+ struct list_head *rn_list;
+ int i;
+
+ lnet_net_lock(0);
+
+ if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) {
+ lnet_net_unlock(0);
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL;
+ i++) {
+ rn_list = &the_lnet.ln_remote_nets_hash[i];
+
+ n = rn_list->next;
+
+ while (n != rn_list && route == NULL) {
+ rnet = list_entry(n, lnet_remotenet_t,
+ lrn_list);
+
+ r = rnet->lrn_routes.next;
+
+ while (r != &rnet->lrn_routes) {
+ lnet_route_t *re =
+ list_entry(r, lnet_route_t,
+ lr_list);
+ if (skip == 0) {
+ route = re;
+ break;
+ }
+
+ skip--;
+ r = r->next;
+ }
+
+ n = n->next;
+ }
+ }
+
+ if (route != NULL) {
+ __u32 net = rnet->lrn_net;
+ unsigned int hops = route->lr_hops;
+ lnet_nid_t nid = route->lr_gateway->lp_nid;
+ int alive = route->lr_gateway->lp_alive;
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-8s %4u %7s %s\n",
+ libcfs_net2str(net), hops,
+ alive ? "up" : "down",
+ libcfs_nid2str(nid));
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else {
+ off += 1;
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ }
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_routers)
+{
+ int rc = 0;
+ char *tmpstr;
+ char *s;
+ const int tmpsiz = 256;
+ int len;
+ int ver;
+ int off;
+
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ off = LNET_PROC_HOFF_GET(*ppos);
+ ver = LNET_PROC_VER_GET(*ppos);
+
+ LASSERT (!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n",
+ "ref", "rtr_ref", "alive_cnt", "state",
+ "last_ping", "ping_sent", "deadline",
+ "down_ni", "router");
+ LASSERT(tmpstr + tmpsiz - s > 0);
+
+ lnet_net_lock(0);
+ ver = (unsigned int)the_lnet.ln_routers_version;
+ lnet_net_unlock(0);
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ } else {
+ struct list_head *r;
+ struct lnet_peer *peer = NULL;
+ int skip = off - 1;
+
+ lnet_net_lock(0);
+
+ if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) {
+ lnet_net_unlock(0);
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ r = the_lnet.ln_routers.next;
+
+ while (r != &the_lnet.ln_routers) {
+ lnet_peer_t *lp = list_entry(r, lnet_peer_t,
+ lp_rtr_list);
+
+ if (skip == 0) {
+ peer = lp;
+ break;
+ }
+
+ skip--;
+ r = r->next;
+ }
+
+ if (peer != NULL) {
+ lnet_nid_t nid = peer->lp_nid;
+ cfs_time_t now = cfs_time_current();
+ cfs_time_t deadline = peer->lp_ping_deadline;
+ int nrefs = peer->lp_refcount;
+ int nrtrrefs = peer->lp_rtr_refcount;
+ int alive_cnt = peer->lp_alive_count;
+ int alive = peer->lp_alive;
+ int pingsent = !peer->lp_ping_notsent;
+ int last_ping = cfs_duration_sec(cfs_time_sub(now,
+ peer->lp_ping_timestamp));
+ int down_ni = 0;
+ lnet_route_t *rtr;
+
+ if ((peer->lp_ping_feats &
+ LNET_PING_FEAT_NI_STATUS) != 0) {
+ list_for_each_entry(rtr, &peer->lp_routes,
+ lr_gwlist) {
+ /* downis on any route should be the
+ * number of downis on the gateway */
+ if (rtr->lr_downis != 0) {
+ down_ni = rtr->lr_downis;
+ break;
+ }
+ }
+ }
+
+ if (deadline == 0)
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n",
+ nrefs, nrtrrefs, alive_cnt,
+ alive ? "up" : "down", last_ping,
+ pingsent, "NA", down_ni,
+ libcfs_nid2str(nid));
+ else
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n",
+ nrefs, nrtrrefs, alive_cnt,
+ alive ? "up" : "down", last_ping,
+ pingsent,
+ cfs_duration_sec(cfs_time_sub(deadline, now)),
+ down_ni, libcfs_nid2str(nid));
+ LASSERT (tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else {
+ off += 1;
+ *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off);
+ }
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+int LL_PROC_PROTO(proc_lnet_peers)
+{
+ const int tmpsiz = 256;
+ struct lnet_peer_table *ptable;
+ char *tmpstr;
+ char *s;
+ int cpt = LNET_PROC_CPT_GET(*ppos);
+ int ver = LNET_PROC_VER_GET(*ppos);
+ int hash = LNET_PROC_HASH_GET(*ppos);
+ int hoff = LNET_PROC_HOFF_GET(*ppos);
+ int rc = 0;
+ int len;
+
+ CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS);
+ LASSERT(!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ if (cpt >= LNET_CPT_NUMBER) {
+ *lenp = 0;
+ return 0;
+ }
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n",
+ "nid", "refs", "state", "last", "max",
+ "rtr", "min", "tx", "min", "queue");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ hoff++;
+ } else {
+ struct lnet_peer *peer;
+ struct list_head *p;
+ int skip;
+ again:
+ p = NULL;
+ peer = NULL;
+ skip = hoff - 1;
+
+ lnet_net_lock(cpt);
+ ptable = the_lnet.ln_peer_tables[cpt];
+ if (hoff == 1)
+ ver = LNET_PROC_VERSION(ptable->pt_version);
+
+ if (ver != LNET_PROC_VERSION(ptable->pt_version)) {
+ lnet_net_unlock(cpt);
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return -ESTALE;
+ }
+
+ while (hash < LNET_PEER_HASH_SIZE) {
+ if (p == NULL)
+ p = ptable->pt_hash[hash].next;
+
+ while (p != &ptable->pt_hash[hash]) {
+ lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+ lp_hashlist);
+ if (skip == 0) {
+ peer = lp;
+
+ /* minor optimization: start from idx+1
+ * on next iteration if we've just
+ * drained lp_hashlist */
+ if (lp->lp_hashlist.next ==
+ &ptable->pt_hash[hash]) {
+ hoff = 1;
+ hash++;
+ } else {
+ hoff++;
+ }
+
+ break;
+ }
+
+ skip--;
+ p = lp->lp_hashlist.next;
+ }
+
+ if (peer != NULL)
+ break;
+
+ p = NULL;
+ hoff = 1;
+ hash++;
+ }
+
+ if (peer != NULL) {
+ lnet_nid_t nid = peer->lp_nid;
+ int nrefs = peer->lp_refcount;
+ int lastalive = -1;
+ char *aliveness = "NA";
+ int maxcr = peer->lp_ni->ni_peertxcredits;
+ int txcr = peer->lp_txcredits;
+ int mintxcr = peer->lp_mintxcredits;
+ int rtrcr = peer->lp_rtrcredits;
+ int minrtrcr = peer->lp_minrtrcredits;
+ int txqnob = peer->lp_txqnob;
+
+ if (lnet_isrouter(peer) ||
+ lnet_peer_aliveness_enabled(peer))
+ aliveness = peer->lp_alive ? "up" : "down";
+
+ if (lnet_peer_aliveness_enabled(peer)) {
+ cfs_time_t now = cfs_time_current();
+ cfs_duration_t delta;
+
+ delta = cfs_time_sub(now, peer->lp_last_alive);
+ lastalive = cfs_duration_sec(delta);
+
+ /* No need to mess up peers contents with
+ * arbitrarily long integers - it suffices to
+ * know that lastalive is more than 10000s old
+ */
+ if (lastalive >= 10000)
+ lastalive = 9999;
+ }
+
+ lnet_net_unlock(cpt);
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n",
+ libcfs_nid2str(nid), nrefs, aliveness,
+ lastalive, maxcr, rtrcr, minrtrcr, txcr,
+ mintxcr, txqnob);
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ } else { /* peer is NULL */
+ lnet_net_unlock(cpt);
+ }
+
+ if (hash == LNET_PEER_HASH_SIZE) {
+ cpt++;
+ hash = 0;
+ hoff = 1;
+ if (peer == NULL && cpt < LNET_CPT_NUMBER)
+ goto again;
+ }
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else
+ *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff);
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+static int __proc_lnet_buffers(void *data, int write,
+ loff_t pos, void *buffer, int nob)
+{
+ char *s;
+ char *tmpstr;
+ int tmpsiz;
+ int idx;
+ int len;
+ int rc;
+ int i;
+
+ LASSERT(!write);
+
+ /* (4 %d) * 4 * LNET_CPT_NUMBER */
+ tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER;
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%5s %5s %7s %7s\n",
+ "pages", "count", "credits", "min");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+
+ if (the_lnet.ln_rtrpools == NULL)
+ goto out; /* I'm not a router */
+
+ for (idx = 0; idx < LNET_NRBPOOLS; idx++) {
+ lnet_rtrbufpool_t *rbp;
+
+ lnet_net_lock(LNET_LOCK_EX);
+ cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%5d %5d %7d %7d\n",
+ rbp[idx].rbp_npages,
+ rbp[idx].rbp_nbuffers,
+ rbp[idx].rbp_credits,
+ rbp[idx].rbp_mincredits);
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+ }
+
+ out:
+ len = s - tmpstr;
+
+ if (pos >= min_t(int, len, strlen(tmpstr)))
+ rc = 0;
+ else
+ rc = cfs_trace_copyout_string(buffer, nob,
+ tmpstr + pos, NULL);
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+ return rc;
+}
+
+DECLARE_PROC_HANDLER(proc_lnet_buffers);
+
+int LL_PROC_PROTO(proc_lnet_nis)
+{
+ int tmpsiz = 128 * LNET_CPT_NUMBER;
+ int rc = 0;
+ char *tmpstr;
+ char *s;
+ int len;
+
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ LASSERT (!write);
+
+ if (*lenp == 0)
+ return 0;
+
+ LIBCFS_ALLOC(tmpstr, tmpsiz);
+ if (tmpstr == NULL)
+ return -ENOMEM;
+
+ s = tmpstr; /* points to current position in tmpstr[] */
+
+ if (*ppos == 0) {
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n",
+ "nid", "status", "alive", "refs", "peer",
+ "rtr", "max", "tx", "min");
+ LASSERT (tmpstr + tmpsiz - s > 0);
+ } else {
+ struct list_head *n;
+ lnet_ni_t *ni = NULL;
+ int skip = *ppos - 1;
+
+ lnet_net_lock(0);
+
+ n = the_lnet.ln_nis.next;
+
+ while (n != &the_lnet.ln_nis) {
+ lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list);
+
+ if (skip == 0) {
+ ni = a_ni;
+ break;
+ }
+
+ skip--;
+ n = n->next;
+ }
+
+ if (ni != NULL) {
+ struct lnet_tx_queue *tq;
+ char *stat;
+ long now = cfs_time_current_sec();
+ int last_alive = -1;
+ int i;
+ int j;
+
+ if (the_lnet.ln_routing)
+ last_alive = now - ni->ni_last_alive;
+
+ /* @lo forever alive */
+ if (ni->ni_lnd->lnd_type == LOLND)
+ last_alive = 0;
+
+ lnet_ni_lock(ni);
+ LASSERT(ni->ni_status != NULL);
+ stat = (ni->ni_status->ns_status ==
+ LNET_NI_STATUS_UP) ? "up" : "down";
+ lnet_ni_unlock(ni);
+
+ /* we actually output credits information for
+ * TX queue of each partition */
+ cfs_percpt_for_each(tq, i, ni->ni_tx_queues) {
+ for (j = 0; ni->ni_cpts != NULL &&
+ j < ni->ni_ncpts; j++) {
+ if (i == ni->ni_cpts[j])
+ break;
+ }
+
+ if (j == ni->ni_ncpts)
+ continue;
+
+ if (i != 0)
+ lnet_net_lock(i);
+
+ s += snprintf(s, tmpstr + tmpsiz - s,
+ "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n",
+ libcfs_nid2str(ni->ni_nid), stat,
+ last_alive, *ni->ni_refs[i],
+ ni->ni_peertxcredits,
+ ni->ni_peerrtrcredits,
+ tq->tq_credits_max,
+ tq->tq_credits, tq->tq_credits_min);
+ if (i != 0)
+ lnet_net_unlock(i);
+ }
+ LASSERT(tmpstr + tmpsiz - s > 0);
+ }
+
+ lnet_net_unlock(0);
+ }
+
+ len = s - tmpstr; /* how many bytes was written */
+
+ if (len > *lenp) { /* linux-supplied buffer is too small */
+ rc = -EINVAL;
+ } else if (len > 0) { /* wrote something */
+ if (copy_to_user(buffer, tmpstr, len))
+ rc = -EFAULT;
+ else
+ *ppos += 1;
+ }
+
+ LIBCFS_FREE(tmpstr, tmpsiz);
+
+ if (rc == 0)
+ *lenp = len;
+
+ return rc;
+}
+
+struct lnet_portal_rotors {
+ int pr_value;
+ const char *pr_name;
+ const char *pr_desc;
+};
+
+static struct lnet_portal_rotors portal_rotors[] = {
+ {
+ .pr_value = LNET_PTL_ROTOR_OFF,
+ .pr_name = "OFF",
+ .pr_desc = "Turn off message rotor for wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_ON,
+ .pr_name = "ON",
+ .pr_desc = "round-robin dispatch all PUT messages for "
+ "wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_RR_RT,
+ .pr_name = "RR_RT",
+ .pr_desc = "round-robin dispatch routed PUT message for "
+ "wildcard portals"
+ },
+ {
+ .pr_value = LNET_PTL_ROTOR_HASH_RT,
+ .pr_name = "HASH_RT",
+ .pr_desc = "dispatch routed PUT message by hashing source "
+ "NID for wildcard portals"
+ },
+ {
+ .pr_value = -1,
+ .pr_name = NULL,
+ .pr_desc = NULL
+ },
+};
+
+extern int portal_rotor;
+
+static int __proc_lnet_portal_rotor(void *data, int write,
+ loff_t pos, void *buffer, int nob)
+{
+ const int buf_len = 128;
+ char *buf;
+ char *tmp;
+ int rc;
+ int i;
+
+ LIBCFS_ALLOC(buf, buf_len);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ if (!write) {
+ lnet_res_lock(0);
+
+ for (i = 0; portal_rotors[i].pr_value >= 0; i++) {
+ if (portal_rotors[i].pr_value == portal_rotor)
+ break;
+ }
+
+ LASSERT(portal_rotors[i].pr_value == portal_rotor);
+ lnet_res_unlock(0);
+
+ rc = snprintf(buf, buf_len,
+ "{\n\tportals: all\n"
+ "\trotor: %s\n\tdescription: %s\n}",
+ portal_rotors[i].pr_name,
+ portal_rotors[i].pr_desc);
+
+ if (pos >= min_t(int, rc, buf_len)) {
+ rc = 0;
+ } else {
+ rc = cfs_trace_copyout_string(buffer, nob,
+ buf + pos, "\n");
+ }
+ goto out;
+ }
+
+ rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob);
+ if (rc < 0)
+ goto out;
+
+ tmp = cfs_trimwhite(buf);
+
+ rc = -EINVAL;
+ lnet_res_lock(0);
+ for (i = 0; portal_rotors[i].pr_name != NULL; i++) {
+ if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp,
+ strlen(portal_rotors[i].pr_name)) == 0) {
+ portal_rotor = portal_rotors[i].pr_value;
+ rc = 0;
+ break;
+ }
+ }
+ lnet_res_unlock(0);
+out:
+ LIBCFS_FREE(buf, buf_len);
+ return rc;
+}
+DECLARE_PROC_HANDLER(proc_lnet_portal_rotor);
+
+static ctl_table_t lnet_table[] = {
+ /*
+ * NB No .strategy entries have been provided since sysctl(8) prefers
+ * to go via /proc for portability.
+ */
+ {
+ INIT_CTL_NAME(PSDEV_LNET_STATS)
+ .procname = "stats",
+ .mode = 0644,
+ .proc_handler = &proc_lnet_stats,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_ROUTES)
+ .procname = "routes",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_routes,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_ROUTERS)
+ .procname = "routers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_routers,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_PEERS)
+ .procname = "peers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_peers,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_PEERS)
+ .procname = "buffers",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_buffers,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_NIS)
+ .procname = "nis",
+ .mode = 0444,
+ .proc_handler = &proc_lnet_nis,
+ },
+ {
+ INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR)
+ .procname = "portal_rotor",
+ .mode = 0644,
+ .proc_handler = &proc_lnet_portal_rotor,
+ },
+ {
+ INIT_CTL_NAME(0)
+ }
+};
+
+static ctl_table_t top_table[] = {
+ {
+ INIT_CTL_NAME(CTL_LNET)
+ .procname = "lnet",
+ .mode = 0555,
+ .data = NULL,
+ .maxlen = 0,
+ .child = lnet_table,
+ },
+ {
+ INIT_CTL_NAME(0)
+ }
+};
+
+void
+lnet_proc_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ if (lnet_table_header == NULL)
+ lnet_table_header = cfs_register_sysctl_table(top_table, 0);
+#endif
+}
+
+void
+lnet_proc_fini(void)
+{
+#ifdef CONFIG_SYSCTL
+ if (lnet_table_header != NULL)
+ unregister_sysctl_table(lnet_table_header);
+
+ lnet_table_header = NULL;
+#endif
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif