diff options
Diffstat (limited to 'drivers/staging/lustre/lnet/lnet')
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/Makefile | 8 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/acceptor.c | 527 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/api-errno.c | 39 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/api-ni.c | 1941 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/config.c | 1264 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/lib-eq.c | 447 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/lib-md.c | 451 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/lib-me.c | 297 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/lib-move.c | 2441 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/lib-msg.c | 650 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/lib-ptl.c | 938 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/lo.c | 120 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/module.c | 154 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/peer.c | 337 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/router.c | 1694 | ||||
-rw-r--r-- | drivers/staging/lustre/lnet/lnet/router_proc.c | 950 |
16 files changed, 12258 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile new file mode 100644 index 000000000000..1bd9ef774208 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LNET) += lnet.o + +lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o \ + lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o \ + router_proc.o acceptor.o peer.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c new file mode 100644 index 000000000000..81ef28bbcba0 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/acceptor.c @@ -0,0 +1,527 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/lnet/lib-lnet.h> + + +static int accept_port = 988; +static int accept_backlog = 127; +static int accept_timeout = 5; + +struct { + int pta_shutdown; + socket_t *pta_sock; + struct completion pta_signal; +} lnet_acceptor_state; + +int +lnet_acceptor_port(void) +{ + return accept_port; +} + +static inline int +lnet_accept_magic(__u32 magic, __u32 constant) +{ + return (magic == constant || + magic == __swab32(constant)); +} + + +EXPORT_SYMBOL(lnet_acceptor_port); + +static char *accept = "secure"; + +CFS_MODULE_PARM(accept, "s", charp, 0444, + "Accept connections (secure|all|none)"); +CFS_MODULE_PARM(accept_port, "i", int, 0444, + "Acceptor's port (same on all nodes)"); +CFS_MODULE_PARM(accept_backlog, "i", int, 0444, + "Acceptor's listen backlog"); +CFS_MODULE_PARM(accept_timeout, "i", int, 0644, + "Acceptor's timeout (seconds)"); + +static char *accept_type = NULL; + +int +lnet_acceptor_get_tunables(void) +{ + /* Userland acceptor uses 'accept_type' instead of 'accept', due to + * conflict with 'accept(2)', but kernel acceptor still uses 'accept' + * for compatibility. Hence the trick. */ + accept_type = accept; + return 0; +} + +int +lnet_acceptor_timeout(void) +{ + return accept_timeout; +} +EXPORT_SYMBOL(lnet_acceptor_timeout); + +void +lnet_connect_console_error (int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int peer_port) +{ + switch (rc) { + /* "normal" errors */ + case -ECONNREFUSED: + CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was " + "refused: check that Lustre is running on that node.\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -EHOSTUNREACH: + case -ENETUNREACH: + CNETERR("Connection to %s at host %u.%u.%u.%u " + "was unreachable: the network or that node may " + "be down, or Lustre may be misconfigured.\n", + libcfs_nid2str(peer_nid), HIPQUAD(peer_ip)); + break; + case -ETIMEDOUT: + CNETERR("Connection to %s at host %u.%u.%u.%u on " + "port %d took too long: that node may be hung " + "or experiencing high load.\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -ECONNRESET: + LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u" + " on port %d was reset: " + "is it running a compatible version of " + "Lustre and is %s one of its NIDs?\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port, + libcfs_nid2str(peer_nid)); + break; + case -EPROTO: + LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at " + "host %u.%u.%u.%u on port %d: is it running " + "a compatible version of Lustre?\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -EADDRINUSE: + LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to " + "connect to %s at host %u.%u.%u.%u on port " + "%d\n", libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + default: + LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s" + " at host %u.%u.%u.%u on port %d\n", rc, + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + } +} +EXPORT_SYMBOL(lnet_connect_console_error); + +int +lnet_connect(socket_t **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port) +{ + lnet_acceptor_connreq_t cr; + socket_t *sock; + int rc; + int port; + int fatal; + + CLASSERT (sizeof(cr) <= 16); /* not too big to be on the stack */ + + for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; + port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; + --port) { + /* Iterate through reserved ports. */ + + rc = libcfs_sock_connect(&sock, &fatal, + local_ip, port, + peer_ip, peer_port); + if (rc != 0) { + if (fatal) + goto failed; + continue; + } + + CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1); + + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + cr.acr_nid = peer_nid; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + lnet_net_lock(LNET_LOCK_EX); + if ((the_lnet.ln_testprotocompat & 4) != 0) { + cr.acr_version++; + the_lnet.ln_testprotocompat &= ~4; + } + if ((the_lnet.ln_testprotocompat & 8) != 0) { + cr.acr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~8; + } + lnet_net_unlock(LNET_LOCK_EX); + } + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + if (rc != 0) + goto failed_sock; + + *sockp = sock; + return 0; + } + + rc = -EADDRINUSE; + goto failed; + + failed_sock: + libcfs_sock_release(sock); + failed: + lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port); + return rc; +} +EXPORT_SYMBOL(lnet_connect); + + +/* Below is the code common for both kernel and MT user-space */ + +int +lnet_accept(socket_t *sock, __u32 magic) +{ + lnet_acceptor_connreq_t cr; + __u32 peer_ip; + int peer_port; + int rc; + int flip; + lnet_ni_t *ni; + char *str; + + LASSERT (sizeof(cr) <= 16); /* not too big for the stack */ + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT (rc == 0); /* we succeeded before */ + + if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { + + if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { + /* future version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back + * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */ + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to LNET magic from %u.%u.%u.%u: %d\n", + HIPQUAD(peer_ip), rc); + return -EPROTO; + } + + if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) + str = "'old' socknal/tcpnal"; + else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC)) + str = "'old' ranal"; + else + str = "unrecognised"; + + LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u" + " magic %08x: %s acceptor protocol\n", + HIPQUAD(peer_ip), magic, str); + return -EPROTO; + } + + flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); + + rc = libcfs_sock_read(sock, &cr.acr_version, + sizeof(cr.acr_version), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request version from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + return -EIO; + } + + if (flip) + __swab32s(&cr.acr_version); + + if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) { + /* future version compatibility! + * An acceptor-specific protocol rev will first send a version + * query. I send back my current version to tell her I'm + * "old". */ + int peer_version = cr.acr_version; + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to version %d from %u.%u.%u.%u: %d\n", + peer_version, HIPQUAD(peer_ip), rc); + return -EPROTO; + } + + rc = libcfs_sock_read(sock, &cr.acr_nid, + sizeof(cr) - + offsetof(lnet_acceptor_connreq_t, acr_nid), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + return -EIO; + } + + if (flip) + __swab64s(&cr.acr_nid); + + ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid)); + if (ni == NULL || /* no matching net */ + ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */ + if (ni != NULL) + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u" + " for %s: No matching NI\n", + HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + if (ni->ni_lnd->lnd_accept == NULL) { + /* This catches a request for the loopback LND */ + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u" + " for %s: NI doesn not accept IP connections\n", + HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n", + libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip)); + + rc = ni->ni_lnd->lnd_accept(ni, sock); + + lnet_ni_decref(ni); + return rc; +} + +int +lnet_acceptor(void *arg) +{ + socket_t *newsock; + int rc; + __u32 magic; + __u32 peer_ip; + int peer_port; + int secure = (int)((long_ptr_t)arg); + + LASSERT (lnet_acceptor_state.pta_sock == NULL); + + cfs_block_allsigs(); + + rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock, + 0, accept_port, accept_backlog); + if (rc != 0) { + if (rc == -EADDRINUSE) + LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port" + " %d: port already in use\n", + accept_port); + else + LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port " + "%d: unexpected error %d\n", + accept_port, rc); + + lnet_acceptor_state.pta_sock = NULL; + } else { + LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port); + } + + /* set init status and unblock parent */ + lnet_acceptor_state.pta_shutdown = rc; + complete(&lnet_acceptor_state.pta_signal); + + if (rc != 0) + return rc; + + while (!lnet_acceptor_state.pta_shutdown) { + + rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock); + if (rc != 0) { + if (rc != -EAGAIN) { + CWARN("Accept error %d: pausing...\n", rc); + cfs_pause(cfs_time_seconds(1)); + } + continue; + } + + /* maybe we're waken up with libcfs_sock_abort_accept() */ + if (lnet_acceptor_state.pta_shutdown) { + libcfs_sock_release(newsock); + break; + } + + rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + goto failed; + } + + if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + CERROR("Refusing connection from %u.%u.%u.%u: " + "insecure port %d\n", + HIPQUAD(peer_ip), peer_port); + goto failed; + } + + rc = libcfs_sock_read(newsock, &magic, sizeof(magic), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + goto failed; + } + + rc = lnet_accept(newsock, magic); + if (rc != 0) + goto failed; + + continue; + + failed: + libcfs_sock_release(newsock); + } + + libcfs_sock_release(lnet_acceptor_state.pta_sock); + lnet_acceptor_state.pta_sock = NULL; + + CDEBUG(D_NET, "Acceptor stopping\n"); + + /* unblock lnet_acceptor_stop() */ + complete(&lnet_acceptor_state.pta_signal); + return 0; +} + +static inline int +accept2secure(const char *acc, long *sec) +{ + if (!strcmp(acc, "secure")) { + *sec = 1; + return 1; + } else if (!strcmp(acc, "all")) { + *sec = 0; + return 1; + } else if (!strcmp(acc, "none")) { + return 0; + } else { + LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n", + acc); + return -EINVAL; + } +} + +int +lnet_acceptor_start(void) +{ + int rc; + long rc2; + long secure; + + LASSERT (lnet_acceptor_state.pta_sock == NULL); + + rc = lnet_acceptor_get_tunables(); + if (rc != 0) + return rc; + + + init_completion(&lnet_acceptor_state.pta_signal); + rc = accept2secure(accept_type, &secure); + if (rc <= 0) { + fini_completion(&lnet_acceptor_state.pta_signal); + return rc; + } + + if (lnet_count_acceptor_nis() == 0) /* not required */ + return 0; + + rc2 = PTR_ERR(kthread_run(lnet_acceptor, + (void *)(ulong_ptr_t)secure, + "acceptor_%03ld", secure)); + if (IS_ERR_VALUE(rc2)) { + CERROR("Can't start acceptor thread: %ld\n", rc2); + fini_completion(&lnet_acceptor_state.pta_signal); + + return -ESRCH; + } + + /* wait for acceptor to startup */ + wait_for_completion(&lnet_acceptor_state.pta_signal); + + if (!lnet_acceptor_state.pta_shutdown) { + /* started OK */ + LASSERT(lnet_acceptor_state.pta_sock != NULL); + return 0; + } + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + fini_completion(&lnet_acceptor_state.pta_signal); + + return -ENETDOWN; +} + +void +lnet_acceptor_stop(void) +{ + if (lnet_acceptor_state.pta_sock == NULL) /* not running */ + return; + + lnet_acceptor_state.pta_shutdown = 1; + libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock); + + /* block until acceptor signals exit */ + wait_for_completion(&lnet_acceptor_state.pta_signal); + + fini_completion(&lnet_acceptor_state.pta_signal); +} diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c new file mode 100644 index 000000000000..695b27265e23 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/api-errno.c @@ -0,0 +1,39 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/api-errno.c + * + * Instantiate the string table of errors + */ + +/* If you change these, you must update the number table in portals/errno.h */ diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c new file mode 100644 index 000000000000..e88bee362497 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c @@ -0,0 +1,1941 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/lnet/lib-lnet.h> +#include <linux/log2.h> + +#define D_LNI D_CONSOLE + +lnet_t the_lnet; /* THE state of the network */ +EXPORT_SYMBOL(the_lnet); + + +static char *ip2nets = ""; +CFS_MODULE_PARM(ip2nets, "s", charp, 0444, + "LNET network <- IP table"); + +static char *networks = ""; +CFS_MODULE_PARM(networks, "s", charp, 0444, + "local networks"); + +static char *routes = ""; +CFS_MODULE_PARM(routes, "s", charp, 0444, + "routes to non-local networks"); + +static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; +CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444, + "size of remote network hash table"); + +char * +lnet_get_routes(void) +{ + return routes; +} + +char * +lnet_get_networks(void) +{ + char *nets; + int rc; + + if (*networks != 0 && *ip2nets != 0) { + LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or " + "'ip2nets' but not both at once\n"); + return NULL; + } + + if (*ip2nets != 0) { + rc = lnet_parse_ip2nets(&nets, ip2nets); + return (rc == 0) ? nets : NULL; + } + + if (*networks != 0) + return networks; + + return "tcp"; +} + +void +lnet_init_locks(void) +{ + spin_lock_init(&the_lnet.ln_eq_wait_lock); + init_waitqueue_head(&the_lnet.ln_eq_waitq); + mutex_init(&the_lnet.ln_lnd_mutex); + mutex_init(&the_lnet.ln_api_mutex); +} + +void +lnet_fini_locks(void) +{ +} + + +static int +lnet_create_remote_nets_table(void) +{ + int i; + struct list_head *hash; + + LASSERT(the_lnet.ln_remote_nets_hash == NULL); + LASSERT(the_lnet.ln_remote_nets_hbits > 0); + LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create remote nets hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + INIT_LIST_HEAD(&hash[i]); + the_lnet.ln_remote_nets_hash = hash; + return 0; +} + +static void +lnet_destroy_remote_nets_table(void) +{ + int i; + struct list_head *hash; + + if (the_lnet.ln_remote_nets_hash == NULL) + return; + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i])); + + LIBCFS_FREE(the_lnet.ln_remote_nets_hash, + LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash)); + the_lnet.ln_remote_nets_hash = NULL; +} + +static void +lnet_destroy_locks(void) +{ + if (the_lnet.ln_res_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_res_lock); + the_lnet.ln_res_lock = NULL; + } + + if (the_lnet.ln_net_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_net_lock); + the_lnet.ln_net_lock = NULL; + } + + lnet_fini_locks(); +} + +static int +lnet_create_locks(void) +{ + lnet_init_locks(); + + the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_res_lock == NULL) + goto failed; + + the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_net_lock == NULL) + goto failed; + + return 0; + + failed: + lnet_destroy_locks(); + return -ENOMEM; +} + +void lnet_assert_wire_constants (void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux robert.bartonsoftware.com 2.6.8-1.521 + * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ + + /* Constants... */ + CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded); + CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1); + CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0); + CLASSERT (LNET_MSG_ACK == 0); + CLASSERT (LNET_MSG_PUT == 1); + CLASSERT (LNET_MSG_GET == 2); + CLASSERT (LNET_MSG_REPLY == 3); + CLASSERT (LNET_MSG_HELLO == 4); + + /* Checks for struct ptl_handle_wire_t */ + CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16); + CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0); + CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8); + CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8); + CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8); + + /* Checks for struct lnet_magicversion_t */ + CLASSERT ((int)sizeof(lnet_magicversion_t) == 8); + CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4); + CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2); + CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2); + + /* Checks for struct lnet_hdr_t */ + CLASSERT ((int)sizeof(lnet_hdr_t) == 72); + CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40); + + /* Ack */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4); + + /* Put */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4); + + /* Get */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4); + + /* Reply */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16); + + /* Hello */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4); +} + +lnd_t * +lnet_find_lnd_by_type (int type) +{ + lnd_t *lnd; + struct list_head *tmp; + + /* holding lnd mutex */ + list_for_each (tmp, &the_lnet.ln_lnds) { + lnd = list_entry(tmp, lnd_t, lnd_list); + + if ((int)lnd->lnd_type == type) + return lnd; + } + + return NULL; +} + +void +lnet_register_lnd (lnd_t *lnd) +{ + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (libcfs_isknown_lnd(lnd->lnd_type)); + LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL); + + list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds); + lnd->lnd_refcount = 0; + + CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_register_lnd); + +void +lnet_unregister_lnd (lnd_t *lnd) +{ + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd); + LASSERT (lnd->lnd_refcount == 0); + + list_del (&lnd->lnd_list); + CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_unregister_lnd); + +void +lnet_counters_get(lnet_counters_t *counters) +{ + lnet_counters_t *ctr; + int i; + + memset(counters, 0, sizeof(*counters)); + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { + counters->msgs_max += ctr->msgs_max; + counters->msgs_alloc += ctr->msgs_alloc; + counters->errors += ctr->errors; + counters->send_count += ctr->send_count; + counters->recv_count += ctr->recv_count; + counters->route_count += ctr->route_count; + counters->drop_length += ctr->drop_length; + counters->send_length += ctr->send_length; + counters->recv_length += ctr->recv_length; + counters->route_length += ctr->route_length; + counters->drop_length += ctr->drop_length; + + } + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_get); + +void +lnet_counters_reset(void) +{ + lnet_counters_t *counters; + int i; + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(counters, i, the_lnet.ln_counters) + memset(counters, 0, sizeof(lnet_counters_t)); + + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_reset); + +#ifdef LNET_USE_LIB_FREELIST + +int +lnet_freelist_init (lnet_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT (n > 0); + + size += offsetof (lnet_freeobj_t, fo_contents); + + LIBCFS_ALLOC(space, n * size); + if (space == NULL) + return (-ENOMEM); + + INIT_LIST_HEAD (&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do + { + memset (space, 0, size); + list_add ((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return (0); +} + +void +lnet_freelist_fini (lnet_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT (count == fl->fl_nobjs); + + LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset (fl, 0, sizeof (*fl)); +} + +#endif /* LNET_USE_LIB_FREELIST */ + +__u64 +lnet_create_interface_cookie (void) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid after reboot. Initialisation time, + * even if it's only implemented to millisecond resolution is probably + * easily good enough. */ + struct timeval tv; + __u64 cookie; + do_gettimeofday(&tv); + cookie = tv.tv_sec; + cookie *= 1000000; + cookie += tv.tv_usec; + return cookie; +} + +static char * +lnet_res_type2str(int type) +{ + switch (type) { + default: + LBUG(); + case LNET_COOKIE_TYPE_MD: + return "MD"; + case LNET_COOKIE_TYPE_ME: + return "ME"; + case LNET_COOKIE_TYPE_EQ: + return "EQ"; + } +} + +void +lnet_res_container_cleanup(struct lnet_res_container *rec) +{ + int count = 0; + + if (rec->rec_type == 0) /* not set yet, it's uninitialized */ + return; + + while (!list_empty(&rec->rec_active)) { + struct list_head *e = rec->rec_active.next; + + list_del_init(e); + if (rec->rec_type == LNET_COOKIE_TYPE_EQ) { + lnet_eq_free(list_entry(e, lnet_eq_t, eq_list)); + + } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) { + lnet_md_free(list_entry(e, lnet_libmd_t, md_list)); + + } else { /* NB: Active MEs should be attached on portals */ + LBUG(); + } + count++; + } + + if (count > 0) { + /* Found alive MD/ME/EQ, user really should unlink/free + * all of them before finalize LNet, but if someone didn't, + * we have to recycle garbage for him */ + CERROR("%d active elements on exit of %s container\n", + count, lnet_res_type2str(rec->rec_type)); + } + +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_fini(&rec->rec_freelist); +#endif + if (rec->rec_lh_hash != NULL) { + LIBCFS_FREE(rec->rec_lh_hash, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + rec->rec_lh_hash = NULL; + } + + rec->rec_type = 0; /* mark it as finalized */ +} + +int +lnet_res_container_setup(struct lnet_res_container *rec, + int cpt, int type, int objnum, int objsz) +{ + int rc = 0; + int i; + + LASSERT(rec->rec_type == 0); + + rec->rec_type = type; + INIT_LIST_HEAD(&rec->rec_active); + +#ifdef LNET_USE_LIB_FREELIST + memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist)); + rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz); + if (rc != 0) + goto out; +#endif + rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type; + + /* Arbitrary choice of hash table size */ + LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + if (rec->rec_lh_hash == NULL) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < LNET_LH_HASH_SIZE; i++) + INIT_LIST_HEAD(&rec->rec_lh_hash[i]); + + return 0; + +out: + CERROR("Failed to setup %s resource container\n", + lnet_res_type2str(type)); + lnet_res_container_cleanup(rec); + return rc; +} + +static void +lnet_res_containers_destroy(struct lnet_res_container **recs) +{ + struct lnet_res_container *rec; + int i; + + cfs_percpt_for_each(rec, i, recs) + lnet_res_container_cleanup(rec); + + cfs_percpt_free(recs); +} + +static struct lnet_res_container ** +lnet_res_containers_create(int type, int objnum, int objsz) +{ + struct lnet_res_container **recs; + struct lnet_res_container *rec; + int rc; + int i; + + recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec)); + if (recs == NULL) { + CERROR("Failed to allocate %s resource containers\n", + lnet_res_type2str(type)); + return NULL; + } + + cfs_percpt_for_each(rec, i, recs) { + rc = lnet_res_container_setup(rec, i, type, objnum, objsz); + if (rc != 0) { + lnet_res_containers_destroy(recs); + return NULL; + } + } + + return recs; +} + +lnet_libhandle_t * +lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie) +{ + /* ALWAYS called with lnet_res_lock held */ + struct list_head *head; + lnet_libhandle_t *lh; + unsigned int hash; + + if ((cookie & LNET_COOKIE_MASK) != rec->rec_type) + return NULL; + + hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS); + head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK]; + + list_for_each_entry(lh, head, lh_hash_chain) { + if (lh->lh_cookie == cookie) + return lh; + } + + return NULL; +} + +void +lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh) +{ + /* ALWAYS called with lnet_res_lock held */ + unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS; + unsigned int hash; + + lh->lh_cookie = rec->rec_lh_cookie; + rec->rec_lh_cookie += 1 << ibits; + + hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK; + + list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]); +} + + +int lnet_unprepare(void); + +int +lnet_prepare(lnet_pid_t requested_pid) +{ + /* Prepare to bring up the network */ + struct lnet_res_container **recs; + int rc = 0; + + LASSERT (the_lnet.ln_refcount == 0); + + the_lnet.ln_routing = 0; + + LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0); + the_lnet.ln_pid = requested_pid; + + INIT_LIST_HEAD(&the_lnet.ln_test_peers); + INIT_LIST_HEAD(&the_lnet.ln_nis); + INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); + INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); + INIT_LIST_HEAD(&the_lnet.ln_routers); + + rc = lnet_create_remote_nets_table(); + if (rc != 0) + goto failed; + + the_lnet.ln_interface_cookie = lnet_create_interface_cookie(); + + the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(lnet_counters_t)); + if (the_lnet.ln_counters == NULL) { + CERROR("Failed to allocate counters for LNet\n"); + rc = -ENOMEM; + goto failed; + } + + rc = lnet_peer_tables_create(); + if (rc != 0) + goto failed; + + rc = lnet_msg_containers_create(); + if (rc != 0) + goto failed; + + rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0, + LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS, + sizeof(lnet_eq_t)); + if (rc != 0) + goto failed; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES, + sizeof(lnet_me_t)); + if (recs == NULL) + goto failed; + + the_lnet.ln_me_containers = recs; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS, + sizeof(lnet_libmd_t)); + if (recs == NULL) + goto failed; + + the_lnet.ln_md_containers = recs; + + rc = lnet_portals_create(); + if (rc != 0) { + CERROR("Failed to create portals for LNet: %d\n", rc); + goto failed; + } + + return 0; + + failed: + lnet_unprepare(); + return rc; +} + +int +lnet_unprepare (void) +{ + /* NB no LNET_LOCK since this is the last reference. All LND instances + * have shut down already, so it is safe to unlink and free all + * descriptors, even those that appear committed to a network op (eg MD + * with non-zero pending count) */ + + lnet_fail_nid(LNET_NID_ANY, 0); + + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_test_peers)); + LASSERT(list_empty(&the_lnet.ln_nis)); + LASSERT(list_empty(&the_lnet.ln_nis_cpt)); + LASSERT(list_empty(&the_lnet.ln_nis_zombie)); + + lnet_portals_destroy(); + + if (the_lnet.ln_md_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_md_containers); + the_lnet.ln_md_containers = NULL; + } + + if (the_lnet.ln_me_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_me_containers); + the_lnet.ln_me_containers = NULL; + } + + lnet_res_container_cleanup(&the_lnet.ln_eq_container); + + lnet_msg_containers_destroy(); + lnet_peer_tables_destroy(); + lnet_rtrpools_free(); + + if (the_lnet.ln_counters != NULL) { + cfs_percpt_free(the_lnet.ln_counters); + the_lnet.ln_counters = NULL; + } + lnet_destroy_remote_nets_table(); + + return 0; +} + +lnet_ni_t * +lnet_net2ni_locked(__u32 net, int cpt) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (LNET_NIDNET(ni->ni_nid) == net) { + lnet_ni_addref_locked(ni, cpt); + return ni; + } + } + + return NULL; +} + +lnet_ni_t * +lnet_net2ni(__u32 net) +{ + lnet_ni_t *ni; + + lnet_net_lock(0); + ni = lnet_net2ni_locked(net, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_net2ni); + +static unsigned int +lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number) +{ + __u64 key = nid; + unsigned int val; + + LASSERT(number >= 1 && number <= LNET_CPT_NUMBER); + + if (number == 1) + return 0; + + val = cfs_hash_long(key, LNET_CPT_BITS); + /* NB: LNET_CP_NUMBER doesn't have to be PO2 */ + if (val < number) + return val; + + return (unsigned int)(key + val + (val >> 1)) % number; +} + +int +lnet_cpt_of_nid_locked(lnet_nid_t nid) +{ + struct lnet_ni *ni; + + /* must called with hold of lnet_net_lock */ + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + /* take lnet_net_lock(any) would be OK */ + if (!list_empty(&the_lnet.ln_nis_cpt)) { + list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) { + if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) + continue; + + LASSERT(ni->ni_cpts != NULL); + return ni->ni_cpts[lnet_nid_cpt_hash + (nid, ni->ni_ncpts)]; + } + } + + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); +} + +int +lnet_cpt_of_nid(lnet_nid_t nid) +{ + int cpt; + int cpt2; + + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + if (list_empty(&the_lnet.ln_nis_cpt)) + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + cpt = lnet_net_lock_current(); + cpt2 = lnet_cpt_of_nid_locked(nid); + lnet_net_unlock(cpt); + + return cpt2; +} +EXPORT_SYMBOL(lnet_cpt_of_nid); + +int +lnet_islocalnet(__u32 net) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + + ni = lnet_net2ni_locked(net, cpt); + if (ni != NULL) + lnet_ni_decref_locked(ni, cpt); + + lnet_net_unlock(cpt); + + return ni != NULL; +} + +lnet_ni_t * +lnet_nid2ni_locked(lnet_nid_t nid, int cpt) +{ + struct lnet_ni *ni; + struct list_head *tmp; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (ni->ni_nid == nid) { + lnet_ni_addref_locked(ni, cpt); + return ni; + } + } + + return NULL; +} + +int +lnet_islocalnid(lnet_nid_t nid) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + ni = lnet_nid2ni_locked(nid, cpt); + if (ni != NULL) + lnet_ni_decref_locked(ni, cpt); + lnet_net_unlock(cpt); + + return ni != NULL; +} + +int +lnet_count_acceptor_nis (void) +{ + /* Return the # of NIs that need the acceptor. */ + int count = 0; + struct list_head *tmp; + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (ni->ni_lnd->lnd_accept != NULL) + count++; + } + + lnet_net_unlock(cpt); + + return count; +} + +static int +lnet_ni_tq_credits(lnet_ni_t *ni) +{ + int credits; + + LASSERT(ni->ni_ncpts >= 1); + + if (ni->ni_ncpts == 1) + return ni->ni_maxtxcredits; + + credits = ni->ni_maxtxcredits / ni->ni_ncpts; + credits = max(credits, 8 * ni->ni_peertxcredits); + credits = min(credits, ni->ni_maxtxcredits); + + return credits; +} + +void +lnet_shutdown_lndnis (void) +{ + int i; + int islo; + lnet_ni_t *ni; + + /* NB called holding the global mutex */ + + /* All quiet on the API front */ + LASSERT(!the_lnet.ln_shutdown); + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_nis_zombie)); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_shutdown = 1; /* flag shutdown */ + + /* Unlink NIs from the global table */ + while (!list_empty(&the_lnet.ln_nis)) { + ni = list_entry(the_lnet.ln_nis.next, + lnet_ni_t, ni_list); + /* move it to zombie list and nobody can find it anymore */ + list_move(&ni->ni_list, &the_lnet.ln_nis_zombie); + lnet_ni_decref_locked(ni, 0); /* drop ln_nis' ref */ + + if (!list_empty(&ni->ni_cptlist)) { + list_del_init(&ni->ni_cptlist); + lnet_ni_decref_locked(ni, 0); + } + } + + /* Drop the cached eqwait NI. */ + if (the_lnet.ln_eq_waitni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0); + the_lnet.ln_eq_waitni = NULL; + } + + /* Drop the cached loopback NI. */ + if (the_lnet.ln_loni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_loni, 0); + the_lnet.ln_loni = NULL; + } + + lnet_net_unlock(LNET_LOCK_EX); + + /* Clear lazy portals and drop delayed messages which hold refs + * on their lnet_msg_t::msg_rxpeer */ + for (i = 0; i < the_lnet.ln_nportals; i++) + LNetClearLazyPortal(i); + + /* Clear the peer table and wait for all peers to go (they hold refs on + * their NIs) */ + lnet_peer_tables_cleanup(); + + lnet_net_lock(LNET_LOCK_EX); + /* Now wait for the NI's I just nuked to show up on ln_zombie_nis + * and shut them down in guaranteed thread context */ + i = 2; + while (!list_empty(&the_lnet.ln_nis_zombie)) { + int *ref; + int j; + + ni = list_entry(the_lnet.ln_nis_zombie.next, + lnet_ni_t, ni_list); + list_del_init(&ni->ni_list); + cfs_percpt_for_each(ref, j, ni->ni_refs) { + if (*ref == 0) + continue; + /* still busy, add it back to zombie list */ + list_add(&ni->ni_list, &the_lnet.ln_nis_zombie); + break; + } + + while (!list_empty(&ni->ni_list)) { + lnet_net_unlock(LNET_LOCK_EX); + ++i; + if ((i & (-i)) == i) { + CDEBUG(D_WARNING, + "Waiting for zombie LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + } + cfs_pause(cfs_time_seconds(1)); + lnet_net_lock(LNET_LOCK_EX); + continue; + } + + ni->ni_lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + + islo = ni->ni_lnd->lnd_type == LOLND; + + LASSERT (!in_interrupt ()); + (ni->ni_lnd->lnd_shutdown)(ni); + + /* can't deref lnd anymore now; it might have unregistered + * itself... */ + + if (!islo) + CDEBUG(D_LNI, "Removed LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + + lnet_ni_free(ni); + lnet_net_lock(LNET_LOCK_EX); + } + + the_lnet.ln_shutdown = 0; + lnet_net_unlock(LNET_LOCK_EX); + + if (the_lnet.ln_network_tokens != NULL) { + LIBCFS_FREE(the_lnet.ln_network_tokens, + the_lnet.ln_network_tokens_nob); + the_lnet.ln_network_tokens = NULL; + } +} + +int +lnet_startup_lndnis (void) +{ + lnd_t *lnd; + struct lnet_ni *ni; + struct lnet_tx_queue *tq; + struct list_head nilist; + int i; + int rc = 0; + int lnd_type; + int nicount = 0; + char *nets = lnet_get_networks(); + + INIT_LIST_HEAD(&nilist); + + if (nets == NULL) + goto failed; + + rc = lnet_parse_networks(&nilist, nets); + if (rc != 0) + goto failed; + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + LASSERT (libcfs_isknown_lnd(lnd_type)); + + if (lnd_type == CIBLND || + lnd_type == OPENIBLND || + lnd_type == IIBLND || + lnd_type == VIBLND) { + CERROR("LND %s obsoleted\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + lnd = lnet_find_lnd_by_type(lnd_type); + + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + rc = request_module("%s", + libcfs_lnd2modname(lnd_type)); + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); + goto failed; + } + } + + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount++; + lnet_net_unlock(LNET_LOCK_EX); + + ni->ni_lnd = lnd; + + rc = (lnd->lnd_startup)(ni); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + + if (rc != 0) { + LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s" + "\n", + rc, libcfs_lnd2str(lnd->lnd_type)); + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + goto failed; + } + + LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); + + list_del(&ni->ni_list); + + lnet_net_lock(LNET_LOCK_EX); + /* refcount for ln_nis */ + lnet_ni_addref_locked(ni, 0); + list_add_tail(&ni->ni_list, &the_lnet.ln_nis); + if (ni->ni_cpts != NULL) { + list_add_tail(&ni->ni_cptlist, + &the_lnet.ln_nis_cpt); + lnet_ni_addref_locked(ni, 0); + } + + lnet_net_unlock(LNET_LOCK_EX); + + if (lnd->lnd_type == LOLND) { + lnet_ni_addref(ni); + LASSERT (the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + continue; + } + + if (ni->ni_peertxcredits == 0 || + ni->ni_maxtxcredits == 0) { + LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", + libcfs_lnd2str(lnd->lnd_type), + ni->ni_peertxcredits == 0 ? + "" : "per-peer "); + goto failed; + } + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + tq->tq_credits_min = + tq->tq_credits_max = + tq->tq_credits = lnet_ni_tq_credits(ni); + } + + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", + libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, + lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, + ni->ni_peerrtrcredits, ni->ni_peertimeout); + + nicount++; + } + + if (the_lnet.ln_eq_waitni != NULL && nicount > 1) { + lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type; + LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network" + "\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + return 0; + + failed: + lnet_shutdown_lndnis(); + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + list_del(&ni->ni_list); + lnet_ni_free(ni); + } + + return -ENETDOWN; +} + +/** + * Initialize LNet library. + * + * Only userspace program needs to call this function - it's automatically + * called in the kernel at module loading time. Caller has to call LNetFini() + * after a call to LNetInit(), if and only if the latter returned 0. It must + * be called exactly once. + * + * \return 0 on success, and -ve on failures. + */ +int +LNetInit(void) +{ + int rc; + + lnet_assert_wire_constants(); + LASSERT(!the_lnet.ln_init); + + memset(&the_lnet, 0, sizeof(the_lnet)); + + /* refer to global cfs_cpt_table for now */ + the_lnet.ln_cpt_table = cfs_cpt_table; + the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_table); + + LASSERT(the_lnet.ln_cpt_number > 0); + if (the_lnet.ln_cpt_number > LNET_CPT_MAX) { + /* we are under risk of consuming all lh_cookie */ + CERROR("Can't have %d CPTs for LNet (max allowed is %d), " + "please change setting of CPT-table and retry\n", + the_lnet.ln_cpt_number, LNET_CPT_MAX); + return -1; + } + + while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number) + the_lnet.ln_cpt_bits++; + + rc = lnet_create_locks(); + if (rc != 0) { + CERROR("Can't create LNet global locks: %d\n", rc); + return -1; + } + + the_lnet.ln_refcount = 0; + the_lnet.ln_init = 1; + LNetInvalidateHandle(&the_lnet.ln_rc_eqh); + INIT_LIST_HEAD(&the_lnet.ln_lnds); + INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); + INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); + + /* The hash table size is the number of bits it takes to express the set + * ln_num_routes, minus 1 (better to under estimate than over so we + * don't waste memory). */ + if (rnet_htable_size <= 0) + rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; + else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX) + rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX; + the_lnet.ln_remote_nets_hbits = max_t(int, 1, + order_base_2(rnet_htable_size) - 1); + + /* All LNDs apart from the LOLND are in separate modules. They + * register themselves when their module loads, and unregister + * themselves when their module is unloaded. */ + lnet_register_lnd(&the_lolnd); + return 0; +} +EXPORT_SYMBOL(LNetInit); + +/** + * Finalize LNet library. + * + * Only userspace program needs to call this function. It can be called + * at most once. + * + * \pre LNetInit() called with success. + * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls. + */ +void +LNetFini(void) +{ + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount == 0); + + while (!list_empty(&the_lnet.ln_lnds)) + lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next, + lnd_t, lnd_list)); + lnet_destroy_locks(); + + the_lnet.ln_init = 0; +} +EXPORT_SYMBOL(LNetFini); + +/** + * Set LNet PID and start LNet interfaces, routing, and forwarding. + * + * Userspace program should call this after a successful call to LNetInit(). + * Users must call this function at least once before any other functions. + * For each successful call there must be a corresponding call to + * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is + * ignored. + * + * The PID used by LNet may be different from the one requested. + * See LNetGetId(). + * + * \param requested_pid PID requested by the caller. + * + * \return >= 0 on success, and < 0 error code on failures. + */ +int +LNetNIInit(lnet_pid_t requested_pid) +{ + int im_a_router = 0; + int rc; + + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + + LASSERT (the_lnet.ln_init); + CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + + if (the_lnet.ln_refcount > 0) { + rc = the_lnet.ln_refcount++; + goto out; + } + + lnet_get_tunables(); + + if (requested_pid == LNET_PID_ANY) { + /* Don't instantiate LNET just for me */ + rc = -ENETDOWN; + goto failed0; + } + + rc = lnet_prepare(requested_pid); + if (rc != 0) + goto failed0; + + rc = lnet_startup_lndnis(); + if (rc != 0) + goto failed1; + + rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_check_routes(); + if (rc != 0) + goto failed2; + + rc = lnet_rtrpools_alloc(im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_acceptor_start(); + if (rc != 0) + goto failed2; + + the_lnet.ln_refcount = 1; + /* Now I may use my own API functions... */ + + /* NB router checker needs the_lnet.ln_ping_info in + * lnet_router_checker -> lnet_update_ni_status_locked */ + rc = lnet_ping_target_init(); + if (rc != 0) + goto failed3; + + rc = lnet_router_checker_start(); + if (rc != 0) + goto failed4; + + lnet_proc_init(); + goto out; + + failed4: + lnet_ping_target_fini(); + failed3: + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); + failed2: + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + failed1: + lnet_unprepare(); + failed0: + LASSERT (rc < 0); + out: + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + return rc; +} +EXPORT_SYMBOL(LNetNIInit); + +/** + * Stop LNet interfaces, routing, and forwarding. + * + * Users must call this function once for each successful call to LNetNIInit(). + * Once the LNetNIFini() operation has been started, the results of pending + * API operations are undefined. + * + * \return always 0 for current implementation. + */ +int +LNetNIFini() +{ + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (the_lnet.ln_refcount != 1) { + the_lnet.ln_refcount--; + } else { + LASSERT (!the_lnet.ln_niinit_self); + + lnet_proc_fini(); + lnet_router_checker_stop(); + lnet_ping_target_fini(); + + /* Teardown fns that use my own API functions BEFORE here */ + the_lnet.ln_refcount = 0; + + lnet_acceptor_stop(); + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + lnet_unprepare(); + } + + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + return 0; +} +EXPORT_SYMBOL(LNetNIFini); + +/** + * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and + * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet + * internal ioctl handler. + * + * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it. + * + * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer. + * The data will be printed to system console. Don't use it excessively. + * \param arg A pointer to lnet_process_id_t, process ID of the peer. + * + * \return Always return 0 when called by users directly (i.e., not via ioctl). + */ +int +LNetCtl(unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + lnet_process_id_t id = {0}; + lnet_ni_t *ni; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + switch (cmd) { + case IOC_LIBCFS_GET_NI: + rc = LNetGetId(data->ioc_count, &id); + data->ioc_nid = id.nid; + return rc; + + case IOC_LIBCFS_FAIL_NID: + return lnet_fail_nid(data->ioc_nid, data->ioc_count); + + case IOC_LIBCFS_ADD_ROUTE: + rc = lnet_add_route(data->ioc_net, data->ioc_count, + data->ioc_nid); + return (rc != 0) ? rc : lnet_check_routes(); + + case IOC_LIBCFS_DEL_ROUTE: + return lnet_del_route(data->ioc_net, data->ioc_nid); + + case IOC_LIBCFS_GET_ROUTE: + return lnet_get_route(data->ioc_count, + &data->ioc_net, &data->ioc_count, + &data->ioc_nid, &data->ioc_flags); + case IOC_LIBCFS_NOTIFY_ROUTER: + return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, + cfs_time_current() - + cfs_time_seconds(cfs_time_current_sec() - + (time_t)data->ioc_u64[0])); + + case IOC_LIBCFS_PORTALS_COMPATIBILITY: + /* This can be removed once lustre stops calling it */ + return 0; + + case IOC_LIBCFS_LNET_DIST: + rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]); + if (rc < 0 && rc != -EHOSTUNREACH) + return rc; + + data->ioc_u32[0] = rc; + return 0; + + case IOC_LIBCFS_TESTPROTOCOMPAT: + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_testprotocompat = data->ioc_flags; + lnet_net_unlock(LNET_LOCK_EX); + return 0; + + case IOC_LIBCFS_PING: + id.nid = data->ioc_nid; + id.pid = data->ioc_u32[0]; + rc = lnet_ping(id, data->ioc_u32[1], /* timeout */ + (lnet_process_id_t *)data->ioc_pbuf1, + data->ioc_plen1/sizeof(lnet_process_id_t)); + if (rc < 0) + return rc; + data->ioc_count = rc; + return 0; + + case IOC_LIBCFS_DEBUG_PEER: { + /* CAVEAT EMPTOR: this one designed for calling directly; not + * via an ioctl */ + id = *((lnet_process_id_t *) arg); + + lnet_debug_peer(id.nid); + + ni = lnet_net2ni(LNET_NIDNET(id.nid)); + if (ni == NULL) { + CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id)); + } else { + if (ni->ni_lnd->lnd_ctl == NULL) { + CDEBUG(D_WARNING, "No ctl for %s\n", + libcfs_id2str(id)); + } else { + (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg); + } + + lnet_ni_decref(ni); + } + return 0; + } + + default: + ni = lnet_net2ni(data->ioc_net); + if (ni == NULL) + return -EINVAL; + + if (ni->ni_lnd->lnd_ctl == NULL) + rc = -EINVAL; + else + rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg); + + lnet_ni_decref(ni); + return rc; + } + /* not reached */ +} +EXPORT_SYMBOL(LNetCtl); + +/** + * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that + * all interfaces share a same PID, as requested by LNetNIInit(). + * + * \param index Index of the interface to look up. + * \param id On successful return, this location will hold the + * lnet_process_id_t ID of the interface. + * + * \retval 0 If an interface exists at \a index. + * \retval -ENOENT If no interface has been found. + */ +int +LNetGetId(unsigned int index, lnet_process_id_t *id) +{ + struct lnet_ni *ni; + struct list_head *tmp; + int cpt; + int rc = -ENOENT; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + list_for_each(tmp, &the_lnet.ln_nis) { + if (index-- != 0) + continue; + + ni = list_entry(tmp, lnet_ni_t, ni_list); + + id->nid = ni->ni_nid; + id->pid = the_lnet.ln_pid; + rc = 0; + break; + } + + lnet_net_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetGetId); + +/** + * Print a string representation of handle \a h into buffer \a str of + * \a len bytes. + */ +void +LNetSnprintHandle(char *str, int len, lnet_handle_any_t h) +{ + snprintf(str, len, LPX64, h.cookie); +} +EXPORT_SYMBOL(LNetSnprintHandle); + +static int +lnet_create_ping_info(void) +{ + int i; + int n; + int rc; + unsigned int infosz; + lnet_ni_t *ni; + lnet_process_id_t id; + lnet_ping_info_t *pinfo; + + for (n = 0; ; n++) { + rc = LNetGetId(n, &id); + if (rc == -ENOENT) + break; + + LASSERT (rc == 0); + } + + infosz = offsetof(lnet_ping_info_t, pi_ni[n]); + LIBCFS_ALLOC(pinfo, infosz); + if (pinfo == NULL) { + CERROR("Can't allocate ping info[%d]\n", n); + return -ENOMEM; + } + + pinfo->pi_nnis = n; + pinfo->pi_pid = the_lnet.ln_pid; + pinfo->pi_magic = LNET_PROTO_PING_MAGIC; + pinfo->pi_features = LNET_PING_FEAT_NI_STATUS; + + for (i = 0; i < n; i++) { + lnet_ni_status_t *ns = &pinfo->pi_ni[i]; + + rc = LNetGetId(i, &id); + LASSERT (rc == 0); + + ns->ns_nid = id.nid; + ns->ns_status = LNET_NI_STATUS_UP; + + lnet_net_lock(0); + + ni = lnet_nid2ni_locked(id.nid, 0); + LASSERT(ni != NULL); + + lnet_ni_lock(ni); + LASSERT(ni->ni_status == NULL); + ni->ni_status = ns; + lnet_ni_unlock(ni); + + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); + } + + the_lnet.ln_ping_info = pinfo; + return 0; +} + +static void +lnet_destroy_ping_info(void) +{ + struct lnet_ni *ni; + + lnet_net_lock(0); + + list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { + lnet_ni_lock(ni); + ni->ni_status = NULL; + lnet_ni_unlock(ni); + } + + lnet_net_unlock(0); + + LIBCFS_FREE(the_lnet.ln_ping_info, + offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis])); + the_lnet.ln_ping_info = NULL; + return; +} + +int +lnet_ping_target_init(void) +{ + lnet_md_t md = {0}; + lnet_handle_me_t meh; + lnet_process_id_t id; + int rc; + int rc2; + int infosz; + + rc = lnet_create_ping_info(); + if (rc != 0) + return rc; + + /* We can have a tiny EQ since we only need to see the unlink event on + * teardown, which by definition is the last one! */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq); + if (rc != 0) { + CERROR("Can't allocate ping EQ: %d\n", rc); + goto failed_0; + } + + memset(&id, 0, sizeof(lnet_process_id_t)); + id.nid = LNET_NID_ANY; + id.pid = LNET_PID_ANY; + + rc = LNetMEAttach(LNET_RESERVED_PORTAL, id, + LNET_PROTO_PING_MATCHBITS, 0, + LNET_UNLINK, LNET_INS_AFTER, + &meh); + if (rc != 0) { + CERROR("Can't create ping ME: %d\n", rc); + goto failed_1; + } + + /* initialize md content */ + infosz = offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis]); + md.start = the_lnet.ln_ping_info; + md.length = infosz; + md.threshold = LNET_MD_THRESH_INF; + md.max_size = 0; + md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE; + md.user_ptr = NULL; + md.eq_handle = the_lnet.ln_ping_target_eq; + + rc = LNetMDAttach(meh, md, + LNET_RETAIN, + &the_lnet.ln_ping_target_md); + if (rc != 0) { + CERROR("Can't attach ping MD: %d\n", rc); + goto failed_2; + } + + return 0; + + failed_2: + rc2 = LNetMEUnlink(meh); + LASSERT (rc2 == 0); + failed_1: + rc2 = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT (rc2 == 0); + failed_0: + lnet_destroy_ping_info(); + return rc; +} + +void +lnet_ping_target_fini(void) +{ + lnet_event_t event; + int rc; + int which; + int timeout_ms = 1000; + sigset_t blocked = cfs_block_allsigs(); + + LNetMDUnlink(the_lnet.ln_ping_target_md); + /* NB md could be busy; this just starts the unlink */ + + for (;;) { + rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1, + timeout_ms, &event, &which); + + /* I expect overflow... */ + LASSERT (rc >= 0 || rc == -EOVERFLOW); + + if (rc == 0) { + /* timed out: provide a diagnostic */ + CWARN("Still waiting for ping MD to unlink\n"); + timeout_ms *= 2; + continue; + } + + /* Got a valid event */ + if (event.unlinked) + break; + } + + rc = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT (rc == 0); + lnet_destroy_ping_info(); + cfs_restore_sigs(blocked); +} + +int +lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids) +{ + lnet_handle_eq_t eqh; + lnet_handle_md_t mdh; + lnet_event_t event; + lnet_md_t md = {0}; + int which; + int unlinked = 0; + int replied = 0; + const int a_long_time = 60000; /* mS */ + int infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]); + lnet_ping_info_t *info; + lnet_process_id_t tmpid; + int i; + int nob; + int rc; + int rc2; + sigset_t blocked; + + if (n_ids <= 0 || + id.nid == LNET_NID_ANY || + timeout_ms > 500000 || /* arbitrary limit! */ + n_ids > 20) /* arbitrary limit! */ + return -EINVAL; + + if (id.pid == LNET_PID_ANY) + id.pid = LUSTRE_SRV_LNET_PID; + + LIBCFS_ALLOC(info, infosz); + if (info == NULL) + return -ENOMEM; + + /* NB 2 events max (including any unlink event) */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh); + if (rc != 0) { + CERROR("Can't allocate EQ: %d\n", rc); + goto out_0; + } + + /* initialize md content */ + md.start = info; + md.length = infosz; + md.threshold = 2; /*GET/REPLY*/ + md.max_size = 0; + md.options = LNET_MD_TRUNCATE; + md.user_ptr = NULL; + md.eq_handle = eqh; + + rc = LNetMDBind(md, LNET_UNLINK, &mdh); + if (rc != 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out_1; + } + + rc = LNetGet(LNET_NID_ANY, mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + + if (rc != 0) { + /* Don't CERROR; this could be deliberate! */ + + rc2 = LNetMDUnlink(mdh); + LASSERT (rc2 == 0); + + /* NB must wait for the UNLINK event below... */ + unlinked = 1; + timeout_ms = a_long_time; + } + + do { + /* MUST block for unlink to complete */ + if (unlinked) + blocked = cfs_block_allsigs(); + + rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which); + + if (unlinked) + cfs_restore_sigs(blocked); + + CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2, + (rc2 <= 0) ? -1 : event.type, + (rc2 <= 0) ? -1 : event.status, + (rc2 > 0 && event.unlinked) ? " unlinked" : ""); + + LASSERT (rc2 != -EOVERFLOW); /* can't miss anything */ + + if (rc2 <= 0 || event.status != 0) { + /* timeout or error */ + if (!replied && rc == 0) + rc = (rc2 < 0) ? rc2 : + (rc2 == 0) ? -ETIMEDOUT : + event.status; + + if (!unlinked) { + /* Ensure completion in finite time... */ + LNetMDUnlink(mdh); + /* No assertion (racing with network) */ + unlinked = 1; + timeout_ms = a_long_time; + } else if (rc2 == 0) { + /* timed out waiting for unlink */ + CWARN("ping %s: late network completion\n", + libcfs_id2str(id)); + } + } else if (event.type == LNET_EVENT_REPLY) { + replied = 1; + rc = event.mlength; + } + + } while (rc2 <= 0 || !event.unlinked); + + if (!replied) { + if (rc >= 0) + CWARN("%s: Unexpected rc >= 0 but no reply!\n", + libcfs_id2str(id)); + rc = -EIO; + goto out_1; + } + + nob = rc; + LASSERT (nob >= 0 && nob <= infosz); + + rc = -EPROTO; /* if I can't parse... */ + + if (nob < 8) { + /* can't check magic/version */ + CERROR("%s: ping info too short %d\n", + libcfs_id2str(id), nob); + goto out_1; + } + + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { + lnet_swap_pinginfo(info); + } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CERROR("%s: Unexpected magic %08x\n", + libcfs_id2str(id), info->pi_magic); + goto out_1; + } + + if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) { + CERROR("%s: ping w/o NI status: 0x%x\n", + libcfs_id2str(id), info->pi_features); + goto out_1; + } + + if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) { + CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_ni[0])); + goto out_1; + } + + if (info->pi_nnis < n_ids) + n_ids = info->pi_nnis; + + if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) { + CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids])); + goto out_1; + } + + rc = -EFAULT; /* If I SEGV... */ + + for (i = 0; i < n_ids; i++) { + tmpid.pid = info->pi_pid; + tmpid.nid = info->pi_ni[i].ns_nid; + if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) + goto out_1; + } + rc = info->pi_nnis; + + out_1: + rc2 = LNetEQFree(eqh); + if (rc2 != 0) + CERROR("rc2 %d\n", rc2); + LASSERT (rc2 == 0); + + out_0: + LIBCFS_FREE(info, infosz); + return rc; +} diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c new file mode 100644 index 000000000000..28711e6e8b03 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/config.c @@ -0,0 +1,1264 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/lnet/lib-lnet.h> + +typedef struct { /* tmp struct for parsing routes */ + struct list_head ltb_list; /* stash on lists */ + int ltb_size; /* allocated size */ + char ltb_text[0]; /* text buffer */ +} lnet_text_buf_t; + +static int lnet_tbnob = 0; /* track text buf allocation */ +#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */ +#define LNET_SINGLE_TEXTBUF_NOB (4<<10) + +void +lnet_syntax(char *name, char *str, int offset, int width) +{ + static char dots[LNET_SINGLE_TEXTBUF_NOB]; + static char dashes[LNET_SINGLE_TEXTBUF_NOB]; + + memset(dots, '.', sizeof(dots)); + dots[sizeof(dots)-1] = 0; + memset(dashes, '-', sizeof(dashes)); + dashes[sizeof(dashes)-1] = 0; + + LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str); + LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n", + (int)strlen(name), dots, offset, dots, + (width < 1) ? 0 : width - 1, dashes); +} + +int +lnet_issep (char c) +{ + switch (c) { + case '\n': + case '\r': + case ';': + return 1; + default: + return 0; + } +} + +int +lnet_net_unique(__u32 net, struct list_head *nilist) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + list_for_each (tmp, nilist) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (LNET_NIDNET(ni->ni_nid) == net) + return 0; + } + + return 1; +} + +void +lnet_ni_free(struct lnet_ni *ni) +{ + if (ni->ni_refs != NULL) + cfs_percpt_free(ni->ni_refs); + + if (ni->ni_tx_queues != NULL) + cfs_percpt_free(ni->ni_tx_queues); + + if (ni->ni_cpts != NULL) + cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts); + + LIBCFS_FREE(ni, sizeof(*ni)); +} + +lnet_ni_t * +lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist) +{ + struct lnet_tx_queue *tq; + struct lnet_ni *ni; + int rc; + int i; + + if (!lnet_net_unique(net, nilist)) { + LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n", + libcfs_net2str(net)); + return NULL; + } + + LIBCFS_ALLOC(ni, sizeof(*ni)); + if (ni == NULL) { + CERROR("Out of memory creating network %s\n", + libcfs_net2str(net)); + return NULL; + } + + spin_lock_init(&ni->ni_lock); + INIT_LIST_HEAD(&ni->ni_cptlist); + ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_refs[0])); + if (ni->ni_refs == NULL) + goto failed; + + ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_tx_queues[0])); + if (ni->ni_tx_queues == NULL) + goto failed; + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) + INIT_LIST_HEAD(&tq->tq_delayed); + + if (el == NULL) { + ni->ni_cpts = NULL; + ni->ni_ncpts = LNET_CPT_NUMBER; + } else { + rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts); + if (rc <= 0) { + CERROR("Failed to set CPTs for NI %s: %d\n", + libcfs_net2str(net), rc); + goto failed; + } + + LASSERT(rc <= LNET_CPT_NUMBER); + if (rc == LNET_CPT_NUMBER) { + LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0])); + ni->ni_cpts = NULL; + } + + ni->ni_ncpts = rc; + } + + /* LND will fill in the address part of the NID */ + ni->ni_nid = LNET_MKNID(net, 0); + ni->ni_last_alive = cfs_time_current_sec(); + list_add_tail(&ni->ni_list, nilist); + return ni; + failed: + lnet_ni_free(ni); + return NULL; +} + +int +lnet_parse_networks(struct list_head *nilist, char *networks) +{ + struct cfs_expr_list *el = NULL; + int tokensize = strlen(networks) + 1; + char *tokens; + char *str; + char *tmp; + struct lnet_ni *ni; + __u32 net; + int nnets = 0; + + if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) { + /* _WAY_ conservative */ + LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too " + "long\n"); + return -EINVAL; + } + + LIBCFS_ALLOC(tokens, tokensize); + if (tokens == NULL) { + CERROR("Can't allocate net tokens\n"); + return -ENOMEM; + } + + the_lnet.ln_network_tokens = tokens; + the_lnet.ln_network_tokens_nob = tokensize; + memcpy (tokens, networks, tokensize); + str = tmp = tokens; + + /* Add in the loopback network */ + ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist); + if (ni == NULL) + goto failed; + + while (str != NULL && *str != 0) { + char *comma = strchr(str, ','); + char *bracket = strchr(str, '('); + char *square = strchr(str, '['); + char *iface; + int niface; + int rc; + + /* NB we don't check interface conflicts here; it's the LNDs + * responsibility (if it cares at all) */ + + if (square != NULL && (comma == NULL || square < comma)) { + /* i.e: o2ib0(ib0)[1,2], number between square + * brackets are CPTs this NI needs to be bond */ + if (bracket != NULL && bracket > square) { + tmp = square; + goto failed_syntax; + } + + tmp = strchr(square, ']'); + if (tmp == NULL) { + tmp = square; + goto failed_syntax; + } + + rc = cfs_expr_list_parse(square, tmp - square + 1, + 0, LNET_CPT_NUMBER - 1, &el); + if (rc != 0) { + tmp = square; + goto failed_syntax; + } + + while (square <= tmp) + *square++ = ' '; + } + + if (bracket == NULL || + (comma != NULL && comma < bracket)) { + + /* no interface list specified */ + + if (comma != NULL) + *comma++ = 0; + net = libcfs_str2net(cfs_trimwhite(str)); + + if (net == LNET_NIDNET(LNET_NID_ANY)) { + LCONSOLE_ERROR_MSG(0x113, "Unrecognised network" + " type\n"); + tmp = str; + goto failed_syntax; + } + + if (LNET_NETTYP(net) != LOLND && /* LO is implicit */ + lnet_ni_alloc(net, el, nilist) == NULL) + goto failed; + + if (el != NULL) { + cfs_expr_list_free(el); + el = NULL; + } + + str = comma; + continue; + } + + *bracket = 0; + net = libcfs_str2net(cfs_trimwhite(str)); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + tmp = str; + goto failed_syntax; + } + + nnets++; + ni = lnet_ni_alloc(net, el, nilist); + if (ni == NULL) + goto failed; + + if (el != NULL) { + cfs_expr_list_free(el); + el = NULL; + } + + niface = 0; + iface = bracket + 1; + + bracket = strchr(iface, ')'); + if (bracket == NULL) { + tmp = iface; + goto failed_syntax; + } + + *bracket = 0; + do { + comma = strchr(iface, ','); + if (comma != NULL) + *comma++ = 0; + + iface = cfs_trimwhite(iface); + if (*iface == 0) { + tmp = iface; + goto failed_syntax; + } + + if (niface == LNET_MAX_INTERFACES) { + LCONSOLE_ERROR_MSG(0x115, "Too many interfaces " + "for net %s\n", + libcfs_net2str(net)); + goto failed; + } + + ni->ni_interfaces[niface++] = iface; + iface = comma; + } while (iface != NULL); + + str = bracket + 1; + comma = strchr(bracket + 1, ','); + if (comma != NULL) { + *comma = 0; + str = cfs_trimwhite(str); + if (*str != 0) { + tmp = str; + goto failed_syntax; + } + str = comma + 1; + continue; + } + + str = cfs_trimwhite(str); + if (*str != 0) { + tmp = str; + goto failed_syntax; + } + } + + LASSERT(!list_empty(nilist)); + return 0; + + failed_syntax: + lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp)); + failed: + while (!list_empty(nilist)) { + ni = list_entry(nilist->next, lnet_ni_t, ni_list); + + list_del(&ni->ni_list); + lnet_ni_free(ni); + } + + if (el != NULL) + cfs_expr_list_free(el); + + LIBCFS_FREE(tokens, tokensize); + the_lnet.ln_network_tokens = NULL; + + return -EINVAL; +} + +lnet_text_buf_t * +lnet_new_text_buf (int str_len) +{ + lnet_text_buf_t *ltb; + int nob; + + /* NB allocate space for the terminating 0 */ + nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]); + if (nob > LNET_SINGLE_TEXTBUF_NOB) { + /* _way_ conservative for "route net gateway..." */ + CERROR("text buffer too big\n"); + return NULL; + } + + if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) { + CERROR("Too many text buffers\n"); + return NULL; + } + + LIBCFS_ALLOC(ltb, nob); + if (ltb == NULL) + return NULL; + + ltb->ltb_size = nob; + ltb->ltb_text[0] = 0; + lnet_tbnob += nob; + return ltb; +} + +void +lnet_free_text_buf (lnet_text_buf_t *ltb) +{ + lnet_tbnob -= ltb->ltb_size; + LIBCFS_FREE(ltb, ltb->ltb_size); +} + +void +lnet_free_text_bufs(struct list_head *tbs) +{ + lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list); + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } +} + +void +lnet_print_text_bufs(struct list_head *tbs) +{ + struct list_head *tmp; + lnet_text_buf_t *ltb; + + list_for_each (tmp, tbs) { + ltb = list_entry(tmp, lnet_text_buf_t, ltb_list); + + CDEBUG(D_WARNING, "%s\n", ltb->ltb_text); + } + + CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob); +} + +int +lnet_str2tbs_sep (struct list_head *tbs, char *str) +{ + struct list_head pending; + char *sep; + int nob; + int i; + lnet_text_buf_t *ltb; + + INIT_LIST_HEAD(&pending); + + /* Split 'str' into separate commands */ + for (;;) { + /* skip leading whitespace */ + while (cfs_iswhite(*str)) + str++; + + /* scan for separator or comment */ + for (sep = str; *sep != 0; sep++) + if (lnet_issep(*sep) || *sep == '#') + break; + + nob = (int)(sep - str); + if (nob > 0) { + ltb = lnet_new_text_buf(nob); + if (ltb == NULL) { + lnet_free_text_bufs(&pending); + return -1; + } + + for (i = 0; i < nob; i++) + if (cfs_iswhite(str[i])) + ltb->ltb_text[i] = ' '; + else + ltb->ltb_text[i] = str[i]; + + ltb->ltb_text[nob] = 0; + + list_add_tail(<b->ltb_list, &pending); + } + + if (*sep == '#') { + /* scan for separator */ + do { + sep++; + } while (*sep != 0 && !lnet_issep(*sep)); + } + + if (*sep == 0) + break; + + str = sep + 1; + } + + list_splice(&pending, tbs->prev); + return 0; +} + +int +lnet_expand1tb (struct list_head *list, + char *str, char *sep1, char *sep2, + char *item, int itemlen) +{ + int len1 = (int)(sep1 - str); + int len2 = strlen(sep2 + 1); + lnet_text_buf_t *ltb; + + LASSERT (*sep1 == '['); + LASSERT (*sep2 == ']'); + + ltb = lnet_new_text_buf(len1 + itemlen + len2); + if (ltb == NULL) + return -ENOMEM; + + memcpy(ltb->ltb_text, str, len1); + memcpy(<b->ltb_text[len1], item, itemlen); + memcpy(<b->ltb_text[len1+itemlen], sep2 + 1, len2); + ltb->ltb_text[len1 + itemlen + len2] = 0; + + list_add_tail(<b->ltb_list, list); + return 0; +} + +int +lnet_str2tbs_expand (struct list_head *tbs, char *str) +{ + char num[16]; + struct list_head pending; + char *sep; + char *sep2; + char *parsed; + char *enditem; + int lo; + int hi; + int stride; + int i; + int nob; + int scanned; + + INIT_LIST_HEAD(&pending); + + sep = strchr(str, '['); + if (sep == NULL) /* nothing to expand */ + return 0; + + sep2 = strchr(sep, ']'); + if (sep2 == NULL) + goto failed; + + for (parsed = sep; parsed < sep2; parsed = enditem) { + + enditem = ++parsed; + while (enditem < sep2 && *enditem != ',') + enditem++; + + if (enditem == parsed) /* no empty items */ + goto failed; + + if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) { + + if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) { + + /* simple string enumeration */ + if (lnet_expand1tb(&pending, str, sep, sep2, + parsed, (int)(enditem - parsed)) != 0) + goto failed; + + continue; + } + + stride = 1; + } + + /* range expansion */ + + if (enditem != parsed + scanned) /* no trailing junk */ + goto failed; + + if (hi < 0 || lo < 0 || stride < 0 || hi < lo || + (hi - lo) % stride != 0) + goto failed; + + for (i = lo; i <= hi; i += stride) { + + snprintf(num, sizeof(num), "%d", i); + nob = strlen(num); + if (nob + 1 == sizeof(num)) + goto failed; + + if (lnet_expand1tb(&pending, str, sep, sep2, + num, nob) != 0) + goto failed; + } + } + + list_splice(&pending, tbs->prev); + return 1; + + failed: + lnet_free_text_bufs(&pending); + return -1; +} + +int +lnet_parse_hops (char *str, unsigned int *hops) +{ + int len = strlen(str); + int nob = len; + + return (sscanf(str, "%u%n", hops, &nob) >= 1 && + nob == len && + *hops > 0 && *hops < 256); +} + + +int +lnet_parse_route (char *str, int *im_a_router) +{ + /* static scratch buffer OK (single threaded) */ + static char cmd[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head nets; + struct list_head gateways; + struct list_head *tmp1; + struct list_head *tmp2; + __u32 net; + lnet_nid_t nid; + lnet_text_buf_t *ltb; + int rc; + char *sep; + char *token = str; + int ntokens = 0; + int myrc = -1; + unsigned int hops; + int got_hops = 0; + + INIT_LIST_HEAD(&gateways); + INIT_LIST_HEAD(&nets); + + /* save a copy of the string for error messages */ + strncpy(cmd, str, sizeof(cmd) - 1); + cmd[sizeof(cmd) - 1] = 0; + + sep = str; + for (;;) { + /* scan for token start */ + while (cfs_iswhite(*sep)) + sep++; + if (*sep == 0) { + if (ntokens < (got_hops ? 3 : 2)) + goto token_error; + break; + } + + ntokens++; + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !cfs_iswhite(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens == 1) { + tmp2 = &nets; /* expanding nets */ + } else if (ntokens == 2 && + lnet_parse_hops(token, &hops)) { + got_hops = 1; /* got a hop count */ + continue; + } else { + tmp2 = &gateways; /* expanding gateways */ + } + + ltb = lnet_new_text_buf(strlen(token)); + if (ltb == NULL) + goto out; + + strcpy(ltb->ltb_text, token); + tmp1 = <b->ltb_list; + list_add_tail(tmp1, tmp2); + + while (tmp1 != tmp2) { + ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list); + + rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text); + if (rc < 0) + goto token_error; + + tmp1 = tmp1->next; + + if (rc > 0) { /* expanded! */ + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + continue; + } + + if (ntokens == 1) { + net = libcfs_str2net(ltb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND) + goto token_error; + } else { + nid = libcfs_str2nid(ltb->ltb_text); + if (nid == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + goto token_error; + } + } + } + + if (!got_hops) + hops = 1; + + LASSERT (!list_empty(&nets)); + LASSERT (!list_empty(&gateways)); + + list_for_each (tmp1, &nets) { + ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list); + net = libcfs_str2net(ltb->ltb_text); + LASSERT (net != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each (tmp2, &gateways) { + ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list); + nid = libcfs_str2nid(ltb->ltb_text); + LASSERT (nid != LNET_NID_ANY); + + if (lnet_islocalnid(nid)) { + *im_a_router = 1; + continue; + } + + rc = lnet_add_route (net, hops, nid); + if (rc != 0) { + CERROR("Can't create route " + "to %s via %s\n", + libcfs_net2str(net), + libcfs_nid2str(nid)); + goto out; + } + } + } + + myrc = 0; + goto out; + + token_error: + lnet_syntax("routes", cmd, (int)(token - str), strlen(token)); + out: + lnet_free_text_bufs(&nets); + lnet_free_text_bufs(&gateways); + return myrc; +} + +int +lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router) +{ + lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list); + + if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) { + lnet_free_text_bufs(tbs); + return -EINVAL; + } + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } + + return 0; +} + +int +lnet_parse_routes (char *routes, int *im_a_router) +{ + struct list_head tbs; + int rc = 0; + + *im_a_router = 0; + + INIT_LIST_HEAD(&tbs); + + if (lnet_str2tbs_sep(&tbs, routes) < 0) { + CERROR("Error parsing routes\n"); + rc = -EINVAL; + } else { + rc = lnet_parse_route_tbs(&tbs, im_a_router); + } + + LASSERT (lnet_tbnob == 0); + return rc; +} + +int +lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip) +{ + LIST_HEAD (list); + int rc; + int i; + + rc = cfs_ip_addr_parse(token, len, &list); + if (rc != 0) + return rc; + + for (rc = i = 0; !rc && i < nip; i++) + rc = cfs_ip_addr_match(ipaddrs[i], &list); + + cfs_ip_addr_free(&list); + + return rc; +} + +int +lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip) +{ + static char tokens[LNET_SINGLE_TEXTBUF_NOB]; + + int matched = 0; + int ntokens = 0; + int len; + char *net = NULL; + char *sep; + char *token; + int rc; + + LASSERT (strlen(net_entry) < sizeof(tokens)); + + /* work on a copy of the string */ + strcpy(tokens, net_entry); + sep = tokens; + for (;;) { + /* scan for token start */ + while (cfs_iswhite(*sep)) + sep++; + if (*sep == 0) + break; + + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !cfs_iswhite(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens++ == 0) { + net = token; + continue; + } + + len = strlen(token); + + rc = lnet_match_network_token(token, len, ipaddrs, nip); + if (rc < 0) { + lnet_syntax("ip2nets", net_entry, + (int)(token - tokens), len); + return rc; + } + + matched |= (rc != 0); + } + + if (!matched) + return 0; + + strcpy(net_entry, net); /* replace with matched net */ + return 1; +} + +__u32 +lnet_netspec2net(char *netspec) +{ + char *bracket = strchr(netspec, '('); + __u32 net; + + if (bracket != NULL) + *bracket = 0; + + net = libcfs_str2net(netspec); + + if (bracket != NULL) + *bracket = '('; + + return net; +} + +int +lnet_splitnets(char *source, struct list_head *nets) +{ + int offset = 0; + int offset2; + int len; + lnet_text_buf_t *tb; + lnet_text_buf_t *tb2; + struct list_head *t; + char *sep; + char *bracket; + __u32 net; + + LASSERT (!list_empty(nets)); + LASSERT (nets->next == nets->prev); /* single entry */ + + tb = list_entry(nets->next, lnet_text_buf_t, ltb_list); + + for (;;) { + sep = strchr(tb->ltb_text, ','); + bracket = strchr(tb->ltb_text, '('); + + if (sep != NULL && + bracket != NULL && + bracket < sep) { + /* netspec lists interfaces... */ + + offset2 = offset + (int)(bracket - tb->ltb_text); + len = strlen(bracket); + + bracket = strchr(bracket + 1, ')'); + + if (bracket == NULL || + !(bracket[1] == ',' || bracket[1] == 0)) { + lnet_syntax("ip2nets", source, offset2, len); + return -EINVAL; + } + + sep = (bracket[1] == 0) ? NULL : bracket + 1; + } + + if (sep != NULL) + *sep++ = 0; + + net = lnet_netspec2net(tb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + + list_for_each(t, nets) { + tb2 = list_entry(t, lnet_text_buf_t, ltb_list); + + if (tb2 == tb) + continue; + + if (net == lnet_netspec2net(tb2->ltb_text)) { + /* duplicate network */ + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + } + + if (sep == NULL) + return 0; + + offset += (int)(sep - tb->ltb_text); + tb2 = lnet_new_text_buf(strlen(sep)); + if (tb2 == NULL) + return -ENOMEM; + + strcpy(tb2->ltb_text, sep); + list_add_tail(&tb2->ltb_list, nets); + + tb = tb2; + } +} + +int +lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip) +{ + static char networks[LNET_SINGLE_TEXTBUF_NOB]; + static char source[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head raw_entries; + struct list_head matched_nets; + struct list_head current_nets; + struct list_head *t; + struct list_head *t2; + lnet_text_buf_t *tb; + lnet_text_buf_t *tb2; + __u32 net1; + __u32 net2; + int len; + int count; + int dup; + int rc; + + INIT_LIST_HEAD(&raw_entries); + if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) { + CERROR("Error parsing ip2nets\n"); + LASSERT (lnet_tbnob == 0); + return -EINVAL; + } + + INIT_LIST_HEAD(&matched_nets); + INIT_LIST_HEAD(¤t_nets); + networks[0] = 0; + count = 0; + len = 0; + rc = 0; + + while (!list_empty(&raw_entries)) { + tb = list_entry(raw_entries.next, lnet_text_buf_t, + ltb_list); + + strncpy(source, tb->ltb_text, sizeof(source)-1); + source[sizeof(source)-1] = 0; + + /* replace ltb_text with the network(s) add on match */ + rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip); + if (rc < 0) + break; + + list_del(&tb->ltb_list); + + if (rc == 0) { /* no match */ + lnet_free_text_buf(tb); + continue; + } + + /* split into separate networks */ + INIT_LIST_HEAD(¤t_nets); + list_add(&tb->ltb_list, ¤t_nets); + rc = lnet_splitnets(source, ¤t_nets); + if (rc < 0) + break; + + dup = 0; + list_for_each (t, ¤t_nets) { + tb = list_entry(t, lnet_text_buf_t, ltb_list); + net1 = lnet_netspec2net(tb->ltb_text); + LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each(t2, &matched_nets) { + tb2 = list_entry(t2, lnet_text_buf_t, + ltb_list); + net2 = lnet_netspec2net(tb2->ltb_text); + LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY)); + + if (net1 == net2) { + dup = 1; + break; + } + } + + if (dup) + break; + } + + if (dup) { + lnet_free_text_bufs(¤t_nets); + continue; + } + + list_for_each_safe(t, t2, ¤t_nets) { + tb = list_entry(t, lnet_text_buf_t, ltb_list); + + list_del(&tb->ltb_list); + list_add_tail(&tb->ltb_list, &matched_nets); + + len += snprintf(networks + len, sizeof(networks) - len, + "%s%s", (len == 0) ? "" : ",", + tb->ltb_text); + + if (len >= sizeof(networks)) { + CERROR("Too many matched networks\n"); + rc = -E2BIG; + goto out; + } + } + + count++; + } + + out: + lnet_free_text_bufs(&raw_entries); + lnet_free_text_bufs(&matched_nets); + lnet_free_text_bufs(¤t_nets); + LASSERT (lnet_tbnob == 0); + + if (rc < 0) + return rc; + + *networksp = networks; + return count; +} + +void +lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip) +{ + LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs)); +} + +int +lnet_ipaddr_enumerate (__u32 **ipaddrsp) +{ + int up; + __u32 netmask; + __u32 *ipaddrs; + __u32 *ipaddrs2; + int nip; + char **ifnames; + int nif = libcfs_ipif_enumerate(&ifnames); + int i; + int rc; + + if (nif <= 0) + return nif; + + LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs)); + if (ipaddrs == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nif); + libcfs_ipif_free_enumeration(ifnames, nif); + return -ENOMEM; + } + + for (i = nip = 0; i < nif; i++) { + if (!strcmp(ifnames[i], "lo")) + continue; + + rc = libcfs_ipif_query(ifnames[i], &up, + &ipaddrs[nip], &netmask); + if (rc != 0) { + CWARN("Can't query interface %s: %d\n", + ifnames[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s: it's down\n", + ifnames[i]); + continue; + } + + nip++; + } + + libcfs_ipif_free_enumeration(ifnames, nif); + + if (nip == nif) { + *ipaddrsp = ipaddrs; + } else { + if (nip > 0) { + LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2)); + if (ipaddrs2 == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nip); + nip = -ENOMEM; + } else { + memcpy(ipaddrs2, ipaddrs, + nip * sizeof(*ipaddrs)); + *ipaddrsp = ipaddrs2; + rc = nip; + } + } + lnet_ipaddr_free_enumeration(ipaddrs, nif); + } + return nip; +} + +int +lnet_parse_ip2nets (char **networksp, char *ip2nets) +{ + __u32 *ipaddrs; + int nip = lnet_ipaddr_enumerate(&ipaddrs); + int rc; + + if (nip < 0) { + LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP " + "interfaces for ip2nets to match\n", nip); + return nip; + } + + if (nip == 0) { + LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces " + "for ip2nets to match\n"); + return -ENOENT; + } + + rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip); + lnet_ipaddr_free_enumeration(ipaddrs, nip); + + if (rc < 0) { + LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc); + return rc; + } + + if (rc == 0) { + LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match " + "any local IP interfaces\n"); + return -ENOENT; + } + + return 0; +} + +int +lnet_set_ip_niaddr (lnet_ni_t *ni) +{ + __u32 net = LNET_NIDNET(ni->ni_nid); + char **names; + int n; + __u32 ip; + __u32 netmask; + int up; + int i; + int rc; + + /* Convenience for LNDs that use the IP address of a local interface as + * the local address part of their NID */ + + if (ni->ni_interfaces[0] != NULL) { + + CLASSERT (LNET_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[1] != NULL) { + CERROR("Net %s doesn't support multiple interfaces\n", + libcfs_net2str(net)); + return -EPERM; + } + + rc = libcfs_ipif_query(ni->ni_interfaces[0], + &up, &ip, &netmask); + if (rc != 0) { + CERROR("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), ni->ni_interfaces[0], rc); + return -EPERM; + } + + if (!up) { + CERROR("Net %s can't use interface %s: it's down\n", + libcfs_net2str(net), ni->ni_interfaces[0]); + return -ENETDOWN; + } + + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Net %s can't enumerate interfaces: %d\n", + libcfs_net2str(net), n); + return 0; + } + + for (i = 0; i < n; i++) { + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &netmask); + + if (rc != 0) { + CWARN("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), names[i], rc); + continue; + } + + if (!up) { + CWARN("Net %s ignoring interface %s (down)\n", + libcfs_net2str(net), names[i]); + continue; + } + + libcfs_ipif_free_enumeration(names, n); + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net)); + libcfs_ipif_free_enumeration(names, n); + return -ENOENT; +} +EXPORT_SYMBOL(lnet_set_ip_niaddr); diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c new file mode 100644 index 000000000000..78297a7d94e8 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c @@ -0,0 +1,447 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-eq.c + * + * Library level Event queue management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/lnet/lib-lnet.h> + +/** + * Create an event queue that has room for \a count number of events. + * + * The event queue is circular and older events will be overwritten by new + * ones if they are not removed in time by the user using the functions + * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to + * determine the appropriate size of the event queue to prevent this loss + * of events. Note that when EQ handler is specified in \a callback, no + * event loss can happen, since the handler is run for each event deposited + * into the EQ. + * + * \param count The number of events to be stored in the event queue. It + * will be rounded up to the next power of two. + * \param callback A handler function that runs when an event is deposited + * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to + * indicate that no event handler is desired. + * \param handle On successful return, this location will hold a handle for + * the newly created EQ. + * + * \retval 0 On success. + * \retval -EINVAL If an parameter is not valid. + * \retval -ENOMEM If memory for the EQ can't be allocated. + * + * \see lnet_eq_handler_t for the discussion on EQ handler semantics. + */ +int +LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, + lnet_handle_eq_t *handle) +{ + lnet_eq_t *eq; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq + * overflow, they don't skip entries, so the queue has the same + * apparent capacity at all times */ + + count = cfs_power2_roundup(count); + + if (callback != LNET_EQ_HANDLER_NONE && count != 0) { + CWARN("EQ callback is guaranteed to get every event, " + "do you still want to set eqcount %d for polling " + "event which will have locking overhead? " + "Please contact with developer to confirm\n", count); + } + + /* count can be 0 if only need callback, we can eliminate + * overhead of enqueue event */ + if (count == 0 && callback == LNET_EQ_HANDLER_NONE) + return -EINVAL; + + eq = lnet_eq_alloc(); + if (eq == NULL) + return -ENOMEM; + + if (count != 0) { + LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t)); + if (eq->eq_events == NULL) + goto failed; + /* NB allocator has set all event sequence numbers to 0, + * so all them should be earlier than eq_deq_seq */ + } + + eq->eq_deq_seq = 1; + eq->eq_enq_seq = 1; + eq->eq_size = count; + eq->eq_callback = callback; + + eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*eq->eq_refs[0])); + if (eq->eq_refs == NULL) + goto failed; + + /* MUST hold both exclusive lnet_res_lock */ + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); + list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); + + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_eq2handle(handle, eq); + return 0; + +failed: + if (eq->eq_events != NULL) + LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t)); + + if (eq->eq_refs != NULL) + cfs_percpt_free(eq->eq_refs); + + lnet_eq_free(eq); + return -ENOMEM; +} +EXPORT_SYMBOL(LNetEQAlloc); + +/** + * Release the resources associated with an event queue if it's idle; + * otherwise do nothing and it's up to the user to try again. + * + * \param eqh A handle for the event queue to be released. + * + * \retval 0 If the EQ is not in use and freed. + * \retval -ENOENT If \a eqh does not point to a valid EQ. + * \retval -EBUSY If the EQ is still in use by some MDs. + */ +int +LNetEQFree(lnet_handle_eq_t eqh) +{ + struct lnet_eq *eq; + lnet_event_t *events = NULL; + int **refs = NULL; + int *ref; + int rc = 0; + int size = 0; + int i; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + eq = lnet_handle2eq(&eqh); + if (eq == NULL) { + rc = -ENOENT; + goto out; + } + + cfs_percpt_for_each(ref, i, eq->eq_refs) { + LASSERT(*ref >= 0); + if (*ref == 0) + continue; + + CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", + i, *ref); + rc = -EBUSY; + goto out; + } + + /* stash for free after lock dropped */ + events = eq->eq_events; + size = eq->eq_size; + refs = eq->eq_refs; + + lnet_res_lh_invalidate(&eq->eq_lh); + list_del(&eq->eq_list); + lnet_eq_free_locked(eq); + out: + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + if (events != NULL) + LIBCFS_FREE(events, size * sizeof(lnet_event_t)); + if (refs != NULL) + cfs_percpt_free(refs); + + return rc; +} +EXPORT_SYMBOL(LNetEQFree); + +void +lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev) +{ + /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ + int index; + + if (eq->eq_size == 0) { + LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); + eq->eq_callback(ev); + return; + } + + lnet_eq_wait_lock(); + ev->sequence = eq->eq_enq_seq++; + + LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); + index = ev->sequence & (eq->eq_size - 1); + + eq->eq_events[index] = *ev; + + if (eq->eq_callback != LNET_EQ_HANDLER_NONE) + eq->eq_callback(ev); + + /* Wake anyone waiting in LNetEQPoll() */ + if (waitqueue_active(&the_lnet.ln_eq_waitq)) + wake_up_all(&the_lnet.ln_eq_waitq); + lnet_eq_wait_unlock(); +} + +int +lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev) +{ + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + lnet_event_t *new_event = &eq->eq_events[new_index]; + int rc; + ENTRY; + + /* must called with lnet_eq_wait_lock hold */ + if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) + RETURN(0); + + /* We've got a new event... */ + *ev = *new_event; + + CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->eq_deq_seq, eq->eq_size); + + /* ...but did it overwrite an event we've not seen yet? */ + if (eq->eq_deq_seq == new_event->sequence) { + rc = 1; + } else { + /* don't complain with CERROR: some EQs are sized small + * anyway; if it's important, the caller should complain */ + CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", + eq->eq_deq_seq, new_event->sequence); + rc = -EOVERFLOW; + } + + eq->eq_deq_seq = new_event->sequence + 1; + RETURN(rc); +} + +/** + * A nonblocking function that can be used to get the next event in an EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. The event is removed from the queue. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 0 No pending event in the EQ. + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, 0, + event, &which); +} +EXPORT_SYMBOL(LNetEQGet); + +/** + * Block the calling process until there is an event in the EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. This function returns the next event + * in the EQ and removes it from the EQ. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER, + event, &which); +} +EXPORT_SYMBOL(LNetEQWait); + + +static int +lnet_eq_wait_locked(int *timeout_ms) +{ + int tms = *timeout_ms; + int wait; + wait_queue_t wl; + cfs_time_t now; + + if (tms == 0) + return -1; /* don't want to wait and no new event */ + + init_waitqueue_entry_current(&wl); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + lnet_eq_wait_unlock(); + + if (tms < 0) { + waitq_wait(&wl, TASK_INTERRUPTIBLE); + + } else { + struct timeval tv; + + now = cfs_time_current(); + waitq_timedwait(&wl, TASK_INTERRUPTIBLE, + cfs_time_seconds(tms) / 1000); + cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv); + tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000); + if (tms < 0) /* no more wait but may have new event */ + tms = 0; + } + + wait = tms != 0; /* might need to call here again */ + *timeout_ms = tms; + + lnet_eq_wait_lock(); + remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + return wait; +} + + + +/** + * Block the calling process until there's an event from a set of EQs or + * timeout happens. + * + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully, in which case the corresponding event + * is consumed. + * + * LNetEQPoll() provides a timeout to allow applications to poll, block for a + * fixed period, or block indefinitely. + * + * \param eventqs,neq An array of EQ handles, and size of the array. + * \param timeout_ms Time in milliseconds to wait for an event to occur on + * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an + * infinite timeout. + * \param event,which On successful return (1 or -EOVERFLOW), \a event will + * hold the next event in the EQs, and \a which will contain the index of the + * EQ from which the event was taken. + * + * \retval 0 No pending event in the EQs after timeout. + * \retval 1 Indicates success. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ indicated by \a which has been dropped due to limited space in the EQ. + * \retval -ENOENT If there's an invalid handle in \a eventqs. + */ +int +LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms, + lnet_event_t *event, int *which) +{ + int wait = 1; + int rc; + int i; + ENTRY; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (neq < 1) + RETURN(-ENOENT); + + lnet_eq_wait_lock(); + + for (;;) { + for (i = 0; i < neq; i++) { + lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]); + + if (eq == NULL) { + lnet_eq_wait_unlock(); + RETURN(-ENOENT); + } + + rc = lnet_eq_dequeue_event(eq, event); + if (rc != 0) { + lnet_eq_wait_unlock(); + *which = i; + RETURN(rc); + } + } + + if (wait == 0) + break; + + /* + * return value of lnet_eq_wait_locked: + * -1 : did nothing and it's sure no new event + * 1 : sleep inside and wait until new event + * 0 : don't want to wait anymore, but might have new event + * so need to call dequeue again + */ + wait = lnet_eq_wait_locked(&timeout_ms); + if (wait < 0) /* no new event */ + break; + } + + lnet_eq_wait_unlock(); + RETURN(0); +} diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c new file mode 100644 index 000000000000..ae643f26933b --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-md.c @@ -0,0 +1,451 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-md.c + * + * Memory Descriptor management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include <linux/lnet/lib-lnet.h> + +/* must be called with lnet_res_lock held */ +void +lnet_md_unlink(lnet_libmd_t *md) +{ + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) { + /* first unlink attempt... */ + lnet_me_t *me = md->md_me; + + md->md_flags |= LNET_MD_FLAG_ZOMBIE; + + /* Disassociate from ME (if any), and unlink it if it was created + * with LNET_UNLINK */ + if (me != NULL) { + /* detach MD from portal */ + lnet_ptl_detach_md(me, md); + if (me->me_unlink == LNET_UNLINK) + lnet_me_unlink(me); + } + + /* ensure all future handle lookups fail */ + lnet_res_lh_invalidate(&md->md_lh); + } + + if (md->md_refcount != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if (md->md_eq != NULL) { + int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + + LASSERT(*md->md_eq->eq_refs[cpt] > 0); + (*md->md_eq->eq_refs[cpt])--; + } + + LASSERT(!list_empty(&md->md_list)); + list_del_init(&md->md_list); + lnet_md_free_locked(md); +} + +static int +lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink) +{ + int i; + unsigned int niov; + int total_length = 0; + + lmd->md_me = NULL; + lmd->md_start = umd->start; + lmd->md_offset = 0; + lmd->md_max_size = umd->max_size; + lmd->md_options = umd->options; + lmd->md_user_ptr = umd->user_ptr; + lmd->md_eq = NULL; + lmd->md_threshold = umd->threshold; + lmd->md_refcount = 0; + lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; + + if ((umd->options & LNET_MD_IOVEC) != 0) { + + if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */ + return -EINVAL; + + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.iov, umd->start, + niov * sizeof (lmd->md_iov.iov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the base address on trust */ + if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return -EINVAL; + + total_length += lmd->md_iov.iov[i].iov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return -EINVAL; + + } else if ((umd->options & LNET_MD_KIOV) != 0) { + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.kiov, umd->start, + niov * sizeof (lmd->md_iov.kiov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the page pointer on trust */ + if (lmd->md_iov.kiov[i].kiov_offset + + lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE ) + return -EINVAL; /* invalid length */ + + total_length += lmd->md_iov.kiov[i].kiov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return -EINVAL; + } else { /* contiguous */ + lmd->md_length = umd->length; + lmd->md_niov = niov = 1; + lmd->md_iov.iov[0].iov_base = umd->start; + lmd->md_iov.iov[0].iov_len = umd->length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > (int)umd->length)) // illegal max_size + return -EINVAL; + } + + return 0; +} + +/* must be called with resource lock held */ +static int +lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt) +{ + struct lnet_res_container *container = the_lnet.ln_md_containers[cpt]; + + /* NB we are passed an allocated, but inactive md. + * if we return success, caller may lnet_md_unlink() it. + * otherwise caller may only lnet_md_free() it. + */ + /* This implementation doesn't know how to create START events or + * disable END events. Best to LASSERT our caller is compliant so + * we find out quickly... */ + /* TODO - reevaluate what should be here in light of + * the removal of the start and end events + * maybe there we shouldn't even allow LNET_EQ_NONE!) + * LASSERT (eq == NULL); + */ + if (!LNetHandleIsInvalid(eq_handle)) { + md->md_eq = lnet_handle2eq(&eq_handle); + + if (md->md_eq == NULL) + return -ENOENT; + + (*md->md_eq->eq_refs[cpt])++; + } + + lnet_res_lh_initialize(container, &md->md_lh); + + LASSERT(list_empty(&md->md_list)); + list_add(&md->md_list, &container->rec_active); + + return 0; +} + +/* must be called with lnet_res_lock held */ +void +lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + umd->start = lmd->md_start; + umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ? + lmd->md_length : lmd->md_niov; + umd->threshold = lmd->md_threshold; + umd->max_size = lmd->md_max_size; + umd->options = lmd->md_options; + umd->user_ptr = lmd->md_user_ptr; + lnet_eq2handle(&umd->eq_handle, lmd->md_eq); +} + +int +lnet_md_validate(lnet_md_t *umd) +{ + if (umd->start == NULL && umd->length != 0) { + CERROR("MD start pointer can not be NULL with length %u\n", + umd->length); + return -EINVAL; + } + + if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && + umd->length > LNET_MAX_IOV) { + CERROR("Invalid option: too many fragments %u, %d max\n", + umd->length, LNET_MAX_IOV); + return -EINVAL; + } + + return 0; +} + +/** + * Create a memory descriptor and attach it to a ME + * + * \param meh A handle for a ME to associate the new MD with. + * \param umd Provides initial values for the user-visible parts of a MD. + * Other than its use for initialization, there is no linkage between this + * structure and the MD maintained by the LNet. + * \param unlink A flag to indicate whether the MD is automatically unlinked + * when it becomes inactive, either because the operation threshold drops to + * zero or because the available memory becomes less than \a umd.max_size. + * (Note that the check for unlinking a MD only occurs after the completion + * of a successful operation on the MD.) The value LNET_UNLINK enables auto + * unlinking; the value LNET_RETAIN disables it. + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(). + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a + * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by + * calling LNetInvalidateHandle() on it. + * \retval -EBUSY If the ME pointed to by \a meh is already associated with + * a MD. + */ +int +LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, + lnet_unlink_t unlink, lnet_handle_md_t *handle) +{ + LIST_HEAD (matches); + LIST_HEAD (drops); + struct lnet_me *me; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) { + CERROR("Invalid option: no MD_OP set\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + cpt = lnet_cpt_of_cookie(meh.cookie); + + lnet_res_lock(cpt); + if (rc != 0) + goto failed; + + me = lnet_handle2me(&meh); + if (me == NULL) + rc = -ENOENT; + else if (me->me_md != NULL) + rc = -EBUSY; + else + rc = lnet_md_link(md, umd.eq_handle, cpt); + + if (rc != 0) + goto failed; + + /* attach this MD to portal of ME and check if it matches any + * blocked msgs on this portal */ + lnet_ptl_attach_md(me, md, &matches, &drops); + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + + lnet_drop_delayed_msg_list(&drops, "Bad match"); + lnet_recv_delayed_msg_list(&matches); + + return 0; + + failed: + lnet_md_free_locked(md); + + lnet_res_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetMDAttach); + +/** + * Create a "free floating" memory descriptor - a MD that is not associated + * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations. + * + * \param umd,unlink See the discussion for LNetMDAttach(). + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(), + * and LNetGet() operations. + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that + * it's OK to supply a NULL \a umd.eq_handle by calling + * LNetInvalidateHandle() on it. + */ +int +LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) +{ + lnet_libmd_t *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) { + CERROR("Invalid option: GET|PUT illegal on active MDs\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + + cpt = lnet_res_lock_current(); + if (rc != 0) + goto failed; + + rc = lnet_md_link(md, umd.eq_handle, cpt); + if (rc != 0) + goto failed; + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + return 0; + + failed: + lnet_md_free_locked(md); + + lnet_res_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetMDBind); + +/** + * Unlink the memory descriptor from any ME it may be linked to and release + * the internal resources associated with it. + * + * This function does not free the memory region associated with the MD; + * i.e., the memory the user allocated for this MD. If the ME associated with + * this MD is not NULL and was created with auto unlink enabled, the ME is + * unlinked as well (see LNetMEAttach()). + * + * Explicitly unlinking a MD via this function call has the same behavior as + * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK + * is generated in the latter case. + * + * An unlinked event can be reported in two ways: + * - If there's no pending operations on the MD, it's unlinked immediately + * and an LNET_EVENT_UNLINK event is logged before this function returns. + * - Otherwise, the MD is only marked for deletion when this function + * returns, and the unlinked event will be piggybacked on the event of + * the completion of the last operation by setting the unlinked field of + * the event. No dedicated LNET_EVENT_UNLINK event is generated. + * + * Note that in both cases the unlinked field of the event is always set; no + * more event will happen on the MD after such an event is logged. + * + * \param mdh A handle for the MD to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a mdh does not point to a valid MD object. + */ +int +LNetMDUnlink (lnet_handle_md_t mdh) +{ + lnet_event_t ev; + lnet_libmd_t *md; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + /* If the MD is busy, lnet_md_unlink just marks it for deletion, and + * when the NAL is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + + if (md->md_eq != NULL && + md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + + lnet_md_unlink(md); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMDUnlink); diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c new file mode 100644 index 000000000000..0081075cabee --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-me.c @@ -0,0 +1,297 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-me.c + * + * Match Entry management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include <linux/lnet/lib-lnet.h> + +/** + * Create and attach a match entry to the match list of \a portal. The new + * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach() + * can be used to attach a MD to an empty ME. + * + * \param portal The portal table index where the ME should be attached. + * \param match_id Specifies the match criteria for the process ID of + * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be + * used to wildcard either of the identifiers in the lnet_process_id_t + * structure. + * \param match_bits,ignore_bits Specify the match criteria to apply + * to the match bits in the incoming request. The ignore bits are used + * to mask out insignificant bits in the incoming match bits. The resulting + * bits are then compared to the ME's match bits to determine if the + * incoming request meets the match criteria. + * \param unlink Indicates whether the ME should be unlinked when the memory + * descriptor associated with it is unlinked (Note that the check for + * unlinking a ME only occurs when the memory descriptor is unlinked.). + * Valid values are LNET_RETAIN and LNET_UNLINK. + * \param pos Indicates whether the new ME should be prepended or + * appended to the match list. Allowed constants: LNET_INS_BEFORE, + * LNET_INS_AFTER. + * \param handle On successful returns, a handle to the newly created ME + * object is saved here. This handle can be used later in LNetMEInsert(), + * LNetMEUnlink(), or LNetMDAttach() functions. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is invalid. + * \retval -ENOMEM If new ME object cannot be allocated. + */ +int +LNetMEAttach(unsigned int portal, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) +{ + struct lnet_match_table *mtable; + struct lnet_me *me; + struct list_head *head; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if ((int)portal >= the_lnet.ln_nportals) + return -EINVAL; + + mtable = lnet_mt_of_attach(portal, match_id, + match_bits, ignore_bits, pos); + if (mtable == NULL) /* can't match portal type */ + return -EPERM; + + me = lnet_me_alloc(); + if (me == NULL) + return -ENOMEM; + + lnet_res_lock(mtable->mt_cpt); + + me->me_portal = portal; + me->me_match_id = match_id; + me->me_match_bits = match_bits; + me->me_ignore_bits = ignore_bits; + me->me_unlink = unlink; + me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt], + &me->me_lh); + if (ignore_bits != 0) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, match_id, match_bits); + + me->me_pos = head - &mtable->mt_mhash[0]; + if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL) + list_add_tail(&me->me_list, head); + else + list_add(&me->me_list, head); + + lnet_me2handle(handle, me); + + lnet_res_unlock(mtable->mt_cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEAttach); + +/** + * Create and a match entry and insert it before or after the ME pointed to by + * \a current_meh. The new ME is empty, i.e. not associated with a memory + * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME. + * + * This function is identical to LNetMEAttach() except for the position + * where the new ME is inserted. + * + * \param current_meh A handle for a ME. The new ME will be inserted + * immediately before or immediately after this ME. + * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion + * for LNetMEAttach(). + * + * \retval 0 On success. + * \retval -ENOMEM If new ME object cannot be allocated. + * \retval -ENOENT If \a current_meh does not point to a valid match entry. + */ +int +LNetMEInsert(lnet_handle_me_t current_meh, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) +{ + struct lnet_me *current_me; + struct lnet_me *new_me; + struct lnet_portal *ptl; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (pos == LNET_INS_LOCAL) + return -EPERM; + + new_me = lnet_me_alloc(); + if (new_me == NULL) + return -ENOMEM; + + cpt = lnet_cpt_of_cookie(current_meh.cookie); + + lnet_res_lock(cpt); + + current_me = lnet_handle2me(¤t_meh); + if (current_me == NULL) { + lnet_me_free_locked(new_me); + + lnet_res_unlock(cpt); + return -ENOENT; + } + + LASSERT(current_me->me_portal < the_lnet.ln_nportals); + + ptl = the_lnet.ln_portals[current_me->me_portal]; + if (lnet_ptl_is_unique(ptl)) { + /* nosense to insertion on unique portal */ + lnet_me_free_locked(new_me); + lnet_res_unlock(cpt); + return -EPERM; + } + + new_me->me_pos = current_me->me_pos; + new_me->me_portal = current_me->me_portal; + new_me->me_match_id = match_id; + new_me->me_match_bits = match_bits; + new_me->me_ignore_bits = ignore_bits; + new_me->me_unlink = unlink; + new_me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh); + + if (pos == LNET_INS_AFTER) + list_add(&new_me->me_list, ¤t_me->me_list); + else + list_add_tail(&new_me->me_list, ¤t_me->me_list); + + lnet_me2handle(handle, new_me); + + lnet_res_unlock(cpt); + + return 0; +} +EXPORT_SYMBOL(LNetMEInsert); + +/** + * Unlink a match entry from its match list. + * + * This operation also releases any resources associated with the ME. If a + * memory descriptor is attached to the ME, then it will be unlinked as well + * and an unlink event will be generated. It is an error to use the ME handle + * after calling LNetMEUnlink(). + * + * \param meh A handle for the ME to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a meh does not point to a valid ME. + * \see LNetMDUnlink() for the discussion on delivering unlink event. + */ +int +LNetMEUnlink(lnet_handle_me_t meh) +{ + lnet_me_t *me; + lnet_libmd_t *md; + lnet_event_t ev; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(meh.cookie); + lnet_res_lock(cpt); + + me = lnet_handle2me(&meh); + if (me == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + md = me->me_md; + if (md != NULL && + md->md_eq != NULL && + md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + + lnet_me_unlink(me); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEUnlink); + +/* call with lnet_res_lock please */ +void +lnet_me_unlink(lnet_me_t *me) +{ + list_del(&me->me_list); + + if (me->me_md != NULL) { + lnet_libmd_t *md = me->me_md; + + /* detach MD from portal of this ME */ + lnet_ptl_detach_md(me, md); + lnet_md_unlink(md); + } + + lnet_res_lh_invalidate(&me->me_lh); + lnet_me_free_locked(me); +} + +#if 0 +static void +lib_me_dump(lnet_me_t *me) +{ + CWARN("Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); + + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->me_match_bits, me->me_ignore_bits); + + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, lnet_me_t, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, lnet_me_t, me_list)); +} +#endif diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c new file mode 100644 index 000000000000..49b0f1287a69 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-move.c @@ -0,0 +1,2441 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-move.c + * + * Data movement routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include <linux/lnet/lib-lnet.h> + +static int local_nid_dist_zero = 1; +CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444, + "Reserved"); + +int +lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) +{ + lnet_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + LASSERT (the_lnet.ln_init); + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + if (threshold != 0) { + /* Adding a new entry */ + LIBCFS_ALLOC(tp, sizeof(*tp)); + if (tp == NULL) + return -ENOMEM; + + tp->tp_nid = nid; + tp->tp_threshold = threshold; + + lnet_net_lock(0); + list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); + lnet_net_unlock(0); + return 0; + } + + /* removing entries */ + INIT_LIST_HEAD(&cull); + + lnet_net_lock(0); + + list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = list_entry (el, lnet_test_peer_t, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + nid == LNET_NID_ANY || /* removing all entries */ + tp->tp_nid == nid) /* matched this one */ + { + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + + lnet_net_unlock(0); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lnet_test_peer_t, tp_list); + + list_del (&tp->tp_list); + LIBCFS_FREE(tp, sizeof (*tp)); + } + return 0; +} + +static int +fail_peer (lnet_nid_t nid, int outgoing) +{ + lnet_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD (&cull); + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + lnet_net_lock(0); + + list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = list_entry (el, lnet_test_peer_t, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != LNET_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + break; + } + } + + lnet_net_unlock(0); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lnet_test_peer_t, tp_list); + list_del (&tp->tp_list); + + LIBCFS_FREE(tp, sizeof (*tp)); + } + + return (fail); +} + +unsigned int +lnet_iov_nob (unsigned int niov, struct iovec *iov) +{ + unsigned int nob = 0; + + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_iov_nob); + +void +lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset, + unsigned int nsiov, struct iovec *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + + if (nob == 0) + return; + + /* skip complete frags before 'doffset' */ + LASSERT (ndiov > 0); + while (doffset >= diov->iov_len) { + doffset -= diov->iov_len; + diov++; + ndiov--; + LASSERT (ndiov > 0); + } + + /* skip complete frags before 'soffset' */ + LASSERT (nsiov > 0); + while (soffset >= siov->iov_len) { + soffset -= siov->iov_len; + siov++; + nsiov--; + LASSERT (nsiov > 0); + } + + do { + LASSERT (ndiov > 0); + LASSERT (nsiov > 0); + this_nob = MIN(diov->iov_len - doffset, + siov->iov_len - soffset); + this_nob = MIN(this_nob, nob); + + memcpy ((char *)diov->iov_base + doffset, + (char *)siov->iov_base + soffset, this_nob); + nob -= this_nob; + + if (diov->iov_len > doffset + this_nob) { + doffset += this_nob; + } else { + diov++; + ndiov--; + doffset = 0; + } + + if (siov->iov_len > soffset + this_nob) { + soffset += this_nob; + } else { + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); +} +EXPORT_SYMBOL(lnet_copy_iov2iov); + +int +lnet_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT ((int)niov <= dst_niov); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_iov); + + +unsigned int +lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov) +{ + unsigned int nob = 0; + + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_kiov_nob); + +void +lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + char *daddr = NULL; + char *saddr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (ndiov > 0); + while (doffset >= diov->kiov_len) { + doffset -= diov->kiov_len; + diov++; + ndiov--; + LASSERT (ndiov > 0); + } + + LASSERT (nsiov > 0); + while (soffset >= siov->kiov_len) { + soffset -= siov->kiov_len; + siov++; + nsiov--; + LASSERT (nsiov > 0); + } + + do { + LASSERT (ndiov > 0); + LASSERT (nsiov > 0); + this_nob = MIN(diov->kiov_len - doffset, + siov->kiov_len - soffset); + this_nob = MIN(this_nob, nob); + + if (daddr == NULL) + daddr = ((char *)kmap(diov->kiov_page)) + + diov->kiov_offset + doffset; + if (saddr == NULL) + saddr = ((char *)kmap(siov->kiov_page)) + + siov->kiov_offset + soffset; + + /* Vanishing risk of kmap deadlock when mapping 2 pages. + * However in practice at least one of the kiovs will be mapped + * kernel pages and the map/unmap will be NOOPs */ + + memcpy (daddr, saddr, this_nob); + nob -= this_nob; + + if (diov->kiov_len > doffset + this_nob) { + daddr += this_nob; + doffset += this_nob; + } else { + kunmap(diov->kiov_page); + daddr = NULL; + diov++; + ndiov--; + doffset = 0; + } + + if (siov->kiov_len > soffset + this_nob) { + saddr += this_nob; + soffset += this_nob; + } else { + kunmap(siov->kiov_page); + saddr = NULL; + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); + + if (daddr != NULL) + kunmap(diov->kiov_page); + if (saddr != NULL) + kunmap(siov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2kiov); + +void +lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nob) +{ + /* NB iov, kiov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + + LASSERT (nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT (nkiov > 0); + } + + do { + LASSERT (niov > 0); + LASSERT (nkiov > 0); + this_nob = MIN(iov->iov_len - iovoffset, + kiov->kiov_len - kiovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob); + nob -= this_nob; + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2iov); + +void +lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nob) +{ + /* NB kiov, iov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT (nkiov > 0); + } + + LASSERT (niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + + do { + LASSERT (nkiov > 0); + LASSERT (niov > 0); + this_nob = MIN(kiov->kiov_len - kiovoffset, + iov->iov_len - iovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); + nob -= this_nob; + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_iov2kiov); + +int +lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT ((int)niov <= dst_niov); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); + return (niov); + } + + dst->kiov_len = frag_len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_kiov); + +void +lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + unsigned int niov = 0; + struct iovec *iov = NULL; + lnet_kiov_t *kiov = NULL; + int rc; + + LASSERT (!in_interrupt ()); + LASSERT (mlen == 0 || msg != NULL); + + if (msg != NULL) { + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(rlen == msg->msg_len); + LASSERT(mlen <= msg->msg_len); + LASSERT(msg->msg_offset == offset); + LASSERT(msg->msg_wanted == mlen); + + msg->msg_receiving = 0; + + if (mlen != 0) { + niov = msg->msg_niov; + iov = msg->msg_iov; + kiov = msg->msg_kiov; + + LASSERT (niov > 0); + LASSERT ((iov == NULL) != (kiov == NULL)); + } + } + + rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, + niov, iov, kiov, offset, mlen, rlen); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} + +void +lnet_setpayloadbuffer(lnet_msg_t *msg) +{ + lnet_libmd_t *md = msg->msg_md; + + LASSERT (msg->msg_len > 0); + LASSERT (!msg->msg_routing); + LASSERT (md != NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + if ((md->md_options & LNET_MD_KIOV) != 0) + msg->msg_kiov = md->md_iov.kiov; + else + msg->msg_iov = md->md_iov.iov; +} + +void +lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len) +{ + msg->msg_type = type; + msg->msg_target = target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + msg->msg_hdr.type = cpu_to_le32(type); + msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); + msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); + /* src_nid will be set later */ + msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); + msg->msg_hdr.payload_length = cpu_to_le32(len); +} + +void +lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *priv = msg->msg_private; + int rc; + + LASSERT (!in_interrupt ()); + LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || + (msg->msg_txcredit && msg->msg_peertxcredit)); + + rc = (ni->ni_lnd->lnd_send)(ni, priv, msg); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} + +int +lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) +{ + int rc; + + LASSERT(!msg->msg_sending); + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_rx_ready_delay); + LASSERT(ni->ni_lnd->lnd_eager_recv != NULL); + + msg->msg_rx_ready_delay = 1; + rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); + if (rc != 0) { + CERROR("recv from %s / send to %s aborted: " + "eager_recv failed %d\n", + libcfs_nid2str(msg->msg_rxpeer->lp_nid), + libcfs_id2str(msg->msg_target), rc); + LASSERT(rc < 0); /* required by my callers */ + } + + return rc; +} + +/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */ +void +lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp) +{ + cfs_time_t last_alive = 0; + + LASSERT(lnet_peer_aliveness_enabled(lp)); + LASSERT(ni->ni_lnd->lnd_query != NULL); + + lnet_net_unlock(lp->lp_cpt); + (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); + lnet_net_lock(lp->lp_cpt); + + lp->lp_last_query = cfs_time_current(); + + if (last_alive != 0) /* NI has updated timestamp */ + lp->lp_last_alive = last_alive; +} + +/* NB: always called with lnet_net_lock held */ +static inline int +lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) +{ + int alive; + cfs_time_t deadline; + + LASSERT (lnet_peer_aliveness_enabled(lp)); + + /* Trust lnet_notify() if it has more recent aliveness news, but + * ignore the initial assumed death (see lnet_peers_start_down()). + */ + if (!lp->lp_alive && lp->lp_alive_count > 0 && + cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive)) + return 0; + + deadline = cfs_time_add(lp->lp_last_alive, + cfs_time_seconds(lp->lp_ni->ni_peertimeout)); + alive = cfs_time_after(deadline, now); + + /* Update obsolete lp_alive except for routers assumed to be dead + * initially, because router checker would update aliveness in this + * case, and moreover lp_last_alive at peer creation is assumed. + */ + if (alive && !lp->lp_alive && + !(lnet_isrouter(lp) && lp->lp_alive_count == 0)) + lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); + + return alive; +} + + +/* NB: returns 1 when alive, 0 when dead, negative when error; + * may drop the lnet_net_lock */ +int +lnet_peer_alive_locked (lnet_peer_t *lp) +{ + cfs_time_t now = cfs_time_current(); + + if (!lnet_peer_aliveness_enabled(lp)) + return -ENODEV; + + if (lnet_peer_is_alive(lp, now)) + return 1; + + /* Peer appears dead, but we should avoid frequent NI queries (at + * most once per lnet_queryinterval seconds). */ + if (lp->lp_last_query != 0) { + static const int lnet_queryinterval = 1; + + cfs_time_t next_query = + cfs_time_add(lp->lp_last_query, + cfs_time_seconds(lnet_queryinterval)); + + if (cfs_time_before(now, next_query)) { + if (lp->lp_alive) + CWARN("Unexpected aliveness of peer %s: " + "%d < %d (%d/%d)\n", + libcfs_nid2str(lp->lp_nid), + (int)now, (int)next_query, + lnet_queryinterval, + lp->lp_ni->ni_peertimeout); + return 0; + } + } + + /* query NI for latest aliveness news */ + lnet_ni_query_locked(lp->lp_ni, lp); + + if (lnet_peer_is_alive(lp, now)) + return 1; + + lnet_notify_locked(lp, 0, 0, lp->lp_last_alive); + return 0; +} + +int +lnet_post_send_locked(lnet_msg_t *msg, int do_send) +{ + /* lnet_send is going to lnet_net_unlock immediately after this, + * so it sets do_send FALSE and I don't do the unlock/send/lock bit. + * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer + * appears dead, and 0 if sent or OK to send */ + struct lnet_peer *lp = msg->msg_txpeer; + struct lnet_ni *ni = lp->lp_ni; + struct lnet_tx_queue *tq; + int cpt; + + /* non-lnet_send() callers have checked before */ + LASSERT(!do_send || msg->msg_tx_delayed); + LASSERT(!msg->msg_receiving); + LASSERT(msg->msg_tx_committed); + + cpt = msg->msg_tx_cpt; + tq = ni->ni_tx_queues[cpt]; + + /* NB 'lp' is always the next hop */ + if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && + lnet_peer_alive_locked(lp) == 0) { + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; + lnet_net_unlock(cpt); + + CNETERR("Dropping message for %s: peer not alive\n", + libcfs_id2str(msg->msg_target)); + if (do_send) + lnet_finalize(ni, msg, -EHOSTUNREACH); + + lnet_net_lock(cpt); + return EHOSTUNREACH; + } + + if (!msg->msg_peertxcredit) { + LASSERT ((lp->lp_txcredits < 0) == + !list_empty(&lp->lp_txq)); + + msg->msg_peertxcredit = 1; + lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); + lp->lp_txcredits--; + + if (lp->lp_txcredits < lp->lp_mintxcredits) + lp->lp_mintxcredits = lp->lp_txcredits; + + if (lp->lp_txcredits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lp_txq); + return EAGAIN; + } + } + + if (!msg->msg_txcredit) { + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + msg->msg_txcredit = 1; + tq->tq_credits--; + + if (tq->tq_credits < tq->tq_credits_min) + tq->tq_credits_min = tq->tq_credits; + + if (tq->tq_credits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &tq->tq_delayed); + return EAGAIN; + } + } + + if (do_send) { + lnet_net_unlock(cpt); + lnet_ni_send(ni, msg); + lnet_net_lock(cpt); + } + return 0; +} + + +lnet_rtrbufpool_t * +lnet_msg2bufpool(lnet_msg_t *msg) +{ + lnet_rtrbufpool_t *rbp; + int cpt; + + LASSERT(msg->msg_rx_committed); + + cpt = msg->msg_rx_cpt; + rbp = &the_lnet.ln_rtrpools[cpt][0]; + + LASSERT(msg->msg_len <= LNET_MTU); + while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) { + rbp++; + LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); + } + + return rbp; +} + +int +lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) +{ + /* lnet_parse is going to lnet_net_unlock immediately after this, so it + * sets do_recv FALSE and I don't do the unlock/send/lock bit. I + * return EAGAIN if msg blocked and 0 if received or OK to receive */ + lnet_peer_t *lp = msg->msg_rxpeer; + lnet_rtrbufpool_t *rbp; + lnet_rtrbuf_t *rb; + + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_routing); + LASSERT (msg->msg_receiving); + LASSERT (!msg->msg_sending); + + /* non-lnet_parse callers only receive delayed messages */ + LASSERT(!do_recv || msg->msg_rx_delayed); + + if (!msg->msg_peerrtrcredit) { + LASSERT ((lp->lp_rtrcredits < 0) == + !list_empty(&lp->lp_rtrq)); + + msg->msg_peerrtrcredit = 1; + lp->lp_rtrcredits--; + if (lp->lp_rtrcredits < lp->lp_minrtrcredits) + lp->lp_minrtrcredits = lp->lp_rtrcredits; + + if (lp->lp_rtrcredits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lp_rtrq); + return EAGAIN; + } + } + + rbp = lnet_msg2bufpool(msg); + + if (!msg->msg_rtrcredit) { + LASSERT ((rbp->rbp_credits < 0) == + !list_empty(&rbp->rbp_msgs)); + + msg->msg_rtrcredit = 1; + rbp->rbp_credits--; + if (rbp->rbp_credits < rbp->rbp_mincredits) + rbp->rbp_mincredits = rbp->rbp_credits; + + if (rbp->rbp_credits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + return EAGAIN; + } + } + + LASSERT (!list_empty(&rbp->rbp_bufs)); + rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; + + if (do_recv) { + int cpt = msg->msg_rx_cpt; + + lnet_net_unlock(cpt); + lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, + 0, msg->msg_len, msg->msg_len); + lnet_net_lock(cpt); + } + return 0; +} + +void +lnet_return_tx_credits_locked(lnet_msg_t *msg) +{ + lnet_peer_t *txpeer = msg->msg_txpeer; + lnet_msg_t *msg2; + + if (msg->msg_txcredit) { + struct lnet_ni *ni = txpeer->lp_ni; + struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; + + /* give back NI txcredits */ + msg->msg_txcredit = 0; + + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + tq->tq_credits++; + if (tq->tq_credits <= 0) { + msg2 = list_entry(tq->tq_delayed.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer->lp_ni == ni); + LASSERT(msg2->msg_tx_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; + + LASSERT((txpeer->lp_txcredits < 0) == + !list_empty(&txpeer->lp_txq)); + + txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); + LASSERT (txpeer->lp_txqnob >= 0); + + txpeer->lp_txcredits++; + if (txpeer->lp_txcredits <= 0) { + msg2 = list_entry(txpeer->lp_txq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer == txpeer); + LASSERT(msg2->msg_tx_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (txpeer != NULL) { + msg->msg_txpeer = NULL; + lnet_peer_decref_locked(txpeer); + } +} + +void +lnet_return_rx_credits_locked(lnet_msg_t *msg) +{ + lnet_peer_t *rxpeer = msg->msg_rxpeer; + lnet_msg_t *msg2; + + if (msg->msg_rtrcredit) { + /* give back global router credits */ + lnet_rtrbuf_t *rb; + lnet_rtrbufpool_t *rbp; + + /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays + * there until it gets one allocated, or aborts the wait + * itself */ + LASSERT (msg->msg_kiov != NULL); + + rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); + rbp = rb->rb_pool; + LASSERT (rbp == lnet_msg2bufpool(msg)); + + msg->msg_kiov = NULL; + msg->msg_rtrcredit = 0; + + LASSERT((rbp->rbp_credits < 0) == + !list_empty(&rbp->rbp_msgs)); + LASSERT((rbp->rbp_credits > 0) == + !list_empty(&rbp->rbp_bufs)); + + list_add(&rb->rb_list, &rbp->rbp_bufs); + rbp->rbp_credits++; + if (rbp->rbp_credits <= 0) { + msg2 = list_entry(rbp->rbp_msgs.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + + if (msg->msg_peerrtrcredit) { + /* give back peer router credits */ + msg->msg_peerrtrcredit = 0; + + LASSERT((rxpeer->lp_rtrcredits < 0) == + !list_empty(&rxpeer->lp_rtrq)); + + rxpeer->lp_rtrcredits++; + if (rxpeer->lp_rtrcredits <= 0) { + msg2 = list_entry(rxpeer->lp_rtrq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + if (rxpeer != NULL) { + msg->msg_rxpeer = NULL; + lnet_peer_decref_locked(rxpeer); + } +} + +static int +lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) +{ + lnet_peer_t *p1 = r1->lr_gateway; + lnet_peer_t *p2 = r2->lr_gateway; + + if (r1->lr_hops < r2->lr_hops) + return 1; + + if (r1->lr_hops > r2->lr_hops) + return -1; + + if (p1->lp_txqnob < p2->lp_txqnob) + return 1; + + if (p1->lp_txqnob > p2->lp_txqnob) + return -1; + + if (p1->lp_txcredits > p2->lp_txcredits) + return 1; + + if (p1->lp_txcredits < p2->lp_txcredits) + return -1; + + if (r1->lr_seq - r2->lr_seq <= 0) + return 1; + + return -1; +} + +static lnet_peer_t * +lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) +{ + lnet_remotenet_t *rnet; + lnet_route_t *rtr; + lnet_route_t *rtr_best; + lnet_route_t *rtr_last; + struct lnet_peer *lp_best; + struct lnet_peer *lp; + int rc; + + /* If @rtr_nid is not LNET_NID_ANY, return the gateway with + * rtr_nid nid, otherwise find the best gateway I can use */ + + rnet = lnet_find_net_locked(LNET_NIDNET(target)); + if (rnet == NULL) + return NULL; + + lp_best = NULL; + rtr_best = rtr_last = NULL; + list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) { + lp = rtr->lr_gateway; + + if (!lp->lp_alive || /* gateway is down */ + ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 && + rtr->lr_downis != 0)) /* NI to target is down */ + continue; + + if (ni != NULL && lp->lp_ni != ni) + continue; + + if (lp->lp_nid == rtr_nid) /* it's pre-determined router */ + return lp; + + if (lp_best == NULL) { + rtr_best = rtr_last = rtr; + lp_best = lp; + continue; + } + + /* no protection on below fields, but it's harmless */ + if (rtr_last->lr_seq - rtr->lr_seq < 0) + rtr_last = rtr; + + rc = lnet_compare_routes(rtr, rtr_best); + if (rc < 0) + continue; + + rtr_best = rtr; + lp_best = lp; + } + + /* set sequence number on the best router to the latest sequence + 1 + * so we can round-robin all routers, it's race and inaccurate but + * harmless and functional */ + if (rtr_best != NULL) + rtr_best->lr_seq = rtr_last->lr_seq + 1; + return lp_best; +} + +int +lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + struct lnet_ni *src_ni; + struct lnet_ni *local_ni; + struct lnet_peer *lp; + int cpt; + int cpt2; + int rc; + + /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, + * but we might want to use pre-determined router for ACK/REPLY + * in the future */ + /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ + LASSERT (msg->msg_txpeer == NULL); + LASSERT (!msg->msg_sending); + LASSERT (!msg->msg_target_is_router); + LASSERT (!msg->msg_receiving); + + msg->msg_sending = 1; + + LASSERT(!msg->msg_tx_committed); + cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid); + again: + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + if (src_nid == LNET_NID_ANY) { + src_ni = NULL; + } else { + src_ni = lnet_nid2ni_locked(src_nid, cpt); + if (src_ni == NULL) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("Can't send to %s: src %s is not a " + "local nid\n", libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EINVAL; + } + LASSERT (!msg->msg_routing); + } + + /* Is this for someone on a local network? */ + local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt); + + if (local_ni != NULL) { + if (src_ni == NULL) { + src_ni = local_ni; + src_nid = src_ni->ni_nid; + } else if (src_ni == local_ni) { + lnet_ni_decref_locked(local_ni, cpt); + } else { + lnet_ni_decref_locked(local_ni, cpt); + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + LCONSOLE_WARN("No route to %s via from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EINVAL; + } + + LASSERT(src_nid != LNET_NID_ANY); + lnet_msg_commit(msg, cpt); + + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + + if (src_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + lnet_net_unlock(cpt); + lnet_ni_send(src_ni, msg); + + lnet_net_lock(cpt); + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + return 0; + } + + rc = lnet_nid2peer_locked(&lp, dst_nid, cpt); + /* lp has ref on src_ni; lose mine */ + lnet_ni_decref_locked(src_ni, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("Error %d finding peer %s\n", rc, + libcfs_nid2str(dst_nid)); + /* ENOMEM or shutting down */ + return rc; + } + LASSERT (lp->lp_ni == src_ni); + } else { + /* sending to a remote network */ + lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid); + if (lp == NULL) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + + LCONSOLE_WARN("No route to %s via %s " + "(all routers down)\n", + libcfs_id2str(msg->msg_target), + libcfs_nid2str(src_nid)); + return -EHOSTUNREACH; + } + + /* rtr_nid is LNET_NID_ANY or NID of pre-determined router, + * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't + * pre-determined router, this can happen if router table + * was changed when we release the lock */ + if (rtr_nid != lp->lp_nid) { + cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid); + if (cpt2 != cpt) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + + rtr_nid = lp->lp_nid; + cpt = cpt2; + goto again; + } + } + + CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", + libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid), + lnet_msgtyp2str(msg->msg_type), msg->msg_len); + + if (src_ni == NULL) { + src_ni = lp->lp_ni; + src_nid = src_ni->ni_nid; + } else { + LASSERT (src_ni == lp->lp_ni); + lnet_ni_decref_locked(src_ni, cpt); + } + + lnet_peer_addref_locked(lp); + + LASSERT(src_nid != LNET_NID_ANY); + lnet_msg_commit(msg, cpt); + + if (!msg->msg_routing) { + /* I'm the source and now I know which NI to send on */ + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + } + + msg->msg_target_is_router = 1; + msg->msg_target.nid = lp->lp_nid; + msg->msg_target.pid = LUSTRE_SRV_LNET_PID; + } + + /* 'lp' is our best choice of peer */ + + LASSERT (!msg->msg_peertxcredit); + LASSERT (!msg->msg_txcredit); + LASSERT (msg->msg_txpeer == NULL); + + msg->msg_txpeer = lp; /* msg takes my ref on lp */ + + rc = lnet_post_send_locked(msg, 0); + lnet_net_unlock(cpt); + + if (rc == EHOSTUNREACH) + return -EHOSTUNREACH; + + if (rc == 0) + lnet_ni_send(src_ni, msg); + + return 0; +} + +static void +lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob) +{ + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += nob; + lnet_net_unlock(cpt); + + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); +} + +static void +lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + + if (msg->msg_wanted != 0) + lnet_setpayloadbuffer(msg); + + lnet_build_msg_event(msg, LNET_EVENT_PUT); + + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK during lnet_finalize() */ + msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); + + lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed, + msg->msg_offset, msg->msg_wanted, hdr->payload_length); +} + +static int +lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + struct lnet_match_info info; + int rc; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); + hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL; + + again: + rc = lnet_ptl_match_md(&info, msg); + switch (rc) { + default: + LBUG(); + + case LNET_MATCHMD_OK: + lnet_recv_put(ni, msg); + return 0; + + case LNET_MATCHMD_NONE: + if (msg->msg_rx_delayed) /* attached on delayed list */ + return 0; + + rc = lnet_ni_eager_recv(ni, msg); + if (rc == 0) + goto again; + /* fall through */ + + case LNET_MATCHMD_DROP: + CNETERR("Dropping PUT from %s portal %d match "LPU64 + " offset %d length %d: %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); + + return ENOENT; /* +ve: OK but no match */ + } +} + +static int +lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) +{ + struct lnet_match_info info; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_handle_wire_t reply_wmd; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); + hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); + + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_GET; + info.mi_portal = hdr->msg.get.ptl_index; + info.mi_rlength = hdr->msg.get.sink_length; + info.mi_roffset = hdr->msg.get.src_offset; + info.mi_mbits = hdr->msg.get.match_bits; + + rc = lnet_ptl_match_md(&info, msg); + if (rc == LNET_MATCHMD_DROP) { + CNETERR("Dropping GET from %s portal %d match "LPU64 + " offset %d length %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength); + return ENOENT; /* +ve: OK but no match */ + } + + LASSERT(rc == LNET_MATCHMD_OK); + + lnet_build_msg_event(msg, LNET_EVENT_GET); + + reply_wmd = hdr->msg.get.return_wmd; + + lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id, + msg->msg_offset, msg->msg_wanted); + + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; + + if (rdma_get) { + /* The LND completes the REPLY from her recv procedure */ + lnet_ni_recv(ni, msg->msg_private, msg, 0, + msg->msg_offset, msg->msg_len, msg->msg_len); + return 0; + } + + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + msg->msg_receiving = 0; + + rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); + if (rc < 0) { + /* didn't get as far as lnet_ni_send() */ + CERROR("%s: Unable to send REPLY for GET from %s: %d\n", + libcfs_nid2str(ni->ni_nid), + libcfs_id2str(info.mi_id), rc); + + lnet_finalize(ni, msg, rc); + } + + return 0; +} + +static int +lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *private = msg->msg_private; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; + int rlength; + int mlength; + int cpt; + + cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CNETERR("%s: Dropping REPLY from %s for %s " + "MD "LPX64"."LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return ENOENT; /* +ve: OK but no match */ + } + + LASSERT (md->md_offset == 0); + + rlength = hdr->payload_length; + mlength = MIN(rlength, (int)md->md_length); + + if (mlength < rlength && + (md->md_options & LNET_MD_TRUNCATE) == 0) { + CNETERR("%s: Dropping REPLY from %s length %d " + "for MD "LPX64" would overflow (%d)\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, + mlength); + lnet_res_unlock(cpt); + return ENOENT; /* +ve: OK but no match */ + } + + CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, mlength); + + if (mlength != 0) + lnet_setpayloadbuffer(msg); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); + return 0; +} + +static int +lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; + int cpt; + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); + + cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + /* Don't moan; this is expected */ + CDEBUG(D_NET, + "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return ENOENT; /* +ve! */ + } + + CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + hdr->msg.ack.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_ACK); + + lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); + return 0; +} + +static int +lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg) +{ + int rc = 0; + + if (msg->msg_rxpeer->lp_rtrcredits <= 0 || + lnet_msg2bufpool(msg)->rbp_credits <= 0) { + if (ni->ni_lnd->lnd_eager_recv == NULL) { + msg->msg_rx_ready_delay = 1; + } else { + lnet_net_unlock(msg->msg_rx_cpt); + rc = lnet_ni_eager_recv(ni, msg); + lnet_net_lock(msg->msg_rx_cpt); + } + } + + if (rc == 0) + rc = lnet_post_routed_recv_locked(msg, 0); + return rc; +} + +char * +lnet_msgtyp2str (int type) +{ + switch (type) { + case LNET_MSG_ACK: + return ("ACK"); + case LNET_MSG_PUT: + return ("PUT"); + case LNET_MSG_GET: + return ("GET"); + case LNET_MSG_REPLY: + return ("REPLY"); + case LNET_MSG_HELLO: + return ("HELLO"); + default: + return ("<UNKNOWN>"); + } +} +EXPORT_SYMBOL(lnet_msgtyp2str); + +void +lnet_print_hdr(lnet_hdr_t * hdr) +{ + lnet_process_id_t src = {0}; + lnet_process_id_t dst = {0}; + char *type_str = lnet_msgtyp2str (hdr->type); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + dst.nid = hdr->dest_nid; + dst.pid = hdr->dest_pid; + + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From %s\n", libcfs_id2str(src)); + CWARN(" To %s\n", libcfs_id2str(dst)); + + switch (hdr->type) { + default: + break; + + case LNET_MSG_PUT: + CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPU64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data "LPX64"\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case LNET_MSG_GET: + CWARN(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPU64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case LNET_MSG_ACK: + CWARN(" dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case LNET_MSG_REPLY: + CWARN(" dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); + } + +} + +int +lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, + void *private, int rdma_req) +{ + int rc = 0; + int cpt; + int for_me; + struct lnet_msg *msg; + lnet_pid_t dest_pid; + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + __u32 payload_length; + __u32 type; + + LASSERT (!in_interrupt ()); + + type = le32_to_cpu(hdr->type); + src_nid = le64_to_cpu(hdr->src_nid); + dest_nid = le64_to_cpu(hdr->dest_nid); + dest_pid = le32_to_cpu(hdr->dest_pid); + payload_length = le32_to_cpu(hdr->payload_length); + + for_me = (ni->ni_nid == dest_nid); + cpt = lnet_cpt_of_nid(from_nid); + + switch (type) { + case LNET_MSG_ACK: + case LNET_MSG_GET: + if (payload_length > 0) { + CERROR("%s, src %s: bad %s payload %d (0 expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), payload_length); + return -EPROTO; + } + break; + + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { + CERROR("%s, src %s: bad %s payload %d " + "(%d max expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + payload_length, + for_me ? LNET_MAX_PAYLOAD : LNET_MTU); + return -EPROTO; + } + break; + + default: + CERROR("%s, src %s: Bad message type 0x%x\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), type); + return -EPROTO; + } + + if (the_lnet.ln_routing && + ni->ni_last_alive != cfs_time_current_sec()) { + lnet_ni_lock(ni); + + /* NB: so far here is the only place to set NI status to "up */ + ni->ni_last_alive = cfs_time_current_sec(); + if (ni->ni_status != NULL && + ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) + ni->ni_status->ns_status = LNET_NI_STATUS_UP; + lnet_ni_unlock(ni); + } + + /* Regard a bad destination NID as a protocol error. Senders should + * know what they're doing; if they don't they're misconfigured, buggy + * or malicious so we chop them off at the knees :) */ + + if (!for_me) { + if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { + /* should have gone direct */ + CERROR ("%s, src %s: Bad dest nid %s " + "(should have been sent direct)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (lnet_islocalnid(dest_nid)) { + /* dest is another local NI; sender should have used + * this node's NID on its own network */ + CERROR ("%s, src %s: Bad dest nid %s " + "(it's my nid but on a different network)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (rdma_req && type == LNET_MSG_GET) { + CERROR ("%s, src %s: Bad optimized GET for %s " + "(final destination must be me)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (!the_lnet.ln_routing) { + CERROR ("%s, src %s: Dropping message for %s " + "(routing not enabled)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + goto drop; + } + } + + /* Message looks OK; we're not going to return an error, so we MUST + * call back lnd_recv() come what may... */ + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (src_nid, 0)) /* shall we now? */ + { + CERROR("%s, src %s: Dropping %s to simulate failure\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("%s, src %s: Dropping %s (out of memory)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */ + + msg->msg_type = type; + msg->msg_private = private; + msg->msg_receiving = 1; + msg->msg_len = msg->msg_wanted = payload_length; + msg->msg_offset = 0; + msg->msg_hdr = *hdr; + /* for building message event */ + msg->msg_from = from_nid; + if (!for_me) { + msg->msg_target.pid = dest_pid; + msg->msg_target.nid = dest_nid; + msg->msg_routing = 1; + + } else { + /* convert common msg->hdr fields to host byteorder */ + msg->msg_hdr.type = type; + msg->msg_hdr.src_nid = src_nid; + msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); + msg->msg_hdr.dest_nid = dest_nid; + msg->msg_hdr.dest_pid = dest_pid; + msg->msg_hdr.payload_length = payload_length; + } + + lnet_net_lock(cpt); + rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + CERROR("%s, src %s: Dropping %s " + "(error %d looking up sender)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), rc); + lnet_msg_free(msg); + goto drop; + } + + lnet_msg_commit(msg, cpt); + + if (!for_me) { + rc = lnet_parse_forward_locked(ni, msg); + lnet_net_unlock(cpt); + + if (rc < 0) + goto free_drop; + if (rc == 0) { + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, payload_length, payload_length); + } + return 0; + } + + lnet_net_unlock(cpt); + + switch (type) { + case LNET_MSG_ACK: + rc = lnet_parse_ack(ni, msg); + break; + case LNET_MSG_PUT: + rc = lnet_parse_put(ni, msg); + break; + case LNET_MSG_GET: + rc = lnet_parse_get(ni, msg, rdma_req); + break; + case LNET_MSG_REPLY: + rc = lnet_parse_reply(ni, msg); + break; + default: + LASSERT(0); + rc = -EPROTO; + goto free_drop; /* prevent an unused label if !kernel */ + } + + if (rc == 0) + return 0; + + LASSERT (rc == ENOENT); + + free_drop: + LASSERT(msg->msg_md == NULL); + lnet_finalize(ni, msg, rc); + + drop: + lnet_drop_message(ni, cpt, private, payload_length); + return 0; +} +EXPORT_SYMBOL(lnet_parse); + +void +lnet_drop_delayed_msg_list(struct list_head *head, char *reason) +{ + while (!list_empty(head)) { + lnet_process_id_t id = {0}; + lnet_msg_t *msg; + + msg = list_entry(head->next, lnet_msg_t, msg_list); + list_del(&msg->msg_list); + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_md == NULL); + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CWARN("Dropping delayed PUT from %s portal %d match "LPU64 + " offset %d length %d: %s\n", + libcfs_id2str(id), + msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length, reason); + + /* NB I can't drop msg's ref on msg_rxpeer until after I've + * called lnet_drop_message(), so I just hang onto msg as well + * until that's done */ + + lnet_drop_message(msg->msg_rxpeer->lp_ni, + msg->msg_rxpeer->lp_cpt, + msg->msg_private, msg->msg_len); + /* + * NB: message will not generate event because w/o attached MD, + * but we still should give error code so lnet_msg_decommit() + * can skip counters operations and other checks. + */ + lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT); + } +} + +void +lnet_recv_delayed_msg_list(struct list_head *head) +{ + while (!list_empty(head)) { + lnet_msg_t *msg; + lnet_process_id_t id; + + msg = list_entry(head->next, lnet_msg_t, msg_list); + list_del(&msg->msg_list); + + /* md won't disappear under me, since each msg + * holds a ref on it */ + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_md != NULL); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match "LPU64" offset %d length %d.\n", + libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length); + + lnet_recv_put(msg->msg_rxpeer->lp_ni, msg); + } +} + +/** + * Initiate an asynchronous PUT operation. + * + * There are several events associated with a PUT: completion of the send on + * the initiator node (LNET_EVENT_SEND), and when the send completes + * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating + * that the operation was accepted by the target. The event LNET_EVENT_PUT is + * used at the target node to indicate the completion of incoming data + * delivery. + * + * The local events will be logged in the EQ associated with the MD pointed to + * by \a mdh handle. Using a MD without an associated EQ results in these + * events being discarded. In this case, the caller must have another + * mechanism (e.g., a higher level protocol) for determining when it is safe + * to modify the memory region associated with the MD. + * + * Note that LNet does not guarantee the order of LNET_EVENT_SEND and + * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. + * + * \param self Indicates the NID of a local interface through which to send + * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. + * \param mdh A handle for the MD that describes the memory to be sent. The MD + * must be "free floating" (See LNetMDBind()). + * \param ack Controls whether an acknowledgment is requested. + * Acknowledgments are only sent when they are requested by the initiating + * process and the target MD enables them. + * \param target A process identifier for the target process. + * \param portal The index in the \a target's portal table. + * \param match_bits The match bits to use for MD selection at the target + * process. + * \param offset The offset into the target MD (only used when the target + * MD has the LNET_MD_MANAGE_REMOTE option set). + * \param hdr_data 64 bits of user data that can be included in the message + * header. This data is written to an event queue entry at the target if an + * EQ is present on the matching MD. + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists). + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + * + * \see lnet_event_t::hdr_data and lnet_event_kind_t. + */ +int +LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset, + __u64 hdr_data) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (target.nid, 1)) /* shall we now? */ + { + CERROR("Dropping PUT to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } + msg->msg_vmflush = !!memory_pressure_get(); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + return -ENOENT; + } + + CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); + + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; + + /* NB handles only looked up by creator (no flips) */ + if (ack == LNET_ACK_REQ) { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + } else { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + } + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc != 0) { + CNETERR( "Error sending PUT to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize (NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetPut); + +lnet_msg_t * +lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) +{ + /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink + * data has been received. + * + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lnet_finalize() is called on it, so the LND must call this first */ + + struct lnet_msg *msg = lnet_msg_alloc(); + struct lnet_libmd *getmd = getmsg->msg_md; + lnet_process_id_t peer_id = getmsg->msg_target; + int cpt; + + LASSERT(!getmsg->msg_target_is_router); + LASSERT(!getmsg->msg_routing); + + cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie); + lnet_res_lock(cpt); + + LASSERT (getmd->md_refcount > 0); + + if (msg == NULL) { + CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); + goto drop; + } + + if (getmd->md_threshold == 0) { + CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), + getmd); + lnet_res_unlock(cpt); + goto drop; + } + + LASSERT(getmd->md_offset == 0); + + CDEBUG(D_NET, "%s: Reply from %s md %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); + + /* setup information for lnet_build_msg_event */ + msg->msg_from = peer_id.nid; + msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ + msg->msg_hdr.src_nid = peer_id.nid; + msg->msg_hdr.payload_length = getmd->md_length; + msg->msg_receiving = 1; /* required by lnet_msg_attach_md */ + + lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); + lnet_res_unlock(cpt); + + cpt = lnet_cpt_of_nid(peer_id.nid); + + lnet_net_lock(cpt); + lnet_msg_commit(msg, cpt); + lnet_net_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + return msg; + + drop: + cpt = lnet_cpt_of_nid(peer_id.nid); + + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += getmd->md_length; + lnet_net_unlock(cpt); + + if (msg != NULL) + lnet_msg_free(msg); + + return NULL; +} +EXPORT_SYMBOL(lnet_create_reply_msg); + +void +lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) +{ + /* Set the REPLY length, now the RDMA that elides the REPLY message has + * completed and I know it. */ + LASSERT (reply != NULL); + LASSERT (reply->msg_type == LNET_MSG_GET); + LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY); + + /* NB I trusted my peer to RDMA. If she tells me she's written beyond + * the end of my buffer, I might as well be dead. */ + LASSERT (len <= reply->msg_ev.mlength); + + reply->msg_ev.mlength = len; +} +EXPORT_SYMBOL(lnet_set_reply_msg_len); + +/** + * Initiate an asynchronous GET operation. + * + * On the initiator node, an LNET_EVENT_SEND is logged when the GET request + * is sent, and an LNET_EVENT_REPLY is logged when the data returned from + * the target node in the REPLY has been written to local MD. + * + * On the target node, an LNET_EVENT_GET is logged when the GET request + * arrives and is accepted into a MD. + * + * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). + * \param mdh A handle for the MD that describes the memory into which the + * requested data will be received. The MD must be "free floating" (See LNetMDBind()). + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists) of the MD. + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + */ +int +LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (target.nid, 1)) /* shall we now? */ + { + CERROR("Dropping GET to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + + return -ENOENT; + } + + CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); + + msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); + msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); + + /* NB handles only looked up by creator (no flips) */ + msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc < 0) { + CNETERR( "Error sending GET to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize (NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetGet); + +/** + * Calculate distance to node at \a dstnid. + * + * \param dstnid Target NID. + * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid + * is saved here. + * \param orderp If not NULL, order of the route to reach \a dstnid is saved + * here. + * + * \retval 0 If \a dstnid belongs to a local interface, and reserved option + * local_nid_dist_zero is set, which is the default. + * \retval positives Distance to target NID, i.e. number of hops plus one. + * \retval -EHOSTUNREACH If \a dstnid is not reachable. + */ +int +LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) +{ + struct list_head *e; + struct lnet_ni *ni; + lnet_remotenet_t *rnet; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; + + /* if !local_nid_dist_zero, I don't return a distance of 0 ever + * (when lustre sees a distance of 0, it substitutes 0@lo), so I + * keep order 0 free for 0@lo and order 1 free for a local NID + * match */ + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + list_for_each (e, &the_lnet.ln_nis) { + ni = list_entry(e, lnet_ni_t, ni_list); + + if (ni->ni_nid == dstnid) { + if (srcnidp != NULL) + *srcnidp = dstnid; + if (orderp != NULL) { + if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + *orderp = 0; + else + *orderp = 1; + } + lnet_net_unlock(cpt); + + return local_nid_dist_zero ? 0 : 1; + } + + if (LNET_NIDNET(ni->ni_nid) == dstnet) { + if (srcnidp != NULL) + *srcnidp = ni->ni_nid; + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return 1; + } + + order++; + } + + rn_list = lnet_net2rnethash(dstnet); + list_for_each(e, rn_list) { + rnet = list_entry(e, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == dstnet) { + lnet_route_t *route; + lnet_route_t *shortest = NULL; + + LASSERT (!list_empty(&rnet->lrn_routes)); + + list_for_each_entry(route, &rnet->lrn_routes, + lr_list) { + if (shortest == NULL || + route->lr_hops < shortest->lr_hops) + shortest = route; + } + + LASSERT (shortest != NULL); + hops = shortest->lr_hops; + if (srcnidp != NULL) + *srcnidp = shortest->lr_gateway->lp_ni->ni_nid; + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return hops + 1; + } + order++; + } + + lnet_net_unlock(cpt); + return -EHOSTUNREACH; +} +EXPORT_SYMBOL(LNetDist); + +/** + * Set the number of asynchronous messages expected from a target process. + * + * This function is only meaningful for userspace callers. It's a no-op when + * called from kernel. + * + * Asynchronous messages are those that can come from a target when the + * userspace process is not waiting for IO to complete; e.g., AST callbacks + * from Lustre servers. Specifying the expected number of such messages + * allows them to be eagerly received when user process is not running in + * LNet; otherwise network errors may occur. + * + * \param id Process ID of the target process. + * \param nasync Number of asynchronous messages expected from the target. + * + * \return 0 on success, and an error code otherwise. + */ +int +LNetSetAsync(lnet_process_id_t id, int nasync) +{ + return 0; +} +EXPORT_SYMBOL(LNetSetAsync); diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c new file mode 100644 index 000000000000..8f3a50bd5f69 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c @@ -0,0 +1,650 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-msg.c + * + * Message decoding, parsing and finalizing routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include <linux/lnet/lib-lnet.h> + +void +lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev) +{ + ENTRY; + + memset(ev, 0, sizeof(*ev)); + + ev->status = 0; + ev->unlinked = 1; + ev->type = LNET_EVENT_UNLINK; + lnet_md_deconstruct(md, &ev->md); + lnet_md2handle(&ev->md_handle, md); + EXIT; +} + +/* + * Don't need any lock, must be called after lnet_commit_md + */ +void +lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(!msg->msg_routing); + + ev->type = ev_type; + + if (ev_type == LNET_EVENT_SEND) { + /* event for active message */ + ev->target.nid = le64_to_cpu(hdr->dest_nid); + ev->target.pid = le32_to_cpu(hdr->dest_pid); + ev->initiator.nid = LNET_NID_ANY; + ev->initiator.pid = the_lnet.ln_pid; + ev->sender = LNET_NID_ANY; + + } else { + /* event for passive message */ + ev->target.pid = hdr->dest_pid; + ev->target.nid = hdr->dest_nid; + ev->initiator.pid = hdr->src_pid; + ev->initiator.nid = hdr->src_nid; + ev->rlength = hdr->payload_length; + ev->sender = msg->msg_from; + ev->mlength = msg->msg_wanted; + ev->offset = msg->msg_offset; + } + + switch (ev_type) { + default: + LBUG(); + + case LNET_EVENT_PUT: /* passive PUT */ + ev->pt_index = hdr->msg.put.ptl_index; + ev->match_bits = hdr->msg.put.match_bits; + ev->hdr_data = hdr->msg.put.hdr_data; + return; + + case LNET_EVENT_GET: /* passive GET */ + ev->pt_index = hdr->msg.get.ptl_index; + ev->match_bits = hdr->msg.get.match_bits; + ev->hdr_data = 0; + return; + + case LNET_EVENT_ACK: /* ACK */ + ev->match_bits = hdr->msg.ack.match_bits; + ev->mlength = hdr->msg.ack.mlength; + return; + + case LNET_EVENT_REPLY: /* REPLY */ + return; + + case LNET_EVENT_SEND: /* active message */ + if (msg->msg_type == LNET_MSG_PUT) { + ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits); + ev->offset = le32_to_cpu(hdr->msg.put.offset); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->payload_length); + ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data); + + } else { + LASSERT(msg->msg_type == LNET_MSG_GET); + ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->msg.get.sink_length); + ev->offset = le32_to_cpu(hdr->msg.get.src_offset); + ev->hdr_data = 0; + } + return; + } +} + +void +lnet_msg_commit(lnet_msg_t *msg, int cpt) +{ + struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt]; + lnet_counters_t *counters = the_lnet.ln_counters[cpt]; + + /* routed message can be committed for both receiving and sending */ + LASSERT(!msg->msg_tx_committed); + + if (msg->msg_sending) { + LASSERT(!msg->msg_receiving); + + msg->msg_tx_cpt = cpt; + msg->msg_tx_committed = 1; + if (msg->msg_rx_committed) { /* routed message REPLY */ + LASSERT(msg->msg_onactivelist); + return; + } + } else { + LASSERT(!msg->msg_sending); + msg->msg_rx_cpt = cpt; + msg->msg_rx_committed = 1; + } + + LASSERT(!msg->msg_onactivelist); + msg->msg_onactivelist = 1; + list_add(&msg->msg_activelist, &container->msc_active); + + counters->msgs_alloc++; + if (counters->msgs_alloc > counters->msgs_max) + counters->msgs_max = counters->msgs_alloc; +} + +static void +lnet_msg_decommit_tx(lnet_msg_t *msg, int status) +{ + lnet_counters_t *counters; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(msg->msg_tx_committed); + if (status != 0) + goto out; + + counters = the_lnet.ln_counters[msg->msg_tx_cpt]; + switch (ev->type) { + default: /* routed message */ + LASSERT(msg->msg_routing); + LASSERT(msg->msg_rx_committed); + LASSERT(ev->type == 0); + + counters->route_length += msg->msg_len; + counters->route_count++; + goto out; + + case LNET_EVENT_PUT: + /* should have been decommitted */ + LASSERT(!msg->msg_rx_committed); + /* overwritten while sending ACK */ + LASSERT(msg->msg_type == LNET_MSG_ACK); + msg->msg_type = LNET_MSG_PUT; /* fix type */ + break; + + case LNET_EVENT_SEND: + LASSERT(!msg->msg_rx_committed); + if (msg->msg_type == LNET_MSG_PUT) + counters->send_length += msg->msg_len; + break; + + case LNET_EVENT_GET: + LASSERT(msg->msg_rx_committed); + /* overwritten while sending reply, we should never be + * here for optimized GET */ + LASSERT(msg->msg_type == LNET_MSG_REPLY); + msg->msg_type = LNET_MSG_GET; /* fix type */ + break; + } + + counters->send_count++; + out: + lnet_return_tx_credits_locked(msg); + msg->msg_tx_committed = 0; +} + +static void +lnet_msg_decommit_rx(lnet_msg_t *msg, int status) +{ + lnet_counters_t *counters; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */ + LASSERT(msg->msg_rx_committed); + + if (status != 0) + goto out; + + counters = the_lnet.ln_counters[msg->msg_rx_cpt]; + switch (ev->type) { + default: + LASSERT(ev->type == 0); + LASSERT(msg->msg_routing); + goto out; + + case LNET_EVENT_ACK: + LASSERT(msg->msg_type == LNET_MSG_ACK); + break; + + case LNET_EVENT_GET: + /* type is "REPLY" if it's an optimized GET on passive side, + * because optimized GET will never be committed for sending, + * so message type wouldn't be changed back to "GET" by + * lnet_msg_decommit_tx(), see details in lnet_parse_get() */ + LASSERT(msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_GET); + counters->send_length += msg->msg_wanted; + break; + + case LNET_EVENT_PUT: + LASSERT(msg->msg_type == LNET_MSG_PUT); + break; + + case LNET_EVENT_REPLY: + /* type is "GET" if it's an optimized GET on active side, + * see details in lnet_create_reply_msg() */ + LASSERT(msg->msg_type == LNET_MSG_GET || + msg->msg_type == LNET_MSG_REPLY); + break; + } + + counters->recv_count++; + if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY) + counters->recv_length += msg->msg_wanted; + + out: + lnet_return_rx_credits_locked(msg); + msg->msg_rx_committed = 0; +} + +void +lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status) +{ + int cpt2 = cpt; + + LASSERT(msg->msg_tx_committed || msg->msg_rx_committed); + LASSERT(msg->msg_onactivelist); + + if (msg->msg_tx_committed) { /* always decommit for sending first */ + LASSERT(cpt == msg->msg_tx_cpt); + lnet_msg_decommit_tx(msg, status); + } + + if (msg->msg_rx_committed) { + /* forwarding msg committed for both receiving and sending */ + if (cpt != msg->msg_rx_cpt) { + lnet_net_unlock(cpt); + cpt2 = msg->msg_rx_cpt; + lnet_net_lock(cpt2); + } + lnet_msg_decommit_rx(msg, status); + } + + list_del(&msg->msg_activelist); + msg->msg_onactivelist = 0; + + the_lnet.ln_counters[cpt2]->msgs_alloc--; + + if (cpt2 != cpt) { + lnet_net_unlock(cpt2); + lnet_net_lock(cpt); + } +} + +void +lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md, + unsigned int offset, unsigned int mlen) +{ + /* NB: @offset and @len are only useful for receiving */ + /* Here, we attach the MD on lnet_msg and mark it busy and + * decrementing its threshold. Come what may, the lnet_msg "owns" + * the MD until a call to lnet_msg_detach_md or lnet_finalize() + * signals completion. */ + LASSERT(!msg->msg_routing); + + msg->msg_md = md; + if (msg->msg_receiving) { /* commited for receiving */ + msg->msg_offset = offset; + msg->msg_wanted = mlen; + } + + md->md_refcount++; + if (md->md_threshold != LNET_MD_THRESH_INF) { + LASSERT(md->md_threshold > 0); + md->md_threshold--; + } + + /* build umd in event */ + lnet_md2handle(&msg->msg_ev.md_handle, md); + lnet_md_deconstruct(md, &msg->msg_ev.md); +} + +void +lnet_msg_detach_md(lnet_msg_t *msg, int status) +{ + lnet_libmd_t *md = msg->msg_md; + int unlink; + + /* Now it's safe to drop my caller's ref */ + md->md_refcount--; + LASSERT(md->md_refcount >= 0); + + unlink = lnet_md_unlinkable(md); + if (md->md_eq != NULL) { + msg->msg_ev.status = status; + msg->msg_ev.unlinked = unlink; + lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev); + } + + if (unlink) + lnet_md_unlink(md); + + msg->msg_md = NULL; +} + +static int +lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) +{ + lnet_handle_wire_t ack_wmd; + int rc; + int status = msg->msg_ev.status; + + LASSERT (msg->msg_onactivelist); + + if (status == 0 && msg->msg_ack) { + /* Only send an ACK if the PUT completed successfully */ + + lnet_msg_decommit(msg, cpt, 0); + + msg->msg_ack = 0; + lnet_net_unlock(cpt); + + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LASSERT(!msg->msg_routing); + + ack_wmd = msg->msg_hdr.msg.put.ack_wmd; + + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); + + msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; + msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; + msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); + + /* NB: we probably want to use NID of msg::msg_from as 3rd + * parameter (router NID) if it's routed message */ + rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is commited for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either + * because CPT for sending can be different with CPT for + * receiving, so we should return back to lnet_finalize() + * to make sure we are locking the correct partition. + */ + return rc; + + } else if (status == 0 && /* OK so far */ + (msg->msg_routing && !msg->msg_sending)) { + /* not forwarded */ + LASSERT(!msg->msg_receiving); /* called back recv already */ + lnet_net_unlock(cpt); + + rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is commited for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either: + * - The rule is message must decommit for sending first if + * the it's committed for both sending and receiving + * - CPT for sending can be different with CPT for receiving, + * so we should return back to lnet_finalize() to make + * sure we are locking the correct partition. + */ + return rc; + } + + lnet_msg_decommit(msg, cpt, status); + lnet_msg_free_locked(msg); + return 0; +} + +void +lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) +{ + struct lnet_msg_container *container; + int my_slot; + int cpt; + int rc; + int i; + + LASSERT (!in_interrupt ()); + + if (msg == NULL) + return; +#if 0 + CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n", + lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target), + msg->msg_target_is_router ? "t" : "", + msg->msg_routing ? "X" : "", + msg->msg_ack ? "A" : "", + msg->msg_sending ? "S" : "", + msg->msg_receiving ? "R" : "", + msg->msg_delayed ? "d" : "", + msg->msg_txcredit ? "C" : "", + msg->msg_peertxcredit ? "c" : "", + msg->msg_rtrcredit ? "F" : "", + msg->msg_peerrtrcredit ? "f" : "", + msg->msg_onactivelist ? "!" : "", + msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid), + msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid)); +#endif + msg->msg_ev.status = status; + + if (msg->msg_md != NULL) { + cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); + + lnet_res_lock(cpt); + lnet_msg_detach_md(msg, status); + lnet_res_unlock(cpt); + } + + again: + rc = 0; + if (!msg->msg_tx_committed && !msg->msg_rx_committed) { + /* not commited to network yet */ + LASSERT(!msg->msg_onactivelist); + lnet_msg_free(msg); + return; + } + + /* + * NB: routed message can be commited for both receiving and sending, + * we should finalize in LIFO order and keep counters correct. + * (finalize sending first then finalize receiving) + */ + cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt; + lnet_net_lock(cpt); + + container = the_lnet.ln_msg_containers[cpt]; + list_add_tail(&msg->msg_list, &container->msc_finalizing); + + /* Recursion breaker. Don't complete the message here if I am (or + * enough other threads are) already completing messages */ + + my_slot = -1; + for (i = 0; i < container->msc_nfinalizers; i++) { + if (container->msc_finalizers[i] == current) + break; + + if (my_slot < 0 && container->msc_finalizers[i] == NULL) + my_slot = i; + } + + if (i < container->msc_nfinalizers || my_slot < 0) { + lnet_net_unlock(cpt); + return; + } + + container->msc_finalizers[my_slot] = current; + + while (!list_empty(&container->msc_finalizing)) { + msg = list_entry(container->msc_finalizing.next, + lnet_msg_t, msg_list); + + list_del(&msg->msg_list); + + /* NB drops and regains the lnet lock if it actually does + * anything, so my finalizing friends can chomp along too */ + rc = lnet_complete_msg_locked(msg, cpt); + if (rc != 0) + break; + } + + container->msc_finalizers[my_slot] = NULL; + lnet_net_unlock(cpt); + + if (rc != 0) + goto again; +} +EXPORT_SYMBOL(lnet_finalize); + +void +lnet_msg_container_cleanup(struct lnet_msg_container *container) +{ + int count = 0; + + if (container->msc_init == 0) + return; + + while (!list_empty(&container->msc_active)) { + lnet_msg_t *msg = list_entry(container->msc_active.next, + lnet_msg_t, msg_activelist); + + LASSERT(msg->msg_onactivelist); + msg->msg_onactivelist = 0; + list_del(&msg->msg_activelist); + lnet_msg_free(msg); + count++; + } + + if (count > 0) + CERROR("%d active msg on exit\n", count); + + if (container->msc_finalizers != NULL) { + LIBCFS_FREE(container->msc_finalizers, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + container->msc_finalizers = NULL; + } +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_fini(&container->msc_freelist); +#endif + container->msc_init = 0; +} + +int +lnet_msg_container_setup(struct lnet_msg_container *container, int cpt) +{ + int rc; + + container->msc_init = 1; + + INIT_LIST_HEAD(&container->msc_active); + INIT_LIST_HEAD(&container->msc_finalizing); + +#ifdef LNET_USE_LIB_FREELIST + memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t)); + + rc = lnet_freelist_init(&container->msc_freelist, + LNET_FL_MAX_MSGS, sizeof(lnet_msg_t)); + if (rc != 0) { + CERROR("Failed to init freelist for message container\n"); + lnet_msg_container_cleanup(container); + return rc; + } +#else + rc = 0; +#endif + /* number of CPUs */ + container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt); + + LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + + if (container->msc_finalizers == NULL) { + CERROR("Failed to allocate message finalizers\n"); + lnet_msg_container_cleanup(container); + return -ENOMEM; + } + + return rc; +} + +void +lnet_msg_containers_destroy(void) +{ + struct lnet_msg_container *container; + int i; + + if (the_lnet.ln_msg_containers == NULL) + return; + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) + lnet_msg_container_cleanup(container); + + cfs_percpt_free(the_lnet.ln_msg_containers); + the_lnet.ln_msg_containers = NULL; +} + +int +lnet_msg_containers_create(void) +{ + struct lnet_msg_container *container; + int rc; + int i; + + the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*container)); + + if (the_lnet.ln_msg_containers == NULL) { + CERROR("Failed to allocate cpu-partition data for network\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) { + rc = lnet_msg_container_setup(container, i); + if (rc != 0) { + lnet_msg_containers_destroy(); + return rc; + } + } + + return 0; +} diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c new file mode 100644 index 000000000000..9b9e7d3139b0 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c @@ -0,0 +1,938 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-ptl.c + * + * portal & match routines + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include <linux/lnet/lib-lnet.h> + +/* NB: add /proc interfaces in upcoming patches */ +int portal_rotor = LNET_PTL_ROTOR_HASH_RT; +CFS_MODULE_PARM(portal_rotor, "i", int, 0644, + "redirect PUTs to different cpu-partitions"); + +static int +lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id, + __u64 mbits, __u64 ignore_bits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[index]; + int unique; + + unique = ignore_bits == 0 && + match_id.nid != LNET_NID_ANY && + match_id.pid != LNET_PID_ANY; + + LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl)); + + /* prefer to check w/o any lock */ + if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) + goto match; + + /* unset, new portal */ + lnet_ptl_lock(ptl); + /* check again with lock */ + if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) { + lnet_ptl_unlock(ptl); + goto match; + } + + /* still not set */ + if (unique) + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE); + else + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD); + + lnet_ptl_unlock(ptl); + + return 1; + + match: + if ((lnet_ptl_is_unique(ptl) && !unique) || + (lnet_ptl_is_wildcard(ptl) && unique)) + return 0; + return 1; +} + +static void +lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + mtable->mt_enabled = 1; + + ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt; + for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) { + LASSERT(ptl->ptl_mt_maps[i] != cpt); + if (ptl->ptl_mt_maps[i] < cpt) + break; + + /* swap to order */ + ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i]; + ptl->ptl_mt_maps[i] = cpt; + } + + ptl->ptl_mt_nmaps++; +} + +static void +lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + if (LNET_CPT_NUMBER == 1) + return; /* never disable the only match-table */ + + mtable->mt_enabled = 0; + + LASSERT(ptl->ptl_mt_nmaps > 0 && + ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER); + + /* remove it from mt_maps */ + ptl->ptl_mt_nmaps--; + for (i = 0; i < ptl->ptl_mt_nmaps; i++) { + if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */ + ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1]; + } +} + +static int +lnet_try_match_md(lnet_libmd_t *md, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock; + * lnet_match_blocked_msg() relies on this to avoid races */ + unsigned int offset; + unsigned int mlength; + lnet_me_t *me = md->md_me; + + /* MD exhausted */ + if (lnet_md_exhausted(md)) + return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED; + + /* mismatched MD op */ + if ((md->md_options & info->mi_opc) == 0) + return LNET_MATCHMD_NONE; + + /* mismatched ME nid/pid? */ + if (me->me_match_id.nid != LNET_NID_ANY && + me->me_match_id.nid != info->mi_id.nid) + return LNET_MATCHMD_NONE; + + if (me->me_match_id.pid != LNET_PID_ANY && + me->me_match_id.pid != info->mi_id.pid) + return LNET_MATCHMD_NONE; + + /* mismatched ME matchbits? */ + if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0) + return LNET_MATCHMD_NONE; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0) + offset = md->md_offset; + else + offset = info->mi_roffset; + + if ((md->md_options & LNET_MD_MAX_SIZE) != 0) { + mlength = md->md_max_size; + LASSERT(md->md_offset + mlength <= md->md_length); + } else { + mlength = md->md_length - offset; + } + + if (info->mi_rlength <= mlength) { /* fits in allowed space */ + mlength = info->mi_rlength; + } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet from %s, match "LPU64 + " length %d too big: %d left, %d allowed\n", + libcfs_id2str(info->mi_id), info->mi_mbits, + info->mi_rlength, md->md_length - offset, mlength); + + return LNET_MATCHMD_DROP; + } + + /* Commit to this ME/MD */ + CDEBUG(D_NET, "Incoming %s index %x from %s of " + "length %d/%d into md "LPX64" [%d] + %d\n", + (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get", + info->mi_portal, libcfs_id2str(info->mi_id), mlength, + info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset); + + lnet_msg_attach_md(msg, md, offset, mlength); + md->md_offset = offset + mlength; + + if (!lnet_md_exhausted(md)) + return LNET_MATCHMD_OK; + + /* Auto-unlink NOW, so the ME gets unlinked if required. + * We bumped md->md_refcount above so the MD just gets flagged + * for unlink when it is finalized. */ + if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0) + lnet_md_unlink(md); + + return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED; +} + +static struct lnet_match_table * +lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits) +{ + if (LNET_CPT_NUMBER == 1) + return ptl->ptl_mtables[0]; /* the only one */ + + /* if it's a unique portal, return match-table hashed by NID */ + return lnet_ptl_is_unique(ptl) ? + ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL; +} + +struct lnet_match_table * +lnet_mt_of_attach(unsigned int index, lnet_process_id_t id, + __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos) +{ + struct lnet_portal *ptl; + struct lnet_match_table *mtable; + + /* NB: called w/o lock */ + LASSERT(index < the_lnet.ln_nportals); + + if (!lnet_ptl_match_type(index, id, mbits, ignore_bits)) + return NULL; + + ptl = the_lnet.ln_portals[index]; + + mtable = lnet_match2mt(ptl, id, mbits); + if (mtable != NULL) /* unique portal or only one match-table */ + return mtable; + + /* it's a wildcard portal */ + switch (pos) { + default: + return NULL; + case LNET_INS_BEFORE: + case LNET_INS_AFTER: + /* posted by no affinity thread, always hash to specific + * match-table to avoid buffer stealing which is heavy */ + return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER]; + case LNET_INS_LOCAL: + /* posted by cpu-affinity thread */ + return ptl->ptl_mtables[lnet_cpt_current()]; + } +} + +static struct lnet_match_table * +lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + int nmaps; + int rotor; + int routed; + int cpt; + + /* NB: called w/o lock */ + LASSERT(info->mi_portal < the_lnet.ln_nportals); + ptl = the_lnet.ln_portals[info->mi_portal]; + + LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)); + + mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits); + if (mtable != NULL) + return mtable; + + /* it's a wildcard portal */ + routed = LNET_NIDNET(msg->msg_hdr.src_nid) != + LNET_NIDNET(msg->msg_hdr.dest_nid); + + if (portal_rotor == LNET_PTL_ROTOR_OFF || + (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) { + cpt = lnet_cpt_current(); + if (ptl->ptl_mtables[cpt]->mt_enabled) + return ptl->ptl_mtables[cpt]; + } + + rotor = ptl->ptl_rotor++; /* get round-robin factor */ + if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed) + cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid); + else + cpt = rotor % LNET_CPT_NUMBER; + + if (!ptl->ptl_mtables[cpt]->mt_enabled) { + /* is there any active entry for this portal? */ + nmaps = ptl->ptl_mt_nmaps; + /* map to an active mtable to avoid heavy "stealing" */ + if (nmaps != 0) { + /* NB: there is possibility that ptl_mt_maps is being + * changed because we are not under protection of + * lnet_ptl_lock, but it shouldn't hurt anything */ + cpt = ptl->ptl_mt_maps[rotor % nmaps]; + } + } + + return ptl->ptl_mtables[cpt]; +} + +static int +lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos) +{ + __u64 *bmap; + int i; + + if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + return 0; + + if (pos < 0) { /* check all bits */ + for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) { + if (mtable->mt_exhausted[i] != (__u64)(-1)) + return 0; + } + return 1; + } + + LASSERT(pos <= LNET_MT_HASH_IGNORE); + /* mtable::mt_mhash[pos] is marked as exhausted or not */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + return ((*bmap) & (1ULL << pos)) != 0; +} + +static void +lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted) +{ + __u64 *bmap; + + LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])); + LASSERT(pos <= LNET_MT_HASH_IGNORE); + + /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + if (!exhausted) + *bmap &= ~(1ULL << pos); + else + *bmap |= 1ULL << pos; +} + +struct list_head * +lnet_mt_match_head(struct lnet_match_table *mtable, + lnet_process_id_t id, __u64 mbits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal]; + + if (lnet_ptl_is_wildcard(ptl)) { + return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK]; + } else { + unsigned long hash = mbits + id.nid + id.pid; + + LASSERT(lnet_ptl_is_unique(ptl)); + hash = cfs_hash_long(hash, LNET_MT_HASH_BITS); + return &mtable->mt_mhash[hash]; + } +} + +int +lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct list_head *head; + lnet_me_t *me; + lnet_me_t *tmp; + int exhausted = 0; + int rc; + + /* any ME with ignore bits? */ + if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE])) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + again: + /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */ + if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + exhausted = LNET_MATCHMD_EXHAUSTED; + + list_for_each_entry_safe(me, tmp, head, me_list) { + /* ME attached but MD not attached yet */ + if (me->me_md == NULL) + continue; + + LASSERT(me == me->me_md->md_me); + + rc = lnet_try_match_md(me->me_md, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) == 0) + exhausted = 0; /* mlist is not empty */ + + if ((rc & LNET_MATCHMD_FINISH) != 0) { + /* don't return EXHAUSTED bit because we don't know + * whether the mlist is empty or not */ + return rc & ~LNET_MATCHMD_EXHAUSTED; + } + } + + if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */ + lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1); + if (!lnet_mt_test_exhausted(mtable, -1)) + exhausted = 0; + } + + if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) { + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + goto again; /* re-check MEs w/o ignore-bits */ + } + + if (info->mi_opc == LNET_MD_OP_GET || + !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal])) + return LNET_MATCHMD_DROP | exhausted; + + return LNET_MATCHMD_NONE | exhausted; +} + +static int +lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg) +{ + int rc; + + /* message arrived before any buffer posting on this portal, + * simply delay or drop this message */ + if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl))) + return 0; + + lnet_ptl_lock(ptl); + /* check it again with hold of lock */ + if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) { + lnet_ptl_unlock(ptl); + return 0; + } + + if (lnet_ptl_is_lazy(ptl)) { + if (msg->msg_rx_ready_delay) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + } + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + + lnet_ptl_unlock(ptl); + return rc; +} + +static int +lnet_ptl_match_delay(struct lnet_portal *ptl, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + int first = ptl->ptl_mt_maps[0]; /* read w/o lock */ + int rc = 0; + int i; + + /* steal buffer from other CPTs, and delay it if nothing to steal, + * this function is more expensive than a regular match, but we + * don't expect it can happen a lot */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + for (i = 0; i < LNET_CPT_NUMBER; i++) { + struct lnet_match_table *mtable; + int cpt; + + cpt = (first + i) % LNET_CPT_NUMBER; + mtable = ptl->ptl_mtables[cpt]; + if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled) + continue; + + lnet_res_lock(cpt); + lnet_ptl_lock(ptl); + + if (i == 0) { /* the first try, attach on stealing list */ + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_stealing); + } + + if (!list_empty(&msg->msg_list)) { /* on stealing list */ + rc = lnet_mt_match_md(mtable, info, msg); + + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && + mtable->mt_enabled) + lnet_ptl_disable_mt(ptl, cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0) + list_del_init(&msg->msg_list); + + } else { + /* could be matched by lnet_ptl_attach_md() + * which is called by another thread */ + rc = msg->msg_md == NULL ? + LNET_MATCHMD_DROP : LNET_MATCHMD_OK; + } + + if (!list_empty(&msg->msg_list) && /* not matched yet */ + (i == LNET_CPT_NUMBER - 1 || /* the last CPT */ + ptl->ptl_mt_nmaps == 0 || /* no active CPT */ + (ptl->ptl_mt_nmaps == 1 && /* the only active CPT */ + ptl->ptl_mt_maps[0] == cpt))) { + /* nothing to steal, delay or drop */ + list_del_init(&msg->msg_list); + + if (lnet_ptl_is_lazy(ptl)) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + } + + lnet_ptl_unlock(ptl); + lnet_res_unlock(cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed) + break; + } + + return rc; +} + +int +lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + int rc; + + CDEBUG(D_NET, "Request from %s of length %d into portal %d " + "MB="LPX64"\n", libcfs_id2str(info->mi_id), + info->mi_rlength, info->mi_portal, info->mi_mbits); + + if (info->mi_portal >= the_lnet.ln_nportals) { + CERROR("Invalid portal %d not in [0-%d]\n", + info->mi_portal, the_lnet.ln_nportals); + return LNET_MATCHMD_DROP; + } + + ptl = the_lnet.ln_portals[info->mi_portal]; + rc = lnet_ptl_match_early(ptl, msg); + if (rc != 0) /* matched or delayed early message */ + return rc; + + mtable = lnet_mt_of_match(info, msg); + lnet_res_lock(mtable->mt_cpt); + + if (the_lnet.ln_shutdown) { + rc = LNET_MATCHMD_DROP; + goto out1; + } + + rc = lnet_mt_match_md(mtable, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) { + lnet_ptl_lock(ptl); + lnet_ptl_disable_mt(ptl, mtable->mt_cpt); + lnet_ptl_unlock(ptl); + } + + if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */ + goto out1; + + if (!msg->msg_rx_ready_delay) + goto out1; + + LASSERT(lnet_ptl_is_lazy(ptl)); + LASSERT(!msg->msg_rx_delayed); + + /* NB: we don't expect "delay" can happen a lot */ + if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) { + lnet_ptl_lock(ptl); + + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(mtable->mt_cpt); + + } else { + lnet_res_unlock(mtable->mt_cpt); + rc = lnet_ptl_match_delay(ptl, info, msg); + } + + if (msg->msg_rx_delayed) { + CDEBUG(D_NET, + "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n", + info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET", + libcfs_id2str(info->mi_id), info->mi_portal, + info->mi_mbits, info->mi_roffset, info->mi_rlength); + } + goto out0; + out1: + lnet_res_unlock(mtable->mt_cpt); + out0: + /* EXHAUSTED bit is only meaningful for internal functions */ + return rc & ~LNET_MATCHMD_EXHAUSTED; +} + +void +lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md) +{ + LASSERT(me->me_md == md && md->md_me == me); + + me->me_md = NULL; + md->md_me = NULL; +} + +/* called with lnet_res_lock held */ +void +lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md, + struct list_head *matches, struct list_head *drops) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal]; + struct lnet_match_table *mtable; + struct list_head *head; + lnet_msg_t *tmp; + lnet_msg_t *msg; + int exhausted = 0; + int cpt; + + LASSERT(md->md_refcount == 0); /* a brand new MD */ + + me->me_md = md; + md->md_me = me; + + cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + mtable = ptl->ptl_mtables[cpt]; + + if (list_empty(&ptl->ptl_msg_stealing) && + list_empty(&ptl->ptl_msg_delayed) && + !lnet_mt_test_exhausted(mtable, me->me_pos)) + return; + + lnet_ptl_lock(ptl); + head = &ptl->ptl_msg_stealing; + again: + list_for_each_entry_safe(msg, tmp, head, msg_list) { + struct lnet_match_info info; + lnet_hdr_t *hdr; + int rc; + + LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing); + + hdr = &msg->msg_hdr; + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + rc = lnet_try_match_md(md, &info, msg); + + exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0; + if ((rc & LNET_MATCHMD_NONE) != 0) { + if (exhausted) + break; + continue; + } + + /* Hurrah! This _is_ a match */ + LASSERT((rc & LNET_MATCHMD_FINISH) != 0); + list_del_init(&msg->msg_list); + + if (head == &ptl->ptl_msg_stealing) { + if (exhausted) + break; + /* stealing thread will handle the message */ + continue; + } + + if ((rc & LNET_MATCHMD_OK) != 0) { + list_add_tail(&msg->msg_list, matches); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match "LPU64" offset %d length %d.\n", + libcfs_id2str(info.mi_id), + info.mi_portal, info.mi_mbits, + info.mi_roffset, info.mi_rlength); + } else { + list_add_tail(&msg->msg_list, drops); + } + + if (exhausted) + break; + } + + if (!exhausted && head == &ptl->ptl_msg_stealing) { + head = &ptl->ptl_msg_delayed; + goto again; + } + + if (lnet_ptl_is_wildcard(ptl) && !exhausted) { + lnet_mt_set_exhausted(mtable, me->me_pos, 0); + if (!mtable->mt_enabled) + lnet_ptl_enable_mt(ptl, cpt); + } + + lnet_ptl_unlock(ptl); +} + +void +lnet_ptl_cleanup(struct lnet_portal *ptl) +{ + struct lnet_match_table *mtable; + int i; + + if (ptl->ptl_mtables == NULL) /* uninitialized portal */ + return; + + LASSERT(list_empty(&ptl->ptl_msg_delayed)); + LASSERT(list_empty(&ptl->ptl_msg_stealing)); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + struct list_head *mhash; + lnet_me_t *me; + int j; + + if (mtable->mt_mhash == NULL) /* uninitialized match-table */ + continue; + + mhash = mtable->mt_mhash; + /* cleanup ME */ + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) { + while (!list_empty(&mhash[j])) { + me = list_entry(mhash[j].next, + lnet_me_t, me_list); + CERROR("Active ME %p on exit\n", me); + list_del(&me->me_list); + lnet_me_free(me); + } + } + /* the extra entry is for MEs with ignore bits */ + LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + } + + cfs_percpt_free(ptl->ptl_mtables); + ptl->ptl_mtables = NULL; +} + +int +lnet_ptl_setup(struct lnet_portal *ptl, int index) +{ + struct lnet_match_table *mtable; + struct list_head *mhash; + int i; + int j; + + ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct lnet_match_table)); + if (ptl->ptl_mtables == NULL) { + CERROR("Failed to create match table for portal %d\n", index); + return -ENOMEM; + } + + ptl->ptl_index = index; + INIT_LIST_HEAD(&ptl->ptl_msg_delayed); + INIT_LIST_HEAD(&ptl->ptl_msg_stealing); + spin_lock_init(&ptl->ptl_lock); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + /* the extra entry is for MEs with ignore bits */ + LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i, + sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + if (mhash == NULL) { + CERROR("Failed to create match hash for portal %d\n", + index); + goto failed; + } + + memset(&mtable->mt_exhausted[0], -1, + sizeof(mtable->mt_exhausted[0]) * + LNET_MT_EXHAUSTED_BMAP); + mtable->mt_mhash = mhash; + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) + INIT_LIST_HEAD(&mhash[j]); + + mtable->mt_portal = index; + mtable->mt_cpt = i; + } + + return 0; + failed: + lnet_ptl_cleanup(ptl); + return -ENOMEM; +} + +void +lnet_portals_destroy(void) +{ + int i; + + if (the_lnet.ln_portals == NULL) + return; + + for (i = 0; i < the_lnet.ln_nportals; i++) + lnet_ptl_cleanup(the_lnet.ln_portals[i]); + + cfs_array_free(the_lnet.ln_portals); + the_lnet.ln_portals = NULL; +} + +int +lnet_portals_create(void) +{ + int size; + int i; + + size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]); + + the_lnet.ln_nportals = MAX_PORTALS; + the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size); + if (the_lnet.ln_portals == NULL) { + CERROR("Failed to allocate portals table\n"); + return -ENOMEM; + } + + for (i = 0; i < the_lnet.ln_nportals; i++) { + if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) { + lnet_portals_destroy(); + return -ENOMEM; + } + } + + return 0; +} + +/** + * Turn on the lazy portal attribute. Use with caution! + * + * This portal attribute only affects incoming PUT requests to the portal, + * and is off by default. By default, if there's no matching MD for an + * incoming PUT request, it is simply dropped. With the lazy attribute on, + * such requests are queued indefinitely until either a matching MD is + * posted to the portal or the lazy attribute is turned off. + * + * It would prevent dropped requests, however it should be regarded as the + * last line of defense - i.e. users must keep a close watch on active + * buffers on a lazy portal and once it becomes too low post more buffers as + * soon as possible. This is because delayed requests usually have detrimental + * effects on underlying network connections. A few delayed requests often + * suffice to bring an underlying connection to a complete halt, due to flow + * control mechanisms. + * + * There's also a DOS attack risk. If users don't post match-all MDs on a + * lazy portal, a malicious peer can easily stop a service by sending some + * PUT requests with match bits that won't match any MD. A routed server is + * especially vulnerable since the connections to its neighbor routers are + * shared among all clients. + * + * \param portal Index of the portal to enable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetSetLazyPortal(int portal) +{ + struct lnet_portal *ptl; + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + CDEBUG(D_NET, "Setting portal %d lazy\n", portal); + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + lnet_ptl_setopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + return 0; +} +EXPORT_SYMBOL(LNetSetLazyPortal); + +/** + * Turn off the lazy portal attribute. Delayed requests on the portal, + * if any, will be all dropped when this function returns. + * + * \param portal Index of the portal to disable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetClearLazyPortal(int portal) +{ + struct lnet_portal *ptl; + LIST_HEAD (zombies); + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + if (!lnet_ptl_is_lazy(ptl)) { + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + return 0; + } + + if (the_lnet.ln_shutdown) + CWARN("Active lazy portal %d on exit\n", portal); + else + CDEBUG(D_NET, "clearing portal %d lazy\n", portal); + + /* grab all the blocked messages atomically */ + list_splice_init(&ptl->ptl_msg_delayed, &zombies); + + lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr"); + + return 0; +} +EXPORT_SYMBOL(LNetClearLazyPortal); diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c new file mode 100644 index 000000000000..670dae34107c --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lo.c @@ -0,0 +1,120 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/lnet/lib-lnet.h> + +int +lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + LASSERT (!lntmsg->msg_routing); + LASSERT (!lntmsg->msg_target_is_router); + + return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0); +} + +int +lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + lnet_msg_t *sendmsg = private; + + if (lntmsg != NULL) { /* not discarding */ + if (sendmsg->msg_iov != NULL) { + if (iov != NULL) + lnet_copy_iov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + else + lnet_copy_iov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + } else { + if (iov != NULL) + lnet_copy_kiov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + else + lnet_copy_kiov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + } + + lnet_finalize(ni, lntmsg, 0); + } + + lnet_finalize(ni, sendmsg, 0); + return 0; +} + +static int lolnd_instanced; + +void +lolnd_shutdown(lnet_ni_t *ni) +{ + CDEBUG (D_NET, "shutdown\n"); + LASSERT (lolnd_instanced); + + lolnd_instanced = 0; +} + +int +lolnd_startup (lnet_ni_t *ni) +{ + LASSERT (ni->ni_lnd == &the_lolnd); + LASSERT (!lolnd_instanced); + lolnd_instanced = 1; + + return (0); +} + +lnd_t the_lolnd = { + /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list}, + /* .lnd_refcount = */ 0, + /* .lnd_type = */ LOLND, + /* .lnd_startup = */ lolnd_startup, + /* .lnd_shutdown = */ lolnd_shutdown, + /* .lnt_ctl = */ NULL, + /* .lnd_send = */ lolnd_send, + /* .lnd_recv = */ lolnd_recv, + /* .lnd_eager_recv = */ NULL, + /* .lnd_notify = */ NULL, + /* .lnd_accept = */ NULL +}; diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c new file mode 100644 index 000000000000..c8323854580a --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/module.c @@ -0,0 +1,154 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/lnet/lib-lnet.h> + +static int config_on_load = 0; +CFS_MODULE_PARM(config_on_load, "i", int, 0444, + "configure network at module load"); + +static struct mutex lnet_config_mutex; + +int +lnet_configure (void *arg) +{ + /* 'arg' only there so I can be passed to cfs_create_thread() */ + int rc = 0; + + LNET_MUTEX_LOCK(&lnet_config_mutex); + + if (!the_lnet.ln_niinit_self) { + rc = LNetNIInit(LUSTRE_SRV_LNET_PID); + if (rc >= 0) { + the_lnet.ln_niinit_self = 1; + rc = 0; + } + } + + LNET_MUTEX_UNLOCK(&lnet_config_mutex); + return rc; +} + +int +lnet_unconfigure (void) +{ + int refcount; + + LNET_MUTEX_LOCK(&lnet_config_mutex); + + if (the_lnet.ln_niinit_self) { + the_lnet.ln_niinit_self = 0; + LNetNIFini(); + } + + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + refcount = the_lnet.ln_refcount; + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + + LNET_MUTEX_UNLOCK(&lnet_config_mutex); + return (refcount == 0) ? 0 : -EBUSY; +} + +int +lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data) +{ + int rc; + + switch (cmd) { + case IOC_LIBCFS_CONFIGURE: + return lnet_configure(NULL); + + case IOC_LIBCFS_UNCONFIGURE: + return lnet_unconfigure(); + + default: + /* Passing LNET_PID_ANY only gives me a ref if the net is up + * already; I'll need it to ensure the net can't go down while + * I'm called into it */ + rc = LNetNIInit(LNET_PID_ANY); + if (rc >= 0) { + rc = LNetCtl(cmd, data); + LNetNIFini(); + } + return rc; + } +} + +DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl); + +int +init_lnet(void) +{ + int rc; + ENTRY; + + mutex_init(&lnet_config_mutex); + + rc = LNetInit(); + if (rc != 0) { + CERROR("LNetInit: error %d\n", rc); + RETURN(rc); + } + + rc = libcfs_register_ioctl(&lnet_ioctl_handler); + LASSERT (rc == 0); + + if (config_on_load) { + /* Have to schedule a separate thread to avoid deadlocking + * in modload */ + (void) kthread_run(lnet_configure, NULL, "lnet_initd"); + } + + RETURN(0); +} + +void +fini_lnet(void) +{ + int rc; + + rc = libcfs_deregister_ioctl(&lnet_ioctl_handler); + LASSERT (rc == 0); + + LNetFini(); +} + +MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>"); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +cfs_module(lnet, "1.0.0", init_lnet, fini_lnet); diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c new file mode 100644 index 000000000000..286977691393 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/peer.c @@ -0,0 +1,337 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/peer.c + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include <linux/lnet/lib-lnet.h> + +int +lnet_peer_tables_create(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ptable)); + if (the_lnet.ln_peer_tables == NULL) { + CERROR("Failed to allocate cpu-partition peer tables\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + INIT_LIST_HEAD(&ptable->pt_deathrow); + + LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i, + LNET_PEER_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create peer hash table\n"); + lnet_peer_tables_destroy(); + return -ENOMEM; + } + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + INIT_LIST_HEAD(&hash[j]); + ptable->pt_hash = hash; /* sign of initialization */ + } + + return 0; +} + +void +lnet_peer_tables_destroy(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + if (the_lnet.ln_peer_tables == NULL) + return; + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + hash = ptable->pt_hash; + if (hash == NULL) /* not intialized */ + break; + + LASSERT(list_empty(&ptable->pt_deathrow)); + + ptable->pt_hash = NULL; + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + LASSERT(list_empty(&hash[j])); + + LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash)); + } + + cfs_percpt_free(the_lnet.ln_peer_tables); + the_lnet.ln_peer_tables = NULL; +} + +void +lnet_peer_tables_cleanup(void) +{ + struct lnet_peer_table *ptable; + int i; + int j; + + LASSERT(the_lnet.ln_shutdown); /* i.e. no new peers */ + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(i); + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) { + struct list_head *peers = &ptable->pt_hash[j]; + + while (!list_empty(peers)) { + lnet_peer_t *lp = list_entry(peers->next, + lnet_peer_t, + lp_hashlist); + list_del_init(&lp->lp_hashlist); + /* lose hash table's ref */ + lnet_peer_decref_locked(lp); + } + } + + lnet_net_unlock(i); + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + LIST_HEAD (deathrow); + lnet_peer_t *lp; + + lnet_net_lock(i); + + for (j = 3; ptable->pt_number != 0; j++) { + lnet_net_unlock(i); + + if ((j & (j - 1)) == 0) { + CDEBUG(D_WARNING, + "Waiting for %d peers on peer table\n", + ptable->pt_number); + } + cfs_pause(cfs_time_seconds(1) / 2); + lnet_net_lock(i); + } + list_splice_init(&ptable->pt_deathrow, &deathrow); + + lnet_net_unlock(i); + + while (!list_empty(&deathrow)) { + lp = list_entry(deathrow.next, + lnet_peer_t, lp_hashlist); + list_del(&lp->lp_hashlist); + LIBCFS_FREE(lp, sizeof(*lp)); + } + } +} + +void +lnet_destroy_peer_locked(lnet_peer_t *lp) +{ + struct lnet_peer_table *ptable; + + LASSERT(lp->lp_refcount == 0); + LASSERT(lp->lp_rtr_refcount == 0); + LASSERT(list_empty(&lp->lp_txq)); + LASSERT(list_empty(&lp->lp_hashlist)); + LASSERT(lp->lp_txqnob == 0); + + ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; + LASSERT(ptable->pt_number > 0); + ptable->pt_number--; + + lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt); + lp->lp_ni = NULL; + + list_add(&lp->lp_hashlist, &ptable->pt_deathrow); +} + +lnet_peer_t * +lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) +{ + struct list_head *peers; + lnet_peer_t *lp; + + LASSERT(!the_lnet.ln_shutdown); + + peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; + list_for_each_entry(lp, peers, lp_hashlist) { + if (lp->lp_nid == nid) { + lnet_peer_addref_locked(lp); + return lp; + } + } + + return NULL; +} + +int +lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt) +{ + struct lnet_peer_table *ptable; + lnet_peer_t *lp = NULL; + lnet_peer_t *lp2; + int cpt2; + int rc = 0; + + *lpp = NULL; + if (the_lnet.ln_shutdown) /* it's shutting down */ + return -ESHUTDOWN; + + /* cpt can be LNET_LOCK_EX if it's called from router functions */ + cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid); + + ptable = the_lnet.ln_peer_tables[cpt2]; + lp = lnet_find_peer_locked(ptable, nid); + if (lp != NULL) { + *lpp = lp; + return 0; + } + + if (!list_empty(&ptable->pt_deathrow)) { + lp = list_entry(ptable->pt_deathrow.next, + lnet_peer_t, lp_hashlist); + list_del(&lp->lp_hashlist); + } + + /* + * take extra refcount in case another thread has shutdown LNet + * and destroyed locks and peer-table before I finish the allocation + */ + ptable->pt_number++; + lnet_net_unlock(cpt); + + if (lp != NULL) + memset(lp, 0, sizeof(*lp)); + else + LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp)); + + if (lp == NULL) { + rc = -ENOMEM; + lnet_net_lock(cpt); + goto out; + } + + INIT_LIST_HEAD(&lp->lp_txq); + INIT_LIST_HEAD(&lp->lp_rtrq); + INIT_LIST_HEAD(&lp->lp_routes); + + lp->lp_notify = 0; + lp->lp_notifylnd = 0; + lp->lp_notifying = 0; + lp->lp_alive_count = 0; + lp->lp_timestamp = 0; + lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ + lp->lp_last_alive = cfs_time_current(); /* assumes alive */ + lp->lp_last_query = 0; /* haven't asked NI yet */ + lp->lp_ping_timestamp = 0; + lp->lp_ping_feats = LNET_PING_FEAT_INVAL; + lp->lp_nid = nid; + lp->lp_cpt = cpt2; + lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ + lp->lp_rtr_refcount = 0; + + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + rc = -ESHUTDOWN; + goto out; + } + + lp2 = lnet_find_peer_locked(ptable, nid); + if (lp2 != NULL) { + *lpp = lp2; + goto out; + } + + lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2); + if (lp->lp_ni == NULL) { + rc = -EHOSTUNREACH; + goto out; + } + + lp->lp_txcredits = + lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits; + lp->lp_rtrcredits = + lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni); + + list_add_tail(&lp->lp_hashlist, + &ptable->pt_hash[lnet_nid2peerhash(nid)]); + ptable->pt_version++; + *lpp = lp; + + return 0; +out: + if (lp != NULL) + list_add(&lp->lp_hashlist, &ptable->pt_deathrow); + ptable->pt_number--; + return rc; +} + +void +lnet_debug_peer(lnet_nid_t nid) +{ + char *aliveness = "NA"; + lnet_peer_t *lp; + int rc; + int cpt; + + cpt = lnet_cpt_of_nid(nid); + lnet_net_lock(cpt); + + rc = lnet_nid2peer_locked(&lp, nid, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); + return; + } + + if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) + aliveness = lp->lp_alive ? "up" : "down"; + + CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", + libcfs_nid2str(lp->lp_nid), lp->lp_refcount, + aliveness, lp->lp_ni->ni_peertxcredits, + lp->lp_rtrcredits, lp->lp_minrtrcredits, + lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob); + + lnet_peer_decref_locked(lp); + + lnet_net_unlock(cpt); +} diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c new file mode 100644 index 000000000000..a326ce06bc76 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/router.c @@ -0,0 +1,1694 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/lnet/lib-lnet.h> + +#if defined(LNET_ROUTER) + +#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */ +#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4) +#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */ +#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4) +#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */ +#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4) + +static char *forwarding = ""; +CFS_MODULE_PARM(forwarding, "s", charp, 0444, + "Explicitly enable/disable forwarding between networks"); + +static int tiny_router_buffers; +CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444, + "# of 0 payload messages to buffer in the router"); +static int small_router_buffers; +CFS_MODULE_PARM(small_router_buffers, "i", int, 0444, + "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers; +CFS_MODULE_PARM(large_router_buffers, "i", int, 0444, + "# of large messages to buffer in the router"); +static int peer_buffer_credits = 0; +CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444, + "# router buffer credits per peer"); + +static int auto_down = 1; +CFS_MODULE_PARM(auto_down, "i", int, 0444, + "Automatically mark peers down on comms error"); + +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + /* NI option overrides LNet default */ + if (ni->ni_peerrtrcredits > 0) + return ni->ni_peerrtrcredits; + if (peer_buffer_credits > 0) + return peer_buffer_credits; + + /* As an approximation, allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + return ni->ni_peertxcredits; +} + +/* forward ref's */ +static int lnet_router_checker(void *); +#else + +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + return 0; +} + +#endif + +static int check_routers_before_use = 0; +CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444, + "Assume routers are down and ping them before use"); + +static int avoid_asym_router_failure = 1; +CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644, + "Avoid asymmetrical router failures (0 to disable)"); + +static int dead_router_check_interval = 60; +CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644, + "Seconds between dead router health checks (<= 0 to disable)"); + +static int live_router_check_interval = 60; +CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644, + "Seconds between live router health checks (<= 0 to disable)"); + +static int router_ping_timeout = 50; +CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644, + "Seconds to wait for the reply to a router health query"); + +int +lnet_peers_start_down(void) +{ + return check_routers_before_use; +} + +void +lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when) +{ + if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */ + CDEBUG(D_NET, "Out of date\n"); + return; + } + + lp->lp_timestamp = when; /* update timestamp */ + lp->lp_ping_deadline = 0; /* disable ping timeout */ + + if (lp->lp_alive_count != 0 && /* got old news */ + (!lp->lp_alive) == (!alive)) { /* new date for old news */ + CDEBUG(D_NET, "Old news\n"); + return; + } + + /* Flag that notification is outstanding */ + + lp->lp_alive_count++; + lp->lp_alive = !(!alive); /* 1 bit! */ + lp->lp_notify = 1; + lp->lp_notifylnd |= notifylnd; + if (lp->lp_alive) + lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */ + + CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive); +} + +void +lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp) +{ + int alive; + int notifylnd; + + /* Notify only in 1 thread at any time to ensure ordered notification. + * NB individual events can be missed; the only guarantee is that you + * always get the most recent news */ + + if (lp->lp_notifying) + return; + + lp->lp_notifying = 1; + + while (lp->lp_notify) { + alive = lp->lp_alive; + notifylnd = lp->lp_notifylnd; + + lp->lp_notifylnd = 0; + lp->lp_notify = 0; + + if (notifylnd && ni->ni_lnd->lnd_notify != NULL) { + lnet_net_unlock(lp->lp_cpt); + + /* A new notification could happen now; I'll handle it + * when control returns to me */ + + (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive); + + lnet_net_lock(lp->lp_cpt); + } + } + + lp->lp_notifying = 0; +} + + +static void +lnet_rtr_addref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount >= 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount++; + if (lp->lp_rtr_refcount == 1) { + struct list_head *pos; + + /* a simple insertion sort */ + list_for_each_prev(pos, &the_lnet.ln_routers) { + lnet_peer_t *rtr = list_entry(pos, lnet_peer_t, + lp_rtr_list); + + if (rtr->lp_nid < lp->lp_nid) + break; + } + + list_add(&lp->lp_rtr_list, pos); + /* addref for the_lnet.ln_routers */ + lnet_peer_addref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +static void +lnet_rtr_decref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount > 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount--; + if (lp->lp_rtr_refcount == 0) { + LASSERT(list_empty(&lp->lp_routes)); + + if (lp->lp_rcd != NULL) { + list_add(&lp->lp_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lp_rcd = NULL; + } + + list_del(&lp->lp_rtr_list); + /* decref for the_lnet.ln_routers */ + lnet_peer_decref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +lnet_remotenet_t * +lnet_find_net_locked (__u32 net) +{ + lnet_remotenet_t *rnet; + struct list_head *tmp; + struct list_head *rn_list; + + LASSERT(!the_lnet.ln_shutdown); + + rn_list = lnet_net2rnethash(net); + list_for_each(tmp, rn_list) { + rnet = list_entry(tmp, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == net) + return rnet; + } + return NULL; +} + +static void lnet_shuffle_seed(void) +{ + static int seeded = 0; + int lnd_type, seed[2]; + struct timeval tv; + lnet_ni_t *ni; + struct list_head *tmp; + + if (seeded) + return; + + cfs_get_random_bytes(seed, sizeof(seed)); + + /* Nodes with small feet have little entropy + * the NID for this node gives the most entropy in the low bits */ + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + if (lnd_type != LOLND) + seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type); + } + + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + seeded = 1; + return; +} + +/* NB expects LNET_LOCK held */ +void +lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route) +{ + unsigned int len = 0; + unsigned int offset = 0; + struct list_head *e; + + lnet_shuffle_seed(); + + list_for_each (e, &rnet->lrn_routes) { + len++; + } + + /* len+1 positions to add a new entry, also prevents division by 0 */ + offset = cfs_rand() % (len + 1); + list_for_each (e, &rnet->lrn_routes) { + if (offset == 0) + break; + offset--; + } + list_add(&route->lr_list, e); + list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes); + + the_lnet.ln_remote_nets_version++; + lnet_rtr_addref_locked(route->lr_gateway); +} + +int +lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) +{ + struct list_head *e; + lnet_remotenet_t *rnet; + lnet_remotenet_t *rnet2; + lnet_route_t *route; + lnet_ni_t *ni; + int add_route; + int rc; + + CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + + if (gateway == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND || + net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND || + LNET_NIDNET(gateway) == net || + hops < 1 || hops > 255) + return (-EINVAL); + + if (lnet_islocalnet(net)) /* it's a local network */ + return 0; /* ignore the route entry */ + + /* Assume net, route, all new */ + LIBCFS_ALLOC(route, sizeof(*route)); + LIBCFS_ALLOC(rnet, sizeof(*rnet)); + if (route == NULL || rnet == NULL) { + CERROR("Out of memory creating route %s %d %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + if (route != NULL) + LIBCFS_FREE(route, sizeof(*route)); + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + return -ENOMEM; + } + + INIT_LIST_HEAD(&rnet->lrn_routes); + rnet->lrn_net = net; + route->lr_hops = hops; + route->lr_net = net; + + lnet_net_lock(LNET_LOCK_EX); + + rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX); + if (rc != 0) { + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + LIBCFS_FREE(rnet, sizeof(*rnet)); + + if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */ + return 0; /* ignore the route entry */ + } else { + CERROR("Error %d creating route %s %d %s\n", rc, + libcfs_net2str(net), hops, + libcfs_nid2str(gateway)); + } + return rc; + } + + LASSERT (!the_lnet.ln_shutdown); + + rnet2 = lnet_find_net_locked(net); + if (rnet2 == NULL) { + /* new network */ + list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net)); + rnet2 = rnet; + } + + /* Search for a duplicate route (it's a NOOP if it is) */ + add_route = 1; + list_for_each (e, &rnet2->lrn_routes) { + lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list); + + if (route2->lr_gateway == route->lr_gateway) { + add_route = 0; + break; + } + + /* our lookups must be true */ + LASSERT (route2->lr_gateway->lp_nid != gateway); + } + + if (add_route) { + lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */ + lnet_add_route_to_rnet(rnet2, route); + + ni = route->lr_gateway->lp_ni; + lnet_net_unlock(LNET_LOCK_EX); + + /* XXX Assume alive */ + if (ni->ni_lnd->lnd_notify != NULL) + (ni->ni_lnd->lnd_notify)(ni, gateway, 1); + + lnet_net_lock(LNET_LOCK_EX); + } + + /* -1 for notify or !add_route */ + lnet_peer_decref_locked(route->lr_gateway); + lnet_net_unlock(LNET_LOCK_EX); + + if (!add_route) + LIBCFS_FREE(route, sizeof(*route)); + + if (rnet != rnet2) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + return 0; +} + +int +lnet_check_routes(void) +{ + lnet_remotenet_t *rnet; + lnet_route_t *route; + lnet_route_t *route2; + struct list_head *e1; + struct list_head *e2; + int cpt; + struct list_head *rn_list; + int i; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + route2 = NULL; + list_for_each(e2, &rnet->lrn_routes) { + lnet_nid_t nid1; + lnet_nid_t nid2; + int net; + + route = list_entry(e2, lnet_route_t, + lr_list); + + if (route2 == NULL) { + route2 = route; + continue; + } + + if (route->lr_gateway->lp_ni == + route2->lr_gateway->lp_ni) + continue; + + nid1 = route->lr_gateway->lp_nid; + nid2 = route2->lr_gateway->lp_nid; + net = rnet->lrn_net; + + lnet_net_unlock(cpt); + + CERROR("Routes to %s via %s and %s not " + "supported\n", + libcfs_net2str(net), + libcfs_nid2str(nid1), + libcfs_nid2str(nid2)); + return -EINVAL; + } + } + } + + lnet_net_unlock(cpt); + return 0; +} + +int +lnet_del_route(__u32 net, lnet_nid_t gw_nid) +{ + struct lnet_peer *gateway; + lnet_remotenet_t *rnet; + lnet_route_t *route; + struct list_head *e1; + struct list_head *e2; + int rc = -ENOENT; + struct list_head *rn_list; + int idx = 0; + + CDEBUG(D_NET, "Del route: net %s : gw %s\n", + libcfs_net2str(net), libcfs_nid2str(gw_nid)); + + /* NB Caller may specify either all routes via the given gateway + * or a specific route entry actual NIDs) */ + + lnet_net_lock(LNET_LOCK_EX); + if (net == LNET_NIDNET(LNET_NID_ANY)) + rn_list = &the_lnet.ln_remote_nets_hash[0]; + else + rn_list = lnet_net2rnethash(net); + + again: + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + if (!(net == LNET_NIDNET(LNET_NID_ANY) || + net == rnet->lrn_net)) + continue; + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, lr_list); + + gateway = route->lr_gateway; + if (!(gw_nid == LNET_NID_ANY || + gw_nid == gateway->lp_nid)) + continue; + + list_del(&route->lr_list); + list_del(&route->lr_gwlist); + the_lnet.ln_remote_nets_version++; + + if (list_empty(&rnet->lrn_routes)) + list_del(&rnet->lrn_list); + else + rnet = NULL; + + lnet_rtr_decref_locked(gateway); + lnet_peer_decref_locked(gateway); + + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + rc = 0; + lnet_net_lock(LNET_LOCK_EX); + goto again; + } + } + + if (net == LNET_NIDNET(LNET_NID_ANY) && + ++idx < LNET_REMOTE_NETS_HASH_SIZE) { + rn_list = &the_lnet.ln_remote_nets_hash[idx]; + goto again; + } + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +void +lnet_destroy_routes (void) +{ + lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY); +} + +int +lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive) +{ + struct list_head *e1; + struct list_head *e2; + lnet_remotenet_t *rnet; + lnet_route_t *route; + int cpt; + int i; + struct list_head *rn_list; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, + lr_list); + + if (idx-- == 0) { + *net = rnet->lrn_net; + *hops = route->lr_hops; + *gateway = route->lr_gateway->lp_nid; + *alive = route->lr_gateway->lp_alive; + lnet_net_unlock(cpt); + return 0; + } + } + } + } + + lnet_net_unlock(cpt); + return -ENOENT; +} + +void +lnet_swap_pinginfo(lnet_ping_info_t *info) +{ + int i; + lnet_ni_status_t *stat; + + __swab32s(&info->pi_magic); + __swab32s(&info->pi_features); + __swab32s(&info->pi_pid); + __swab32s(&info->pi_nnis); + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + stat = &info->pi_ni[i]; + __swab64s(&stat->ns_nid); + __swab32s(&stat->ns_status); + } + return; +} + +/** + * parse router-checker pinginfo, record number of down NIs for remote + * networks on that router. + */ +static void +lnet_parse_rc_info(lnet_rc_data_t *rcd) +{ + lnet_ping_info_t *info = rcd->rcd_pinginfo; + struct lnet_peer *gw = rcd->rcd_gateway; + lnet_route_t *rtr; + + if (!gw->lp_alive) + return; + + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(info); + + /* NB always racing with network! */ + if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CDEBUG(D_NET, "%s: Unexpected magic %08x\n", + libcfs_nid2str(gw->lp_nid), info->pi_magic); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + gw->lp_ping_feats = info->pi_features; + if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) { + CDEBUG(D_NET, "%s: Unexpected features 0x%x\n", + libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats); + return; /* nothing I can understand */ + } + + if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) + return; /* can't carry NI status info */ + + list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) { + int ptl_status = LNET_NI_STATUS_INVALID; + int down = 0; + int up = 0; + int i; + + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + lnet_ni_status_t *stat = &info->pi_ni[i]; + lnet_nid_t nid = stat->ns_nid; + + if (nid == LNET_NID_ANY) { + CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n", + libcfs_nid2str(gw->lp_nid)); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + continue; + + if (stat->ns_status == LNET_NI_STATUS_DOWN) { + if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND) + down++; + else if (ptl_status != LNET_NI_STATUS_UP) + ptl_status = LNET_NI_STATUS_DOWN; + continue; + } + + if (stat->ns_status == LNET_NI_STATUS_UP) { + if (LNET_NIDNET(nid) == rtr->lr_net) { + up = 1; + break; + } + /* ptl NIs are considered down only when + * they're all down */ + if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND) + ptl_status = LNET_NI_STATUS_UP; + continue; + } + + CDEBUG(D_NET, "%s: Unexpected status 0x%x\n", + libcfs_nid2str(gw->lp_nid), stat->ns_status); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + if (up) { /* ignore downed NIs if NI for dest network is up */ + rtr->lr_downis = 0; + continue; + } + rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN); + } +} + +static void +lnet_router_checker_event(lnet_event_t *event) +{ + lnet_rc_data_t *rcd = event->md.user_ptr; + struct lnet_peer *lp; + + LASSERT(rcd != NULL); + + if (event->unlinked) { + LNetInvalidateHandle(&rcd->rcd_mdh); + return; + } + + LASSERT(event->type == LNET_EVENT_SEND || + event->type == LNET_EVENT_REPLY); + + lp = rcd->rcd_gateway; + LASSERT(lp != NULL); + + /* NB: it's called with holding lnet_res_lock, we have a few + * places need to hold both locks at the same time, please take + * care of lock ordering */ + lnet_net_lock(lp->lp_cpt); + if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) { + /* ignore if no longer a router or rcd is replaced */ + goto out; + } + + if (event->type == LNET_EVENT_SEND) { + lp->lp_ping_notsent = 0; + if (event->status == 0) + goto out; + } + + /* LNET_EVENT_REPLY */ + /* A successful REPLY means the router is up. If _any_ comms + * to the router fail I assume it's down (this will happen if + * we ping alive routers to try to detect router death before + * apps get burned). */ + + lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current()); + /* The router checker will wake up very shortly and do the + * actual notification. + * XXX If 'lp' stops being a router before then, it will still + * have the notification pending!!! */ + + if (avoid_asym_router_failure && event->status == 0) + lnet_parse_rc_info(rcd); + + out: + lnet_net_unlock(lp->lp_cpt); +} + +void +lnet_wait_known_routerstate(void) +{ + lnet_peer_t *rtr; + struct list_head *entry; + int all_known; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + for (;;) { + int cpt = lnet_net_lock_current(); + + all_known = 1; + list_for_each (entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + if (rtr->lp_alive_count == 0) { + all_known = 0; + break; + } + } + + lnet_net_unlock(cpt); + + if (all_known) + return; + + cfs_pause(cfs_time_seconds(1)); + } +} + +void +lnet_update_ni_status_locked(void) +{ + lnet_ni_t *ni; + long now; + int timeout; + + LASSERT(the_lnet.ln_routing); + + timeout = router_ping_timeout + + MAX(live_router_check_interval, dead_router_check_interval); + + now = cfs_time_current_sec(); + list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { + if (ni->ni_lnd->lnd_type == LOLND) + continue; + + if (now < ni->ni_last_alive + timeout) + continue; + + lnet_ni_lock(ni); + /* re-check with lock */ + if (now < ni->ni_last_alive + timeout) { + lnet_ni_unlock(ni); + continue; + } + + LASSERT(ni->ni_status != NULL); + + if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) { + CDEBUG(D_NET, "NI(%s:%d) status changed to down\n", + libcfs_nid2str(ni->ni_nid), timeout); + /* NB: so far, this is the only place to set + * NI status to "down" */ + ni->ni_status->ns_status = LNET_NI_STATUS_DOWN; + } + lnet_ni_unlock(ni); + } +} + +void +lnet_destroy_rc_data(lnet_rc_data_t *rcd) +{ + LASSERT(list_empty(&rcd->rcd_list)); + /* detached from network */ + LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh)); + + if (rcd->rcd_gateway != NULL) { + int cpt = rcd->rcd_gateway->lp_cpt; + + lnet_net_lock(cpt); + lnet_peer_decref_locked(rcd->rcd_gateway); + lnet_net_unlock(cpt); + } + + if (rcd->rcd_pinginfo != NULL) + LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE); + + LIBCFS_FREE(rcd, sizeof(*rcd)); +} + +lnet_rc_data_t * +lnet_create_rc_data_locked(lnet_peer_t *gateway) +{ + lnet_rc_data_t *rcd = NULL; + lnet_ping_info_t *pi; + int rc; + int i; + + lnet_net_unlock(gateway->lp_cpt); + + LIBCFS_ALLOC(rcd, sizeof(*rcd)); + if (rcd == NULL) + goto out; + + LNetInvalidateHandle(&rcd->rcd_mdh); + INIT_LIST_HEAD(&rcd->rcd_list); + + LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE); + if (pi == NULL) + goto out; + + memset(pi, 0, LNET_PINGINFO_SIZE); + for (i = 0; i < LNET_MAX_RTR_NIS; i++) { + pi->pi_ni[i].ns_nid = LNET_NID_ANY; + pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID; + } + rcd->rcd_pinginfo = pi; + + LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh)); + rc = LNetMDBind((lnet_md_t){.start = pi, + .user_ptr = rcd, + .length = LNET_PINGINFO_SIZE, + .threshold = LNET_MD_THRESH_INF, + .options = LNET_MD_TRUNCATE, + .eq_handle = the_lnet.ln_rc_eqh}, + LNET_UNLINK, + &rcd->rcd_mdh); + if (rc < 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out; + } + LASSERT(rc == 0); + + lnet_net_lock(gateway->lp_cpt); + /* router table changed or someone has created rcd for this gateway */ + if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) { + lnet_net_unlock(gateway->lp_cpt); + goto out; + } + + lnet_peer_addref_locked(gateway); + rcd->rcd_gateway = gateway; + gateway->lp_rcd = rcd; + gateway->lp_ping_notsent = 0; + + return rcd; + + out: + if (rcd != NULL) { + if (!LNetHandleIsInvalid(rcd->rcd_mdh)) { + rc = LNetMDUnlink(rcd->rcd_mdh); + LASSERT(rc == 0); + } + lnet_destroy_rc_data(rcd); + } + + lnet_net_lock(gateway->lp_cpt); + return gateway->lp_rcd; +} + +static int +lnet_router_check_interval (lnet_peer_t *rtr) +{ + int secs; + + secs = rtr->lp_alive ? live_router_check_interval : + dead_router_check_interval; + if (secs < 0) + secs = 0; + + return secs; +} + +static void +lnet_ping_router_locked (lnet_peer_t *rtr) +{ + lnet_rc_data_t *rcd = NULL; + cfs_time_t now = cfs_time_current(); + int secs; + + lnet_peer_addref_locked(rtr); + + if (rtr->lp_ping_deadline != 0 && /* ping timed out? */ + cfs_time_after(now, rtr->lp_ping_deadline)) + lnet_notify_locked(rtr, 1, 0, now); + + /* Run any outstanding notifications */ + lnet_ni_notify_locked(rtr->lp_ni, rtr); + + if (!lnet_isrouter(rtr) || + the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { + /* router table changed or router checker is shutting down */ + lnet_peer_decref_locked(rtr); + return; + } + + rcd = rtr->lp_rcd != NULL ? + rtr->lp_rcd : lnet_create_rc_data_locked(rtr); + + if (rcd == NULL) + return; + + secs = lnet_router_check_interval(rtr); + + CDEBUG(D_NET, + "rtr %s %d: deadline %lu ping_notsent %d alive %d " + "alive_count %d lp_ping_timestamp %lu\n", + libcfs_nid2str(rtr->lp_nid), secs, + rtr->lp_ping_deadline, rtr->lp_ping_notsent, + rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp); + + if (secs != 0 && !rtr->lp_ping_notsent && + cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp, + cfs_time_seconds(secs)))) { + int rc; + lnet_process_id_t id; + lnet_handle_md_t mdh; + + id.nid = rtr->lp_nid; + id.pid = LUSTRE_SRV_LNET_PID; + CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id)); + + rtr->lp_ping_notsent = 1; + rtr->lp_ping_timestamp = now; + + mdh = rcd->rcd_mdh; + + if (rtr->lp_ping_deadline == 0) { + rtr->lp_ping_deadline = + cfs_time_shift(router_ping_timeout); + } + + lnet_net_unlock(rtr->lp_cpt); + + rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + + lnet_net_lock(rtr->lp_cpt); + if (rc != 0) + rtr->lp_ping_notsent = 0; /* no event pending */ + } + + lnet_peer_decref_locked(rtr); + return; +} + +int +lnet_router_checker_start(void) +{ + int rc; + int eqsz; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + if (check_routers_before_use && + dead_router_check_interval <= 0) { + LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be" + " set if 'check_routers_before_use' is set" + "\n"); + return -EINVAL; + } + + if (!the_lnet.ln_routing && + live_router_check_interval <= 0 && + dead_router_check_interval <= 0) + return 0; + + sema_init(&the_lnet.ln_rc_signal, 0); + /* EQ size doesn't matter; the callback is guaranteed to get every + * event */ + eqsz = 0; + rc = LNetEQAlloc(eqsz, lnet_router_checker_event, + &the_lnet.ln_rc_eqh); + if (rc != 0) { + CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc); + return -ENOMEM; + } + + the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; + rc = PTR_ERR(kthread_run(lnet_router_checker, + NULL, "router_checker")); + if (IS_ERR_VALUE(rc)) { + CERROR("Can't start router checker thread: %d\n", rc); + /* block until event callback signals exit */ + down(&the_lnet.ln_rc_signal); + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT(rc == 0); + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + return -ENOMEM; + } + + if (check_routers_before_use) { + /* Note that a helpful side-effect of pinging all known routers + * at startup is that it makes them drop stale connections they + * may have to a previous instance of me. */ + lnet_wait_known_routerstate(); + } + + return 0; +} + +void +lnet_router_checker_stop (void) +{ + int rc; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) + return; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING; + + /* block until event callback signals exit */ + down(&the_lnet.ln_rc_signal); + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT (rc == 0); + return; +} + +static void +lnet_prune_rc_data(int wait_unlink) +{ + lnet_rc_data_t *rcd; + lnet_rc_data_t *tmp; + lnet_peer_t *lp; + struct list_head head; + int i = 2; + + if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING && + list_empty(&the_lnet.ln_rcd_deathrow) && + list_empty(&the_lnet.ln_rcd_zombie))) + return; + + INIT_LIST_HEAD(&head); + + lnet_net_lock(LNET_LOCK_EX); + + if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { + /* router checker is stopping, prune all */ + list_for_each_entry(lp, &the_lnet.ln_routers, + lp_rtr_list) { + if (lp->lp_rcd == NULL) + continue; + + LASSERT(list_empty(&lp->lp_rcd->rcd_list)); + list_add(&lp->lp_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lp_rcd = NULL; + } + } + + /* unlink all RCDs on deathrow list */ + list_splice_init(&the_lnet.ln_rcd_deathrow, &head); + + if (!list_empty(&head)) { + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry(rcd, &head, rcd_list) + LNetMDUnlink(rcd->rcd_mdh); + + lnet_net_lock(LNET_LOCK_EX); + } + + list_splice_init(&head, &the_lnet.ln_rcd_zombie); + + /* release all zombie RCDs */ + while (!list_empty(&the_lnet.ln_rcd_zombie)) { + list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie, + rcd_list) { + if (LNetHandleIsInvalid(rcd->rcd_mdh)) + list_move(&rcd->rcd_list, &head); + } + + wait_unlink = wait_unlink && + !list_empty(&the_lnet.ln_rcd_zombie); + + lnet_net_unlock(LNET_LOCK_EX); + + while (!list_empty(&head)) { + rcd = list_entry(head.next, + lnet_rc_data_t, rcd_list); + list_del_init(&rcd->rcd_list); + lnet_destroy_rc_data(rcd); + } + + if (!wait_unlink) + return; + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for rc buffers to unlink\n"); + cfs_pause(cfs_time_seconds(1) / 4); + + lnet_net_lock(LNET_LOCK_EX); + } + + lnet_net_unlock(LNET_LOCK_EX); +} + + +#if defined(LNET_ROUTER) + +static int +lnet_router_checker(void *arg) +{ + lnet_peer_t *rtr; + struct list_head *entry; + + cfs_block_allsigs(); + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { + __u64 version; + int cpt; + int cpt2; + + cpt = lnet_net_lock_current(); +rescan: + version = the_lnet.ln_routers_version; + + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid); + if (cpt != cpt2) { + lnet_net_unlock(cpt); + cpt = cpt2; + lnet_net_lock(cpt); + /* the routers list has changed */ + if (version != the_lnet.ln_routers_version) + goto rescan; + } + + lnet_ping_router_locked(rtr); + + /* NB dropped lock */ + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } + + if (the_lnet.ln_routing) + lnet_update_ni_status_locked(); + + lnet_net_unlock(cpt); + + lnet_prune_rc_data(0); /* don't wait for UNLINK */ + + /* Call cfs_pause() here always adds 1 to load average + * because kernel counts # active tasks as nr_running + * + nr_uninterruptible. */ + schedule_timeout_and_set_state(TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + } + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING); + + lnet_prune_rc_data(1); /* wait for UNLINK */ + + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + up(&the_lnet.ln_rc_signal); + /* The unlink event callback will signal final completion */ + return 0; +} + +void +lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages) +{ + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + + while (--npages >= 0) + __free_page(rb->rb_kiov[npages].kiov_page); + + LIBCFS_FREE(rb, sz); +} + +lnet_rtrbuf_t * +lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt) +{ + int npages = rbp->rbp_npages; + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + struct page *page; + lnet_rtrbuf_t *rb; + int i; + + LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz); + if (rb == NULL) + return NULL; + + rb->rb_pool = rbp; + + for (i = 0; i < npages; i++) { + page = alloc_pages_node( + cfs_cpt_spread_node(lnet_cpt_table(), cpt), + __GFP_ZERO | GFP_IOFS, 0); + if (page == NULL) { + while (--i >= 0) + __free_page(rb->rb_kiov[i].kiov_page); + + LIBCFS_FREE(rb, sz); + return NULL; + } + + rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE; + rb->rb_kiov[i].kiov_offset = 0; + rb->rb_kiov[i].kiov_page = page; + } + + return rb; +} + +void +lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp) +{ + int npages = rbp->rbp_npages; + int nbuffers = 0; + lnet_rtrbuf_t *rb; + + if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */ + return; + + LASSERT (list_empty(&rbp->rbp_msgs)); + LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers); + + while (!list_empty(&rbp->rbp_bufs)) { + LASSERT (rbp->rbp_credits > 0); + + rb = list_entry(rbp->rbp_bufs.next, + lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + nbuffers++; + } + + LASSERT (rbp->rbp_nbuffers == nbuffers); + LASSERT (rbp->rbp_credits == nbuffers); + + rbp->rbp_nbuffers = rbp->rbp_credits = 0; +} + +int +lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt) +{ + lnet_rtrbuf_t *rb; + int i; + + if (rbp->rbp_nbuffers != 0) { + LASSERT (rbp->rbp_nbuffers == nbufs); + return 0; + } + + for (i = 0; i < nbufs; i++) { + rb = lnet_new_rtrbuf(rbp, cpt); + + if (rb == NULL) { + CERROR("Failed to allocate %d router bufs of %d pages\n", + nbufs, rbp->rbp_npages); + return -ENOMEM; + } + + rbp->rbp_nbuffers++; + rbp->rbp_credits++; + rbp->rbp_mincredits++; + list_add(&rb->rb_list, &rbp->rbp_bufs); + + /* No allocation "under fire" */ + /* Otherwise we'd need code to schedule blocked msgs etc */ + LASSERT (!the_lnet.ln_routing); + } + + LASSERT (rbp->rbp_credits == nbufs); + return 0; +} + +void +lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages) +{ + INIT_LIST_HEAD(&rbp->rbp_msgs); + INIT_LIST_HEAD(&rbp->rbp_bufs); + + rbp->rbp_npages = npages; + rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; +} + +void +lnet_rtrpools_free(void) +{ + lnet_rtrbufpool_t *rtrp; + int i; + + if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */ + return; + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_free_bufs(&rtrp[0]); + lnet_rtrpool_free_bufs(&rtrp[1]); + lnet_rtrpool_free_bufs(&rtrp[2]); + } + + cfs_percpt_free(the_lnet.ln_rtrpools); + the_lnet.ln_rtrpools = NULL; +} + +static int +lnet_nrb_tiny_calculate(int npages) +{ + int nrbs = LNET_NRB_TINY; + + if (tiny_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "tiny_router_buffers=%d invalid when " + "routing enabled\n", tiny_router_buffers); + return -1; + } + + if (tiny_router_buffers > 0) + nrbs = tiny_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_TINY_MIN); +} + +static int +lnet_nrb_small_calculate(int npages) +{ + int nrbs = LNET_NRB_SMALL; + + if (small_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "small_router_buffers=%d invalid when " + "routing enabled\n", small_router_buffers); + return -1; + } + + if (small_router_buffers > 0) + nrbs = small_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_SMALL_MIN); +} + +static int +lnet_nrb_large_calculate(int npages) +{ + int nrbs = LNET_NRB_LARGE; + + if (large_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "large_router_buffers=%d invalid when " + "routing enabled\n", large_router_buffers); + return -1; + } + + if (large_router_buffers > 0) + nrbs = large_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_LARGE_MIN); +} + +int +lnet_rtrpools_alloc(int im_a_router) +{ + lnet_rtrbufpool_t *rtrp; + int large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int small_pages = 1; + int nrb_tiny; + int nrb_small; + int nrb_large; + int rc; + int i; + + if (!strcmp(forwarding, "")) { + /* not set either way */ + if (!im_a_router) + return 0; + } else if (!strcmp(forwarding, "disabled")) { + /* explicitly disabled */ + return 0; + } else if (!strcmp(forwarding, "enabled")) { + /* explicitly enabled */ + } else { + LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either " + "'enabled' or 'disabled'\n"); + return -EINVAL; + } + + nrb_tiny = lnet_nrb_tiny_calculate(0); + if (nrb_tiny < 0) + return -EINVAL; + + nrb_small = lnet_nrb_small_calculate(small_pages); + if (nrb_small < 0) + return -EINVAL; + + nrb_large = lnet_nrb_large_calculate(large_pages); + if (nrb_large < 0) + return -EINVAL; + + the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(), + LNET_NRBPOOLS * + sizeof(lnet_rtrbufpool_t)); + if (the_lnet.ln_rtrpools == NULL) { + LCONSOLE_ERROR_MSG(0x10c, + "Failed to initialize router buffe pool\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_init(&rtrp[0], 0); + rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[1], small_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[2], large_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i); + if (rc != 0) + goto failed; + } + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + lnet_net_unlock(LNET_LOCK_EX); + + return 0; + + failed: + lnet_rtrpools_free(); + return rc; +} + +int +lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) +{ + struct lnet_peer *lp = NULL; + cfs_time_t now = cfs_time_current(); + int cpt = lnet_cpt_of_nid(nid); + + LASSERT (!in_interrupt ()); + + CDEBUG (D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), + alive ? "up" : "down"); + + if (ni != NULL && + LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { + CWARN ("Ignoring notification of %s %s by %s (different net)\n", + libcfs_nid2str(nid), alive ? "birth" : "death", + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + } + + /* can't do predictions... */ + if (cfs_time_after(when, now)) { + CWARN ("Ignoring prediction from %s of %s %s " + "%ld seconds in the future\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down", + cfs_duration_sec(cfs_time_sub(when, now))); + return -EINVAL; + } + + if (ni != NULL && !alive && /* LND telling me she's down */ + !auto_down) { /* auto-down disabled */ + CDEBUG(D_NET, "Auto-down disabled\n"); + return 0; + } + + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid); + if (lp == NULL) { + /* nid not found */ + lnet_net_unlock(cpt); + CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); + return 0; + } + + /* We can't fully trust LND on reporting exact peer last_alive + * if he notifies us about dead peer. For example ksocklnd can + * call us with when == _time_when_the_node_was_booted_ if + * no connections were successfully established */ + if (ni != NULL && !alive && when < lp->lp_last_alive) + when = lp->lp_last_alive; + + lnet_notify_locked(lp, ni == NULL, alive, when); + + lnet_ni_notify_locked(ni, lp); + + lnet_peer_decref_locked(lp); + + lnet_net_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(lnet_notify); + +void +lnet_get_tunables (void) +{ + return; +} + +#else + +int +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) +{ + return -EOPNOTSUPP; +} + +void +lnet_router_checker (void) +{ + static time_t last = 0; + static int running = 0; + + time_t now = cfs_time_current_sec(); + int interval = now - last; + int rc; + __u64 version; + lnet_peer_t *rtr; + + /* It's no use to call me again within a sec - all intervals and + * timeouts are measured in seconds */ + if (last != 0 && interval < 2) + return; + + if (last != 0 && + interval > MAX(live_router_check_interval, + dead_router_check_interval)) + CNETERR("Checker(%d/%d) not called for %d seconds\n", + live_router_check_interval, dead_router_check_interval, + interval); + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_net_lock(0); + LASSERT(!running); /* recursion check */ + running = 1; + lnet_net_unlock(0); + + last = now; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) + lnet_prune_rc_data(0); /* unlink all rcd and nowait */ + + /* consume all pending events */ + while (1) { + int i; + lnet_event_t ev; + + /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the + * recursion breaker in LNetEQPoll would fail */ + rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i); + if (rc == 0) /* no event pending */ + break; + + /* NB a lost SENT prevents me from pinging a router again */ + if (rc == -EOVERFLOW) { + CERROR("Dropped an event!!!\n"); + abort(); + } + + LASSERT (rc == 1); + + lnet_router_checker_event(&ev); + } + + if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) { + lnet_prune_rc_data(1); /* release rcd */ + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + running = 0; + return; + } + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + lnet_net_lock(0); + + version = the_lnet.ln_routers_version; + list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) { + lnet_ping_router_locked(rtr); + LASSERT (version == the_lnet.ln_routers_version); + } + + lnet_net_unlock(0); + + running = 0; /* lock only needed for the recursion check */ + return; +} + +/* NB lnet_peers_start_down depends on me, + * so must be called before any peer creation */ +void +lnet_get_tunables (void) +{ + char *s; + + s = getenv("LNET_ROUTER_PING_TIMEOUT"); + if (s != NULL) router_ping_timeout = atoi(s); + + s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL"); + if (s != NULL) live_router_check_interval = atoi(s); + + s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL"); + if (s != NULL) dead_router_check_interval = atoi(s); + + /* This replaces old lnd_notify mechanism */ + check_routers_before_use = 1; + if (dead_router_check_interval <= 0) + dead_router_check_interval = 30; +} + +void +lnet_rtrpools_free(void) +{ +} + +int +lnet_rtrpools_alloc(int im_a_arouter) +{ + return 0; +} + +#endif diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c new file mode 100644 index 000000000000..3084b0c75983 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/router_proc.c @@ -0,0 +1,950 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include <linux/libcfs/libcfs.h> +#include <linux/lnet/lib-lnet.h> + +#if defined(LNET_ROUTER) + +/* This is really lnet_proc.c. You might need to update sanity test 215 + * if any file format is changed. */ + +static ctl_table_header_t *lnet_table_header = NULL; + +#define CTL_LNET (0x100) +enum { + PSDEV_LNET_STATS = 100, + PSDEV_LNET_ROUTES, + PSDEV_LNET_ROUTERS, + PSDEV_LNET_PEERS, + PSDEV_LNET_BUFFERS, + PSDEV_LNET_NIS, + PSDEV_LNET_PTL_ROTOR, +}; + +#define LNET_LOFFT_BITS (sizeof(loff_t) * 8) +/* + * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system + */ +#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1) +/* change version, 16 bits or 8 bits */ +#define LNET_PROC_VER_BITS MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8) + +#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS +/* + * bits for peer hash offset + * NB: we don't use the highest bit of *ppos because it's signed + */ +#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \ + LNET_PROC_CPT_BITS - \ + LNET_PROC_VER_BITS - \ + LNET_PROC_HASH_BITS - 1) +/* bits for hash index + position */ +#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS) +/* bits for peer hash table + hash version */ +#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS) + +#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1) +#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1) +#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1) +#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1) + +#define LNET_PROC_CPT_GET(pos) \ + (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK) + +#define LNET_PROC_VER_GET(pos) \ + (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK) + +#define LNET_PROC_HASH_GET(pos) \ + (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK) + +#define LNET_PROC_HOFF_GET(pos) \ + (int)((pos) & LNET_PROC_HOFF_MASK) + +#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \ + (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \ + ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \ + ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \ + ((off) & LNET_PROC_HOFF_MASK)) + +#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK)) + +static int __proc_lnet_stats(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + int rc; + lnet_counters_t *ctrs; + int len; + char *tmpstr; + const int tmpsiz = 256; /* 7 %u and 4 LPU64 */ + + if (write) { + lnet_counters_reset(); + return 0; + } + + /* read */ + + LIBCFS_ALLOC(ctrs, sizeof(*ctrs)); + if (ctrs == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) { + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return -ENOMEM; + } + + lnet_counters_get(ctrs); + + len = snprintf(tmpstr, tmpsiz, + "%u %u %u %u %u %u %u "LPU64" "LPU64" " + LPU64" "LPU64, + ctrs->msgs_alloc, ctrs->msgs_max, + ctrs->errors, + ctrs->send_count, ctrs->recv_count, + ctrs->route_count, ctrs->drop_count, + ctrs->send_length, ctrs->recv_length, + ctrs->route_length, ctrs->drop_length); + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + + LIBCFS_FREE(tmpstr, tmpsiz); + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return rc; +} + +DECLARE_PROC_HANDLER(proc_lnet_stats); + +int LL_PROC_PROTO(proc_lnet_routes) +{ + const int tmpsiz = 256; + char *tmpstr; + char *s; + int rc = 0; + int len; + int ver; + int off; + + DECLARE_LL_PROC_PPOS_DECL; + + CLASSERT(sizeof(loff_t) >= 4); + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT (!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n", + the_lnet.ln_routing ? "enabled" : "disabled"); + LASSERT (tmpstr + tmpsiz - s > 0); + + s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n", + "net", "hops", "state", "router"); + LASSERT (tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_remote_nets_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *n; + struct list_head *r; + lnet_route_t *route = NULL; + lnet_remotenet_t *rnet = NULL; + int skip = off - 1; + struct list_head *rn_list; + int i; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) { + lnet_net_unlock(0); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL; + i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + + n = rn_list->next; + + while (n != rn_list && route == NULL) { + rnet = list_entry(n, lnet_remotenet_t, + lrn_list); + + r = rnet->lrn_routes.next; + + while (r != &rnet->lrn_routes) { + lnet_route_t *re = + list_entry(r, lnet_route_t, + lr_list); + if (skip == 0) { + route = re; + break; + } + + skip--; + r = r->next; + } + + n = n->next; + } + } + + if (route != NULL) { + __u32 net = rnet->lrn_net; + unsigned int hops = route->lr_hops; + lnet_nid_t nid = route->lr_gateway->lp_nid; + int alive = route->lr_gateway->lp_alive; + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-8s %4u %7s %s\n", + libcfs_net2str(net), hops, + alive ? "up" : "down", + libcfs_nid2str(nid)); + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +int LL_PROC_PROTO(proc_lnet_routers) +{ + int rc = 0; + char *tmpstr; + char *s; + const int tmpsiz = 256; + int len; + int ver; + int off; + + DECLARE_LL_PROC_PPOS_DECL; + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT (!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n", + "ref", "rtr_ref", "alive_cnt", "state", + "last_ping", "ping_sent", "deadline", + "down_ni", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_routers_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *r; + struct lnet_peer *peer = NULL; + int skip = off - 1; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) { + lnet_net_unlock(0); + + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + r = the_lnet.ln_routers.next; + + while (r != &the_lnet.ln_routers) { + lnet_peer_t *lp = list_entry(r, lnet_peer_t, + lp_rtr_list); + + if (skip == 0) { + peer = lp; + break; + } + + skip--; + r = r->next; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lp_nid; + cfs_time_t now = cfs_time_current(); + cfs_time_t deadline = peer->lp_ping_deadline; + int nrefs = peer->lp_refcount; + int nrtrrefs = peer->lp_rtr_refcount; + int alive_cnt = peer->lp_alive_count; + int alive = peer->lp_alive; + int pingsent = !peer->lp_ping_notsent; + int last_ping = cfs_duration_sec(cfs_time_sub(now, + peer->lp_ping_timestamp)); + int down_ni = 0; + lnet_route_t *rtr; + + if ((peer->lp_ping_feats & + LNET_PING_FEAT_NI_STATUS) != 0) { + list_for_each_entry(rtr, &peer->lp_routes, + lr_gwlist) { + /* downis on any route should be the + * number of downis on the gateway */ + if (rtr->lr_downis != 0) { + down_ni = rtr->lr_downis; + break; + } + } + } + + if (deadline == 0) + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, "NA", down_ni, + libcfs_nid2str(nid)); + else + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, + cfs_duration_sec(cfs_time_sub(deadline, now)), + down_ni, libcfs_nid2str(nid)); + LASSERT (tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +int LL_PROC_PROTO(proc_lnet_peers) +{ + const int tmpsiz = 256; + struct lnet_peer_table *ptable; + char *tmpstr; + char *s; + int cpt = LNET_PROC_CPT_GET(*ppos); + int ver = LNET_PROC_VER_GET(*ppos); + int hash = LNET_PROC_HASH_GET(*ppos); + int hoff = LNET_PROC_HOFF_GET(*ppos); + int rc = 0; + int len; + + CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS); + LASSERT(!write); + + if (*lenp == 0) + return 0; + + if (cpt >= LNET_CPT_NUMBER) { + *lenp = 0; + return 0; + } + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n", + "nid", "refs", "state", "last", "max", + "rtr", "min", "tx", "min", "queue"); + LASSERT (tmpstr + tmpsiz - s > 0); + + hoff++; + } else { + struct lnet_peer *peer; + struct list_head *p; + int skip; + again: + p = NULL; + peer = NULL; + skip = hoff - 1; + + lnet_net_lock(cpt); + ptable = the_lnet.ln_peer_tables[cpt]; + if (hoff == 1) + ver = LNET_PROC_VERSION(ptable->pt_version); + + if (ver != LNET_PROC_VERSION(ptable->pt_version)) { + lnet_net_unlock(cpt); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + while (hash < LNET_PEER_HASH_SIZE) { + if (p == NULL) + p = ptable->pt_hash[hash].next; + + while (p != &ptable->pt_hash[hash]) { + lnet_peer_t *lp = list_entry(p, lnet_peer_t, + lp_hashlist); + if (skip == 0) { + peer = lp; + + /* minor optimization: start from idx+1 + * on next iteration if we've just + * drained lp_hashlist */ + if (lp->lp_hashlist.next == + &ptable->pt_hash[hash]) { + hoff = 1; + hash++; + } else { + hoff++; + } + + break; + } + + skip--; + p = lp->lp_hashlist.next; + } + + if (peer != NULL) + break; + + p = NULL; + hoff = 1; + hash++; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lp_nid; + int nrefs = peer->lp_refcount; + int lastalive = -1; + char *aliveness = "NA"; + int maxcr = peer->lp_ni->ni_peertxcredits; + int txcr = peer->lp_txcredits; + int mintxcr = peer->lp_mintxcredits; + int rtrcr = peer->lp_rtrcredits; + int minrtrcr = peer->lp_minrtrcredits; + int txqnob = peer->lp_txqnob; + + if (lnet_isrouter(peer) || + lnet_peer_aliveness_enabled(peer)) + aliveness = peer->lp_alive ? "up" : "down"; + + if (lnet_peer_aliveness_enabled(peer)) { + cfs_time_t now = cfs_time_current(); + cfs_duration_t delta; + + delta = cfs_time_sub(now, peer->lp_last_alive); + lastalive = cfs_duration_sec(delta); + + /* No need to mess up peers contents with + * arbitrarily long integers - it suffices to + * know that lastalive is more than 10000s old + */ + if (lastalive >= 10000) + lastalive = 9999; + } + + lnet_net_unlock(cpt); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n", + libcfs_nid2str(nid), nrefs, aliveness, + lastalive, maxcr, rtrcr, minrtrcr, txcr, + mintxcr, txqnob); + LASSERT (tmpstr + tmpsiz - s > 0); + + } else { /* peer is NULL */ + lnet_net_unlock(cpt); + } + + if (hash == LNET_PEER_HASH_SIZE) { + cpt++; + hash = 0; + hoff = 1; + if (peer == NULL && cpt < LNET_CPT_NUMBER) + goto again; + } + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff); + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int __proc_lnet_buffers(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + char *s; + char *tmpstr; + int tmpsiz; + int idx; + int len; + int rc; + int i; + + LASSERT(!write); + + /* (4 %d) * 4 * LNET_CPT_NUMBER */ + tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER; + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + s += snprintf(s, tmpstr + tmpsiz - s, + "%5s %5s %7s %7s\n", + "pages", "count", "credits", "min"); + LASSERT (tmpstr + tmpsiz - s > 0); + + if (the_lnet.ln_rtrpools == NULL) + goto out; /* I'm not a router */ + + for (idx = 0; idx < LNET_NRBPOOLS; idx++) { + lnet_rtrbufpool_t *rbp; + + lnet_net_lock(LNET_LOCK_EX); + cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%5d %5d %7d %7d\n", + rbp[idx].rbp_npages, + rbp[idx].rbp_nbuffers, + rbp[idx].rbp_credits, + rbp[idx].rbp_mincredits); + LASSERT(tmpstr + tmpsiz - s > 0); + } + lnet_net_unlock(LNET_LOCK_EX); + } + + out: + len = s - tmpstr; + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, NULL); + + LIBCFS_FREE(tmpstr, tmpsiz); + return rc; +} + +DECLARE_PROC_HANDLER(proc_lnet_buffers); + +int LL_PROC_PROTO(proc_lnet_nis) +{ + int tmpsiz = 128 * LNET_CPT_NUMBER; + int rc = 0; + char *tmpstr; + char *s; + int len; + + DECLARE_LL_PROC_PPOS_DECL; + + LASSERT (!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n", + "nid", "status", "alive", "refs", "peer", + "rtr", "max", "tx", "min"); + LASSERT (tmpstr + tmpsiz - s > 0); + } else { + struct list_head *n; + lnet_ni_t *ni = NULL; + int skip = *ppos - 1; + + lnet_net_lock(0); + + n = the_lnet.ln_nis.next; + + while (n != &the_lnet.ln_nis) { + lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list); + + if (skip == 0) { + ni = a_ni; + break; + } + + skip--; + n = n->next; + } + + if (ni != NULL) { + struct lnet_tx_queue *tq; + char *stat; + long now = cfs_time_current_sec(); + int last_alive = -1; + int i; + int j; + + if (the_lnet.ln_routing) + last_alive = now - ni->ni_last_alive; + + /* @lo forever alive */ + if (ni->ni_lnd->lnd_type == LOLND) + last_alive = 0; + + lnet_ni_lock(ni); + LASSERT(ni->ni_status != NULL); + stat = (ni->ni_status->ns_status == + LNET_NI_STATUS_UP) ? "up" : "down"; + lnet_ni_unlock(ni); + + /* we actually output credits information for + * TX queue of each partition */ + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + for (j = 0; ni->ni_cpts != NULL && + j < ni->ni_ncpts; j++) { + if (i == ni->ni_cpts[j]) + break; + } + + if (j == ni->ni_ncpts) + continue; + + if (i != 0) + lnet_net_lock(i); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n", + libcfs_nid2str(ni->ni_nid), stat, + last_alive, *ni->ni_refs[i], + ni->ni_peertxcredits, + ni->ni_peerrtrcredits, + tq->tq_credits_max, + tq->tq_credits, tq->tq_credits_min); + if (i != 0) + lnet_net_unlock(i); + } + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos += 1; + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +struct lnet_portal_rotors { + int pr_value; + const char *pr_name; + const char *pr_desc; +}; + +static struct lnet_portal_rotors portal_rotors[] = { + { + .pr_value = LNET_PTL_ROTOR_OFF, + .pr_name = "OFF", + .pr_desc = "Turn off message rotor for wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_ON, + .pr_name = "ON", + .pr_desc = "round-robin dispatch all PUT messages for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_RR_RT, + .pr_name = "RR_RT", + .pr_desc = "round-robin dispatch routed PUT message for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_HASH_RT, + .pr_name = "HASH_RT", + .pr_desc = "dispatch routed PUT message by hashing source " + "NID for wildcard portals" + }, + { + .pr_value = -1, + .pr_name = NULL, + .pr_desc = NULL + }, +}; + +extern int portal_rotor; + +static int __proc_lnet_portal_rotor(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + const int buf_len = 128; + char *buf; + char *tmp; + int rc; + int i; + + LIBCFS_ALLOC(buf, buf_len); + if (buf == NULL) + return -ENOMEM; + + if (!write) { + lnet_res_lock(0); + + for (i = 0; portal_rotors[i].pr_value >= 0; i++) { + if (portal_rotors[i].pr_value == portal_rotor) + break; + } + + LASSERT(portal_rotors[i].pr_value == portal_rotor); + lnet_res_unlock(0); + + rc = snprintf(buf, buf_len, + "{\n\tportals: all\n" + "\trotor: %s\n\tdescription: %s\n}", + portal_rotors[i].pr_name, + portal_rotors[i].pr_desc); + + if (pos >= min_t(int, rc, buf_len)) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + buf + pos, "\n"); + } + goto out; + } + + rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob); + if (rc < 0) + goto out; + + tmp = cfs_trimwhite(buf); + + rc = -EINVAL; + lnet_res_lock(0); + for (i = 0; portal_rotors[i].pr_name != NULL; i++) { + if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp, + strlen(portal_rotors[i].pr_name)) == 0) { + portal_rotor = portal_rotors[i].pr_value; + rc = 0; + break; + } + } + lnet_res_unlock(0); +out: + LIBCFS_FREE(buf, buf_len); + return rc; +} +DECLARE_PROC_HANDLER(proc_lnet_portal_rotor); + +static ctl_table_t lnet_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + INIT_CTL_NAME(PSDEV_LNET_STATS) + .procname = "stats", + .mode = 0644, + .proc_handler = &proc_lnet_stats, + }, + { + INIT_CTL_NAME(PSDEV_LNET_ROUTES) + .procname = "routes", + .mode = 0444, + .proc_handler = &proc_lnet_routes, + }, + { + INIT_CTL_NAME(PSDEV_LNET_ROUTERS) + .procname = "routers", + .mode = 0444, + .proc_handler = &proc_lnet_routers, + }, + { + INIT_CTL_NAME(PSDEV_LNET_PEERS) + .procname = "peers", + .mode = 0444, + .proc_handler = &proc_lnet_peers, + }, + { + INIT_CTL_NAME(PSDEV_LNET_PEERS) + .procname = "buffers", + .mode = 0444, + .proc_handler = &proc_lnet_buffers, + }, + { + INIT_CTL_NAME(PSDEV_LNET_NIS) + .procname = "nis", + .mode = 0444, + .proc_handler = &proc_lnet_nis, + }, + { + INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR) + .procname = "portal_rotor", + .mode = 0644, + .proc_handler = &proc_lnet_portal_rotor, + }, + { + INIT_CTL_NAME(0) + } +}; + +static ctl_table_t top_table[] = { + { + INIT_CTL_NAME(CTL_LNET) + .procname = "lnet", + .mode = 0555, + .data = NULL, + .maxlen = 0, + .child = lnet_table, + }, + { + INIT_CTL_NAME(0) + } +}; + +void +lnet_proc_init(void) +{ +#ifdef CONFIG_SYSCTL + if (lnet_table_header == NULL) + lnet_table_header = cfs_register_sysctl_table(top_table, 0); +#endif +} + +void +lnet_proc_fini(void) +{ +#ifdef CONFIG_SYSCTL + if (lnet_table_header != NULL) + unregister_sysctl_table(lnet_table_header); + + lnet_table_header = NULL; +#endif +} + +#else + +void +lnet_proc_init(void) +{ +} + +void +lnet_proc_fini(void) +{ +} + +#endif |