aboutsummaryrefslogtreecommitdiffstats
path: root/net/smc
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--net/smc/Kconfig4
-rw-r--r--net/smc/Makefile5
-rw-r--r--net/smc/af_smc.c2147
-rw-r--r--net/smc/smc.h113
-rw-r--r--net/smc/smc_cdc.c180
-rw-r--r--net/smc/smc_cdc.h23
-rw-r--r--net/smc/smc_clc.c964
-rw-r--r--net/smc/smc_clc.h325
-rw-r--r--net/smc/smc_close.c41
-rw-r--r--net/smc/smc_core.c2071
-rw-r--r--net/smc/smc_core.h348
-rw-r--r--net/smc/smc_diag.c74
-rw-r--r--net/smc/smc_ib.c590
-rw-r--r--net/smc/smc_ib.h47
-rw-r--r--net/smc/smc_ism.c193
-rw-r--r--net/smc/smc_ism.h31
-rw-r--r--net/smc/smc_llc.c2163
-rw-r--r--net/smc/smc_llc.h80
-rw-r--r--net/smc/smc_netlink.c157
-rw-r--r--net/smc/smc_netlink.h34
-rw-r--r--net/smc/smc_netns.h1
-rw-r--r--net/smc/smc_pnet.c857
-rw-r--r--net/smc/smc_pnet.h24
-rw-r--r--net/smc/smc_rx.c115
-rw-r--r--net/smc/smc_stats.c413
-rw-r--r--net/smc/smc_stats.h266
-rw-r--r--net/smc/smc_sysctl.c111
-rw-r--r--net/smc/smc_sysctl.h33
-rw-r--r--net/smc/smc_tracepoint.c9
-rw-r--r--net/smc/smc_tracepoint.h125
-rw-r--r--net/smc/smc_tx.c248
-rw-r--r--net/smc/smc_tx.h3
-rw-r--r--net/smc/smc_wr.c387
-rw-r--r--net/smc/smc_wr.h35
34 files changed, 10139 insertions, 2078 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index f54a70b8da82..1ab3c5a2c5ad 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -2,7 +2,7 @@
config SMC
tristate "SMC socket protocol family"
depends on INET && INFINIBAND
- ---help---
+ help
SMC-R provides a "sockets over RDMA" solution making use of
RDMA over Converged Ethernet (RoCE) technology to upgrade
AF_INET TCP connections transparently.
@@ -14,7 +14,7 @@ config SMC
config SMC_DIAG
tristate "SMC: socket monitoring interface"
depends on SMC
- ---help---
+ help
Support for SMC socket monitoring interface used by tools such as
smcss.
diff --git a/net/smc/Makefile b/net/smc/Makefile
index cb1254541f37..875efcd126a2 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
+ccflags-y += -I$(src)
obj-$(CONFIG_SMC) += smc.o
obj-$(CONFIG_SMC_DIAG) += smc_diag.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
+smc-y += smc_tracepoint.o
+smc-$(CONFIG_SYSCTL) += smc_sysctl.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6fd44bdb0fc3..e12d4fa5aece 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -26,6 +26,7 @@
#include <linux/sched/signal.h>
#include <linux/if_vlan.h>
#include <linux/rcupdate_wait.h>
+#include <linux/ctype.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -44,9 +45,13 @@
#include "smc_ib.h"
#include "smc_ism.h"
#include "smc_pnet.h"
+#include "smc_netlink.h"
#include "smc_tx.h"
#include "smc_rx.h"
#include "smc_close.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
+#include "smc_sysctl.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
@@ -55,9 +60,52 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
*/
+static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */
+struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+struct workqueue_struct *smc_close_wq; /* wq for close work */
+
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ void *hdr;
+
+ if (cb_ctx->pos[0])
+ goto out;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_DUMP_HS_LIMITATION);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
+ sock_net(skb->sk)->smc.limit_smc_hs))
+ goto err;
+
+ genlmsg_end(skb, hdr);
+ cb_ctx->pos[0] = 1;
+out:
+ return skb->len;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = true;
+ return 0;
+}
+
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
+{
+ sock_net(skb->sk)->smc.limit_smc_hs = false;
+ return 0;
+}
+
static void smc_set_keepalive(struct sock *sk, int val)
{
struct smc_sock *smc = smc_sk(sk);
@@ -65,6 +113,61 @@ static void smc_set_keepalive(struct sock *sk, int val)
smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}
+static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct smc_sock *smc;
+ struct sock *child;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
+ sk->sk_max_ack_backlog)
+ goto drop;
+
+ if (sk_acceptq_is_full(&smc->sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ /* passthrough to original syn recv sock fct */
+ child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
+ own_req);
+ /* child must not inherit smc or its ops */
+ if (child) {
+ rcu_assign_sk_user_data(child, NULL);
+
+ /* v4-mapped sockets don't inherit parent ops. Don't restore. */
+ if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops)
+ inet_csk(child)->icsk_af_ops = smc->ori_af_ops;
+ }
+ return child;
+
+drop:
+ dst_release(dst);
+ tcp_listendrop(sk);
+ return NULL;
+}
+
+static bool smc_hs_congested(const struct sock *sk)
+{
+ const struct smc_sock *smc;
+
+ smc = smc_clcsock_user_data(sk);
+
+ if (!smc)
+ return true;
+
+ if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
+ return true;
+
+ return false;
+}
+
static struct smc_hashinfo smc_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};
@@ -82,8 +185,8 @@ int smc_hash_sk(struct sock *sk)
write_lock_bh(&h->lock);
sk_add_node(sk, head);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&h->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
@@ -100,12 +203,27 @@ void smc_unhash_sk(struct sock *sk)
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);
+/* This will be called before user really release sock_lock. So do the
+ * work which we didn't do because of user hold the sock_lock in the
+ * BH context
+ */
+static void smc_release_cb(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ if (smc->conn.tx_in_release_sock) {
+ smc_tx_pending(&smc->conn);
+ smc->conn.tx_in_release_sock = false;
+ }
+}
+
struct proto smc_proto = {
.name = "SMC",
.owner = THIS_MODULE,
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
+ .release_cb = smc_release_cb,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v4_hashinfo,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
@@ -118,16 +236,35 @@ struct proto smc_proto6 = {
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
+ .release_cb = smc_release_cb,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v6_hashinfo,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);
+static void smc_fback_restore_callbacks(struct smc_sock *smc)
+{
+ struct sock *clcsk = smc->clcsock->sk;
+
+ write_lock_bh(&clcsk->sk_callback_lock);
+ clcsk->sk_user_data = NULL;
+
+ smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change);
+ smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready);
+ smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space);
+ smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report);
+
+ write_unlock_bh(&clcsk->sk_callback_lock);
+}
+
static void smc_restore_fallback_changes(struct smc_sock *smc)
{
- smc->clcsock->file->private_data = smc->sk.sk_socket;
- smc->clcsock->file = NULL;
+ if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
+ smc->clcsock->file->private_data = smc->sk.sk_socket;
+ smc->clcsock->file = NULL;
+ smc_fback_restore_callbacks(smc);
+ }
}
static int __smc_release(struct smc_sock *smc)
@@ -140,14 +277,18 @@ static int __smc_release(struct smc_sock *smc)
sock_set_flag(sk, SOCK_DEAD);
sk->sk_shutdown |= SHUTDOWN_MASK;
} else {
- if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
- sock_put(sk); /* passive closing */
- if (sk->sk_state == SMC_LISTEN) {
- /* wake up clcsock accept */
- rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ if (sk->sk_state != SMC_CLOSED) {
+ if (sk->sk_state != SMC_LISTEN &&
+ sk->sk_state != SMC_INIT)
+ sock_put(sk); /* passive closing */
+ if (sk->sk_state == SMC_LISTEN) {
+ /* wake up clcsock accept */
+ rc = kernel_sock_shutdown(smc->clcsock,
+ SHUT_RDWR);
+ }
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
}
- sk->sk_state = SMC_CLOSED;
- sk->sk_state_change(sk);
smc_restore_fallback_changes(smc);
}
@@ -170,7 +311,7 @@ static int smc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
- int rc = 0;
+ int old_state, rc = 0;
if (!sk)
goto out;
@@ -178,10 +319,14 @@ static int smc_release(struct socket *sock)
sock_hold(sk); /* sock_put below */
smc = smc_sk(sk);
+ old_state = sk->sk_state;
+
/* cleanup for a dangling non-blocking connect */
- if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
+ if (smc->connect_nonblock && old_state == SMC_INIT)
tcp_abort(smc->clcsock->sk, ECONNABORTED);
- flush_work(&smc->connect_work);
+
+ if (cancel_work_sync(&smc->connect_work))
+ sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
if (sk->sk_state == SMC_LISTEN)
/* smc_close_non_accepted() is called and acquires
@@ -191,6 +336,10 @@ static int smc_release(struct socket *sock)
else
lock_sock(sk);
+ if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
+ !smc->use_fallback)
+ smc_close_active_abort(smc);
+
rc = __smc_release(smc);
/* detach socket */
@@ -230,6 +379,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
sk->sk_state = SMC_INIT;
sk->sk_destruct = smc_destruct;
sk->sk_protocol = protocol;
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem));
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem));
smc = smc_sk(sk);
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_WORK(&smc->connect_work, smc_connect_work);
@@ -240,6 +391,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
sk->sk_prot->hash(sk);
sk_refcnt_debug_inc(sk);
mutex_init(&smc->clcsock_release_lock);
+ smc_init_saved_callbacks(smc);
return sk;
}
@@ -277,6 +429,7 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
goto out_rel;
smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+ smc->clcsock->sk->sk_reuseport = sk->sk_reuseport;
rc = kernel_bind(smc->clcsock, uaddr, addr_len);
out_rel:
@@ -337,50 +490,84 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
-/* register a new rmb, send confirm_rkey msg to register with peer */
-static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
- bool conf_rkey)
+/* register the new vzalloced sndbuf on all links */
+static int smcr_lgr_reg_sndbufs(struct smc_link *link,
+ struct smc_buf_desc *snd_desc)
{
- if (!rmb_desc->wr_reg) {
- /* register memory region for new rmb */
- if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
- rmb_desc->regerr = 1;
- return -EFAULT;
- }
- rmb_desc->wr_reg = 1;
+ struct smc_link_group *lgr = link->lgr;
+ int i, rc = 0;
+
+ if (!snd_desc->is_vm)
+ return -EINVAL;
+
+ /* protect against parallel smcr_link_reg_buf() */
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc);
+ if (rc)
+ break;
}
- if (!conf_rkey)
- return 0;
+ mutex_unlock(&lgr->llc_conf_mutex);
+ return rc;
+}
+
+/* register the new rmb on all links */
+static int smcr_lgr_reg_rmbs(struct smc_link *link,
+ struct smc_buf_desc *rmb_desc)
+{
+ struct smc_link_group *lgr = link->lgr;
+ int i, rc = 0;
+
+ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+ if (rc)
+ return rc;
+ /* protect against parallel smc_llc_cli_rkey_exchange() and
+ * parallel smcr_link_reg_buf()
+ */
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc);
+ if (rc)
+ goto out;
+ }
+
/* exchange confirm_rkey msg with peer */
- if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
- rmb_desc->regerr = 1;
- return -EFAULT;
+ rc = smc_llc_do_confirm_rkey(link, rmb_desc);
+ if (rc) {
+ rc = -EFAULT;
+ goto out;
}
- return 0;
+ rmb_desc->is_conf_rkey = true;
+out:
+ mutex_unlock(&lgr->llc_conf_mutex);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+ return rc;
}
-static int smc_clnt_conf_first_link(struct smc_sock *smc)
+static int smcr_clnt_conf_first_link(struct smc_sock *smc)
{
- struct net *net = sock_net(smc->clcsock->sk);
- struct smc_link_group *lgr = smc->conn.lgr;
- struct smc_link *link;
- int rest;
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
int rc;
- link = &lgr->lnk[SMC_SINGLE_LINK];
/* receive CONFIRM LINK request from server over RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(
- &link->llc_confirm,
- SMC_LLC_WAIT_FIRST_TIME);
- if (rest <= 0) {
+ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
-
- if (link->llc_confirm_rc)
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
return SMC_CLC_DECL_RMBE_EC;
rc = smc_ib_modify_qp_rts(link);
@@ -389,44 +576,92 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
smc_wr_remember_qp_attr(link);
- if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
- return SMC_CLC_DECL_ERR_REGRMB;
+ /* reg the sndbuf if it was vzalloced */
+ if (smc->conn.sndbuf_desc->is_vm) {
+ if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+ }
+
+ /* reg the rmb */
+ if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
/* send CONFIRM LINK response over RoCE fabric */
rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
if (rc < 0)
return SMC_CLC_DECL_TIMEOUT_CL;
- /* receive ADD LINK request from server over RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(&link->llc_add,
- SMC_LLC_WAIT_TIME);
- if (rest <= 0) {
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+
+ /* optional 2nd link, receive ADD LINK request from server */
+ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK);
+ if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
- return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
+ if (rc == -EAGAIN)
+ rc = 0; /* no DECLINE received, go with one link */
+ return rc;
}
+ smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
+ smc_llc_cli_add_link(link, qentry);
+ return 0;
+}
- /* send add link reject message, only one link supported for now */
- rc = smc_llc_send_add_link(link,
- link->smcibdev->mac[link->ibport - 1],
- link->gid, SMC_LLC_RESP);
- if (rc < 0)
- return SMC_CLC_DECL_TIMEOUT_AL;
+static bool smc_isascii(char *hostname)
+{
+ int i;
- smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
+ for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
+ if (!isascii(hostname[i]))
+ return false;
+ return true;
+}
- return 0;
+static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)clc;
+ struct smc_clc_first_contact_ext *fce;
+ int clc_v2_len;
+
+ if (clc->hdr.version == SMC_V1 ||
+ !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
+ return;
+
+ if (smc->conn.lgr->is_smcd) {
+ memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid,
+ SMC_MAX_EID_LEN);
+ clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
+ d1);
+ } else {
+ memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid,
+ SMC_MAX_EID_LEN);
+ clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
+ r1);
+ }
+ fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len);
+ smc->conn.lgr->peer_os = fce->os_type;
+ smc->conn.lgr->peer_smc_release = fce->release;
+ if (smc_isascii(fce->hostname))
+ memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
+ SMC_MAX_HOSTNAME_LEN);
}
static void smcr_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
- smc->conn.peer_rmbe_idx = clc->rmbe_idx;
- smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
smc->conn.peer_rmbe_size = bufsize;
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
@@ -435,10 +670,10 @@ static void smcr_conn_save_peer_info(struct smc_sock *smc,
static void smcd_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
- int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
+ int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
- smc->conn.peer_rmbe_idx = clc->dmbe_idx;
- smc->conn.peer_token = clc->token;
+ smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
+ smc->conn.peer_token = clc->d0.token;
/* msg header takes up space in the buffer */
smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
@@ -452,34 +687,222 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
smcd_conn_save_peer_info(smc, clc);
else
smcr_conn_save_peer_info(smc, clc);
+ smc_conn_save_peer_info_fce(smc, clc);
}
static void smc_link_save_peer_info(struct smc_link *link,
- struct smc_clc_msg_accept_confirm *clc)
+ struct smc_clc_msg_accept_confirm *clc,
+ struct smc_init_info *ini)
+{
+ link->peer_qpn = ntoh24(clc->r0.qpn);
+ memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->r0.psn);
+ link->peer_mtu = clc->r0.qp_mtu;
+}
+
+static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
+ struct smc_stats_fback *fback_arr)
+{
+ int cnt;
+
+ for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
+ if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
+ fback_arr[cnt].count++;
+ break;
+ }
+ if (!fback_arr[cnt].fback_code) {
+ fback_arr[cnt].fback_code = smc->fallback_rsn;
+ fback_arr[cnt].count++;
+ break;
+ }
+ }
+}
+
+static void smc_stat_fallback(struct smc_sock *smc)
+{
+ struct net *net = sock_net(&smc->sk);
+
+ mutex_lock(&net->smc.mutex_fback_rsn);
+ if (smc->listen_smc) {
+ smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
+ net->smc.fback_rsn->srv_fback_cnt++;
+ } else {
+ smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
+ net->smc.fback_rsn->clnt_fback_cnt++;
+ }
+ mutex_unlock(&net->smc.mutex_fback_rsn);
+}
+
+/* must be called under rcu read lock */
+static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
{
- link->peer_qpn = ntoh24(clc->qpn);
- memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
- memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
- link->peer_psn = ntoh24(clc->psn);
- link->peer_mtu = clc->qp_mtu;
+ struct socket_wq *wq;
+ __poll_t flags;
+
+ wq = rcu_dereference(smc->sk.sk_wq);
+ if (!skwq_has_sleeper(wq))
+ return;
+
+ /* wake up smc sk->sk_wq */
+ if (!key) {
+ /* sk_state_change */
+ wake_up_interruptible_all(&wq->wait);
+ } else {
+ flags = key_to_poll(key);
+ if (flags & (EPOLLIN | EPOLLOUT))
+ /* sk_data_ready or sk_write_space */
+ wake_up_interruptible_sync_poll(&wq->wait, flags);
+ else if (flags & EPOLLERR)
+ /* sk_error_report */
+ wake_up_interruptible_poll(&wq->wait, flags);
+ }
}
-static void smc_switch_to_fallback(struct smc_sock *smc)
+static int smc_fback_mark_woken(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *key)
{
+ struct smc_mark_woken *mark =
+ container_of(wait, struct smc_mark_woken, wait_entry);
+
+ mark->woken = true;
+ mark->key = key;
+ return 0;
+}
+
+static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
+ void (*clcsock_callback)(struct sock *sk))
+{
+ struct smc_mark_woken mark = { .woken = false };
+ struct socket_wq *wq;
+
+ init_waitqueue_func_entry(&mark.wait_entry,
+ smc_fback_mark_woken);
+ rcu_read_lock();
+ wq = rcu_dereference(clcsk->sk_wq);
+ if (!wq)
+ goto out;
+ add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
+ clcsock_callback(clcsk);
+ remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
+
+ if (mark.woken)
+ smc_fback_wakeup_waitqueue(smc, mark.key);
+out:
+ rcu_read_unlock();
+}
+
+static void smc_fback_state_change(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_state_change);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_data_ready(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_data_ready);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_write_space(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_write_space);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_error_report(struct sock *clcsk)
+{
+ struct smc_sock *smc;
+
+ read_lock_bh(&clcsk->sk_callback_lock);
+ smc = smc_clcsock_user_data(clcsk);
+ if (smc)
+ smc_fback_forward_wakeup(smc, clcsk,
+ smc->clcsk_error_report);
+ read_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static void smc_fback_replace_callbacks(struct smc_sock *smc)
+{
+ struct sock *clcsk = smc->clcsock->sk;
+
+ write_lock_bh(&clcsk->sk_callback_lock);
+ clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+
+ smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change,
+ &smc->clcsk_state_change);
+ smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready,
+ &smc->clcsk_data_ready);
+ smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space,
+ &smc->clcsk_write_space);
+ smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report,
+ &smc->clcsk_error_report);
+
+ write_unlock_bh(&clcsk->sk_callback_lock);
+}
+
+static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
+{
+ int rc = 0;
+
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ rc = -EBADF;
+ goto out;
+ }
+
smc->use_fallback = true;
+ smc->fallback_rsn = reason_code;
+ smc_stat_fallback(smc);
+ trace_smc_switch_to_fallback(smc, reason_code);
if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
smc->clcsock->file = smc->sk.sk_socket->file;
smc->clcsock->file->private_data = smc->clcsock;
smc->clcsock->wq.fasync_list =
smc->sk.sk_socket->wq.fasync_list;
+
+ /* There might be some wait entries remaining
+ * in smc sk->sk_wq and they should be woken up
+ * as clcsock's wait queue is woken up.
+ */
+ smc_fback_replace_callbacks(smc);
}
+out:
+ mutex_unlock(&smc->clcsock_release_lock);
+ return rc;
}
/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
- smc_switch_to_fallback(smc);
- smc->fallback_rsn = reason_code;
+ struct net *net = sock_net(&smc->sk);
+ int rc = 0;
+
+ rc = smc_switch_to_fallback(smc, reason_code);
+ if (rc) { /* fallback fails */
+ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
+ return rc;
+ }
smc_copy_sock_settings_to_clc(smc);
smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
@@ -488,18 +911,22 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
}
/* decline and fall back during connect */
-static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
+ u8 version)
{
+ struct net *net = sock_net(&smc->sk);
int rc;
if (reason_code < 0) { /* error, fallback is not possible */
+ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
return reason_code;
}
if (reason_code != SMC_CLC_DECL_PEERDECL) {
- rc = smc_clc_send_decline(smc, reason_code);
+ rc = smc_clc_send_decline(smc, reason_code, version);
if (rc < 0) {
+ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
return rc;
@@ -508,24 +935,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
return smc_connect_fallback(smc, reason_code);
}
-/* abort connecting */
-static int smc_connect_abort(struct smc_sock *smc, int reason_code,
- int local_contact)
+static void smc_conn_abort(struct smc_sock *smc, int local_first)
{
- bool is_smcd = smc->conn.lgr->is_smcd;
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ bool lgr_valid = false;
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&smc->conn);
- else
- smc_conn_free(&smc->conn);
- if (is_smcd)
- /* there is only one lgr role for SMC-D; use server lock */
- mutex_unlock(&smc_server_lgr_pending);
- else
- mutex_unlock(&smc_client_lgr_pending);
+ if (smc_conn_lgr_valid(conn))
+ lgr_valid = true;
- smc->connect_nonblock = 0;
- return reason_code;
+ smc_conn_free(conn);
+ if (local_first && lgr_valid)
+ smc_lgr_cleanup_early(lgr);
}
/* check if there is a rdma device available for this connection. */
@@ -537,7 +958,9 @@ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
* used for the internal TCP socket
*/
smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
- if (!ini->ib_dev)
+ if (!ini->check_smcrv2 && !ini->ib_dev)
+ return SMC_CLC_DECL_NOSMCRDEV;
+ if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
return SMC_CLC_DECL_NOSMCRDEV;
return 0;
}
@@ -548,47 +971,210 @@ static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
/* Find ISM device with same PNETID as connecting interface */
smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
- if (!ini->ism_dev)
+ if (!ini->ism_dev[0])
return SMC_CLC_DECL_NOSMCDDEV;
+ else
+ ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
return 0;
}
+/* is chid unique for the ism devices that are already determined? */
+static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
+ int cnt)
+{
+ int i = (!ini->ism_dev[0]) ? 1 : 0;
+
+ for (; i < cnt; i++)
+ if (ini->ism_chid[i] == chid)
+ return false;
+ return true;
+}
+
+/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
+ * PNETID matching net_device)
+ */
+static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = SMC_CLC_DECL_NOSMCDDEV;
+ struct smcd_dev *smcd;
+ int i = 1;
+ u16 chid;
+
+ if (smcd_indicated(ini->smc_type_v1))
+ rc = 0; /* already initialized for V1 */
+ mutex_lock(&smcd_dev_list.mutex);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away || smcd == ini->ism_dev[0])
+ continue;
+ chid = smc_ism_get_chid(smcd);
+ if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
+ continue;
+ if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
+ smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
+ ini->ism_dev[i] = smcd;
+ ini->ism_chid[i] = chid;
+ ini->is_smcd = true;
+ rc = 0;
+ i++;
+ if (i > SMC_MAX_ISM_DEVS)
+ break;
+ }
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+ ini->ism_offered_cnt = i - 1;
+ if (!ini->ism_dev[0] && !ini->ism_dev[1])
+ ini->smcd_version = 0;
+
+ return rc;
+}
+
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
struct smc_init_info *ini)
{
- if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
+ if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_ISMVLANERR;
return 0;
}
+static int smc_find_proposal_devices(struct smc_sock *smc,
+ struct smc_init_info *ini)
+{
+ int rc = 0;
+
+ /* check if there is an ism device available */
+ if (!(ini->smcd_version & SMC_V1) ||
+ smc_find_ism_device(smc, ini) ||
+ smc_connect_ism_vlan_setup(smc, ini))
+ ini->smcd_version &= ~SMC_V1;
+ /* else ISM V1 is supported for this connection */
+
+ /* check if there is an rdma device available */
+ if (!(ini->smcr_version & SMC_V1) ||
+ smc_find_rdma_device(smc, ini))
+ ini->smcr_version &= ~SMC_V1;
+ /* else RDMA is supported for this connection */
+
+ ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
+ ini->smcr_version & SMC_V1);
+
+ /* check if there is an ism v2 device available */
+ if (!(ini->smcd_version & SMC_V2) ||
+ !smc_ism_is_v2_capable() ||
+ smc_find_ism_v2_device_clnt(smc, ini))
+ ini->smcd_version &= ~SMC_V2;
+
+ /* check if there is an rdma v2 device available */
+ ini->check_smcrv2 = true;
+ ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
+ if (!(ini->smcr_version & SMC_V2) ||
+ smc->clcsock->sk->sk_family != AF_INET ||
+ !smc_clc_ueid_count() ||
+ smc_find_rdma_device(smc, ini))
+ ini->smcr_version &= ~SMC_V2;
+ ini->check_smcrv2 = false;
+
+ ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
+ ini->smcr_version & SMC_V2);
+
+ /* if neither ISM nor RDMA are supported, fallback */
+ if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
+ rc = SMC_CLC_DECL_NOSMCDEV;
+
+ return rc;
+}
+
/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
* used, the VLAN ID will be registered again during the connection setup.
*/
-static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
struct smc_init_info *ini)
{
- if (!is_smcd)
+ if (!smcd_indicated(ini->smc_type_v1))
return 0;
- if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
+ if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_CNFERR;
return 0;
}
+#define SMC_CLC_MAX_ACCEPT_LEN \
+ (sizeof(struct smc_clc_msg_accept_confirm_v2) + \
+ sizeof(struct smc_clc_first_contact_ext) + \
+ sizeof(struct smc_clc_msg_trail))
+
/* CLC handshake during connect */
-static int smc_connect_clc(struct smc_sock *smc, int smc_type,
- struct smc_clc_msg_accept_confirm *aclc,
+static int smc_connect_clc(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *aclc2,
struct smc_init_info *ini)
{
int rc = 0;
/* do inband token exchange */
- rc = smc_clc_send_proposal(smc, smc_type, ini);
+ rc = smc_clc_send_proposal(smc, ini);
if (rc)
return rc;
/* receive SMC Accept CLC message */
- return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
- CLC_WAIT_TIME);
+ return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
+ SMC_CLC_ACCEPT, CLC_WAIT_TIME);
+}
+
+void smc_fill_gid_list(struct smc_link_group *lgr,
+ struct smc_gidlist *gidlist,
+ struct smc_ib_device *known_dev, u8 *known_gid)
+{
+ struct smc_init_info *alt_ini = NULL;
+
+ memset(gidlist, 0, sizeof(*gidlist));
+ memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);
+
+ alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
+ if (!alt_ini)
+ goto out;
+
+ alt_ini->vlan_id = lgr->vlan_id;
+ alt_ini->check_smcrv2 = true;
+ alt_ini->smcrv2.saddr = lgr->saddr;
+ smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);
+
+ if (!alt_ini->smcrv2.ib_dev_v2)
+ goto out;
+
+ memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
+ SMC_GID_SIZE);
+
+out:
+ kfree(alt_ini);
+}
+
+static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+ struct smc_clc_first_contact_ext *fce =
+ (struct smc_clc_first_contact_ext *)
+ (((u8 *)clc_v2) + sizeof(*clc_v2));
+
+ if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
+ return 0;
+
+ if (fce->v2_direct) {
+ memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
+ ini->smcrv2.uses_gateway = false;
+ } else {
+ if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr,
+ smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
+ ini->smcrv2.nexthop_mac,
+ &ini->smcrv2.uses_gateway))
+ return SMC_CLC_DECL_NOROUTE;
+ if (!ini->smcrv2.uses_gateway) {
+ /* mismatch: peer claims indirect, but its direct */
+ return SMC_CLC_DECL_NOINDIRECT;
+ }
+ }
+ return 0;
}
/* setup for RDMA connection of client */
@@ -596,13 +1182,20 @@ static int smc_connect_rdma(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
struct smc_init_info *ini)
{
+ int i, reason_code = 0;
struct smc_link *link;
- int reason_code = 0;
+ u8 *eid = NULL;
ini->is_smcd = false;
- ini->ib_lcl = &aclc->lcl;
- ini->ib_clcqpn = ntoh24(aclc->qpn);
- ini->srv_first_contact = aclc->hdr.flag;
+ ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+ memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
+ memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
+ memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
+
+ reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
+ if (reason_code)
+ return reason_code;
mutex_lock(&smc_client_lgr_pending);
reason_code = smc_conn_create(smc, ini);
@@ -610,49 +1203,94 @@ static int smc_connect_rdma(struct smc_sock *smc,
mutex_unlock(&smc_client_lgr_pending);
return reason_code;
}
- link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
smc_conn_save_peer_info(smc, aclc);
+ if (ini->first_contact_local) {
+ link = smc->conn.lnk;
+ } else {
+ /* set link that was assigned by server */
+ link = NULL;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *l = &smc->conn.lgr->lnk[i];
+
+ if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
+ !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
+ SMC_GID_SIZE) &&
+ (aclc->hdr.version > SMC_V1 ||
+ !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+ sizeof(l->peer_mac)))) {
+ link = l;
+ break;
+ }
+ }
+ if (!link) {
+ reason_code = SMC_CLC_DECL_NOSRVLINK;
+ goto connect_abort;
+ }
+ smc_switch_link_and_count(&smc->conn, link);
+ }
+
/* create send buffer and rmb */
- if (smc_buf_create(smc, false))
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
- ini->cln_first_contact);
+ if (smc_buf_create(smc, false)) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_link_save_peer_info(link, aclc);
+ if (ini->first_contact_local)
+ smc_link_save_peer_info(link, aclc, ini);
- if (smc_rmb_rtoken_handling(&smc->conn, aclc))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
- ini->cln_first_contact);
+ if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
+ reason_code = SMC_CLC_DECL_ERR_RTOK;
+ goto connect_abort;
+ }
smc_close_init(smc);
smc_rx_init(smc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
- if (smc_ib_ready_link(link))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
- ini->cln_first_contact);
+ if (ini->first_contact_local) {
+ if (smc_ib_ready_link(link)) {
+ reason_code = SMC_CLC_DECL_ERR_RDYLNK;
+ goto connect_abort;
+ }
} else {
- if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
- return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
- ini->cln_first_contact);
+ /* reg sendbufs if they were vzalloced */
+ if (smc->conn.sndbuf_desc->is_vm) {
+ if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) {
+ reason_code = SMC_CLC_DECL_ERR_REGBUF;
+ goto connect_abort;
+ }
+ }
+ if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
+ reason_code = SMC_CLC_DECL_ERR_REGBUF;
+ goto connect_abort;
+ }
}
- smc_rmb_sync_sg_for_device(&smc->conn);
- reason_code = smc_clc_send_confirm(smc);
+ if (aclc->hdr.version > SMC_V1) {
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ eid = clc_v2->r1.eid;
+ if (ini->first_contact_local)
+ smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
+ link->smcibdev, link->gid);
+ }
+
+ reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
+ aclc->hdr.version, eid, ini);
if (reason_code)
- return smc_connect_abort(smc, reason_code,
- ini->cln_first_contact);
+ goto connect_abort;
smc_tx_init(smc);
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
/* QP confirmation over RoCE fabric */
- reason_code = smc_clnt_conf_first_link(smc);
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_clnt_conf_first_link(smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
if (reason_code)
- return smc_connect_abort(smc, reason_code,
- ini->cln_first_contact);
+ goto connect_abort;
}
mutex_unlock(&smc_client_lgr_pending);
@@ -662,6 +1300,31 @@ static int smc_connect_rdma(struct smc_sock *smc,
smc->sk.sk_state = SMC_ACTIVE;
return 0;
+connect_abort:
+ smc_conn_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_client_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return reason_code;
+}
+
+/* The server has chosen one of the proposed ISM devices for the communication.
+ * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
+ */
+static int
+smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
+ struct smc_init_info *ini)
+{
+ int i;
+
+ for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
+ if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
+ ini->ism_selected = i;
+ return 0;
+ }
+ }
+
+ return -EPROTO;
}
/* setup for ISM connection of client */
@@ -669,11 +1332,21 @@ static int smc_connect_ism(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
struct smc_init_info *ini)
{
+ u8 *eid = NULL;
int rc = 0;
ini->is_smcd = true;
- ini->ism_gid = aclc->gid;
- ini->srv_first_contact = aclc->hdr.flag;
+ ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+
+ if (aclc->hdr.version == SMC_V2) {
+ struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
+ if (rc)
+ return rc;
+ }
+ ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
/* there is only one lgr role for SMC-D; use server lock */
mutex_lock(&smc_server_lgr_pending);
@@ -684,18 +1357,28 @@ static int smc_connect_ism(struct smc_sock *smc,
}
/* Create send and receive buffers */
- if (smc_buf_create(smc, true))
- return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
- ini->cln_first_contact);
+ rc = smc_buf_create(smc, true);
+ if (rc) {
+ rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
+ goto connect_abort;
+ }
smc_conn_save_peer_info(smc, aclc);
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
- rc = smc_clc_send_confirm(smc);
+ if (aclc->hdr.version > SMC_V1) {
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+ (struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+ eid = clc_v2->d1.eid;
+ }
+
+ rc = smc_clc_send_confirm(smc, ini->first_contact_local,
+ aclc->hdr.version, eid, NULL);
if (rc)
- return smc_connect_abort(smc, rc, ini->cln_first_contact);
+ goto connect_abort;
mutex_unlock(&smc_server_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
@@ -704,15 +1387,47 @@ static int smc_connect_ism(struct smc_sock *smc,
smc->sk.sk_state = SMC_ACTIVE;
return 0;
+connect_abort:
+ smc_conn_abort(smc, ini->first_contact_local);
+ mutex_unlock(&smc_server_lgr_pending);
+ smc->connect_nonblock = 0;
+
+ return rc;
+}
+
+/* check if received accept type and version matches a proposed one */
+static int smc_connect_check_aclc(struct smc_init_info *ini,
+ struct smc_clc_msg_accept_confirm *aclc)
+{
+ if (aclc->hdr.typev1 != SMC_TYPE_R &&
+ aclc->hdr.typev1 != SMC_TYPE_D)
+ return SMC_CLC_DECL_MODEUNSUPP;
+
+ if (aclc->hdr.version >= SMC_V2) {
+ if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+ !smcr_indicated(ini->smc_type_v2)) ||
+ (aclc->hdr.typev1 == SMC_TYPE_D &&
+ !smcd_indicated(ini->smc_type_v2)))
+ return SMC_CLC_DECL_MODEUNSUPP;
+ } else {
+ if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+ !smcr_indicated(ini->smc_type_v1)) ||
+ (aclc->hdr.typev1 == SMC_TYPE_D &&
+ !smcd_indicated(ini->smc_type_v1)))
+ return SMC_CLC_DECL_MODEUNSUPP;
+ }
+
+ return 0;
}
/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
- bool ism_supported = false, rdma_supported = false;
- struct smc_clc_msg_accept_confirm aclc;
- struct smc_init_info ini = {0};
- int smc_type;
+ u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
+ struct smc_clc_msg_accept_confirm_v2 *aclc2;
+ struct smc_clc_msg_accept_confirm *aclc;
+ struct smc_init_info *ini = NULL;
+ u8 *buf = NULL;
int rc = 0;
if (smc->use_fallback)
@@ -722,58 +1437,84 @@ static int __smc_connect(struct smc_sock *smc)
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
- /* IPSec connections opt out of SMC-R optimizations */
+ /* IPSec connections opt out of SMC optimizations */
if (using_ipsec(smc))
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
+ version);
- /* get vlan id from IP device */
- if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
- return smc_connect_decline_fallback(smc,
- SMC_CLC_DECL_GETVLANERR);
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini)
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
+ version);
- /* check if there is an ism device available */
- if (!smc_find_ism_device(smc, &ini) &&
- !smc_connect_ism_vlan_setup(smc, &ini)) {
- /* ISM is supported for this connection */
- ism_supported = true;
- smc_type = SMC_TYPE_D;
- }
-
- /* check if there is a rdma device available */
- if (!smc_find_rdma_device(smc, &ini)) {
- /* RDMA is supported for this connection */
- rdma_supported = true;
- if (ism_supported)
- smc_type = SMC_TYPE_B; /* both */
- else
- smc_type = SMC_TYPE_R; /* only RDMA */
+ ini->smcd_version = SMC_V1 | SMC_V2;
+ ini->smcr_version = SMC_V1 | SMC_V2;
+ ini->smc_type_v1 = SMC_TYPE_B;
+ ini->smc_type_v2 = SMC_TYPE_B;
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
+ ini->smcd_version &= ~SMC_V1;
+ ini->smcr_version = 0;
+ ini->smc_type_v1 = SMC_TYPE_N;
+ if (!ini->smcd_version) {
+ rc = SMC_CLC_DECL_GETVLANERR;
+ goto fallback;
+ }
}
- /* if neither ISM nor RDMA are supported, fallback */
- if (!rdma_supported && !ism_supported)
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
+ rc = smc_find_proposal_devices(smc, ini);
+ if (rc)
+ goto fallback;
+
+ buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto fallback;
+ }
+ aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
+ aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
/* perform CLC handshake */
- rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
+ rc = smc_connect_clc(smc, aclc2, ini);
if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
- return smc_connect_decline_fallback(smc, rc);
+ /* -EAGAIN on timeout, see tcp_recvmsg() */
+ if (rc == -EAGAIN) {
+ rc = -ETIMEDOUT;
+ smc->sk.sk_err = ETIMEDOUT;
+ }
+ goto vlan_cleanup;
}
+ /* check if smc modes and versions of CLC proposal and accept match */
+ rc = smc_connect_check_aclc(ini, aclc);
+ version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
+ if (rc)
+ goto vlan_cleanup;
+
/* depending on previous steps, connect using rdma or ism */
- if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
- rc = smc_connect_rdma(smc, &aclc, &ini);
- else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
- rc = smc_connect_ism(smc, &aclc, &ini);
- else
- rc = SMC_CLC_DECL_MODEUNSUPP;
- if (rc) {
- smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
- return smc_connect_decline_fallback(smc, rc);
+ if (aclc->hdr.typev1 == SMC_TYPE_R) {
+ ini->smcr_version = version;
+ rc = smc_connect_rdma(smc, aclc, ini);
+ } else if (aclc->hdr.typev1 == SMC_TYPE_D) {
+ ini->smcd_version = version;
+ rc = smc_connect_ism(smc, aclc, ini);
}
+ if (rc)
+ goto vlan_cleanup;
- smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
+ SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+ kfree(ini);
return 0;
+
+vlan_cleanup:
+ smc_connect_ism_vlan_cleanup(smc, ini);
+ kfree(buf);
+fallback:
+ kfree(ini);
+ return smc_connect_decline_fallback(smc, rc, version);
}
static void smc_connect_work(struct work_struct *work)
@@ -789,7 +1530,7 @@ static void smc_connect_work(struct work_struct *work)
if (smc->clcsock->sk->sk_err) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
} else if ((1 << smc->clcsock->sk->sk_state) &
- (TCPF_SYN_SENT | TCP_SYN_RECV)) {
+ (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
if ((rc == -EPIPE) &&
((1 << smc->clcsock->sk->sk_state) &
@@ -802,6 +1543,8 @@ static void smc_connect_work(struct work_struct *work)
smc->sk.sk_state = SMC_CLOSED;
if (rc == -EPIPE || rc == -EAGAIN)
smc->sk.sk_err = EPIPE;
+ else if (rc == -ECONNREFUSED)
+ smc->sk.sk_err = ECONNREFUSED;
else if (signal_pending(current))
smc->sk.sk_err = -sock_intr_errno(timeo);
sock_put(&smc->sk); /* passive closing */
@@ -840,14 +1583,33 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
goto out_err;
lock_sock(sk);
+ switch (sock->state) {
+ default:
+ rc = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL;
+ goto out;
+ case SS_CONNECTING:
+ if (sk->sk_state == SMC_ACTIVE)
+ goto connected;
+ break;
+ case SS_UNCONNECTED:
+ sock->state = SS_CONNECTING;
+ break;
+ }
+
switch (sk->sk_state) {
default:
goto out;
+ case SMC_CLOSED:
+ rc = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ goto out;
case SMC_ACTIVE:
rc = -EISCONN;
goto out;
case SMC_INIT:
- rc = 0;
break;
}
@@ -861,21 +1623,25 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
if (rc && rc != -EINPROGRESS)
goto out;
- sock_hold(&smc->sk); /* sock put in passive closing */
- if (smc->use_fallback)
+ if (smc->use_fallback) {
+ sock->state = rc ? SS_CONNECTING : SS_CONNECTED;
goto out;
+ }
+ sock_hold(&smc->sk); /* sock put in passive closing */
if (flags & O_NONBLOCK) {
- if (schedule_work(&smc->connect_work))
+ if (queue_work(smc_hs_wq, &smc->connect_work))
smc->connect_nonblock = 1;
rc = -EINPROGRESS;
+ goto out;
} else {
rc = __smc_connect(smc);
if (rc < 0)
goto out;
- else
- rc = 0; /* success cases including fallback */
}
+connected:
+ rc = 0;
+ sock->state = SS_CONNECTED;
out:
release_sock(sk);
out_err:
@@ -902,10 +1668,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
mutex_lock(&lsmc->clcsock_release_lock);
if (lsmc->clcsock)
- rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
mutex_unlock(&lsmc->clcsock_release_lock);
lock_sock(lsk);
- if (rc < 0)
+ if (rc < 0 && rc != -EAGAIN)
lsk->sk_err = -rc;
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
@@ -918,6 +1684,23 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
goto out;
}
+ /* new clcsock has inherited the smc listen-specific sk_data_ready
+ * function; switch it back to the original sk_data_ready function
+ */
+ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
+
+ /* if new clcsock has also inherited the fallback-specific callback
+ * functions, switch them back to the original ones.
+ */
+ if (lsmc->use_fallback) {
+ if (lsmc->clcsk_state_change)
+ new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change;
+ if (lsmc->clcsk_write_space)
+ new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space;
+ if (lsmc->clcsk_error_report)
+ new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report;
+ }
+
(*new_smc)->clcsock = new_clcsock;
out:
return rc;
@@ -973,6 +1756,7 @@ struct sock *smc_accept_dequeue(struct sock *parent,
}
if (new_sock) {
sock_graft(new_sk, new_sock);
+ new_sock->state = SS_CONNECTED;
if (isk->use_fallback) {
smc_sk(new_sk)->clcsock->file = new_sock->file;
isk->clcsock->file->private_data = isk->clcsock;
@@ -999,18 +1783,21 @@ void smc_close_non_accepted(struct sock *sk)
sock_put(sk); /* final sock_put */
}
-static int smc_serv_conf_first_link(struct smc_sock *smc)
+static int smcr_serv_conf_first_link(struct smc_sock *smc)
{
- struct net *net = sock_net(smc->clcsock->sk);
- struct smc_link_group *lgr = smc->conn.lgr;
- struct smc_link *link;
- int rest;
+ struct smc_link *link = smc->conn.lnk;
+ struct smc_llc_qentry *qentry;
int rc;
- link = &lgr->lnk[SMC_SINGLE_LINK];
+ /* reg the sndbuf if it was vzalloced*/
+ if (smc->conn.sndbuf_desc->is_vm) {
+ if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+ }
- if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
- return SMC_CLC_DECL_ERR_REGRMB;
+ /* reg the rmb */
+ if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
/* send CONFIRM LINK request to client over the RoCE fabric */
rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
@@ -1018,40 +1805,29 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
return SMC_CLC_DECL_TIMEOUT_CL;
/* receive CONFIRM LINK response from client over the RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(
- &link->llc_confirm_resp,
- SMC_LLC_WAIT_FIRST_TIME);
- if (rest <= 0) {
+ qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_LINK);
+ if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
-
- if (link->llc_confirm_resp_rc)
+ smc_llc_save_peer_uid(qentry);
+ rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
+ smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
+ if (rc)
return SMC_CLC_DECL_RMBE_EC;
- /* send ADD LINK request to client over the RoCE fabric */
- rc = smc_llc_send_add_link(link,
- link->smcibdev->mac[link->ibport - 1],
- link->gid, SMC_LLC_REQ);
- if (rc < 0)
- return SMC_CLC_DECL_TIMEOUT_AL;
+ /* confirm_rkey is implicit on 1st contact */
+ smc->conn.rmb_desc->is_conf_rkey = true;
- /* receive ADD LINK response from client over the RoCE fabric */
- rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
- SMC_LLC_WAIT_TIME);
- if (rest <= 0) {
- struct smc_clc_msg_decline dclc;
-
- rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
- SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
- return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
- }
-
- smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
+ smc_llc_link_active(link);
+ smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
+ /* initial contact - try to establish second link */
+ smc_llc_srv_add_link(link, NULL);
return 0;
}
@@ -1061,6 +1837,9 @@ static void smc_listen_out(struct smc_sock *new_smc)
struct smc_sock *lsmc = new_smc->listen_smc;
struct sock *newsmcsk = &new_smc->sk;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_dec(&lsmc->queued_smc_hs);
+
if (lsmc->sk.sk_state == SMC_LISTEN) {
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
smc_accept_enqueue(&lsmc->sk, newsmcsk);
@@ -1079,7 +1858,6 @@ static void smc_listen_out_connected(struct smc_sock *new_smc)
{
struct sock *newsmcsk = &new_smc->sk;
- sk_refcnt_debug_inc(newsmcsk);
if (newsmcsk->sk_state == SMC_INIT)
newsmcsk->sk_state = SMC_ACTIVE;
@@ -1090,7 +1868,9 @@ static void smc_listen_out_connected(struct smc_sock *new_smc)
static void smc_listen_out_err(struct smc_sock *new_smc)
{
struct sock *newsmcsk = &new_smc->sk;
+ struct net *net = sock_net(newsmcsk);
+ this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt);
if (newsmcsk->sk_state == SMC_INIT)
sock_put(&new_smc->sk); /* passive closing */
newsmcsk->sk_state = SMC_CLOSED;
@@ -1100,21 +1880,18 @@ static void smc_listen_out_err(struct smc_sock *new_smc)
/* listen worker: decline and fall back if possible */
static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
- int local_contact)
+ int local_first, u8 version)
{
/* RDMA setup failed, switch back to TCP */
- if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&new_smc->conn);
- else
- smc_conn_free(&new_smc->conn);
- if (reason_code < 0) { /* error, no fallback possible */
+ smc_conn_abort(new_smc, local_first);
+ if (reason_code < 0 ||
+ smc_switch_to_fallback(new_smc, reason_code)) {
+ /* error, no fallback possible */
smc_listen_out_err(new_smc);
return;
}
- smc_switch_to_fallback(new_smc);
- new_smc->fallback_rsn = reason_code;
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
- if (smc_clc_send_decline(new_smc, reason_code) < 0) {
+ if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
smc_listen_out_err(new_smc);
return;
}
@@ -1122,6 +1899,64 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
smc_listen_out_connected(new_smc);
}
+/* listen worker: version checking */
+static int smc_listen_v2_check(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
+ struct smc_clc_v2_extension *pclc_v2_ext;
+ int rc = SMC_CLC_DECL_PEERNOSMC;
+
+ ini->smc_type_v1 = pclc->hdr.typev1;
+ ini->smc_type_v2 = pclc->hdr.typev2;
+ ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
+ ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
+ if (pclc->hdr.version > SMC_V1) {
+ if (smcd_indicated(ini->smc_type_v2))
+ ini->smcd_version |= SMC_V2;
+ if (smcr_indicated(ini->smc_type_v2))
+ ini->smcr_version |= SMC_V2;
+ }
+ if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
+ rc = SMC_CLC_DECL_PEERNOSMC;
+ goto out;
+ }
+ pclc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!pclc_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ ini->smcr_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOV2EXT;
+ goto out;
+ }
+ pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
+ if (ini->smcd_version & SMC_V2) {
+ if (!smc_ism_is_v2_capable()) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOISM2SUPP;
+ } else if (!pclc_smcd_v2_ext) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOV2DEXT;
+ } else if (!pclc_v2_ext->hdr.eid_cnt &&
+ !pclc_v2_ext->hdr.flag.seid) {
+ ini->smcd_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOUEID;
+ }
+ }
+ if (ini->smcr_version & SMC_V2) {
+ if (!pclc_v2_ext->hdr.eid_cnt) {
+ ini->smcr_version &= ~SMC_V2;
+ rc = SMC_CLC_DECL_NOUEID;
+ }
+ }
+
+out:
+ if (!ini->smcd_version && !ini->smcr_version)
+ return rc;
+
+ return 0;
+}
+
/* listen worker: check prefixes */
static int smc_listen_prfx_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc)
@@ -1129,6 +1964,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal_prefix *pclc_prfx;
struct socket *newclcsock = new_smc->clcsock;
+ if (pclc->hdr.typev1 == SMC_TYPE_N)
+ return 0;
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
if (smc_clc_prfx_match(newclcsock, pclc_prfx))
return SMC_CLC_DECL_DIFFPREFIX;
@@ -1156,99 +1993,338 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
/* listen worker: initialize connection and buffers for SMC-D */
static int smc_listen_ism_init(struct smc_sock *new_smc,
- struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
- struct smc_clc_msg_smcd *pclc_smcd;
int rc;
- pclc_smcd = smc_get_clc_msg_smcd(pclc);
- ini->ism_gid = pclc_smcd->gid;
rc = smc_conn_create(new_smc, ini);
if (rc)
return rc;
- /* Check if peer can be reached via ISM device */
- if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
- new_smc->conn.lgr->vlan_id,
- new_smc->conn.lgr->smcd)) {
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&new_smc->conn);
- else
- smc_conn_free(&new_smc->conn);
- return SMC_CLC_DECL_SMCDNOTALK;
- }
-
/* Create send and receive buffers */
- if (smc_buf_create(new_smc, true)) {
- if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_cleanup_early(&new_smc->conn);
- else
- smc_conn_free(&new_smc->conn);
- return SMC_CLC_DECL_MEM;
+ rc = smc_buf_create(new_smc, true);
+ if (rc) {
+ smc_conn_abort(new_smc, ini->first_contact_local);
+ return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
+ SMC_CLC_DECL_MEM;
}
return 0;
}
+static bool smc_is_already_selected(struct smcd_dev *smcd,
+ struct smc_init_info *ini,
+ int matches)
+{
+ int i;
+
+ for (i = 0; i < matches; i++)
+ if (smcd == ini->ism_dev[i])
+ return true;
+
+ return false;
+}
+
+/* check for ISM devices matching proposed ISM devices */
+static void smc_check_ism_v2_match(struct smc_init_info *ini,
+ u16 proposed_chid, u64 proposed_gid,
+ unsigned int *matches)
+{
+ struct smcd_dev *smcd;
+
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ if (smcd->going_away)
+ continue;
+ if (smc_is_already_selected(smcd, ini, *matches))
+ continue;
+ if (smc_ism_get_chid(smcd) == proposed_chid &&
+ !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
+ ini->ism_peer_gid[*matches] = proposed_gid;
+ ini->ism_dev[*matches] = smcd;
+ (*matches)++;
+ break;
+ }
+ }
+}
+
+static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
+{
+ if (!ini->rc)
+ ini->rc = rc;
+}
+
+static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_v2_extension *smc_v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ unsigned int matches = 0;
+ u8 smcd_version;
+ u8 *eid = NULL;
+ int i, rc;
+
+ if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
+ goto not_found;
+
+ pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ smc_v2_ext = smc_get_clc_v2_ext(pclc);
+ smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (pclc_smcd->ism.chid)
+ /* check for ISM device matching proposed native ISM device */
+ smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
+ ntohll(pclc_smcd->ism.gid), &matches);
+ for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
+ /* check for ISM devices matching proposed non-native ISM
+ * devices
+ */
+ smc_check_ism_v2_match(ini,
+ ntohs(smcd_v2_ext->gidchid[i - 1].chid),
+ ntohll(smcd_v2_ext->gidchid[i - 1].gid),
+ &matches);
+ }
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ if (!ini->ism_dev[0]) {
+ smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
+ goto not_found;
+ }
+
+ smc_ism_get_system_eid(&eid);
+ if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
+ smcd_v2_ext->system_eid, eid))
+ goto not_found;
+
+ /* separate - outside the smcd_dev_list.lock */
+ smcd_version = ini->smcd_version;
+ for (i = 0; i < matches; i++) {
+ ini->smcd_version = SMC_V2;
+ ini->is_smcd = true;
+ ini->ism_selected = i;
+ rc = smc_listen_ism_init(new_smc, ini);
+ if (rc) {
+ smc_find_ism_store_rc(rc, ini);
+ /* try next active ISM device */
+ continue;
+ }
+ return; /* matching and usable V2 ISM device found */
+ }
+ /* no V2 ISM device could be initialized */
+ ini->smcd_version = smcd_version; /* restore original value */
+ ini->negotiated_eid[0] = 0;
+
+not_found:
+ ini->smcd_version &= ~SMC_V2;
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
+static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ int rc = 0;
+
+ /* check if ISM V1 is available */
+ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
+ goto not_found;
+ ini->is_smcd = true; /* prepare ISM check */
+ ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
+ rc = smc_find_ism_device(new_smc, ini);
+ if (rc)
+ goto not_found;
+ ini->ism_selected = 0;
+ rc = smc_listen_ism_init(new_smc, ini);
+ if (!rc)
+ return; /* V1 ISM device found */
+
+not_found:
+ smc_find_ism_store_rc(rc, ini);
+ ini->smcd_version &= ~SMC_V1;
+ ini->ism_dev[0] = NULL;
+ ini->is_smcd = false;
+}
+
/* listen worker: register buffers */
-static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
{
- struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+ struct smc_connection *conn = &new_smc->conn;
- if (local_contact != SMC_FIRST_CONTACT) {
- if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
- return SMC_CLC_DECL_ERR_REGRMB;
+ if (!local_first) {
+ /* reg sendbufs if they were vzalloced */
+ if (conn->sndbuf_desc->is_vm) {
+ if (smcr_lgr_reg_sndbufs(conn->lnk,
+ conn->sndbuf_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
+ }
+ if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
+ return SMC_CLC_DECL_ERR_REGBUF;
}
- smc_rmb_sync_sg_for_device(&new_smc->conn);
return 0;
}
+static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ struct smc_clc_v2_extension *smc_v2_ext;
+ u8 smcr_version;
+ int rc;
+
+ if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
+ goto not_found;
+
+ smc_v2_ext = smc_get_clc_v2_ext(pclc);
+ if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
+ goto not_found;
+
+ /* prepare RDMA check */
+ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
+ memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
+ memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
+ ini->check_smcrv2 = true;
+ ini->smcrv2.clc_sk = new_smc->clcsock->sk;
+ ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
+ ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
+ rc = smc_find_rdma_device(new_smc, ini);
+ if (rc) {
+ smc_find_ism_store_rc(rc, ini);
+ goto not_found;
+ }
+ if (!ini->smcrv2.uses_gateway)
+ memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);
+
+ smcr_version = ini->smcr_version;
+ ini->smcr_version = SMC_V2;
+ rc = smc_listen_rdma_init(new_smc, ini);
+ if (!rc)
+ rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+ if (!rc)
+ return;
+ ini->smcr_version = smcr_version;
+ smc_find_ism_store_rc(rc, ini);
+
+not_found:
+ ini->smcr_version &= ~SMC_V2;
+ ini->smcrv2.ib_dev_v2 = NULL;
+ ini->check_smcrv2 = false;
+}
+
+static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int rc;
+
+ if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
+ return SMC_CLC_DECL_NOSMCDEV;
+
+ /* prepare RDMA check */
+ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
+ memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
+ memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
+ rc = smc_find_rdma_device(new_smc, ini);
+ if (rc) {
+ /* no RDMA device found */
+ return SMC_CLC_DECL_NOSMCDEV;
+ }
+ rc = smc_listen_rdma_init(new_smc, ini);
+ if (rc)
+ return rc;
+ return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+}
+
+/* determine the local device matching to proposal */
+static int smc_listen_find_device(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smc_init_info *ini)
+{
+ int prfx_rc;
+
+ /* check for ISM device matching V2 proposed device */
+ smc_find_ism_v2_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ /* check for matching IP prefix and subnet length (V1) */
+ prfx_rc = smc_listen_prfx_check(new_smc, pclc);
+ if (prfx_rc)
+ smc_find_ism_store_rc(prfx_rc, ini);
+
+ /* get vlan id from IP device */
+ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
+ return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
+
+ /* check for ISM device matching V1 proposed device */
+ if (!prfx_rc)
+ smc_find_ism_v1_device_serv(new_smc, pclc, ini);
+ if (ini->ism_dev[0])
+ return 0;
+
+ if (!smcr_indicated(pclc->hdr.typev1) &&
+ !smcr_indicated(pclc->hdr.typev2))
+ /* skip RDMA and decline */
+ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
+
+ /* check if RDMA V2 is available */
+ smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
+ if (ini->smcrv2.ib_dev_v2)
+ return 0;
+
+ /* check if RDMA V1 is available */
+ if (!prfx_rc) {
+ int rc;
+
+ rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
+ smc_find_ism_store_rc(rc, ini);
+ return (!rc) ? 0 : ini->rc;
+ }
+ return SMC_CLC_DECL_NOSMCDEV;
+}
+
/* listen worker: finish RDMA setup */
static int smc_listen_rdma_finish(struct smc_sock *new_smc,
struct smc_clc_msg_accept_confirm *cclc,
- int local_contact)
+ bool local_first,
+ struct smc_init_info *ini)
{
- struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+ struct smc_link *link = new_smc->conn.lnk;
int reason_code = 0;
- if (local_contact == SMC_FIRST_CONTACT)
- smc_link_save_peer_info(link, cclc);
+ if (local_first)
+ smc_link_save_peer_info(link, cclc, ini);
- if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
- reason_code = SMC_CLC_DECL_ERR_RTOK;
- goto decline;
- }
+ if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
+ return SMC_CLC_DECL_ERR_RTOK;
- if (local_contact == SMC_FIRST_CONTACT) {
- if (smc_ib_ready_link(link)) {
- reason_code = SMC_CLC_DECL_ERR_RDYLNK;
- goto decline;
- }
+ if (local_first) {
+ if (smc_ib_ready_link(link))
+ return SMC_CLC_DECL_ERR_RDYLNK;
/* QP confirmation over RoCE fabric */
- reason_code = smc_serv_conf_first_link(new_smc);
- if (reason_code)
- goto decline;
+ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
+ reason_code = smcr_serv_conf_first_link(new_smc);
+ smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
}
- return 0;
-
-decline:
- smc_listen_decline(new_smc, reason_code, local_contact);
return reason_code;
}
-/* setup for RDMA connection of server */
+/* setup for connection of server */
static void smc_listen_work(struct work_struct *work)
{
struct smc_sock *new_smc = container_of(work, struct smc_sock,
smc_listen_work);
struct socket *newclcsock = new_smc->clcsock;
- struct smc_clc_msg_accept_confirm cclc;
+ struct smc_clc_msg_accept_confirm *cclc;
+ struct smc_clc_msg_proposal_area *buf;
struct smc_clc_msg_proposal *pclc;
- struct smc_init_info ini = {0};
- bool ism_supported = false;
- u8 buf[SMC_CLC_MAX_LEN];
+ struct smc_init_info *ini = NULL;
+ u8 proposal_version = SMC_V1;
+ u8 accept_version;
int rc = 0;
if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
@@ -1261,111 +2337,101 @@ static void smc_listen_work(struct work_struct *work)
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
- smc_switch_to_fallback(new_smc);
- new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
- smc_listen_out_connected(new_smc);
+ rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
+ if (rc)
+ smc_listen_out_err(new_smc);
+ else
+ smc_listen_out_connected(new_smc);
return;
}
/* do inband token exchange -
* wait for and receive SMC Proposal CLC message
*/
- pclc = (struct smc_clc_msg_proposal *)&buf;
- rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf) {
+ rc = SMC_CLC_DECL_MEM;
+ goto out_decl;
+ }
+ pclc = (struct smc_clc_msg_proposal *)buf;
+ rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
if (rc)
goto out_decl;
- /* IPSec connections opt out of SMC-R optimizations */
+ if (pclc->hdr.version > SMC_V1)
+ proposal_version = SMC_V2;
+
+ /* IPSec connections opt out of SMC optimizations */
if (using_ipsec(new_smc)) {
rc = SMC_CLC_DECL_IPSEC;
goto out_decl;
}
- /* check for matching IP prefix and subnet length */
- rc = smc_listen_prfx_check(new_smc, pclc);
- if (rc)
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = SMC_CLC_DECL_MEM;
goto out_decl;
+ }
- /* get vlan id from IP device */
- if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
- rc = SMC_CLC_DECL_GETVLANERR;
+ /* initial version checking */
+ rc = smc_listen_v2_check(new_smc, pclc, ini);
+ if (rc)
goto out_decl;
- }
mutex_lock(&smc_server_lgr_pending);
smc_close_init(new_smc);
smc_rx_init(new_smc);
smc_tx_init(new_smc);
- /* check if ISM is available */
- if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
- ini.is_smcd = true; /* prepare ISM check */
- rc = smc_find_ism_device(new_smc, &ini);
- if (!rc)
- rc = smc_listen_ism_init(new_smc, pclc, &ini);
- if (!rc)
- ism_supported = true;
- else if (pclc->hdr.path == SMC_TYPE_D)
- goto out_unlock; /* skip RDMA and decline */
- }
-
- /* check if RDMA is available */
- if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
- /* prepare RDMA check */
- ini.is_smcd = false;
- ini.ism_dev = NULL;
- ini.ib_lcl = &pclc->lcl;
- rc = smc_find_rdma_device(new_smc, &ini);
- if (rc) {
- /* no RDMA device found */
- if (pclc->hdr.path == SMC_TYPE_B)
- /* neither ISM nor RDMA device found */
- rc = SMC_CLC_DECL_NOSMCDEV;
- goto out_unlock;
- }
- rc = smc_listen_rdma_init(new_smc, &ini);
- if (rc)
- goto out_unlock;
- rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
- if (rc)
- goto out_unlock;
- }
+ /* determine ISM or RoCE device used for connection */
+ rc = smc_listen_find_device(new_smc, pclc, ini);
+ if (rc)
+ goto out_unlock;
/* send SMC Accept CLC message */
- rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
+ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
+ rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
+ accept_version, ini->negotiated_eid);
if (rc)
goto out_unlock;
/* SMC-D does not need this lock any more */
- if (ism_supported)
+ if (ini->is_smcd)
mutex_unlock(&smc_server_lgr_pending);
/* receive SMC Confirm CLC message */
- rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+ memset(buf, 0, sizeof(*buf));
+ cclc = (struct smc_clc_msg_accept_confirm *)buf;
+ rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
SMC_CLC_CONFIRM, CLC_WAIT_TIME);
if (rc) {
- if (!ism_supported)
+ if (!ini->is_smcd)
goto out_unlock;
goto out_decl;
}
/* finish worker */
- if (!ism_supported) {
- rc = smc_listen_rdma_finish(new_smc, &cclc,
- ini.cln_first_contact);
- mutex_unlock(&smc_server_lgr_pending);
+ if (!ini->is_smcd) {
+ rc = smc_listen_rdma_finish(new_smc, cclc,
+ ini->first_contact_local, ini);
if (rc)
- return;
+ goto out_unlock;
+ mutex_unlock(&smc_server_lgr_pending);
}
- smc_conn_save_peer_info(new_smc, &cclc);
+ smc_conn_save_peer_info(new_smc, cclc);
smc_listen_out_connected(new_smc);
- return;
+ SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
+ goto out_free;
out_unlock:
mutex_unlock(&smc_server_lgr_pending);
out_decl:
- smc_listen_decline(new_smc, rc, ini.cln_first_contact);
+ smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
+ proposal_version);
+out_free:
+ kfree(ini);
+ kfree(buf);
}
static void smc_tcp_listen_work(struct work_struct *work)
@@ -1379,11 +2445,14 @@ static void smc_tcp_listen_work(struct work_struct *work)
lock_sock(lsk);
while (lsk->sk_state == SMC_LISTEN) {
rc = smc_clcsock_accept(lsmc, &new_smc);
- if (rc)
+ if (rc) /* clcsock accept queue empty or error */
goto out;
if (!new_smc)
continue;
+ if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
+ atomic_inc(&lsmc->queued_smc_hs);
+
new_smc->listen_smc = lsmc;
new_smc->use_fallback = lsmc->use_fallback;
new_smc->fallback_rsn = lsmc->fallback_rsn;
@@ -1393,13 +2462,31 @@ static void smc_tcp_listen_work(struct work_struct *work)
new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
sock_hold(&new_smc->sk); /* sock_put in passive closing */
- if (!schedule_work(&new_smc->smc_listen_work))
+ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk);
}
out:
release_sock(lsk);
- sock_put(&lsmc->sk); /* sock_hold in smc_listen */
+ sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
+}
+
+static void smc_clcsock_data_ready(struct sock *listen_clcsock)
+{
+ struct smc_sock *lsmc;
+
+ read_lock_bh(&listen_clcsock->sk_callback_lock);
+ lsmc = smc_clcsock_user_data(listen_clcsock);
+ if (!lsmc)
+ goto out;
+ lsmc->clcsk_data_ready(listen_clcsock);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
+ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
+ sock_put(&lsmc->sk);
+ }
+out:
+ read_unlock_bh(&listen_clcsock->sk_callback_lock);
}
static int smc_listen(struct socket *sock, int backlog)
@@ -1413,7 +2500,7 @@ static int smc_listen(struct socket *sock, int backlog)
rc = -EINVAL;
if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
- smc->connect_nonblock)
+ smc->connect_nonblock || sock->state != SS_UNCONNECTED)
goto out;
rc = 0;
@@ -1428,15 +2515,39 @@ static int smc_listen(struct socket *sock, int backlog)
if (!smc->use_fallback)
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+ /* save original sk_data_ready function and establish
+ * smc-specific sk_data_ready function
+ */
+ write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
+ smc->clcsock->sk->sk_user_data =
+ (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+ smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready,
+ smc_clcsock_data_ready, &smc->clcsk_data_ready);
+ write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
+
+ /* save original ops */
+ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
+
+ smc->af_ops = *smc->ori_af_ops;
+ smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
+
+ inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
+
+ if (smc->limit_smc_hs)
+ tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+
rc = kernel_listen(smc->clcsock, backlog);
- if (rc)
+ if (rc) {
+ write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
+ smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
+ &smc->clcsk_data_ready);
+ smc->clcsock->sk->sk_user_data = NULL;
+ write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
goto out;
+ }
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
sk->sk_state = SMC_LISTEN;
- sock_hold(sk); /* sock_hold in tcp_listen_worker */
- if (!schedule_work(&smc->tcp_listen_work))
- sock_put(sk);
out:
release_sock(sk);
@@ -1542,18 +2653,21 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_FASTOPEN) {
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
- smc_switch_to_fallback(smc);
- smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
+ rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
+ if (rc)
+ goto out;
} else {
rc = -EINVAL;
goto out;
}
}
- if (smc->use_fallback)
+ if (smc->use_fallback) {
rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
- else
+ } else {
rc = smc_tx_sendmsg(smc, msg, len);
+ SMC_STAT_TX_PAYLOAD(smc, len, rc);
+ }
out:
release_sock(sk);
return rc;
@@ -1588,6 +2702,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
} else {
msg->msg_namelen = 0;
rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
+ SMC_STAT_RX_PAYLOAD(smc, rc, rc);
}
out:
@@ -1664,8 +2779,10 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
static int smc_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
+ bool do_shutdown = true;
struct smc_sock *smc;
int rc = -EINVAL;
+ int old_state;
int rc1 = 0;
smc = smc_sk(sk);
@@ -1675,6 +2792,17 @@ static int smc_shutdown(struct socket *sock, int how)
lock_sock(sk);
+ if (sock->state == SS_CONNECTING) {
+ if (sk->sk_state == SMC_ACTIVE)
+ sock->state = SS_CONNECTED;
+ else if (sk->sk_state == SMC_PEERCLOSEWAIT1 ||
+ sk->sk_state == SMC_PEERCLOSEWAIT2 ||
+ sk->sk_state == SMC_APPCLOSEWAIT1 ||
+ sk->sk_state == SMC_APPCLOSEWAIT2 ||
+ sk->sk_state == SMC_APPFINCLOSEWAIT)
+ sock->state = SS_DISCONNECTING;
+ }
+
rc = -ENOTCONN;
if ((sk->sk_state != SMC_ACTIVE) &&
(sk->sk_state != SMC_PEERCLOSEWAIT1) &&
@@ -1686,13 +2814,20 @@ static int smc_shutdown(struct socket *sock, int how)
if (smc->use_fallback) {
rc = kernel_sock_shutdown(smc->clcsock, how);
sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
- if (sk->sk_shutdown == SHUTDOWN_MASK)
+ if (sk->sk_shutdown == SHUTDOWN_MASK) {
sk->sk_state = SMC_CLOSED;
+ sk->sk_socket->state = SS_UNCONNECTED;
+ sock_put(sk);
+ }
goto out;
}
switch (how) {
case SHUT_RDWR: /* shutdown in both directions */
+ old_state = sk->sk_state;
rc = smc_close_active(smc);
+ if (old_state == SMC_ACTIVE &&
+ sk->sk_state == SMC_PEERCLOSEWAIT1)
+ do_shutdown = false;
break;
case SHUT_WR:
rc = smc_close_shutdown_write(smc);
@@ -1702,53 +2837,134 @@ static int smc_shutdown(struct socket *sock, int how)
/* nothing more to do because peer is not involved */
break;
}
- if (smc->clcsock)
+ if (do_shutdown && smc->clcsock)
rc1 = kernel_sock_shutdown(smc->clcsock, how);
/* map sock_shutdown_cmd constants to sk_shutdown value range */
sk->sk_shutdown |= how + 1;
+ if (sk->sk_state == SMC_CLOSED)
+ sock->state = SS_UNCONNECTED;
+ else
+ sock->state = SS_DISCONNECTING;
out:
release_sock(sk);
return rc ? rc : rc1;
}
+static int __smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+ int val, len;
+
+ smc = smc_sk(sock->sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(int, len, sizeof(int));
+
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ val = smc->limit_smc_hs;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int __smc_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int val, rc;
+
+ smc = smc_sk(sk);
+
+ lock_sock(sk);
+ switch (optname) {
+ case SMC_LIMIT_HS:
+ if (optlen < sizeof(int)) {
+ rc = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&val, optval, sizeof(int))) {
+ rc = -EFAULT;
+ break;
+ }
+
+ smc->limit_smc_hs = !!val;
+ rc = 0;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ break;
+ }
+ release_sock(sk);
+
+ return rc;
+}
+
static int smc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int val, rc;
+ if (level == SOL_TCP && optname == TCP_ULP)
+ return -EOPNOTSUPP;
+ else if (level == SOL_SMC)
+ return __smc_setsockopt(sock, level, optname, optval, optlen);
+
smc = smc_sk(sk);
/* generic setsockopts reaching us here always apply to the
* CLC socket
*/
- rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
- optval, optlen);
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EBADF;
+ }
+ if (unlikely(!smc->clcsock->ops->setsockopt))
+ rc = -EOPNOTSUPP;
+ else
+ rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
if (smc->clcsock->sk->sk_err) {
sk->sk_err = smc->clcsock->sk->sk_err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
+ mutex_unlock(&smc->clcsock_release_lock);
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
lock_sock(sk);
if (rc || smc->use_fallback)
goto out;
switch (optname) {
- case TCP_ULP:
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
- smc_switch_to_fallback(smc);
- smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
+ rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
} else {
rc = -EINVAL;
}
@@ -1757,18 +2973,22 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
- if (val)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ if (val) {
+ SMC_STAT_INC(smc, ndly_cnt);
+ smc_tx_pending(&smc->conn);
+ cancel_delayed_work(&smc->conn.tx_work);
+ }
}
break;
case TCP_CORK:
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
- if (!val)
- mod_delayed_work(system_wq, &smc->conn.tx_work,
- 0);
+ if (!val) {
+ SMC_STAT_INC(smc, cork_cnt);
+ smc_tx_pending(&smc->conn);
+ cancel_delayed_work(&smc->conn.tx_work);
+ }
}
break;
case TCP_DEFER_ACCEPT:
@@ -1787,11 +3007,26 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct smc_sock *smc;
+ int rc;
+
+ if (level == SOL_SMC)
+ return __smc_getsockopt(sock, level, optname, optval, optlen);
smc = smc_sk(sock->sk);
+ mutex_lock(&smc->clcsock_release_lock);
+ if (!smc->clcsock) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EBADF;
+ }
/* socket options apply to the CLC socket */
- return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
- optval, optlen);
+ if (unlikely(!smc->clcsock->ops->getsockopt)) {
+ mutex_unlock(&smc->clcsock_release_lock);
+ return -EOPNOTSUPP;
+ }
+ rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+ mutex_unlock(&smc->clcsock_release_lock);
+ return rc;
}
static int smc_ioctl(struct socket *sock, unsigned int cmd,
@@ -1889,11 +3124,15 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page,
goto out;
}
release_sock(sk);
- if (smc->use_fallback)
+ if (smc->use_fallback) {
rc = kernel_sendpage(smc->clcsock, page, offset,
size, flags);
- else
- rc = sock_no_sendpage(sock, page, offset, size, flags);
+ } else {
+ lock_sock(sk);
+ rc = smc_tx_sendpage(smc, page, offset, size, flags);
+ release_sock(sk);
+ SMC_STAT_INC(smc, sendpage_cnt);
+ }
out:
return rc;
@@ -1942,6 +3181,7 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
flags = MSG_DONTWAIT;
else
flags = 0;
+ SMC_STAT_INC(smc, splice_cnt);
rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
}
out:
@@ -1973,8 +3213,8 @@ static const struct proto_ops smc_sock_ops = {
.splice_read = smc_splice_read,
};
-static int smc_create(struct net *net, struct socket *sock, int protocol,
- int kern)
+static int __smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern, struct socket *clcsock)
{
int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
struct smc_sock *smc;
@@ -1991,6 +3231,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
rc = -ENOBUFS;
sock->ops = &smc_sock_ops;
+ sock->state = SS_UNCONNECTED;
sk = smc_sock_alloc(net, sock, protocol);
if (!sk)
goto out;
@@ -1999,37 +3240,124 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
smc = smc_sk(sk);
smc->use_fallback = false; /* assume rdma capability first */
smc->fallback_rsn = 0;
- rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
- &smc->clcsock);
- if (rc) {
- sk_common_release(sk);
- goto out;
+
+ /* default behavior from limit_smc_hs in every net namespace */
+ smc->limit_smc_hs = net->smc.limit_smc_hs;
+
+ rc = 0;
+ if (!clcsock) {
+ rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
+ &smc->clcsock);
+ if (rc) {
+ sk_common_release(sk);
+ goto out;
+ }
+ } else {
+ smc->clcsock = clcsock;
}
- smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
- smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
out:
return rc;
}
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ return __smc_create(net, sock, protocol, kern, NULL);
+}
+
static const struct net_proto_family smc_sock_family_ops = {
.family = PF_SMC,
.owner = THIS_MODULE,
.create = smc_create,
};
+static int smc_ulp_init(struct sock *sk)
+{
+ struct socket *tcp = sk->sk_socket;
+ struct net *net = sock_net(sk);
+ struct socket *smcsock;
+ int protocol, ret;
+
+ /* only TCP can be replaced */
+ if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
+ (sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
+ return -ESOCKTNOSUPPORT;
+ /* don't handle wq now */
+ if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
+ return -ENOTCONN;
+
+ if (sk->sk_family == AF_INET)
+ protocol = SMCPROTO_SMC;
+ else
+ protocol = SMCPROTO_SMC6;
+
+ smcsock = sock_alloc();
+ if (!smcsock)
+ return -ENFILE;
+
+ smcsock->type = SOCK_STREAM;
+ __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
+ ret = __smc_create(net, smcsock, protocol, 1, tcp);
+ if (ret) {
+ sock_release(smcsock); /* module_put() which ops won't be NULL */
+ return ret;
+ }
+
+ /* replace tcp socket to smc */
+ smcsock->file = tcp->file;
+ smcsock->file->private_data = smcsock;
+ smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
+ smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
+ tcp->file = NULL;
+
+ return ret;
+}
+
+static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
+ const gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(newsk);
+
+ /* don't inherit ulp ops to child when listen */
+ icsk->icsk_ulp_ops = NULL;
+}
+
+static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
+ .name = "smc",
+ .owner = THIS_MODULE,
+ .init = smc_ulp_init,
+ .clone = smc_ulp_clone,
+};
+
unsigned int smc_net_id;
static __net_init int smc_net_init(struct net *net)
{
+ int rc;
+
+ rc = smc_sysctl_net_init(net);
+ if (rc)
+ return rc;
return smc_pnet_net_init(net);
}
static void __net_exit smc_net_exit(struct net *net)
{
+ smc_sysctl_net_exit(net);
smc_pnet_net_exit(net);
}
+static __net_init int smc_net_stat_init(struct net *net)
+{
+ return smc_stats_init(net);
+}
+
+static void __net_exit smc_net_stat_exit(struct net *net)
+{
+ smc_stats_exit(net);
+}
+
static struct pernet_operations smc_net_ops = {
.init = smc_net_init,
.exit = smc_net_exit,
@@ -2037,6 +3365,11 @@ static struct pernet_operations smc_net_ops = {
.size = sizeof(struct smc_net),
};
+static struct pernet_operations smc_net_stat_ops = {
+ .init = smc_net_stat_init,
+ .exit = smc_net_stat_exit,
+};
+
static int __init smc_init(void)
{
int rc;
@@ -2045,14 +3378,39 @@ static int __init smc_init(void)
if (rc)
return rc;
- rc = smc_pnet_init();
+ rc = register_pernet_subsys(&smc_net_stat_ops);
if (rc)
goto out_pernet_subsys;
+ smc_ism_init();
+ smc_clc_init();
+
+ rc = smc_nl_init();
+ if (rc)
+ goto out_pernet_subsys_stat;
+
+ rc = smc_pnet_init();
+ if (rc)
+ goto out_nl;
+
+ rc = -ENOMEM;
+
+ smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
+ if (!smc_tcp_ls_wq)
+ goto out_pnet;
+
+ smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+ if (!smc_hs_wq)
+ goto out_alloc_tcp_ls_wq;
+
+ smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+ if (!smc_close_wq)
+ goto out_alloc_hs_wq;
+
rc = smc_core_init();
if (rc) {
pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_alloc_wqs;
}
rc = smc_llc_init();
@@ -2093,9 +3451,17 @@ static int __init smc_init(void)
goto out_sock;
}
+ rc = tcp_register_ulp(&smc_ulp_ops);
+ if (rc) {
+ pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
+ goto out_ib;
+ }
+
static_branch_enable(&tcp_have_smc);
return 0;
+out_ib:
+ smc_ib_unregister_client();
out_sock:
sock_unregister(PF_SMC);
out_proto6:
@@ -2104,8 +3470,18 @@ out_proto:
proto_unregister(&smc_proto);
out_core:
smc_core_exit();
+out_alloc_wqs:
+ destroy_workqueue(smc_close_wq);
+out_alloc_hs_wq:
+ destroy_workqueue(smc_hs_wq);
+out_alloc_tcp_ls_wq:
+ destroy_workqueue(smc_tcp_ls_wq);
out_pnet:
smc_pnet_exit();
+out_nl:
+ smc_nl_exit();
+out_pernet_subsys_stat:
+ unregister_pernet_subsys(&smc_net_stat_ops);
out_pernet_subsys:
unregister_pernet_subsys(&smc_net_ops);
@@ -2115,12 +3491,19 @@ out_pernet_subsys:
static void __exit smc_exit(void)
{
static_branch_disable(&tcp_have_smc);
+ tcp_unregister_ulp(&smc_ulp_ops);
sock_unregister(PF_SMC);
smc_core_exit();
smc_ib_unregister_client();
+ destroy_workqueue(smc_close_wq);
+ destroy_workqueue(smc_tcp_ls_wq);
+ destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
smc_pnet_exit();
+ smc_nl_exit();
+ smc_clc_exit();
+ unregister_pernet_subsys(&smc_net_stat_ops);
unregister_pernet_subsys(&smc_net_ops);
rcu_barrier();
}
@@ -2132,3 +3515,5 @@ MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("smc socket address family");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_SMC);
+MODULE_ALIAS_TCP_ULP("smc");
+MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index be11ba41190f..5ed765ea0c73 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -14,13 +14,23 @@
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/compiler.h> /* __aligned */
+#include <net/genetlink.h>
#include <net/sock.h>
#include "smc_ib.h"
+#define SMC_V1 1 /* SMC version V1 */
+#define SMC_V2 2 /* SMC version V2 */
+#define SMC_RELEASE 0
+
#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
+#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
+ * devices
+ */
+#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */
+
extern struct proto smc_proto;
extern struct proto smc_proto6;
@@ -48,7 +58,20 @@ enum smc_state { /* possible states of an SMC socket */
struct smc_link_group;
struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
- u8 type;
+ union {
+ u8 type;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ struct {
+ u8 llc_version:4,
+ llc_type:4;
+ };
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ struct {
+ u8 llc_type:4,
+ llc_version:4;
+ };
+#endif
+ };
} __aligned(1);
struct smc_cdc_conn_state_flags {
@@ -118,9 +141,16 @@ enum smc_urg_state {
SMC_URG_READ = 3, /* data was already read */
};
+struct smc_mark_woken {
+ bool woken;
+ void *key;
+ wait_queue_entry_t wait_entry;
+};
+
struct smc_connection {
struct rb_node alert_node;
struct smc_link_group *lgr; /* link group of connection */
+ struct smc_link *lnk; /* assigned SMC-R link */
u32 alert_token_local; /* unique conn. id */
u8 peer_rmbe_idx; /* from tcp handshake */
int peer_rmbe_size; /* size of peer rx buffer */
@@ -142,6 +172,9 @@ struct smc_connection {
* .prod cf. TCP snd_nxt
* .cons cf. TCP sends ack
*/
+ union smc_host_cursor local_tx_ctrl_fin;
+ /* prod crsr - confirmed by peer
+ */
union smc_host_cursor tx_curs_prep; /* tx - prepared data
* snd_max..wmem_alloc
*/
@@ -153,7 +186,14 @@ struct smc_connection {
*/
atomic_t sndbuf_space; /* remaining space in sndbuf */
u16 tx_cdc_seq; /* sequence # for CDC send */
+ u16 tx_cdc_seq_fin; /* sequence # - tx completed */
spinlock_t send_lock; /* protect wr_sends */
+ atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
+ * - inc when post wqe,
+ * - dec on polled tx cqe
+ */
+ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
+ atomic_t tx_pushing; /* nr_threads trying tx push */
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
u32 tx_off; /* base offset in peer rmb */
@@ -173,6 +213,10 @@ struct smc_connection {
* data still pending
*/
char urg_rx_byte; /* urgent byte */
+ bool tx_in_release_sock;
+ /* flush pending tx data in
+ * sock release_cb()
+ */
atomic_t bytes_to_rcv; /* arrived data,
* not yet received
*/
@@ -183,17 +227,28 @@ struct smc_connection {
spinlock_t acurs_lock; /* protect cursors */
#endif
struct work_struct close_work; /* peer sent some closing */
+ struct work_struct abort_work; /* abort the connection */
struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */
u8 rx_off; /* receive offset:
* 0 for SMC-R, 32 for SMC-D
*/
u64 peer_token; /* SMC-D token of peer */
u8 killed : 1; /* abnormal termination */
+ u8 freed : 1; /* normal termiation */
+ u8 out_of_sync : 1; /* out of sync with peer */
};
struct smc_sock { /* smc sock container */
struct sock sk;
struct socket *clcsock; /* internal tcp socket */
+ void (*clcsk_state_change)(struct sock *sk);
+ /* original stat_change fct. */
+ void (*clcsk_data_ready)(struct sock *sk);
+ /* original data_ready fct. */
+ void (*clcsk_write_space)(struct sock *sk);
+ /* original write_space fct. */
+ void (*clcsk_error_report)(struct sock *sk);
+ /* original error_report fct. */
struct smc_connection conn; /* smc connection */
struct smc_sock *listen_smc; /* listen parent */
struct work_struct connect_work; /* handle non-blocking connect*/
@@ -201,9 +256,14 @@ struct smc_sock { /* smc sock container */
struct work_struct smc_listen_work;/* prepare new accept socket */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
+ bool limit_smc_hs; /* put constraint on handshake */
bool use_fallback; /* fallback to tcp */
int fallback_rsn; /* reason for fallback */
u32 peer_diagnosis; /* decline reason from peer */
+ atomic_t queued_smc_hs; /* queued smc handshakes */
+ struct inet_connection_sock_af_ops af_ops;
+ const struct inet_connection_sock_af_ops *ori_af_ops;
+ /* original af ops */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
@@ -228,10 +288,51 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
return (struct smc_sock *)sk;
}
+static inline void smc_init_saved_callbacks(struct smc_sock *smc)
+{
+ smc->clcsk_state_change = NULL;
+ smc->clcsk_data_ready = NULL;
+ smc->clcsk_write_space = NULL;
+ smc->clcsk_error_report = NULL;
+}
+
+static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk)
+{
+ return (struct smc_sock *)
+ ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
+}
+
+/* save target_cb in saved_cb, and replace target_cb with new_cb */
+static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *),
+ void (*new_cb)(struct sock *),
+ void (**saved_cb)(struct sock *))
+{
+ /* only save once */
+ if (!*saved_cb)
+ *saved_cb = *target_cb;
+ *target_cb = new_cb;
+}
+
+/* restore target_cb to saved_cb, and reset saved_cb to NULL */
+static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *),
+ void (**saved_cb)(struct sock *))
+{
+ if (!*saved_cb)
+ return;
+ *target_cb = *saved_cb;
+ *saved_cb = NULL;
+}
+
+extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
+extern struct workqueue_struct *smc_close_wq; /* wq for close work */
+
#define SMC_SYSTEMID_LEN 8
extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+#define ntohll(x) be64_to_cpu(x)
+#define htonll(x) cpu_to_be64(x)
+
/* convert an u32 value into network byte order, store it into a 3 byte field */
static inline void hton24(u8 *net, u32 host)
{
@@ -263,7 +364,17 @@ static inline bool using_ipsec(struct smc_sock *smc)
}
#endif
+struct smc_gidlist;
+
struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
void smc_close_non_accepted(struct sock *sk);
+void smc_fill_gid_list(struct smc_link_group *lgr,
+ struct smc_gidlist *gidlist,
+ struct smc_ib_device *known_dev, u8 *known_gid);
+
+/* smc handshake limitation interface for netlink */
+int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 164f1584861b..53f63bfbaf5f 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -31,10 +31,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
struct smc_sock *smc;
int diff;
- if (!conn)
- /* already dismissed */
- return;
-
smc = container_of(conn, struct smc_sock, conn);
bh_lock_sock(&smc->sk);
if (!wc_status) {
@@ -47,25 +43,48 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
+ smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor,
+ conn);
+ conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
}
+
+ if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) {
+ /* If user owns the sock_lock, mark the connection need sending.
+ * User context will later try to send when it release sock_lock
+ * in smc_release_cb()
+ */
+ if (sock_owned_by_user(&smc->sk))
+ conn->tx_in_release_sock = true;
+ else
+ smc_tx_pending(conn);
+
+ if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
+ wake_up(&conn->cdc_pend_tx_wq);
+ }
+ WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
+
smc_tx_sndbuf_nonfull(smc);
bh_unlock_sock(&smc->sk);
}
int smc_cdc_get_free_slot(struct smc_connection *conn,
+ struct smc_link *link,
struct smc_wr_buf **wr_buf,
struct smc_rdma_wr **wr_rdma_buf,
struct smc_cdc_tx_pend **pend)
{
- struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
int rc;
rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
wr_rdma_buf,
(struct smc_wr_tx_pend_priv **)pend);
- if (conn->killed)
+ if (conn->killed) {
/* abnormal termination */
+ if (!rc)
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)(*pend));
rc = -EPIPE;
+ }
return rc;
}
@@ -91,39 +110,96 @@ int smc_cdc_msg_send(struct smc_connection *conn,
struct smc_wr_buf *wr_buf,
struct smc_cdc_tx_pend *pend)
{
+ struct smc_link *link = conn->lnk;
union smc_host_cursor cfed;
- struct smc_link *link;
int rc;
- link = &conn->lgr->lnk[SMC_SINGLE_LINK];
-
smc_cdc_add_pending_send(conn, pend);
conn->tx_cdc_seq++;
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
+
+ atomic_inc(&conn->cdc_pend_tx_wr);
+ smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
+
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
if (!rc) {
smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ } else {
+ conn->tx_cdc_seq--;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ atomic_dec(&conn->cdc_pend_tx_wr);
}
return rc;
}
+/* send a validation msg indicating the move of a conn to an other QP link */
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf)
+{
+ struct smc_host_cdc_msg *local = &conn->local_tx_ctrl;
+ struct smc_link *link = conn->lnk;
+ struct smc_cdc_msg *peer;
+ int rc;
+
+ peer = (struct smc_cdc_msg *)wr_buf;
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */
+ peer->token = htonl(local->token);
+ peer->prod_flags.failover_validation = 1;
+
+ /* We need to set pend->conn here to make sure smc_cdc_tx_handler()
+ * can handle properly
+ */
+ smc_cdc_add_pending_send(conn, pend);
+
+ atomic_inc(&conn->cdc_pend_tx_wr);
+ smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
+
+ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+ if (unlikely(rc))
+ atomic_dec(&conn->cdc_pend_tx_wr);
+
+ return rc;
+}
+
static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
{
struct smc_cdc_tx_pend *pend;
struct smc_wr_buf *wr_buf;
+ struct smc_link *link;
+ bool again = false;
int rc;
- rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend);
+again:
+ link = conn->lnk;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
if (rc)
- return rc;
+ goto put_out;
spin_lock_bh(&conn->send_lock);
+ if (link != conn->lnk) {
+ /* link of connection changed, try again one time*/
+ spin_unlock_bh(&conn->send_lock);
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ smc_wr_tx_link_put(link);
+ if (again)
+ return -ENOLINK;
+ again = true;
+ goto again;
+ }
rc = smc_cdc_msg_send(conn, wr_buf, pend);
spin_unlock_bh(&conn->send_lock);
+put_out:
+ smc_wr_tx_link_put(link);
return rc;
}
@@ -131,7 +207,8 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
{
int rc;
- if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown))
+ if (!smc_conn_lgr_valid(conn) ||
+ (conn->lgr->is_smcd && conn->lgr->peer_shutdown))
return -EPIPE;
if (conn->lgr->is_smcd) {
@@ -145,31 +222,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
return rc;
}
-static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
- unsigned long data)
+void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
{
- struct smc_connection *conn = (struct smc_connection *)data;
- struct smc_cdc_tx_pend *cdc_pend =
- (struct smc_cdc_tx_pend *)tx_pend;
-
- return cdc_pend->conn == conn;
-}
-
-static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
-{
- struct smc_cdc_tx_pend *cdc_pend =
- (struct smc_cdc_tx_pend *)tx_pend;
-
- cdc_pend->conn = NULL;
-}
-
-void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
-{
- struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
-
- smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
- smc_cdc_tx_filter, smc_cdc_tx_dismisser,
- (unsigned long)conn);
+ wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
}
/* Send a SMC-D CDC header.
@@ -239,6 +294,28 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
sk_send_sigurg(&smc->sk);
}
+static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc,
+ struct smc_link *link)
+{
+ struct smc_connection *conn = &smc->conn;
+ u16 recv_seq = ntohs(cdc->seqno);
+ s16 diff;
+
+ /* check that seqnum was seen before */
+ diff = conn->local_rx_ctrl.seqno - recv_seq;
+ if (diff < 0) { /* diff larger than 0x7fff */
+ /* drop connection */
+ conn->out_of_sync = 1; /* prevent any further receives */
+ spin_lock_bh(&conn->send_lock);
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ conn->lnk = link;
+ spin_unlock_bh(&conn->send_lock);
+ sock_hold(&smc->sk); /* sock_put in abort_work */
+ if (!queue_work(smc_close_wq, &conn->abort_work))
+ sock_put(&smc->sk);
+ }
+}
+
static void smc_cdc_msg_recv_action(struct smc_sock *smc,
struct smc_cdc_msg *cdc)
{
@@ -283,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
if ((diff_cons && smc_tx_prepared_sends(conn)) ||
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
- conn->local_rx_ctrl.prod_flags.urg_data_pending)
- smc_tx_sndbuf_nonempty(conn);
+ conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+ if (!sock_owned_by_user(&smc->sk))
+ smc_tx_pending(conn);
+ else
+ conn->tx_in_release_sock = true;
+ }
if (diff_cons && conn->urg_tx_pend &&
atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
@@ -303,7 +384,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(&smc->sk, SOCK_DONE);
sock_hold(&smc->sk); /* sock_put in close_work */
- if (!schedule_work(&conn->close_work))
+ if (!queue_work(smc_close_wq, &conn->close_work))
sock_put(&smc->sk);
}
}
@@ -324,9 +405,9 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
* Context:
* - tasklet context
*/
-static void smcd_cdc_rx_tsklet(unsigned long data)
+static void smcd_cdc_rx_tsklet(struct tasklet_struct *t)
{
- struct smc_connection *conn = (struct smc_connection *)data;
+ struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet);
struct smcd_cdc_msg *data_cdc;
struct smcd_cdc_msg cdc;
struct smc_sock *smc;
@@ -346,7 +427,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data)
*/
void smcd_cdc_rx_init(struct smc_connection *conn)
{
- tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn);
+ tasklet_setup(&conn->rx_tsklet, smcd_cdc_rx_tsklet);
}
/***************************** init, exit, misc ******************************/
@@ -369,16 +450,19 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
read_lock_bh(&lgr->conns_lock);
conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
read_unlock_bh(&lgr->conns_lock);
- if (!conn)
+ if (!conn || conn->out_of_sync)
return;
smc = container_of(conn, struct smc_sock, conn);
- if (!cdc->prod_flags.failover_validation) {
- if (smc_cdc_before(ntohs(cdc->seqno),
- conn->local_rx_ctrl.seqno))
- /* received seqno is old */
- return;
+ if (cdc->prod_flags.failover_validation) {
+ smc_cdc_msg_validate(smc, cdc, link);
+ return;
}
+ if (smc_cdc_before(ntohs(cdc->seqno),
+ conn->local_rx_ctrl.seqno))
+ /* received seqno is old */
+ return;
+
smc_cdc_msg_recv(smc, cdc);
}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 861dc24c588c..696cc11f2303 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -97,23 +97,6 @@ static inline void smc_curs_add(int size, union smc_host_cursor *curs,
}
}
-/* SMC cursors are 8 bytes long and require atomic reading and writing */
-static inline u64 smc_curs_read(union smc_host_cursor *curs,
- struct smc_connection *conn)
-{
-#ifndef KERNEL_HAS_ATOMIC64
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&conn->acurs_lock, flags);
- ret = curs->acurs;
- spin_unlock_irqrestore(&conn->acurs_lock, flags);
- return ret;
-#else
- return atomic64_read(&curs->acurs);
-#endif
-}
-
/* Copy cursor src into tgt */
static inline void smc_curs_copy(union smc_host_cursor *tgt,
union smc_host_cursor *src,
@@ -304,14 +287,18 @@ struct smc_cdc_tx_pend {
};
int smc_cdc_get_free_slot(struct smc_connection *conn,
+ struct smc_link *link,
struct smc_wr_buf **wr_buf,
struct smc_rdma_wr **wr_rdma_buf,
struct smc_cdc_tx_pend **pend);
-void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
+void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
struct smc_cdc_tx_pend *pend);
int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
int smcd_cdc_msg_send(struct smc_connection *conn);
+int smcr_cdc_msg_send_validation(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf);
int smc_cdc_init(void) __init;
void smcd_cdc_rx_init(struct smc_connection *conn);
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 86cccc24e52e..1472f31480d8 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -14,6 +14,8 @@
#include <linux/inetdevice.h>
#include <linux/if_ether.h>
#include <linux/sched/signal.h>
+#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <net/addrconf.h>
#include <net/sock.h>
@@ -24,22 +26,415 @@
#include "smc_clc.h"
#include "smc_ib.h"
#include "smc_ism.h"
+#include "smc_netlink.h"
#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78
+#define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108
+#define SMC_CLC_RECV_BUF_LEN 100
/* eye catcher "SMCR" EBCDIC for CLC messages */
static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
/* eye catcher "SMCD" EBCDIC for CLC messages */
static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
+static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN];
+
+struct smc_clc_eid_table {
+ rwlock_t lock;
+ struct list_head list;
+ u8 ueid_cnt;
+ u8 seid_enabled;
+};
+
+static struct smc_clc_eid_table smc_clc_eid_table;
+
+struct smc_clc_eid_entry {
+ struct list_head list;
+ u8 eid[SMC_MAX_EID_LEN];
+};
+
+/* The size of a user EID is 32 characters.
+ * Valid characters should be (single-byte character set) A-Z, 0-9, '.' and '-'.
+ * Blanks should only be used to pad to the expected size.
+ * First character must be alphanumeric.
+ */
+static bool smc_clc_ueid_valid(char *ueid)
+{
+ char *end = ueid + SMC_MAX_EID_LEN;
+
+ while (--end >= ueid && isspace(*end))
+ ;
+ if (end < ueid)
+ return false;
+ if (!isalnum(*ueid) || islower(*ueid))
+ return false;
+ while (ueid <= end) {
+ if ((!isalnum(*ueid) || islower(*ueid)) && *ueid != '.' &&
+ *ueid != '-')
+ return false;
+ ueid++;
+ }
+ return true;
+}
+
+static int smc_clc_ueid_add(char *ueid)
+{
+ struct smc_clc_eid_entry *new_ueid, *tmp_ueid;
+ int rc;
+
+ if (!smc_clc_ueid_valid(ueid))
+ return -EINVAL;
+
+ /* add a new ueid entry to the ueid table if there isn't one */
+ new_ueid = kzalloc(sizeof(*new_ueid), GFP_KERNEL);
+ if (!new_ueid)
+ return -ENOMEM;
+ memcpy(new_ueid->eid, ueid, SMC_MAX_EID_LEN);
+
+ write_lock(&smc_clc_eid_table.lock);
+ if (smc_clc_eid_table.ueid_cnt >= SMC_MAX_UEID) {
+ rc = -ERANGE;
+ goto err_out;
+ }
+ list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) {
+ if (!memcmp(tmp_ueid->eid, ueid, SMC_MAX_EID_LEN)) {
+ rc = -EEXIST;
+ goto err_out;
+ }
+ }
+ list_add_tail(&new_ueid->list, &smc_clc_eid_table.list);
+ smc_clc_eid_table.ueid_cnt++;
+ write_unlock(&smc_clc_eid_table.lock);
+ return 0;
+
+err_out:
+ write_unlock(&smc_clc_eid_table.lock);
+ kfree(new_ueid);
+ return rc;
+}
+
+int smc_clc_ueid_count(void)
+{
+ int count;
+
+ read_lock(&smc_clc_eid_table.lock);
+ count = smc_clc_eid_table.ueid_cnt;
+ read_unlock(&smc_clc_eid_table.lock);
+
+ return count;
+}
+
+int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY];
+ char *ueid;
+
+ if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1)
+ return -EINVAL;
+ ueid = (char *)nla_data(nla_ueid);
+
+ return smc_clc_ueid_add(ueid);
+}
+
+/* remove one or all ueid entries from the table */
+static int smc_clc_ueid_remove(char *ueid)
+{
+ struct smc_clc_eid_entry *lst_ueid, *tmp_ueid;
+ int rc = -ENOENT;
+
+ /* remove table entry */
+ write_lock(&smc_clc_eid_table.lock);
+ list_for_each_entry_safe(lst_ueid, tmp_ueid, &smc_clc_eid_table.list,
+ list) {
+ if (!ueid || !memcmp(lst_ueid->eid, ueid, SMC_MAX_EID_LEN)) {
+ list_del(&lst_ueid->list);
+ smc_clc_eid_table.ueid_cnt--;
+ kfree(lst_ueid);
+ rc = 0;
+ }
+ }
+ if (!rc && !smc_clc_eid_table.ueid_cnt) {
+ smc_clc_eid_table.seid_enabled = 1;
+ rc = -EAGAIN; /* indicate success and enabling of seid */
+ }
+ write_unlock(&smc_clc_eid_table.lock);
+ return rc;
+}
+
+int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY];
+ char *ueid;
+
+ if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1)
+ return -EINVAL;
+ ueid = (char *)nla_data(nla_ueid);
+
+ return smc_clc_ueid_remove(ueid);
+}
+
+int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info)
+{
+ smc_clc_ueid_remove(NULL);
+ return 0;
+}
+
+static int smc_nl_ueid_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq,
+ u32 flags, char *ueid)
+{
+ char ueid_str[SMC_MAX_EID_LEN + 1];
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &smc_gen_nl_family,
+ flags, SMC_NETLINK_DUMP_UEID);
+ if (!hdr)
+ return -ENOMEM;
+ memcpy(ueid_str, ueid, SMC_MAX_EID_LEN);
+ ueid_str[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_EID_TABLE_ENTRY, ueid_str)) {
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+ }
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+static int _smc_nl_ueid_dump(struct sk_buff *skb, u32 portid, u32 seq,
+ int start_idx)
+{
+ struct smc_clc_eid_entry *lst_ueid;
+ int idx = 0;
+
+ read_lock(&smc_clc_eid_table.lock);
+ list_for_each_entry(lst_ueid, &smc_clc_eid_table.list, list) {
+ if (idx++ < start_idx)
+ continue;
+ if (smc_nl_ueid_dumpinfo(skb, portid, seq, NLM_F_MULTI,
+ lst_ueid->eid)) {
+ --idx;
+ break;
+ }
+ }
+ read_unlock(&smc_clc_eid_table.lock);
+ return idx;
+}
+
+int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ int idx;
+
+ idx = _smc_nl_ueid_dump(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb_ctx->pos[0]);
+
+ cb_ctx->pos[0] = idx;
+ return skb->len;
+}
+
+int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ char seid_str[SMC_MAX_EID_LEN + 1];
+ u8 seid_enabled;
+ void *hdr;
+ u8 *seid;
+
+ if (cb_ctx->pos[0])
+ return skb->len;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_DUMP_SEID);
+ if (!hdr)
+ return -ENOMEM;
+ if (!smc_ism_is_v2_capable())
+ goto end;
+
+ smc_ism_get_system_eid(&seid);
+ memcpy(seid_str, seid, SMC_MAX_EID_LEN);
+ seid_str[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SEID_ENTRY, seid_str))
+ goto err;
+ read_lock(&smc_clc_eid_table.lock);
+ seid_enabled = smc_clc_eid_table.seid_enabled;
+ read_unlock(&smc_clc_eid_table.lock);
+ if (nla_put_u8(skb, SMC_NLA_SEID_ENABLED, seid_enabled))
+ goto err;
+end:
+ genlmsg_end(skb, hdr);
+ cb_ctx->pos[0]++;
+ return skb->len;
+err:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info)
+{
+ write_lock(&smc_clc_eid_table.lock);
+ smc_clc_eid_table.seid_enabled = 1;
+ write_unlock(&smc_clc_eid_table.lock);
+ return 0;
+}
+
+int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+
+ write_lock(&smc_clc_eid_table.lock);
+ if (!smc_clc_eid_table.ueid_cnt)
+ rc = -ENOENT;
+ else
+ smc_clc_eid_table.seid_enabled = 0;
+ write_unlock(&smc_clc_eid_table.lock);
+ return rc;
+}
+
+static bool _smc_clc_match_ueid(u8 *peer_ueid)
+{
+ struct smc_clc_eid_entry *tmp_ueid;
+
+ list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) {
+ if (!memcmp(tmp_ueid->eid, peer_ueid, SMC_MAX_EID_LEN))
+ return true;
+ }
+ return false;
+}
+
+bool smc_clc_match_eid(u8 *negotiated_eid,
+ struct smc_clc_v2_extension *smc_v2_ext,
+ u8 *peer_eid, u8 *local_eid)
+{
+ bool match = false;
+ int i;
+
+ negotiated_eid[0] = 0;
+ read_lock(&smc_clc_eid_table.lock);
+ if (peer_eid && local_eid &&
+ smc_clc_eid_table.seid_enabled &&
+ smc_v2_ext->hdr.flag.seid &&
+ !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) {
+ memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN);
+ match = true;
+ goto out;
+ }
+
+ for (i = 0; i < smc_v2_ext->hdr.eid_cnt; i++) {
+ if (_smc_clc_match_ueid(smc_v2_ext->user_eids[i])) {
+ memcpy(negotiated_eid, smc_v2_ext->user_eids[i],
+ SMC_MAX_EID_LEN);
+ match = true;
+ goto out;
+ }
+ }
+out:
+ read_unlock(&smc_clc_eid_table.lock);
+ return match;
+}
+
+/* check arriving CLC proposal */
+static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc)
+{
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_hdr *hdr = &pclc->hdr;
+ struct smc_clc_v2_extension *v2_ext;
+
+ v2_ext = smc_get_clc_v2_ext(pclc);
+ pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+ if (hdr->version == SMC_V1) {
+ if (hdr->typev1 == SMC_TYPE_N)
+ return false;
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) + ntohs(pclc->iparea_offset) +
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ } else {
+ if (ntohs(hdr->length) !=
+ sizeof(*pclc) +
+ sizeof(struct smc_clc_msg_smcd) +
+ (hdr->typev1 != SMC_TYPE_N ?
+ sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(struct smc_clc_ipv6_prefix) : 0) +
+ (hdr->typev2 != SMC_TYPE_N ?
+ sizeof(*v2_ext) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) +
+ (smcd_indicated(hdr->typev2) ?
+ sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid) :
+ 0) +
+ sizeof(struct smc_clc_msg_trail))
+ return false;
+ }
+ return true;
+}
+
+/* check arriving CLC accept or confirm */
+static bool
+smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2)
+{
+ struct smc_clc_msg_hdr *hdr = &clc_v2->hdr;
+
+ if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
+ return false;
+ if (hdr->version == SMC_V1) {
+ if ((hdr->typev1 == SMC_TYPE_R &&
+ ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
+ (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
+ return false;
+ } else {
+ if (hdr->typev1 == SMC_TYPE_D &&
+ ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 &&
+ (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 +
+ sizeof(struct smc_clc_first_contact_ext)))
+ return false;
+ if (hdr->typev1 == SMC_TYPE_R &&
+ ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2)
+ return false;
+ }
+ return true;
+}
+
+/* check arriving CLC decline */
+static bool
+smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc)
+{
+ struct smc_clc_msg_hdr *hdr = &dclc->hdr;
+
+ if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
+ return false;
+ if (hdr->version == SMC_V1) {
+ if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline))
+ return false;
+ } else {
+ if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2))
+ return false;
+ }
+ return true;
+}
+
+static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len)
+{
+ memset(fce, 0, sizeof(*fce));
+ fce->os_type = SMC_CLC_OS_LINUX;
+ fce->release = SMC_RELEASE;
+ memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname));
+ (*len) += sizeof(*fce);
+}
+
/* check if received message has a correct header length and contains valid
* heading and trailing eyecatchers
*/
-static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
+static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
{
- struct smc_clc_msg_proposal_prefix *pclc_prfx;
- struct smc_clc_msg_accept_confirm *clc;
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2;
struct smc_clc_msg_proposal *pclc;
struct smc_clc_msg_decline *dclc;
struct smc_clc_msg_trail *trl;
@@ -49,44 +444,32 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
return false;
switch (clcm->type) {
case SMC_CLC_PROPOSAL:
- if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
- clcm->path != SMC_TYPE_B)
- return false;
pclc = (struct smc_clc_msg_proposal *)clcm;
- pclc_prfx = smc_clc_proposal_get_prefix(pclc);
- if (ntohs(pclc->hdr.length) !=
- sizeof(*pclc) + ntohs(pclc->iparea_offset) +
- sizeof(*pclc_prfx) +
- pclc_prfx->ipv6_prefixes_cnt *
- sizeof(struct smc_clc_ipv6_prefix) +
- sizeof(*trl))
+ if (!smc_clc_msg_prop_valid(pclc))
return false;
trl = (struct smc_clc_msg_trail *)
((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
break;
case SMC_CLC_ACCEPT:
case SMC_CLC_CONFIRM:
- if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D)
- return false;
- clc = (struct smc_clc_msg_accept_confirm *)clcm;
- if ((clcm->path == SMC_TYPE_R &&
- ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
- (clcm->path == SMC_TYPE_D &&
- ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
+ clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm;
+ if (!smc_clc_msg_acc_conf_valid(clc_v2))
return false;
trl = (struct smc_clc_msg_trail *)
- ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl));
+ ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) -
+ sizeof(*trl));
break;
case SMC_CLC_DECLINE:
dclc = (struct smc_clc_msg_decline *)clcm;
- if (ntohs(dclc->hdr.length) != sizeof(*dclc))
+ if (!smc_clc_msg_decl_valid(dclc))
return false;
- trl = &dclc->trl;
+ check_trl = false;
break;
default:
return false;
}
- if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ if (check_trl &&
+ memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
return false;
return true;
@@ -154,7 +537,6 @@ static int smc_clc_prfx_set(struct socket *clcsock,
struct sockaddr_in *addr;
int rc = -ENOENT;
- memset(prop, 0, sizeof(*prop));
if (!dst) {
rc = -ENOTCONN;
goto out;
@@ -164,7 +546,8 @@ static int smc_clc_prfx_set(struct socket *clcsock,
goto out_rel;
}
/* get address to which the internal TCP socket is bound */
- kernel_getsockname(clcsock, (struct sockaddr *)&addrs);
+ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0)
+ goto out_rel;
/* analyze IP specific data of net_device belonging to TCP socket */
addr6 = (struct sockaddr_in6 *)&addrs;
rcu_read_lock();
@@ -276,7 +659,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
struct msghdr msg = {NULL, 0};
int reason_code = 0;
struct kvec vec = {buf, buflen};
- int len, datlen;
+ int len, datlen, recvlen;
+ bool check_trl = true;
int krflags;
/* peek the first few bytes to determine length of data to receive
@@ -320,10 +704,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
}
datlen = ntohs(clcm->length);
if ((len < sizeof(struct smc_clc_msg_hdr)) ||
- (datlen > buflen) ||
- (clcm->version != SMC_CLC_V1) ||
- (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
- clcm->path != SMC_TYPE_B) ||
+ (clcm->version < SMC_V1) ||
((clcm->type != SMC_CLC_DECLINE) &&
(clcm->type != expected_type))) {
smc->sk.sk_err = EPROTO;
@@ -333,23 +714,43 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
/* receive the complete CLC message */
memset(&msg, 0, sizeof(struct msghdr));
- iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, datlen);
+ if (datlen > buflen) {
+ check_trl = false;
+ recvlen = buflen;
+ } else {
+ recvlen = datlen;
+ }
+ iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen);
krflags = MSG_WAITALL;
len = sock_recvmsg(smc->clcsock, &msg, krflags);
- if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
+ if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) {
smc->sk.sk_err = EPROTO;
reason_code = -EPROTO;
goto out;
}
+ datlen -= len;
+ while (datlen) {
+ u8 tmp[SMC_CLC_RECV_BUF_LEN];
+
+ vec.iov_base = &tmp;
+ vec.iov_len = SMC_CLC_RECV_BUF_LEN;
+ /* receive remaining proposal message */
+ recvlen = datlen > SMC_CLC_RECV_BUF_LEN ?
+ SMC_CLC_RECV_BUF_LEN : datlen;
+ iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen);
+ len = sock_recvmsg(smc->clcsock, &msg, krflags);
+ datlen -= len;
+ }
if (clcm->type == SMC_CLC_DECLINE) {
struct smc_clc_msg_decline *dclc;
dclc = (struct smc_clc_msg_decline *)clcm;
reason_code = SMC_CLC_DECL_PEERDECL;
smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
- if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
+ if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 &
+ SMC_FIRST_CONTACT_MASK) {
smc->conn.lgr->sync_err = 1;
- smc_lgr_terminate(smc->conn.lgr, true);
+ smc_lgr_terminate_sched(smc->conn.lgr);
}
}
@@ -359,172 +760,369 @@ out:
}
/* send CLC DECLINE message across internal TCP socket */
-int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version)
{
- struct smc_clc_msg_decline dclc;
+ struct smc_clc_msg_decline *dclc_v1;
+ struct smc_clc_msg_decline_v2 dclc;
struct msghdr msg;
+ int len, send_len;
struct kvec vec;
- int len;
+ dclc_v1 = (struct smc_clc_msg_decline *)&dclc;
memset(&dclc, 0, sizeof(dclc));
memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
dclc.hdr.type = SMC_CLC_DECLINE;
- dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
- dclc.hdr.version = SMC_CLC_V1;
- dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;
- if (smc->conn.lgr && !smc->conn.lgr->is_smcd)
+ dclc.hdr.version = version;
+ dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX;
+ dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ?
+ SMC_FIRST_CONTACT_MASK : 0;
+ if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) &&
+ smc_ib_is_valid_local_systemid())
memcpy(dclc.id_for_peer, local_systemid,
sizeof(local_systemid));
dclc.peer_diagnosis = htonl(peer_diag_info);
- memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ if (version == SMC_V1) {
+ memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ send_len = sizeof(*dclc_v1);
+ } else {
+ memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ send_len = sizeof(dclc);
+ }
+ dclc.hdr.length = htons(send_len);
memset(&msg, 0, sizeof(msg));
vec.iov_base = &dclc;
- vec.iov_len = sizeof(struct smc_clc_msg_decline);
- len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
- sizeof(struct smc_clc_msg_decline));
- if (len < 0 || len < sizeof(struct smc_clc_msg_decline))
+ vec.iov_len = send_len;
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len);
+ if (len < 0 || len < send_len)
len = -EPROTO;
return len > 0 ? 0 : len;
}
/* send CLC PROPOSAL message across internal TCP socket */
-int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_init_info *ini)
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
{
- struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
- struct smc_clc_msg_proposal_prefix pclc_prfx;
- struct smc_clc_msg_smcd pclc_smcd;
- struct smc_clc_msg_proposal pclc;
- struct smc_clc_msg_trail trl;
+ struct smc_clc_smcd_v2_extension *smcd_v2_ext;
+ struct smc_clc_msg_proposal_prefix *pclc_prfx;
+ struct smc_clc_msg_proposal *pclc_base;
+ struct smc_clc_smcd_gid_chid *gidchids;
+ struct smc_clc_msg_proposal_area *pclc;
+ struct smc_clc_ipv6_prefix *ipv6_prfx;
+ struct smc_clc_v2_extension *v2_ext;
+ struct smc_clc_msg_smcd *pclc_smcd;
+ struct smc_clc_msg_trail *trl;
int len, i, plen, rc;
int reason_code = 0;
- struct kvec vec[5];
+ struct kvec vec[8];
struct msghdr msg;
+ pclc = kzalloc(sizeof(*pclc), GFP_KERNEL);
+ if (!pclc)
+ return -ENOMEM;
+
+ pclc_base = &pclc->pclc_base;
+ pclc_smcd = &pclc->pclc_smcd;
+ pclc_prfx = &pclc->pclc_prfx;
+ ipv6_prfx = pclc->pclc_prfx_ipv6;
+ v2_ext = &pclc->pclc_v2_ext;
+ smcd_v2_ext = &pclc->pclc_smcd_v2_ext;
+ gidchids = pclc->pclc_gidchids;
+ trl = &pclc->pclc_trl;
+
+ pclc_base->hdr.version = SMC_V2;
+ pclc_base->hdr.typev1 = ini->smc_type_v1;
+ pclc_base->hdr.typev2 = ini->smc_type_v2;
+ plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl);
+
/* retrieve ip prefixes for CLC proposal msg */
- rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx);
- if (rc)
- return SMC_CLC_DECL_CNFERR; /* configuration error */
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx);
+ if (rc) {
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ kfree(pclc);
+ return SMC_CLC_DECL_CNFERR;
+ }
+ pclc_base->hdr.typev1 = SMC_TYPE_N;
+ } else {
+ pclc_base->iparea_offset = htons(sizeof(*pclc_smcd));
+ plen += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
+ }
- /* send SMC Proposal CLC message */
- plen = sizeof(pclc) + sizeof(pclc_prfx) +
- (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) +
- sizeof(trl);
- memset(&pclc, 0, sizeof(pclc));
- memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
- pclc.hdr.type = SMC_CLC_PROPOSAL;
- pclc.hdr.version = SMC_CLC_V1; /* SMC version */
- pclc.hdr.path = smc_type;
- if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) {
+ /* build SMC Proposal CLC message */
+ memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ pclc_base->hdr.type = SMC_CLC_PROPOSAL;
+ if (smcr_indicated(ini->smc_type_v1)) {
/* add SMC-R specifics */
- memcpy(pclc.lcl.id_for_peer, local_systemid,
+ memcpy(pclc_base->lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE);
- memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
+ memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE);
+ memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1],
ETH_ALEN);
- pclc.iparea_offset = htons(0);
}
- if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
+ if (smcd_indicated(ini->smc_type_v1)) {
/* add SMC-D specifics */
- memset(&pclc_smcd, 0, sizeof(pclc_smcd));
- plen += sizeof(pclc_smcd);
- pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET);
- pclc_smcd.gid = ini->ism_dev->local_gid;
+ if (ini->ism_dev[0]) {
+ pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid);
+ pclc_smcd->ism.chid =
+ htons(smc_ism_get_chid(ini->ism_dev[0]));
+ }
}
- pclc.hdr.length = htons(plen);
+ if (ini->smc_type_v2 == SMC_TYPE_N) {
+ pclc_smcd->v2_ext_offset = 0;
+ } else {
+ struct smc_clc_eid_entry *ueident;
+ u16 v2_ext_offset;
+
+ v2_ext->hdr.flag.release = SMC_RELEASE;
+ v2_ext_offset = sizeof(*pclc_smcd) -
+ offsetofend(struct smc_clc_msg_smcd, v2_ext_offset);
+ if (ini->smc_type_v1 != SMC_TYPE_N)
+ v2_ext_offset += sizeof(*pclc_prfx) +
+ pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ pclc_smcd->v2_ext_offset = htons(v2_ext_offset);
+ plen += sizeof(*v2_ext);
- memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ read_lock(&smc_clc_eid_table.lock);
+ v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt;
+ plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN;
+ i = 0;
+ list_for_each_entry(ueident, &smc_clc_eid_table.list, list) {
+ memcpy(v2_ext->user_eids[i++], ueident->eid,
+ sizeof(ueident->eid));
+ }
+ read_unlock(&smc_clc_eid_table.lock);
+ }
+ if (smcd_indicated(ini->smc_type_v2)) {
+ u8 *eid = NULL;
+
+ v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled;
+ v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt;
+ v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) -
+ offsetofend(struct smc_clnt_opts_area_hdr,
+ smcd_v2_ext_offset) +
+ v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
+ smc_ism_get_system_eid(&eid);
+ if (eid && v2_ext->hdr.flag.seid)
+ memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN);
+ plen += sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ for (i = 1; i <= ini->ism_offered_cnt; i++) {
+ gidchids[i - 1].gid =
+ htonll(ini->ism_dev[i]->local_gid);
+ gidchids[i - 1].chid =
+ htons(smc_ism_get_chid(ini->ism_dev[i]));
+ }
+ plen += ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
+ }
+ if (smcr_indicated(ini->smc_type_v2))
+ memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE);
+
+ pclc_base->hdr.length = htons(plen);
+ memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ /* send SMC Proposal CLC message */
memset(&msg, 0, sizeof(msg));
i = 0;
- vec[i].iov_base = &pclc;
- vec[i++].iov_len = sizeof(pclc);
- if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
- vec[i].iov_base = &pclc_smcd;
- vec[i++].iov_len = sizeof(pclc_smcd);
- }
- vec[i].iov_base = &pclc_prfx;
- vec[i++].iov_len = sizeof(pclc_prfx);
- if (pclc_prfx.ipv6_prefixes_cnt > 0) {
- vec[i].iov_base = &ipv6_prfx[0];
- vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt *
- sizeof(ipv6_prfx[0]);
+ vec[i].iov_base = pclc_base;
+ vec[i++].iov_len = sizeof(*pclc_base);
+ vec[i].iov_base = pclc_smcd;
+ vec[i++].iov_len = sizeof(*pclc_smcd);
+ if (ini->smc_type_v1 != SMC_TYPE_N) {
+ vec[i].iov_base = pclc_prfx;
+ vec[i++].iov_len = sizeof(*pclc_prfx);
+ if (pclc_prfx->ipv6_prefixes_cnt > 0) {
+ vec[i].iov_base = ipv6_prfx;
+ vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt *
+ sizeof(ipv6_prfx[0]);
+ }
}
- vec[i].iov_base = &trl;
- vec[i++].iov_len = sizeof(trl);
+ if (ini->smc_type_v2 != SMC_TYPE_N) {
+ vec[i].iov_base = v2_ext;
+ vec[i++].iov_len = sizeof(*v2_ext) +
+ (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
+ if (smcd_indicated(ini->smc_type_v2)) {
+ vec[i].iov_base = smcd_v2_ext;
+ vec[i++].iov_len = sizeof(*smcd_v2_ext);
+ if (ini->ism_offered_cnt) {
+ vec[i].iov_base = gidchids;
+ vec[i++].iov_len = ini->ism_offered_cnt *
+ sizeof(struct smc_clc_smcd_gid_chid);
+ }
+ }
+ }
+ vec[i].iov_base = trl;
+ vec[i++].iov_len = sizeof(*trl);
/* due to the few bytes needed for clc-handshake this cannot block */
len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
if (len < 0) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
reason_code = -smc->sk.sk_err;
- } else if (len < (int)sizeof(pclc)) {
+ } else if (len < ntohs(pclc_base->hdr.length)) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
}
+ kfree(pclc);
return reason_code;
}
-/* send CLC CONFIRM message across internal TCP socket */
-int smc_clc_send_confirm(struct smc_sock *smc)
+/* build and send CLC CONFIRM / ACCEPT message */
+static int smc_clc_send_confirm_accept(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm_v2 *clc_v2,
+ int first_contact, u8 version,
+ u8 *eid, struct smc_init_info *ini)
{
struct smc_connection *conn = &smc->conn;
- struct smc_clc_msg_accept_confirm cclc;
- struct smc_link *link;
- int reason_code = 0;
+ struct smc_clc_msg_accept_confirm *clc;
+ struct smc_clc_first_contact_ext fce;
+ struct smc_clc_fce_gid_ext gle;
+ struct smc_clc_msg_trail trl;
+ struct kvec vec[5];
struct msghdr msg;
- struct kvec vec;
- int len;
+ int i, len;
/* send SMC Confirm CLC msg */
- memset(&cclc, 0, sizeof(cclc));
- cclc.hdr.type = SMC_CLC_CONFIRM;
- cclc.hdr.version = SMC_CLC_V1; /* SMC version */
- if (smc->conn.lgr->is_smcd) {
+ clc = (struct smc_clc_msg_accept_confirm *)clc_v2;
+ clc->hdr.version = version; /* SMC version */
+ if (first_contact)
+ clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK;
+ if (conn->lgr->is_smcd) {
/* SMC-D specific settings */
- memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER,
+ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER,
sizeof(SMCD_EYECATCHER));
- cclc.hdr.path = SMC_TYPE_D;
- cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
- cclc.gid = conn->lgr->smcd->local_gid;
- cclc.token = conn->rmb_desc->token;
- cclc.dmbe_size = conn->rmbe_size_short;
- cclc.dmbe_idx = 0;
- memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
- memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
+ clc->hdr.typev1 = SMC_TYPE_D;
+ clc->d0.gid = conn->lgr->smcd->local_gid;
+ clc->d0.token = conn->rmb_desc->token;
+ clc->d0.dmbe_size = conn->rmbe_size_short;
+ clc->d0.dmbe_idx = 0;
+ memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ if (version == SMC_V1) {
+ clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ } else {
+ clc_v2->d1.chid =
+ htons(smc_ism_get_chid(conn->lgr->smcd));
+ if (eid && eid[0])
+ memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN);
+ len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2;
+ if (first_contact)
+ smc_clc_fill_fce(&fce, &len);
+ clc_v2->hdr.length = htons(len);
+ }
+ memcpy(trl.eyecatcher, SMCD_EYECATCHER,
sizeof(SMCD_EYECATCHER));
} else {
+ struct smc_link *link = conn->lnk;
+
/* SMC-R specific settings */
- link = &conn->lgr->lnk[SMC_SINGLE_LINK];
- memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER,
+ memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER,
sizeof(SMC_EYECATCHER));
- cclc.hdr.path = SMC_TYPE_R;
- cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(cclc.lcl.id_for_peer, local_systemid,
+ clc->hdr.typev1 = SMC_TYPE_R;
+ clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(clc->r0.lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE);
- memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE);
+ memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
ETH_ALEN);
- hton24(cclc.qpn, link->roce_qp->qp_num);
- cclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
- cclc.rmbe_alert_token = htonl(conn->alert_token_local);
- cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
- cclc.rmbe_size = conn->rmbe_size_short;
- cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
- (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
- hton24(cclc.psn, link->psn_initial);
- memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
+ hton24(clc->r0.qpn, link->roce_qp->qp_num);
+ clc->r0.rmb_rkey =
+ htonl(conn->rmb_desc->mr[link->link_idx]->rkey);
+ clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ clc->r0.rmbe_alert_token = htonl(conn->alert_token_local);
+ switch (clc->hdr.type) {
+ case SMC_CLC_ACCEPT:
+ clc->r0.qp_mtu = link->path_mtu;
+ break;
+ case SMC_CLC_CONFIRM:
+ clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ break;
+ }
+ clc->r0.rmbe_size = conn->rmbe_size_short;
+ clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ?
+ cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address
+ (conn->rmb_desc->sgt[link->link_idx].sgl));
+ hton24(clc->r0.psn, link->psn_initial);
+ if (version == SMC_V1) {
+ clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ } else {
+ if (eid && eid[0])
+ memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN);
+ len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2;
+ if (first_contact) {
+ smc_clc_fill_fce(&fce, &len);
+ fce.v2_direct = !link->lgr->uses_gateway;
+ memset(&gle, 0, sizeof(gle));
+ if (ini && clc->hdr.type == SMC_CLC_CONFIRM) {
+ gle.gid_cnt = ini->smcrv2.gidlist.len;
+ len += sizeof(gle);
+ len += gle.gid_cnt * sizeof(gle.gid[0]);
+ } else {
+ len += sizeof(gle.reserved);
+ }
+ }
+ clc_v2->hdr.length = htons(len);
+ }
+ memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
}
memset(&msg, 0, sizeof(msg));
- vec.iov_base = &cclc;
- vec.iov_len = ntohs(cclc.hdr.length);
- len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
- ntohs(cclc.hdr.length));
- if (len < ntohs(cclc.hdr.length)) {
+ i = 0;
+ vec[i].iov_base = clc_v2;
+ if (version > SMC_V1)
+ vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
+ SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 :
+ SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) -
+ sizeof(trl);
+ else
+ vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
+ SMCD_CLC_ACCEPT_CONFIRM_LEN :
+ SMCR_CLC_ACCEPT_CONFIRM_LEN) -
+ sizeof(trl);
+ if (version > SMC_V1 && first_contact) {
+ vec[i].iov_base = &fce;
+ vec[i++].iov_len = sizeof(fce);
+ if (!conn->lgr->is_smcd) {
+ if (clc->hdr.type == SMC_CLC_CONFIRM) {
+ vec[i].iov_base = &gle;
+ vec[i++].iov_len = sizeof(gle);
+ vec[i].iov_base = &ini->smcrv2.gidlist.list;
+ vec[i++].iov_len = gle.gid_cnt *
+ sizeof(gle.gid[0]);
+ } else {
+ vec[i].iov_base = &gle.reserved;
+ vec[i++].iov_len = sizeof(gle.reserved);
+ }
+ }
+ }
+ vec[i].iov_base = &trl;
+ vec[i++].iov_len = sizeof(trl);
+ return kernel_sendmsg(smc->clcsock, &msg, vec, 1,
+ ntohs(clc->hdr.length));
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version, u8 *eid, struct smc_init_info *ini)
+{
+ struct smc_clc_msg_accept_confirm_v2 cclc_v2;
+ int reason_code = 0;
+ int len;
+
+ /* send SMC Confirm CLC msg */
+ memset(&cclc_v2, 0, sizeof(cclc_v2));
+ cclc_v2.hdr.type = SMC_CLC_CONFIRM;
+ len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact,
+ version, eid, ini);
+ if (len < ntohs(cclc_v2.hdr.length)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
@@ -537,67 +1135,43 @@ int smc_clc_send_confirm(struct smc_sock *smc)
}
/* send CLC ACCEPT message across internal TCP socket */
-int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact,
+ u8 version, u8 *negotiated_eid)
{
- struct smc_connection *conn = &new_smc->conn;
- struct smc_clc_msg_accept_confirm aclc;
- struct smc_link *link;
- struct msghdr msg;
- struct kvec vec;
+ struct smc_clc_msg_accept_confirm_v2 aclc_v2;
int len;
- memset(&aclc, 0, sizeof(aclc));
- aclc.hdr.type = SMC_CLC_ACCEPT;
- aclc.hdr.version = SMC_CLC_V1; /* SMC version */
- if (srv_first_contact)
- aclc.hdr.flag = 1;
-
- if (new_smc->conn.lgr->is_smcd) {
- /* SMC-D specific settings */
- aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER,
- sizeof(SMCD_EYECATCHER));
- aclc.hdr.path = SMC_TYPE_D;
- aclc.gid = conn->lgr->smcd->local_gid;
- aclc.token = conn->rmb_desc->token;
- aclc.dmbe_size = conn->rmbe_size_short;
- aclc.dmbe_idx = 0;
- memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
- memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
- sizeof(SMCD_EYECATCHER));
- } else {
- /* SMC-R specific settings */
- aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
- memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
- aclc.hdr.path = SMC_TYPE_R;
- link = &conn->lgr->lnk[SMC_SINGLE_LINK];
- memcpy(aclc.lcl.id_for_peer, local_systemid,
- sizeof(local_systemid));
- memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE);
- memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
- ETH_ALEN);
- hton24(aclc.qpn, link->roce_qp->qp_num);
- aclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
- aclc.rmbe_alert_token = htonl(conn->alert_token_local);
- aclc.qp_mtu = link->path_mtu;
- aclc.rmbe_size = conn->rmbe_size_short,
- aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
- (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
- hton24(aclc.psn, link->psn_initial);
- memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
- sizeof(SMC_EYECATCHER));
- }
-
- memset(&msg, 0, sizeof(msg));
- vec.iov_base = &aclc;
- vec.iov_len = ntohs(aclc.hdr.length);
- len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1,
- ntohs(aclc.hdr.length));
- if (len < ntohs(aclc.hdr.length))
+ memset(&aclc_v2, 0, sizeof(aclc_v2));
+ aclc_v2.hdr.type = SMC_CLC_ACCEPT;
+ len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact,
+ version, negotiated_eid, NULL);
+ if (len < ntohs(aclc_v2.hdr.length))
len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
return len > 0 ? 0 : len;
}
+
+void smc_clc_get_hostname(u8 **host)
+{
+ *host = &smc_hostname[0];
+}
+
+void __init smc_clc_init(void)
+{
+ struct new_utsname *u;
+
+ memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */
+ u = utsname();
+ memcpy(smc_hostname, u->nodename,
+ min_t(size_t, strlen(u->nodename), sizeof(smc_hostname)));
+
+ INIT_LIST_HEAD(&smc_clc_eid_table.list);
+ rwlock_init(&smc_clc_eid_table.lock);
+ smc_clc_eid_table.ueid_cnt = 0;
+ smc_clc_eid_table.seid_enabled = 1;
+}
+
+void smc_clc_exit(void)
+{
+ smc_clc_ueid_remove(NULL);
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index ca209272e5fa..5fee545c9a10 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -14,17 +14,19 @@
#define _SMC_CLC_H
#include <rdma/ib_verbs.h>
+#include <linux/smc.h>
#include "smc.h"
+#include "smc_netlink.h"
#define SMC_CLC_PROPOSAL 0x01
#define SMC_CLC_ACCEPT 0x02
#define SMC_CLC_CONFIRM 0x03
#define SMC_CLC_DECLINE 0x04
-#define SMC_CLC_V1 0x1 /* SMC version */
#define SMC_TYPE_R 0 /* SMC-R only */
#define SMC_TYPE_D 1 /* SMC-D only */
+#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */
#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */
@@ -37,19 +39,32 @@
#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
-#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */
+#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */
+#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */
+#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */
+#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */
+#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */
+#define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */
#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */
#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/
#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */
+#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */
+#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */
+#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */
+#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */
+#define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */
+#define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/
#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */
#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */
#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */
-#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */
+#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */
+
+#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */
struct smc_clc_msg_hdr { /* header1 of clc messages */
u8 eyecatcher[4]; /* eye catcher */
@@ -57,13 +72,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */
__be16 length;
#if defined(__BIG_ENDIAN_BITFIELD)
u8 version : 4,
- flag : 1,
- rsvd : 1,
- path : 2;
+ typev2 : 2,
+ typev1 : 2;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 path : 2,
- rsvd : 1,
- flag : 1,
+ u8 typev1 : 2,
+ typev2 : 2,
version : 4;
#endif
} __packed; /* format defined in RFC7609 */
@@ -78,8 +91,6 @@ struct smc_clc_msg_local { /* header2 of clc messages */
u8 mac[6]; /* mac of ib_device port */
};
-#define SMC_CLC_MAX_V6_PREFIX 8
-
/* Struct would be 4 byte aligned, but it is used in an array that is sent
* to peers and must conform to RFC7609, hence we need to use packed here.
*/
@@ -88,6 +99,44 @@ struct smc_clc_ipv6_prefix {
u8 prefix_len;
} __packed; /* format defined in RFC7609 */
+#if defined(__BIG_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 release : 4,
+ rsvd : 3,
+ seid : 1;
+};
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+struct smc_clc_v2_flag {
+ u8 seid : 1,
+ rsvd : 3,
+ release : 4;
+};
+#endif
+
+struct smc_clnt_opts_area_hdr {
+ u8 eid_cnt; /* number of user defined EIDs */
+ u8 ism_gid_cnt; /* number of ISMv2 GIDs */
+ u8 reserved1;
+ struct smc_clc_v2_flag flag;
+ u8 reserved2[2];
+ __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */
+};
+
+struct smc_clc_smcd_gid_chid {
+ __be64 gid; /* ISM GID */
+ __be16 chid; /* ISMv2 CHID */
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
+
+struct smc_clc_v2_extension {
+ struct smc_clnt_opts_area_hdr hdr;
+ u8 roce[16]; /* RoCEv2 GID */
+ u8 reserved[16];
+ u8 user_eids[][SMC_MAX_EID_LEN];
+};
+
struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
__be32 outgoing_subnet; /* subnet mask */
u8 prefix_len; /* number of significant bits in mask */
@@ -96,8 +145,15 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
} __aligned(4);
struct smc_clc_msg_smcd { /* SMC-D GID information */
- u64 gid; /* ISM GID of requestor */
- u8 res[32];
+ struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */
+ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */
+ u8 reserved[28];
+};
+
+struct smc_clc_smcd_v2_extension {
+ u8 system_eid[SMC_MAX_EID_LEN];
+ u8 reserved[16];
+ struct smc_clc_smcd_gid_chid gidchid[];
};
struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
@@ -106,64 +162,141 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
__be16 iparea_offset; /* offset to IP address information area */
} __aligned(4);
-#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28
-#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \
- sizeof(struct smc_clc_ipv6_prefix))
-#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \
- SMC_CLC_PROPOSAL_MAX_OFFSET + \
- sizeof(struct smc_clc_msg_proposal_prefix) + \
- SMC_CLC_PROPOSAL_MAX_PREFIX + \
- sizeof(struct smc_clc_msg_trail))
+#define SMC_CLC_MAX_V6_PREFIX 8
+#define SMC_CLC_MAX_UEID 8
-struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
- struct smc_clc_msg_hdr hdr;
- union {
- struct { /* SMC-R */
- struct smc_clc_msg_local lcl;
- u8 qpn[3]; /* QP number */
- __be32 rmb_rkey; /* RMB rkey */
- u8 rmbe_idx; /* Index of RMBE in RMB */
- __be32 rmbe_alert_token;/* unique connection id */
+struct smc_clc_msg_proposal_area {
+ struct smc_clc_msg_proposal pclc_base;
+ struct smc_clc_msg_smcd pclc_smcd;
+ struct smc_clc_msg_proposal_prefix pclc_prfx;
+ struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX];
+ struct smc_clc_v2_extension pclc_v2_ext;
+ u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN];
+ struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext;
+ struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS];
+ struct smc_clc_msg_trail pclc_trl;
+};
+
+struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
+ __be32 rmbe_alert_token; /* unique connection id */
+ #if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* buf size (compressed) */
+ qp_mtu : 4; /* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
+#endif
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* packet sequence number */
+} __packed;
+
+struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */
+ u64 gid; /* Sender GID */
+ u64 token; /* DMB token */
+ u8 dmbe_idx; /* DMBE index */
#if defined(__BIG_ENDIAN_BITFIELD)
- u8 rmbe_size : 4, /* buf size (compressed) */
- qp_mtu : 4; /* QP mtu */
+ u8 dmbe_size : 4, /* buf size (compressed) */
+ reserved3 : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 qp_mtu : 4,
- rmbe_size : 4;
+ u8 reserved3 : 4,
+ dmbe_size : 4;
#endif
- u8 reserved;
- __be64 rmb_dma_addr; /* RMB virtual address */
- u8 reserved2;
- u8 psn[3]; /* packet sequence number */
- struct smc_clc_msg_trail smcr_trl;
- /* eye catcher "SMCR" EBCDIC */
- } __packed;
- struct { /* SMC-D */
- u64 gid; /* Sender GID */
- u64 token; /* DMB token */
- u8 dmbe_idx; /* DMBE index */
+ u16 reserved4;
+ __be32 linkid; /* Link identifier */
+} __packed;
+
+#define SMC_CLC_OS_ZOS 1
+#define SMC_CLC_OS_LINUX 2
+#define SMC_CLC_OS_AIX 3
+
+struct smc_clc_first_contact_ext {
#if defined(__BIG_ENDIAN_BITFIELD)
- u8 dmbe_size : 4, /* buf size (compressed) */
- reserved3 : 4;
+ u8 v2_direct : 1,
+ reserved : 7;
+ u8 os_type : 4,
+ release : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 reserved3 : 4,
- dmbe_size : 4;
+ u8 reserved : 7,
+ v2_direct : 1;
+ u8 release : 4,
+ os_type : 4;
#endif
- u16 reserved4;
- u32 linkid; /* Link identifier */
+ u8 reserved2[2];
+ u8 hostname[SMC_MAX_HOSTNAME_LEN];
+};
+
+struct smc_clc_fce_gid_ext {
+ u8 reserved[16];
+ u8 gid_cnt;
+ u8 reserved2[3];
+ u8 gid[][SMC_GID_SIZE];
+};
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
u32 reserved5[3];
- struct smc_clc_msg_trail smcd_trl;
- /* eye catcher "SMCD" EBCDIC */
- } __packed;
+ };
};
} __packed; /* format defined in RFC7609 */
+struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ union {
+ struct { /* SMC-R */
+ struct smcr_clc_msg_accept_confirm r0;
+ u8 eid[SMC_MAX_EID_LEN];
+ u8 reserved6[8];
+ } r1;
+ struct { /* SMC-D */
+ struct smcd_clc_msg_accept_confirm_common d0;
+ __be16 chid;
+ u8 eid[SMC_MAX_EID_LEN];
+ u8 reserved5[8];
+ } d1;
+ };
+};
+
struct smc_clc_msg_decline { /* clc decline message */
struct smc_clc_msg_hdr hdr;
u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
__be32 peer_diagnosis; /* diagnosis information */
- u8 reserved2[4];
- struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 os_type : 4,
+ reserved : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[3];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
+} __aligned(4);
+
+#define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */
+
+struct smc_clc_msg_decline_v2 { /* clc decline message */
+ struct smc_clc_msg_hdr hdr;
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+ __be32 peer_diagnosis; /* diagnosis information */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 os_type : 4,
+ reserved : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 4,
+ os_type : 4;
+#endif
+ u8 reserved2[3];
+ __be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
} __aligned(4);
/* determine start of the prefix area within the proposal message */
@@ -174,16 +307,69 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
}
+static inline bool smcr_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B;
+}
+
+static inline bool smcd_indicated(int smc_type)
+{
+ return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B;
+}
+
+static inline u8 smc_indicated_type(int is_smcd, int is_smcr)
+{
+ if (is_smcd && is_smcr)
+ return SMC_TYPE_B;
+ if (is_smcd)
+ return SMC_TYPE_D;
+ if (is_smcr)
+ return SMC_TYPE_R;
+ return SMC_TYPE_N;
+}
+
/* get SMC-D info from proposal message */
static inline struct smc_clc_msg_smcd *
smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
{
- if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
+ if (smcd_indicated(prop->hdr.typev1) &&
+ ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
return NULL;
return (struct smc_clc_msg_smcd *)(prop + 1);
}
+static inline struct smc_clc_v2_extension *
+smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop)
+{
+ struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop);
+
+ if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_v2_extension *)
+ ((u8 *)prop_smcd +
+ offsetof(struct smc_clc_msg_smcd, v2_ext_offset) +
+ sizeof(prop_smcd->v2_ext_offset) +
+ ntohs(prop_smcd->v2_ext_offset));
+}
+
+static inline struct smc_clc_smcd_v2_extension *
+smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext)
+{
+ if (!prop_v2ext)
+ return NULL;
+ if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset))
+ return NULL;
+
+ return (struct smc_clc_smcd_v2_extension *)
+ ((u8 *)prop_v2ext +
+ offsetof(struct smc_clc_v2_extension, hdr) +
+ offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) +
+ sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) +
+ ntohs(prop_v2ext->hdr.smcd_v2_ext_offset));
+}
+
struct smcd_dev;
struct smc_init_info;
@@ -191,10 +377,25 @@ int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop);
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type, unsigned long timeout);
-int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
-int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_init_info *ini);
-int smc_clc_send_confirm(struct smc_sock *smc);
-int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini);
+int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
+ u8 version, u8 *eid, struct smc_init_info *ini);
+int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact,
+ u8 version, u8 *negotiated_eid);
+void smc_clc_init(void) __init;
+void smc_clc_exit(void);
+void smc_clc_get_hostname(u8 **host);
+bool smc_clc_match_eid(u8 *negotiated_eid,
+ struct smc_clc_v2_extension *smc_v2_ext,
+ u8 *peer_eid, u8 *local_eid);
+int smc_clc_ueid_count(void);
+int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info);
+int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info);
#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 290270c821ca..31db7438857c 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -57,6 +57,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
if (!smc_tx_prepared_sends(&smc->conn))
return;
+ /* Send out corked data remaining in sndbuf */
+ smc_tx_pending(&smc->conn);
+
smc->wait_close_tx_prepared = 1;
add_wait_queue(sk_sleep(sk), &wait);
while (!signal_pending(current) && timeout) {
@@ -116,7 +119,6 @@ static void smc_close_cancel_work(struct smc_sock *smc)
cancel_work_sync(&smc->conn.close_work);
cancel_delayed_work_sync(&smc->conn.tx_work);
lock_sock(sk);
- sk->sk_state = SMC_CLOSED;
}
/* terminate smc socket abnormally - active abort
@@ -134,22 +136,22 @@ void smc_close_active_abort(struct smc_sock *smc)
}
switch (sk->sk_state) {
case SMC_ACTIVE:
- sk->sk_state = SMC_PEERABORTWAIT;
- smc_close_cancel_work(smc);
- sk->sk_state = SMC_CLOSED;
- sock_put(sk); /* passive closing */
- break;
case SMC_APPCLOSEWAIT1:
case SMC_APPCLOSEWAIT2:
+ sk->sk_state = SMC_PEERABORTWAIT;
smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
sk->sk_state = SMC_CLOSED;
- sock_put(sk); /* postponed passive closing */
+ sock_put(sk); /* (postponed) passive closing */
break;
case SMC_PEERCLOSEWAIT1:
case SMC_PEERCLOSEWAIT2:
case SMC_PEERFINCLOSEWAIT:
sk->sk_state = SMC_PEERABORTWAIT;
smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
sk->sk_state = SMC_CLOSED;
smc_conn_free(&smc->conn);
release_clcsock = true;
@@ -159,6 +161,8 @@ void smc_close_active_abort(struct smc_sock *smc)
case SMC_APPFINCLOSEWAIT:
sk->sk_state = SMC_PEERABORTWAIT;
smc_close_cancel_work(smc);
+ if (sk->sk_state != SMC_PEERABORTWAIT)
+ break;
sk->sk_state = SMC_CLOSED;
smc_conn_free(&smc->conn);
release_clcsock = true;
@@ -194,6 +198,7 @@ int smc_close_active(struct smc_sock *smc)
int old_state;
long timeout;
int rc = 0;
+ int rc1 = 0;
timeout = current->flags & PF_EXITING ?
0 : sock_flag(sk, SOCK_LINGER) ?
@@ -209,9 +214,12 @@ again:
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk); /* wake up accept */
if (smc->clcsock && smc->clcsock->sk) {
+ write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
+ smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
+ &smc->clcsk_data_ready);
+ smc->clcsock->sk->sk_user_data = NULL;
+ write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
- /* wake up kernel_accept of smc_tcp_listen_worker */
- smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
}
smc_close_cleanup_listen(sk);
release_sock(sk);
@@ -227,6 +235,15 @@ again:
/* send close request */
rc = smc_close_final(conn);
sk->sk_state = SMC_PEERCLOSEWAIT1;
+
+ /* actively shutdown clcsock before peer close it,
+ * prevent peer from entering TIME_WAIT state.
+ */
+ if (smc->clcsock && smc->clcsock->sk) {
+ rc1 = kernel_sock_shutdown(smc->clcsock,
+ SHUT_RDWR);
+ rc = rc ? rc : rc1;
+ }
} else {
/* peer event has changed the state */
goto again;
@@ -353,9 +370,9 @@ static void smc_close_passive_work(struct work_struct *work)
if (rxflags->peer_conn_abort) {
/* peer has not received all data */
smc_close_passive_abort_received(smc);
- release_sock(&smc->sk);
+ release_sock(sk);
cancel_delayed_work_sync(&conn->tx_work);
- lock_sock(&smc->sk);
+ lock_sock(sk);
goto wakeup;
}
@@ -372,7 +389,7 @@ static void smc_close_passive_work(struct work_struct *work)
case SMC_PEERCLOSEWAIT1:
if (rxflags->peer_done_writing)
sk->sk_state = SMC_PEERCLOSEWAIT2;
- /* fall through */
+ fallthrough;
/* to check for closing */
case SMC_PEERCLOSEWAIT2:
if (!smc_cdc_rxed_any_close(conn))
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 5b085efa3bce..c305d8dd23f8 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -15,6 +15,9 @@
#include <linux/workqueue.h>
#include <linux/wait.h>
#include <linux/reboot.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/smc.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <rdma/ib_verbs.h>
@@ -29,13 +32,15 @@
#include "smc_cdc.h"
#include "smc_close.h"
#include "smc_ism.h"
+#include "smc_netlink.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
#define SMC_LGR_NUM_INCR 256
#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
-#define SMC_LGR_FREE_DELAY_FAST (8 * HZ)
-static struct smc_lgr_list smc_lgr_list = { /* established link groups */
+struct smc_lgr_list smc_lgr_list = { /* established link groups */
.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
.list = LIST_HEAD_INIT(smc_lgr_list.list),
.num = 0,
@@ -46,6 +51,9 @@ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
struct smc_buf_desc *buf_desc);
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
+
+static void smc_link_down_work(struct work_struct *work);
/* return head of link group list and its lock for a given link group */
static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
@@ -60,13 +68,23 @@ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
return &smc_lgr_list.list;
}
+static void smc_ibdev_cnt_inc(struct smc_link *lnk)
+{
+ atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]);
+}
+
+static void smc_ibdev_cnt_dec(struct smc_link *lnk)
+{
+ atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]);
+}
+
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
/* client link group creation always follows the server link group
* creation. For client use a somewhat higher removal delay time,
* otherwise there is a risk of out-of-sync link groups.
*/
- if (!lgr->freeing && !lgr->freefast) {
+ if (!lgr->freeing) {
mod_delayed_work(system_wq, &lgr->free_work,
(!lgr->is_smcd && lgr->role == SMC_CLNT) ?
SMC_LGR_FREE_DELAY_CLNT :
@@ -74,15 +92,6 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
}
}
-void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
-{
- if (!lgr->freeing && !lgr->freefast) {
- lgr->freefast = 1;
- mod_delayed_work(system_wq, &lgr->free_work,
- SMC_LGR_FREE_DELAY_FAST);
- }
-}
-
/* Register connection's alert token in our lookup structure.
* To use rbtrees we have to implement our own insert core.
* Requires @conns_lock
@@ -110,16 +119,63 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn)
rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
}
+/* assign an SMC-R link to the connection */
+static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first)
+{
+ enum smc_link_state expected = first ? SMC_LNK_ACTIVATING :
+ SMC_LNK_ACTIVE;
+ int i, j;
+
+ /* do link balancing */
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &conn->lgr->lnk[i];
+
+ if (lnk->state != expected || lnk->link_is_asym)
+ continue;
+ if (conn->lgr->role == SMC_CLNT) {
+ conn->lnk = lnk; /* temporary, SMC server assigns link*/
+ break;
+ }
+ if (conn->lgr->conns_num % 2) {
+ for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ struct smc_link *lnk2;
+
+ lnk2 = &conn->lgr->lnk[j];
+ if (lnk2->state == expected &&
+ !lnk2->link_is_asym) {
+ conn->lnk = lnk2;
+ break;
+ }
+ }
+ }
+ if (!conn->lnk)
+ conn->lnk = lnk;
+ break;
+ }
+ if (!conn->lnk)
+ return SMC_CLC_DECL_NOACTLINK;
+ atomic_inc(&conn->lnk->conn_cnt);
+ return 0;
+}
+
/* Register connection in link group by assigning an alert token
* registered in a search tree.
* Requires @conns_lock
* Note that '0' is a reserved value and not assigned.
*/
-static void smc_lgr_register_conn(struct smc_connection *conn)
+static int smc_lgr_register_conn(struct smc_connection *conn, bool first)
{
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
static atomic_t nexttoken = ATOMIC_INIT(0);
+ int rc;
+ if (!conn->lgr->is_smcd) {
+ rc = smcr_lgr_conn_assign_link(conn, first);
+ if (rc) {
+ conn->lgr = NULL;
+ return rc;
+ }
+ }
/* find a new alert_token_local value not yet used by some connection
* in this link group
*/
@@ -131,6 +187,7 @@ static void smc_lgr_register_conn(struct smc_connection *conn)
}
smc_lgr_add_alert_token(conn);
conn->lgr->conns_num++;
+ return 0;
}
/* Unregister connection and reset the alert token of the given connection<
@@ -141,6 +198,8 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn)
struct smc_link_group *lgr = conn->lgr;
rb_erase(&conn->alert_node, &lgr->conns_all);
+ if (conn->lnk)
+ atomic_dec(&conn->lnk->conn_cnt);
lgr->conns_num--;
conn->alert_token_local = 0;
sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
@@ -152,40 +211,451 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
- if (!lgr)
+ if (!smc_conn_lgr_valid(conn))
return;
write_lock_bh(&lgr->conns_lock);
if (conn->alert_token_local) {
__smc_lgr_unregister_conn(conn);
}
write_unlock_bh(&lgr->conns_lock);
- conn->lgr = NULL;
}
-void smc_lgr_cleanup_early(struct smc_connection *conn)
+int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct smc_link_group *lgr = conn->lgr;
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ char hostname[SMC_MAX_HOSTNAME_LEN + 1];
+ char smc_seid[SMC_MAX_EID_LEN + 1];
+ struct nlattr *attrs;
+ u8 *seid = NULL;
+ u8 *host = NULL;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_SYS_INFO);
+ if (!nlh)
+ goto errmsg;
+ if (cb_ctx->pos[0])
+ goto errout;
+ attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable()))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true))
+ goto errattr;
+ smc_clc_get_hostname(&host);
+ if (host) {
+ memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN);
+ hostname[SMC_MAX_HOSTNAME_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname))
+ goto errattr;
+ }
+ if (smc_ism_is_v2_capable()) {
+ smc_ism_get_system_eid(&seid);
+ memcpy(smc_seid, seid, SMC_MAX_EID_LEN);
+ smc_seid[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid))
+ goto errattr;
+ }
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ cb_ctx->pos[0] = 1;
+ return skb->len;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return skb->len;
+}
+
+/* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */
+static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nlattr *v2_attrs)
+{
+ char smc_host[SMC_MAX_HOSTNAME_LEN + 1];
+ char smc_eid[SMC_MAX_EID_LEN + 1];
+
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release))
+ goto errv2attr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os))
+ goto errv2attr;
+ memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN);
+ smc_host[SMC_MAX_HOSTNAME_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host))
+ goto errv2attr;
+ memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN);
+ smc_eid[SMC_MAX_EID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid))
+ goto errv2attr;
+
+ nla_nest_end(skb, v2_attrs);
+ return 0;
+
+errv2attr:
+ nla_nest_cancel(skb, v2_attrs);
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlattr *v2_attrs;
+
+ v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2);
+ if (!v2_attrs)
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway))
+ goto errv2attr;
+
+ nla_nest_end(skb, v2_attrs);
+ return 0;
+
+errv2attr:
+ nla_nest_cancel(skb, v2_attrs);
+errattr:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_target[SMC_MAX_PNETID_LEN + 1];
+ struct nlattr *attrs, *v2_attrs;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id)))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE,
+ lgr->net->net_cookie, SMC_NLA_LGR_R_PAD))
+ goto errattr;
+ memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN);
+ smc_target[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target))
+ goto errattr;
+ if (lgr->smc_version > SMC_V1) {
+ v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON);
+ if (!v2_attrs)
+ goto errattr;
+ if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs))
+ goto errattr;
+ if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb))
+ goto errattr;
+ }
+
+ nla_nest_end(skb, attrs);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_lgr_link(struct smc_link_group *lgr,
+ struct smc_link *link,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_ibname[IB_DEVICE_NAME_MAX];
+ u8 smc_gid_target[41];
+ struct nlattr *attrs;
+ u32 link_uid = 0;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LINK_SMCR);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT,
+ atomic_read(&link->conn_cnt)))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx))
+ goto errattr;
+ snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname);
+ if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname))
+ goto errattr;
+ memcpy(&link_uid, link->link_uid, sizeof(link_uid));
+ if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid))
+ goto errattr;
+ memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid));
+ if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid))
+ goto errattr;
+ memset(smc_gid_target, 0, sizeof(smc_gid_target));
+ smc_gid_be16_convert(smc_gid_target, link->gid);
+ if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target))
+ goto errattr;
+ memset(smc_gid_target, 0, sizeof(smc_gid_target));
+ smc_gid_be16_convert(smc_gid_target, link->peer_gid);
+ if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_handle_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ bool list_links)
+{
+ void *nlh;
+ int i;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LGR_SMCR);
+ if (!nlh)
+ goto errmsg;
+ if (smc_nl_fill_lgr(lgr, skb, cb))
+ goto errout;
+
+ genlmsg_end(skb, nlh);
+ if (!list_links)
+ goto out;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_usable(&lgr->lnk[i]))
+ continue;
+ if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb))
+ goto errout;
+ }
+out:
+ return 0;
+
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ bool list_links)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_link_group *lgr;
+ int snum = cb_ctx->pos[0];
+ int num = 0;
+
+ spin_lock_bh(&smc_lgr->lock);
+ list_for_each_entry(lgr, &smc_lgr->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_lgr(lgr, skb, cb, list_links))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ spin_unlock_bh(&smc_lgr->lock);
+ cb_ctx->pos[0] = num;
+}
+
+static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct nlattr *attrs;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_LGR_SMCD);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD);
+ if (!attrs)
+ goto errout;
+
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id)))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid,
+ SMC_NLA_LGR_D_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid,
+ SMC_NLA_LGR_D_PAD))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd)))
+ goto errattr;
+ memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet))
+ goto errattr;
+ if (lgr->smc_version > SMC_V1) {
+ struct nlattr *v2_attrs;
+
+ v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON);
+ if (!v2_attrs)
+ goto errattr;
+ if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs))
+ goto errattr;
+ }
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_link_group *lgr;
+ int snum = cb_ctx->pos[1];
+ int rc = 0, num = 0;
+
+ spin_lock_bh(&dev->lgr_lock);
+ list_for_each_entry(lgr, &dev->lgr_list, list) {
+ if (!lgr->is_smcd)
+ continue;
+ if (num < snum)
+ goto next;
+ rc = smc_nl_fill_smcd_lgr(lgr, skb, cb);
+ if (rc)
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ spin_unlock_bh(&dev->lgr_lock);
+ cb_ctx->pos[1] = num;
+ return rc;
+}
+
+static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smcd_dev *smcd_dev;
+ int snum = cb_ctx->pos[0];
+ int rc = 0, num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcd_dev, &dev_list->list, list) {
+ if (list_empty(&smcd_dev->lgr_list))
+ continue;
+ if (num < snum)
+ goto next;
+ rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb);
+ if (rc)
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+ return rc;
+}
+
+int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ bool list_links = false;
+
+ smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links);
+ return skb->len;
+}
+
+int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ bool list_links = true;
+
+ smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links);
+ return skb->len;
+}
+
+int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb);
+ return skb->len;
+}
+
+void smc_lgr_cleanup_early(struct smc_link_group *lgr)
+{
+ spinlock_t *lgr_lock;
if (!lgr)
return;
- smc_conn_free(conn);
- smc_lgr_forget(lgr);
- smc_lgr_schedule_free_work_fast(lgr);
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ /* do not use this link group for new connections */
+ if (!list_empty(&lgr->list))
+ list_del_init(&lgr->list);
+ spin_unlock_bh(lgr_lock);
+ __smc_lgr_terminate(lgr, true);
}
-/* Send delete link, either as client to request the initiation
- * of the DELETE LINK sequence from server; or as server to
- * initiate the delete processing. See smc_llc_rx_delete_link().
- */
-static int smc_link_send_delete(struct smc_link *lnk, bool orderly)
+static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
{
- if (lnk->state == SMC_LNK_ACTIVE &&
- !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) {
- smc_llc_link_deleting(lnk);
- return 0;
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (smc_link_sendable(lnk))
+ lnk->state = SMC_LNK_INACTIVE;
}
- return -ENOTCONN;
+ wake_up_all(&lgr->llc_msg_waiter);
+ wake_up_all(&lgr->llc_flow_waiter);
}
static void smc_lgr_free(struct smc_link_group *lgr);
@@ -196,7 +666,6 @@ static void smc_lgr_free_work(struct work_struct *work)
struct smc_link_group,
free_work);
spinlock_t *lgr_lock;
- struct smc_link *lnk;
bool conns;
smc_lgr_list_head(lgr, &lgr_lock);
@@ -213,26 +682,17 @@ static void smc_lgr_free_work(struct work_struct *work)
return;
}
list_del_init(&lgr->list); /* remove from smc_lgr_list */
-
- lnk = &lgr->lnk[SMC_SINGLE_LINK];
- if (!lgr->is_smcd && !lgr->terminating) {
- /* try to send del link msg, on error free lgr immediately */
- if (lnk->state == SMC_LNK_ACTIVE &&
- !smc_link_send_delete(lnk, true)) {
- /* reschedule in case we never receive a response */
- smc_lgr_schedule_free_work(lgr);
- spin_unlock_bh(lgr_lock);
- return;
- }
- }
lgr->freeing = 1; /* this instance does the freeing, no new schedule */
spin_unlock_bh(lgr_lock);
cancel_delayed_work(&lgr->free_work);
- if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
- smc_llc_link_inactive(lnk);
+ if (!lgr->is_smcd && !lgr->terminating)
+ smc_llc_send_link_delete_all(lgr, true,
+ SMC_LLC_DEL_PROG_INIT_TERM);
if (lgr->is_smcd && !lgr->terminating)
smc_ism_signal_shutdown(lgr);
+ if (!lgr->is_smcd)
+ smcr_lgr_link_deactivate_all(lgr);
smc_lgr_free(lgr);
}
@@ -241,7 +701,118 @@ static void smc_lgr_terminate_work(struct work_struct *work)
struct smc_link_group *lgr = container_of(work, struct smc_link_group,
terminate_work);
- smc_lgr_terminate(lgr, true);
+ __smc_lgr_terminate(lgr, true);
+}
+
+/* return next unique link id for the lgr */
+static u8 smcr_next_link_id(struct smc_link_group *lgr)
+{
+ u8 link_id;
+ int i;
+
+ while (1) {
+again:
+ link_id = ++lgr->next_link_id;
+ if (!link_id) /* skip zero as link_id */
+ link_id = ++lgr->next_link_id;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (smc_link_usable(&lgr->lnk[i]) &&
+ lgr->lnk[i].link_id == link_id)
+ goto again;
+ }
+ break;
+ }
+ return link_id;
+}
+
+static void smcr_copy_dev_info_to_link(struct smc_link *link)
+{
+ struct smc_ib_device *smcibdev = link->smcibdev;
+
+ snprintf(link->ibname, sizeof(link->ibname), "%s",
+ smcibdev->ibdev->name);
+ link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1];
+}
+
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+ u8 link_idx, struct smc_init_info *ini)
+{
+ struct smc_ib_device *smcibdev;
+ u8 rndvec[3];
+ int rc;
+
+ if (lgr->smc_version == SMC_V2) {
+ lnk->smcibdev = ini->smcrv2.ib_dev_v2;
+ lnk->ibport = ini->smcrv2.ib_port_v2;
+ } else {
+ lnk->smcibdev = ini->ib_dev;
+ lnk->ibport = ini->ib_port;
+ }
+ get_device(&lnk->smcibdev->ibdev->dev);
+ atomic_inc(&lnk->smcibdev->lnk_cnt);
+ refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */
+ lnk->clearing = 0;
+ lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
+ lnk->link_id = smcr_next_link_id(lgr);
+ lnk->lgr = lgr;
+ smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
+ lnk->link_idx = link_idx;
+ lnk->wr_rx_id_compl = 0;
+ smc_ibdev_cnt_inc(lnk);
+ smcr_copy_dev_info_to_link(lnk);
+ atomic_set(&lnk->conn_cnt, 0);
+ smc_llc_link_set_uid(lnk);
+ INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
+ if (!lnk->smcibdev->initialized) {
+ rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev);
+ if (rc)
+ goto out;
+ }
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
+ (rndvec[2] << 16);
+ rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
+ ini->vlan_id, lnk->gid, &lnk->sgid_index,
+ lgr->smc_version == SMC_V2 ?
+ &ini->smcrv2 : NULL);
+ if (rc)
+ goto out;
+ rc = smc_llc_link_init(lnk);
+ if (rc)
+ goto out;
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto clear_llc_lnk;
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_wr_create_link(lnk);
+ if (rc)
+ goto destroy_qp;
+ lnk->state = SMC_LNK_ACTIVATING;
+ return 0;
+
+destroy_qp:
+ smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+ smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+ smc_wr_free_link_mem(lnk);
+clear_llc_lnk:
+ smc_llc_link_clear(lnk, false);
+out:
+ smc_ibdev_cnt_dec(lnk);
+ put_device(&lnk->smcibdev->ibdev->dev);
+ smcibdev = lnk->smcibdev;
+ memset(lnk, 0, sizeof(struct smc_link));
+ lnk->state = SMC_LNK_UNUSED;
+ if (!atomic_dec_return(&smcibdev->lnk_cnt))
+ wake_up(&smcibdev->lnks_deleted);
+ smc_lgr_put(lgr); /* lgr_hold above */
+ return rc;
}
/* create a new SMC link group */
@@ -251,12 +822,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
struct list_head *lgr_list;
struct smc_link *lnk;
spinlock_t *lgr_lock;
- u8 rndvec[3];
+ u8 link_idx;
int rc = 0;
int i;
if (ini->is_smcd && ini->vlan_id) {
- if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) {
+ if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected],
+ ini->vlan_id)) {
rc = SMC_CLC_DECL_ISMVLANERR;
goto out;
}
@@ -267,19 +839,26 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
rc = SMC_CLC_DECL_MEM;
goto ism_put_vlan;
}
+ lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0,
+ SMC_LGR_ID_SIZE, &lgr->id);
+ if (!lgr->tx_wq) {
+ rc = -ENOMEM;
+ goto free_lgr;
+ }
lgr->is_smcd = ini->is_smcd;
lgr->sync_err = 0;
lgr->terminating = 0;
- lgr->freefast = 0;
lgr->freeing = 0;
lgr->vlan_id = ini->vlan_id;
- rwlock_init(&lgr->sndbufs_lock);
- rwlock_init(&lgr->rmbs_lock);
+ refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */
+ mutex_init(&lgr->sndbufs_lock);
+ mutex_init(&lgr->rmbs_lock);
rwlock_init(&lgr->conns_lock);
for (i = 0; i < SMC_RMBE_SIZES; i++) {
INIT_LIST_HEAD(&lgr->sndbufs[i]);
INIT_LIST_HEAD(&lgr->rmbs[i]);
}
+ lgr->next_link_id = 0;
smc_lgr_list.num += SMC_LGR_NUM_INCR;
memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
@@ -287,77 +866,67 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lgr->conns_all = RB_ROOT;
if (ini->is_smcd) {
/* SMC-D specific settings */
- get_device(&ini->ism_dev->dev);
- lgr->peer_gid = ini->ism_gid;
- lgr->smcd = ini->ism_dev;
- lgr_list = &ini->ism_dev->lgr_list;
+ get_device(&ini->ism_dev[ini->ism_selected]->dev);
+ lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected];
+ lgr->smcd = ini->ism_dev[ini->ism_selected];
+ lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list;
lgr_lock = &lgr->smcd->lgr_lock;
+ lgr->smc_version = ini->smcd_version;
lgr->peer_shutdown = 0;
- atomic_inc(&ini->ism_dev->lgr_cnt);
+ atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt);
} else {
/* SMC-R specific settings */
- get_device(&ini->ib_dev->ibdev->dev);
+ struct smc_ib_device *ibdev;
+ int ibport;
+
lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
- memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
+ lgr->smc_version = ini->smcr_version;
+ memcpy(lgr->peer_systemid, ini->peer_systemid,
SMC_SYSTEMID_LEN);
+ if (lgr->smc_version == SMC_V2) {
+ ibdev = ini->smcrv2.ib_dev_v2;
+ ibport = ini->smcrv2.ib_port_v2;
+ lgr->saddr = ini->smcrv2.saddr;
+ lgr->uses_gateway = ini->smcrv2.uses_gateway;
+ memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac,
+ ETH_ALEN);
+ } else {
+ ibdev = ini->ib_dev;
+ ibport = ini->ib_port;
+ }
+ memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1],
+ SMC_MAX_PNETID_LEN);
+ rc = smc_wr_alloc_lgr_mem(lgr);
+ if (rc)
+ goto free_wq;
+ smc_llc_lgr_init(lgr, smc);
- lnk = &lgr->lnk[SMC_SINGLE_LINK];
- /* initialize link */
- lnk->state = SMC_LNK_ACTIVATING;
- lnk->link_id = SMC_SINGLE_LINK;
- lnk->smcibdev = ini->ib_dev;
- lnk->ibport = ini->ib_port;
+ link_idx = SMC_SINGLE_LINK;
+ lnk = &lgr->lnk[link_idx];
+ rc = smcr_link_init(lgr, lnk, link_idx, ini);
+ if (rc) {
+ smc_wr_free_lgr_mem(lgr);
+ goto free_wq;
+ }
+ lgr->net = smc_ib_net(lnk->smcibdev);
lgr_list = &smc_lgr_list.list;
lgr_lock = &smc_lgr_list.lock;
- lnk->path_mtu =
- ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
- if (!ini->ib_dev->initialized)
- smc_ib_setup_per_ibdev(ini->ib_dev);
- get_random_bytes(rndvec, sizeof(rndvec));
- lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
- (rndvec[2] << 16);
- rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
- ini->vlan_id, lnk->gid,
- &lnk->sgid_index);
- if (rc)
- goto free_lgr;
- rc = smc_llc_link_init(lnk);
- if (rc)
- goto free_lgr;
- rc = smc_wr_alloc_link_mem(lnk);
- if (rc)
- goto clear_llc_lnk;
- rc = smc_ib_create_protection_domain(lnk);
- if (rc)
- goto free_link_mem;
- rc = smc_ib_create_queue_pair(lnk);
- if (rc)
- goto dealloc_pd;
- rc = smc_wr_create_link(lnk);
- if (rc)
- goto destroy_qp;
+ lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type;
atomic_inc(&lgr_cnt);
- atomic_inc(&ini->ib_dev->lnk_cnt);
}
smc->conn.lgr = lgr;
spin_lock_bh(lgr_lock);
- list_add(&lgr->list, lgr_list);
+ list_add_tail(&lgr->list, lgr_list);
spin_unlock_bh(lgr_lock);
return 0;
-destroy_qp:
- smc_ib_destroy_queue_pair(lnk);
-dealloc_pd:
- smc_ib_dealloc_protection_domain(lnk);
-free_link_mem:
- smc_wr_free_link_mem(lnk);
-clear_llc_lnk:
- smc_llc_link_clear(lnk);
+free_wq:
+ destroy_workqueue(lgr->tx_wq);
free_lgr:
kfree(lgr);
ism_put_vlan:
if (ini->is_smcd && ini->vlan_id)
- smc_ism_put_vlan(ini->ism_dev, ini->vlan_id);
+ smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id);
out:
if (rc < 0) {
if (rc == -ENOMEM)
@@ -368,27 +937,214 @@ out:
return rc;
}
+static int smc_write_space(struct smc_connection *conn)
+{
+ int buffer_len = conn->peer_rmbe_size;
+ union smc_host_cursor prod;
+ union smc_host_cursor cons;
+ int space;
+
+ smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+ /* determine rx_buf space */
+ space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod);
+ return space;
+}
+
+static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend,
+ struct smc_wr_buf *wr_buf)
+{
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons, fin;
+ int rc = 0;
+ int diff;
+
+ smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn);
+ smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn);
+ /* set prod cursor to old state, enforce tx_rdma_writes() */
+ smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
+
+ if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) {
+ /* cons cursor advanced more than fin, and prod was set
+ * fin above, so now prod is smaller than cons. Fix that.
+ */
+ diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons);
+ smc_curs_add(conn->sndbuf_desc->len,
+ &conn->tx_curs_sent, diff);
+ smc_curs_add(conn->sndbuf_desc->len,
+ &conn->tx_curs_fin, diff);
+
+ smp_mb__before_atomic();
+ atomic_add(diff, &conn->sndbuf_space);
+ smp_mb__after_atomic();
+
+ smc_curs_add(conn->peer_rmbe_size,
+ &conn->local_tx_ctrl.prod, diff);
+ smc_curs_add(conn->peer_rmbe_size,
+ &conn->local_tx_ctrl_fin, diff);
+ }
+ /* recalculate, value is used by tx_rdma_writes() */
+ atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn));
+
+ if (smc->sk.sk_state != SMC_INIT &&
+ smc->sk.sk_state != SMC_CLOSED) {
+ rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf);
+ if (!rc) {
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0);
+ smc->sk.sk_data_ready(&smc->sk);
+ }
+ } else {
+ smc_wr_tx_put_slot(conn->lnk,
+ (struct smc_wr_tx_pend_priv *)pend);
+ }
+ return rc;
+}
+
+void smc_switch_link_and_count(struct smc_connection *conn,
+ struct smc_link *to_lnk)
+{
+ atomic_dec(&conn->lnk->conn_cnt);
+ /* link_hold in smc_conn_create() */
+ smcr_link_put(conn->lnk);
+ conn->lnk = to_lnk;
+ atomic_inc(&conn->lnk->conn_cnt);
+ /* link_put in smc_conn_free() */
+ smcr_link_hold(conn->lnk);
+}
+
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+ struct smc_link *from_lnk, bool is_dev_err)
+{
+ struct smc_link *to_lnk = NULL;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_connection *conn;
+ struct smc_wr_buf *wr_buf;
+ struct smc_sock *smc;
+ struct rb_node *node;
+ int i, rc = 0;
+
+ /* link is inactive, wake up tx waiters */
+ smc_wr_wakeup_tx_wait(from_lnk);
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx)
+ continue;
+ if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev &&
+ from_lnk->ibport == lgr->lnk[i].ibport) {
+ continue;
+ }
+ to_lnk = &lgr->lnk[i];
+ break;
+ }
+ if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) {
+ smc_lgr_terminate_sched(lgr);
+ return NULL;
+ }
+again:
+ read_lock_bh(&lgr->conns_lock);
+ for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) {
+ conn = rb_entry(node, struct smc_connection, alert_node);
+ if (conn->lnk != from_lnk)
+ continue;
+ smc = container_of(conn, struct smc_sock, conn);
+ /* conn->lnk not yet set in SMC_INIT state */
+ if (smc->sk.sk_state == SMC_INIT)
+ continue;
+ if (smc->sk.sk_state == SMC_CLOSED ||
+ smc->sk.sk_state == SMC_PEERCLOSEWAIT1 ||
+ smc->sk.sk_state == SMC_PEERCLOSEWAIT2 ||
+ smc->sk.sk_state == SMC_APPFINCLOSEWAIT ||
+ smc->sk.sk_state == SMC_APPCLOSEWAIT1 ||
+ smc->sk.sk_state == SMC_APPCLOSEWAIT2 ||
+ smc->sk.sk_state == SMC_PEERFINCLOSEWAIT ||
+ smc->sk.sk_state == SMC_PEERABORTWAIT ||
+ smc->sk.sk_state == SMC_PROCESSABORT) {
+ spin_lock_bh(&conn->send_lock);
+ smc_switch_link_and_count(conn, to_lnk);
+ spin_unlock_bh(&conn->send_lock);
+ continue;
+ }
+ sock_hold(&smc->sk);
+ read_unlock_bh(&lgr->conns_lock);
+ /* pre-fetch buffer outside of send_lock, might sleep */
+ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend);
+ if (rc)
+ goto err_out;
+ /* avoid race with smcr_tx_sndbuf_nonempty() */
+ spin_lock_bh(&conn->send_lock);
+ smc_switch_link_and_count(conn, to_lnk);
+ rc = smc_switch_cursor(smc, pend, wr_buf);
+ spin_unlock_bh(&conn->send_lock);
+ sock_put(&smc->sk);
+ if (rc)
+ goto err_out;
+ goto again;
+ }
+ read_unlock_bh(&lgr->conns_lock);
+ smc_wr_tx_link_put(to_lnk);
+ return to_lnk;
+
+err_out:
+ smcr_link_down_cond_sched(to_lnk);
+ smc_wr_tx_link_put(to_lnk);
+ return NULL;
+}
+
+static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link_group *lgr)
+{
+ struct mutex *lock; /* lock buffer list */
+ int rc;
+
+ if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) {
+ /* unregister rmb with peer */
+ rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
+ if (!rc) {
+ /* protect against smc_llc_cli_rkey_exchange() */
+ mutex_lock(&lgr->llc_conf_mutex);
+ smc_llc_do_delete_rkey(lgr, buf_desc);
+ buf_desc->is_conf_rkey = false;
+ mutex_unlock(&lgr->llc_conf_mutex);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+ }
+ }
+
+ if (buf_desc->is_reg_err) {
+ /* buf registration failed, reuse not possible */
+ lock = is_rmb ? &lgr->rmbs_lock :
+ &lgr->sndbufs_lock;
+ mutex_lock(lock);
+ list_del(&buf_desc->list);
+ mutex_unlock(lock);
+
+ smc_buf_free(lgr, is_rmb, buf_desc);
+ } else {
+ buf_desc->used = 0;
+ memset(buf_desc->cpu_addr, 0, buf_desc->len);
+ }
+}
+
static void smc_buf_unuse(struct smc_connection *conn,
struct smc_link_group *lgr)
{
- if (conn->sndbuf_desc)
- conn->sndbuf_desc->used = 0;
+ if (conn->sndbuf_desc) {
+ if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) {
+ smcr_buf_unuse(conn->sndbuf_desc, false, lgr);
+ } else {
+ conn->sndbuf_desc->used = 0;
+ memset(conn->sndbuf_desc->cpu_addr, 0,
+ conn->sndbuf_desc->len);
+ }
+ }
if (conn->rmb_desc) {
- if (!conn->rmb_desc->regerr) {
- if (!lgr->is_smcd && !list_empty(&lgr->list)) {
- /* unregister rmb with peer */
- smc_llc_do_delete_rkey(
- &lgr->lnk[SMC_SINGLE_LINK],
- conn->rmb_desc);
- }
- conn->rmb_desc->used = 0;
+ if (!lgr->is_smcd) {
+ smcr_buf_unuse(conn->rmb_desc, true, lgr);
} else {
- /* buf registration failed, reuse not possible */
- write_lock_bh(&lgr->rmbs_lock);
- list_del(&conn->rmb_desc->list);
- write_unlock_bh(&lgr->rmbs_lock);
-
- smc_buf_free(lgr, true, conn->rmb_desc);
+ conn->rmb_desc->used = 0;
+ memset(conn->rmb_desc->cpu_addr, 0,
+ conn->rmb_desc->len +
+ sizeof(struct smcd_cdc_msg));
}
}
}
@@ -398,55 +1154,152 @@ void smc_conn_free(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
- if (!lgr)
+ if (!lgr || conn->freed)
+ /* Connection has never been registered in a
+ * link group, or has already been freed.
+ */
return;
+
+ conn->freed = 1;
+ if (!smc_conn_lgr_valid(conn))
+ /* Connection has already unregistered from
+ * link group.
+ */
+ goto lgr_put;
+
if (lgr->is_smcd) {
if (!list_empty(&lgr->list))
smc_ism_unset_conn(conn);
tasklet_kill(&conn->rx_tsklet);
} else {
- smc_cdc_tx_dismiss_slots(conn);
+ smc_cdc_wait_pend_tx_wr(conn);
+ if (current_work() != &conn->abort_work)
+ cancel_work_sync(&conn->abort_work);
}
if (!list_empty(&lgr->list)) {
- smc_lgr_unregister_conn(conn);
smc_buf_unuse(conn, lgr); /* allow buffer reuse */
+ smc_lgr_unregister_conn(conn);
}
if (!lgr->conns_num)
smc_lgr_schedule_free_work(lgr);
+lgr_put:
+ if (!lgr->is_smcd)
+ smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */
+ smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */
}
-static void smc_link_clear(struct smc_link *lnk)
+/* unregister a link from a buf_desc */
+static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link *lnk)
{
+ if (is_rmb || buf_desc->is_vm)
+ buf_desc->is_reg_mr[lnk->link_idx] = false;
+ if (!buf_desc->is_map_ib[lnk->link_idx])
+ return;
+
+ if ((is_rmb || buf_desc->is_vm) &&
+ buf_desc->mr[lnk->link_idx]) {
+ smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]);
+ buf_desc->mr[lnk->link_idx] = NULL;
+ }
+ if (is_rmb)
+ smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE);
+ else
+ smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE);
+
+ sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+ buf_desc->is_map_ib[lnk->link_idx] = false;
+}
+
+/* unmap all buffers of lgr for a deleted link */
+static void smcr_buf_unmap_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_buf_desc *buf_desc, *bf;
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ mutex_lock(&lgr->rmbs_lock);
+ list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list)
+ smcr_buf_unmap_link(buf_desc, true, lnk);
+ mutex_unlock(&lgr->rmbs_lock);
+ mutex_lock(&lgr->sndbufs_lock);
+ list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i],
+ list)
+ smcr_buf_unmap_link(buf_desc, false, lnk);
+ mutex_unlock(&lgr->sndbufs_lock);
+ }
+}
+
+static void smcr_rtoken_clear_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ lgr->rtokens[i][lnk->link_idx].rkey = 0;
+ lgr->rtokens[i][lnk->link_idx].dma_addr = 0;
+ }
+}
+
+static void __smcr_link_clear(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_ib_device *smcibdev;
+
+ smc_wr_free_link_mem(lnk);
+ smc_ibdev_cnt_dec(lnk);
+ put_device(&lnk->smcibdev->ibdev->dev);
+ smcibdev = lnk->smcibdev;
+ memset(lnk, 0, sizeof(struct smc_link));
+ lnk->state = SMC_LNK_UNUSED;
+ if (!atomic_dec_return(&smcibdev->lnk_cnt))
+ wake_up(&smcibdev->lnks_deleted);
+ smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */
+}
+
+/* must be called under lgr->llc_conf_mutex lock */
+void smcr_link_clear(struct smc_link *lnk, bool log)
+{
+ if (!lnk->lgr || lnk->clearing ||
+ lnk->state == SMC_LNK_UNUSED)
+ return;
+ lnk->clearing = 1;
lnk->peer_qpn = 0;
- smc_llc_link_clear(lnk);
- smc_ib_modify_qp_reset(lnk);
+ smc_llc_link_clear(lnk, log);
+ smcr_buf_unmap_lgr(lnk);
+ smcr_rtoken_clear_link(lnk);
+ smc_ib_modify_qp_error(lnk);
smc_wr_free_link(lnk);
smc_ib_destroy_queue_pair(lnk);
smc_ib_dealloc_protection_domain(lnk);
- smc_wr_free_link_mem(lnk);
- if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt))
- wake_up(&lnk->smcibdev->lnks_deleted);
+ smcr_link_put(lnk); /* theoretically last link_put */
+}
+
+void smcr_link_hold(struct smc_link *lnk)
+{
+ refcount_inc(&lnk->refcnt);
+}
+
+void smcr_link_put(struct smc_link *lnk)
+{
+ if (refcount_dec_and_test(&lnk->refcnt))
+ __smcr_link_clear(lnk);
}
static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
struct smc_buf_desc *buf_desc)
{
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ int i;
- if (is_rmb) {
- if (buf_desc->mr_rx[SMC_SINGLE_LINK])
- smc_ib_put_memory_region(
- buf_desc->mr_rx[SMC_SINGLE_LINK]);
- smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
- DMA_FROM_DEVICE);
- } else {
- smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc,
- DMA_TO_DEVICE);
- }
- sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
- if (buf_desc->pages)
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]);
+
+ if (!buf_desc->is_vm && buf_desc->pages)
__free_pages(buf_desc->pages, buf_desc->order);
+ else if (buf_desc->is_vm && buf_desc->cpu_addr)
+ vfree(buf_desc->cpu_addr);
kfree(buf_desc);
}
@@ -499,51 +1352,53 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
__smc_lgr_free_bufs(lgr, true);
}
-/* remove a link group */
-static void smc_lgr_free(struct smc_link_group *lgr)
+/* won't be freed until no one accesses to lgr anymore */
+static void __smc_lgr_free(struct smc_link_group *lgr)
{
smc_lgr_free_bufs(lgr);
if (lgr->is_smcd) {
- if (!lgr->terminating) {
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(&lgr->smcd->dev);
- }
if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
wake_up(&lgr->smcd->lgrs_deleted);
} else {
- smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
- put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
+ smc_wr_free_lgr_mem(lgr);
if (!atomic_dec_return(&lgr_cnt))
wake_up(&lgrs_deleted);
}
kfree(lgr);
}
-void smc_lgr_forget(struct smc_link_group *lgr)
+/* remove a link group */
+static void smc_lgr_free(struct smc_link_group *lgr)
{
- struct list_head *lgr_list;
- spinlock_t *lgr_lock;
+ int i;
- lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
- spin_lock_bh(lgr_lock);
- /* do not use this link group for new connections */
- if (!list_empty(lgr_list))
- list_del_init(lgr_list);
- spin_unlock_bh(lgr_lock);
+ if (!lgr->is_smcd) {
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state != SMC_LNK_UNUSED)
+ smcr_link_clear(&lgr->lnk[i], false);
+ }
+ mutex_unlock(&lgr->llc_conf_mutex);
+ smc_llc_lgr_clear(lgr);
+ }
+
+ destroy_workqueue(lgr->tx_wq);
+ if (lgr->is_smcd) {
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
+ }
+ smc_lgr_put(lgr); /* theoretically last lgr_put */
}
-static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
+void smc_lgr_hold(struct smc_link_group *lgr)
{
- int i;
-
- for (i = 0; i < SMC_RMBE_SIZES; i++) {
- struct smc_buf_desc *buf_desc;
+ refcount_inc(&lgr->refcnt);
+}
- list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
- buf_desc->len += sizeof(struct smcd_cdc_msg);
- smc_ism_unregister_dmb(lgr->smcd, buf_desc);
- }
- }
+void smc_lgr_put(struct smc_link_group *lgr)
+{
+ if (refcount_dec_and_test(&lgr->refcnt))
+ __smc_lgr_free(lgr);
}
static void smc_sk_wake_ups(struct smc_sock *smc)
@@ -572,7 +1427,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
else
tasklet_unlock_wait(&conn->rx_tsklet);
} else {
- smc_cdc_tx_dismiss_slots(conn);
+ smc_cdc_wait_pend_tx_wr(conn);
}
smc_lgr_unregister_conn(conn);
smc_close_active_abort(smc);
@@ -582,21 +1437,20 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
{
if (lgr->is_smcd) {
smc_ism_signal_shutdown(lgr);
- smcd_unregister_all_dmbs(lgr);
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(&lgr->smcd->dev);
} else {
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ u32 rsn = lgr->llc_termination_rsn;
- wake_up(&lnk->wr_reg_wait);
- if (lnk->state != SMC_LNK_INACTIVE) {
- smc_link_send_delete(lnk, false);
- smc_llc_link_inactive(lnk);
- }
+ if (!rsn)
+ rsn = SMC_LLC_DEL_PROG_INIT_TERM;
+ smc_llc_send_link_delete_all(lgr, false, rsn);
+ smcr_lgr_link_deactivate_all(lgr);
}
}
-/* terminate link group */
+/* terminate link group
+ * @soft: true if link group shutdown can take its time
+ * false if immediate link group shutdown is required
+ */
static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
{
struct smc_connection *conn;
@@ -605,11 +1459,9 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
if (lgr->terminating)
return; /* lgr already terminating */
- if (!soft)
- cancel_delayed_work_sync(&lgr->free_work);
+ /* cancel free_work sync, will terminate when lgr->freeing is set */
+ cancel_delayed_work_sync(&lgr->free_work);
lgr->terminating = 1;
- if (!lgr->is_smcd)
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
/* kill remaining link group connections */
read_lock_bh(&lgr->conns_lock);
@@ -628,54 +1480,24 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
}
read_unlock_bh(&lgr->conns_lock);
smc_lgr_cleanup(lgr);
- if (soft)
- smc_lgr_schedule_free_work_fast(lgr);
- else
- smc_lgr_free(lgr);
+ smc_lgr_free(lgr);
}
-/* unlink and terminate link group
- * @soft: true if link group shutdown can take its time
- * false if immediate link group shutdown is required
- */
-void smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
+/* unlink link group and schedule termination */
+void smc_lgr_terminate_sched(struct smc_link_group *lgr)
{
spinlock_t *lgr_lock;
smc_lgr_list_head(lgr, &lgr_lock);
spin_lock_bh(lgr_lock);
- if (lgr->terminating) {
+ if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) {
spin_unlock_bh(lgr_lock);
return; /* lgr already terminating */
}
- if (!soft)
- lgr->freeing = 1;
list_del_init(&lgr->list);
+ lgr->freeing = 1;
spin_unlock_bh(lgr_lock);
- __smc_lgr_terminate(lgr, soft);
-}
-
-/* Called when IB port is terminated */
-void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
-{
- struct smc_link_group *lgr, *l;
- LIST_HEAD(lgr_free_list);
-
- spin_lock_bh(&smc_lgr_list.lock);
- list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
- if (!lgr->is_smcd &&
- lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
- lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) {
- list_move(&lgr->list, &lgr_free_list);
- lgr->freeing = 1;
- }
- }
- spin_unlock_bh(&smc_lgr_list.lock);
-
- list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
- list_del_init(&lgr->list);
- __smc_lgr_terminate(lgr, false);
- }
+ schedule_work(&lgr->terminate_work);
}
/* Called when peer lgr shutdown (regularly or abnormally) is received */
@@ -692,6 +1514,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
if (peer_gid) /* peer triggered termination */
lgr->peer_shutdown = 1;
list_move(&lgr->list, &lgr_free_list);
+ lgr->freeing = 1;
}
}
spin_unlock_bh(&dev->lgr_lock);
@@ -732,6 +1555,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
{
struct smc_link_group *lgr, *lg;
LIST_HEAD(lgr_free_list);
+ int i;
spin_lock_bh(&smc_lgr_list.lock);
if (!smcibdev) {
@@ -740,9 +1564,9 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
lgr->freeing = 1;
} else {
list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
- if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) {
- list_move(&lgr->list, &lgr_free_list);
- lgr->freeing = 1;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].smcibdev == smcibdev)
+ smcr_link_down_cond_sched(&lgr->lnk[i]);
}
}
}
@@ -750,6 +1574,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
list_del_init(&lgr->list);
+ smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM);
__smc_lgr_terminate(lgr, false);
}
@@ -763,14 +1588,200 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
}
}
-/* Determine vlan of internal TCP socket.
- * @vlan_id: address to store the determined vlan id into
+/* set new lgr type and clear all asymmetric link tagging */
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type)
+{
+ char *lgr_type = "";
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (smc_link_usable(&lgr->lnk[i]))
+ lgr->lnk[i].link_is_asym = false;
+ if (lgr->type == new_type)
+ return;
+ lgr->type = new_type;
+
+ switch (lgr->type) {
+ case SMC_LGR_NONE:
+ lgr_type = "NONE";
+ break;
+ case SMC_LGR_SINGLE:
+ lgr_type = "SINGLE";
+ break;
+ case SMC_LGR_SYMMETRIC:
+ lgr_type = "SYMMETRIC";
+ break;
+ case SMC_LGR_ASYMMETRIC_PEER:
+ lgr_type = "ASYMMETRIC_PEER";
+ break;
+ case SMC_LGR_ASYMMETRIC_LOCAL:
+ lgr_type = "ASYMMETRIC_LOCAL";
+ break;
+ }
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: "
+ "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id,
+ lgr->net->net_cookie, lgr_type, lgr->pnet_id);
+}
+
+/* set new lgr type and tag a link as asymmetric */
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+ enum smc_lgr_type new_type, int asym_lnk_idx)
+{
+ smcr_lgr_set_type(lgr, new_type);
+ lgr->lnk[asym_lnk_idx].link_is_asym = true;
+}
+
+/* abort connection, abort_work scheduled from tasklet context */
+static void smc_conn_abort_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(work,
+ struct smc_connection,
+ abort_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_conn_kill(conn, true);
+ release_sock(&smc->sk);
+ sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */
+}
+
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr, *n;
+
+ list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+ struct smc_link *link;
+
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN) ||
+ lgr->type == SMC_LGR_SYMMETRIC ||
+ lgr->type == SMC_LGR_ASYMMETRIC_PEER ||
+ !rdma_dev_access_netns(smcibdev->ibdev, lgr->net))
+ continue;
+
+ /* trigger local add link processing */
+ link = smc_llc_usable_link(lgr);
+ if (link)
+ smc_llc_add_link_local(link);
+ }
+}
+
+/* link is down - switch connections to alternate link,
+ * must be called under lgr->llc_conf_mutex lock
*/
+static void smcr_link_down(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_link *to_lnk;
+ int del_link_id;
+
+ if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
+ return;
+
+ to_lnk = smc_switch_conns(lgr, lnk, true);
+ if (!to_lnk) { /* no backup link available */
+ smcr_link_clear(lnk, true);
+ return;
+ }
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ del_link_id = lnk->link_id;
+
+ if (lgr->role == SMC_SERV) {
+ /* trigger local delete link processing */
+ smc_llc_srv_delete_link_local(to_lnk, del_link_id);
+ } else {
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+ /* another llc task is ongoing */
+ mutex_unlock(&lgr->llc_conf_mutex);
+ wait_event_timeout(lgr->llc_flow_waiter,
+ (list_empty(&lgr->list) ||
+ lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
+ SMC_LLC_WAIT_TIME);
+ mutex_lock(&lgr->llc_conf_mutex);
+ }
+ if (!list_empty(&lgr->list)) {
+ smc_llc_send_delete_link(to_lnk, del_link_id,
+ SMC_LLC_REQ, true,
+ SMC_LLC_DEL_LOST_PATH);
+ smcr_link_clear(lnk, true);
+ }
+ wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */
+ }
+}
+
+/* must be called under lgr->llc_conf_mutex lock */
+void smcr_link_down_cond(struct smc_link *lnk)
+{
+ if (smc_link_downing(&lnk->state)) {
+ trace_smcr_link_down(lnk, __builtin_return_address(0));
+ smcr_link_down(lnk);
+ }
+}
+
+/* will get the lgr->llc_conf_mutex lock */
+void smcr_link_down_cond_sched(struct smc_link *lnk)
+{
+ if (smc_link_downing(&lnk->state)) {
+ trace_smcr_link_down(lnk, __builtin_return_address(0));
+ schedule_work(&lnk->link_down_wrk);
+ }
+}
+
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr, *n;
+ int i;
+
+ list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) {
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN))
+ continue; /* lgr is not affected */
+ if (list_empty(&lgr->list))
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (smc_link_usable(lnk) &&
+ lnk->smcibdev == smcibdev && lnk->ibport == ibport)
+ smcr_link_down_cond_sched(lnk);
+ }
+ }
+}
+
+static void smc_link_down_work(struct work_struct *work)
+{
+ struct smc_link *link = container_of(work, struct smc_link,
+ link_down_wrk);
+ struct smc_link_group *lgr = link->lgr;
+
+ if (list_empty(&lgr->list))
+ return;
+ wake_up_all(&lgr->llc_msg_waiter);
+ mutex_lock(&lgr->llc_conf_mutex);
+ smcr_link_down(link);
+ mutex_unlock(&lgr->llc_conf_mutex);
+}
+
+static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev,
+ struct netdev_nested_priv *priv)
+{
+ unsigned short *vlan_id = (unsigned short *)priv->data;
+
+ if (is_vlan_dev(lower_dev)) {
+ *vlan_id = vlan_dev_vlan_id(lower_dev);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Determine vlan of internal TCP socket. */
int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
{
struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct netdev_nested_priv priv;
struct net_device *ndev;
- int i, nest_lvl, rc = 0;
+ int rc = 0;
ini->vlan_id = 0;
if (!dst) {
@@ -788,20 +1799,9 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
goto out_rel;
}
+ priv.data = (void *)&ini->vlan_id;
rtnl_lock();
- nest_lvl = ndev->lower_level;
- for (i = 0; i < nest_lvl; i++) {
- struct list_head *lower = &ndev->adj_list.lower;
-
- if (list_empty(lower))
- break;
- lower = lower->next;
- ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
- if (is_vlan_dev(ndev)) {
- ini->vlan_id = vlan_dev_vlan_id(ndev);
- break;
- }
- }
+ netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv);
rtnl_unlock();
out_rel:
@@ -810,19 +1810,35 @@ out:
return rc;
}
-static bool smcr_lgr_match(struct smc_link_group *lgr,
- struct smc_clc_msg_local *lcl,
- enum smc_lgr_role role, u32 clcqpn)
+static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version,
+ u8 peer_systemid[],
+ u8 peer_gid[],
+ u8 peer_mac_v1[],
+ enum smc_lgr_role role, u32 clcqpn,
+ struct net *net)
{
- return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
- SMC_SYSTEMID_LEN) &&
- !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
- SMC_GID_SIZE) &&
- !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
- sizeof(lcl->mac)) &&
- lgr->role == role &&
- (lgr->role == SMC_SERV ||
- lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn);
+ struct smc_link *lnk;
+ int i;
+
+ if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) ||
+ lgr->role != role)
+ return false;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ lnk = &lgr->lnk[i];
+
+ if (!smc_link_active(lnk))
+ continue;
+ /* use verbs API to check netns, instead of lgr->net */
+ if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net))
+ return false;
+ if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) &&
+ !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) &&
+ (smcr_version == SMC_V2 ||
+ !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN)))
+ return true;
+ }
+ return false;
}
static bool smcd_lgr_match(struct smc_link_group *lgr,
@@ -835,17 +1851,20 @@ static bool smcd_lgr_match(struct smc_link_group *lgr,
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
{
struct smc_connection *conn = &smc->conn;
+ struct net *net = sock_net(&smc->sk);
struct list_head *lgr_list;
struct smc_link_group *lgr;
enum smc_lgr_role role;
spinlock_t *lgr_lock;
int rc = 0;
- lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
- lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
- ini->cln_first_contact = SMC_FIRST_CONTACT;
+ lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list :
+ &smc_lgr_list.list;
+ lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock :
+ &smc_lgr_list.lock;
+ ini->first_contact_local = 1;
role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
- if (role == SMC_CLNT && ini->srv_first_contact)
+ if (role == SMC_CLNT && ini->first_contact_peer)
/* create new link group as well */
goto create;
@@ -854,27 +1873,35 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
list_for_each_entry(lgr, lgr_list, list) {
write_lock_bh(&lgr->conns_lock);
if ((ini->is_smcd ?
- smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
- smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
+ smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected],
+ ini->ism_peer_gid[ini->ism_selected]) :
+ smcr_lgr_match(lgr, ini->smcr_version,
+ ini->peer_systemid,
+ ini->peer_gid, ini->peer_mac, role,
+ ini->ib_clcqpn, net)) &&
!lgr->sync_err &&
- lgr->vlan_id == ini->vlan_id &&
- (role == SMC_CLNT ||
- lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
+ (ini->smcd_version == SMC_V2 ||
+ lgr->vlan_id == ini->vlan_id) &&
+ (role == SMC_CLNT || ini->is_smcd ||
+ (lgr->conns_num < SMC_RMBS_PER_LGR_MAX &&
+ !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) {
/* link group found */
- ini->cln_first_contact = SMC_REUSE_CONTACT;
+ ini->first_contact_local = 0;
conn->lgr = lgr;
- smc_lgr_register_conn(conn); /* add smc conn to lgr */
- if (delayed_work_pending(&lgr->free_work))
- cancel_delayed_work(&lgr->free_work);
+ rc = smc_lgr_register_conn(conn, false);
write_unlock_bh(&lgr->conns_lock);
+ if (!rc && delayed_work_pending(&lgr->free_work))
+ cancel_delayed_work(&lgr->free_work);
break;
}
write_unlock_bh(&lgr->conns_lock);
}
spin_unlock_bh(lgr_lock);
+ if (rc)
+ return rc;
- if (role == SMC_CLNT && !ini->srv_first_contact &&
- ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (role == SMC_CLNT && !ini->first_contact_peer &&
+ ini->first_contact_local) {
/* Server reuses a link group, but Client wants to start
* a new one
* send out_of_sync decline, reason synchr. error
@@ -883,21 +1910,33 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
}
create:
- if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
+ if (ini->first_contact_local) {
rc = smc_lgr_create(smc, ini);
if (rc)
goto out;
lgr = conn->lgr;
write_lock_bh(&lgr->conns_lock);
- smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ rc = smc_lgr_register_conn(conn, true);
write_unlock_bh(&lgr->conns_lock);
+ if (rc) {
+ smc_lgr_cleanup_early(lgr);
+ goto out;
+ }
}
+ smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */
+ if (!conn->lgr->is_smcd)
+ smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */
+ conn->freed = 0;
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
conn->urg_state = SMC_URG_READ;
+ init_waitqueue_head(&conn->cdc_pend_tx_wq);
+ INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
if (ini->is_smcd) {
conn->rx_off = sizeof(struct smcd_cdc_msg);
smcd_cdc_rx_init(conn); /* init tasklet for this conn */
+ } else {
+ conn->rx_off = 0;
}
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&conn->acurs_lock);
@@ -907,21 +1946,30 @@ out:
return rc;
}
-/* convert the RMB size into the compressed notation - minimum 16K.
+#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */
+
+/* convert the RMB size into the compressed notation (minimum 16K, see
+ * SMCD/R_DMBE_SIZES.
* In contrast to plain ilog2, this rounds towards the next power of 2,
* so the socket application gets at least its desired sndbuf / rcvbuf size.
*/
-static u8 smc_compress_bufsize(int size)
+static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb)
{
+ const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE;
u8 compressed;
if (size <= SMC_BUF_MIN_SIZE)
return 0;
- size = (size - 1) >> 14;
- compressed = ilog2(size) + 1;
- if (compressed >= SMC_RMBE_SIZES)
- compressed = SMC_RMBE_SIZES - 1;
+ size = (size - 1) >> 14; /* convert to 16K multiple */
+ compressed = min_t(u8, ilog2(size) + 1,
+ is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES);
+
+ if (!is_smcd && is_rmb)
+ /* RMBs are backed by & limited to max size of scatterlists */
+ compressed = min_t(u8, compressed, ilog2(max_scat >> 14));
+
return compressed;
}
@@ -938,19 +1986,19 @@ int smc_uncompress_bufsize(u8 compressed)
* buffer size; if not available, return NULL
*/
static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
- rwlock_t *lock,
+ struct mutex *lock,
struct list_head *buf_list)
{
struct smc_buf_desc *buf_slot;
- read_lock_bh(lock);
+ mutex_lock(lock);
list_for_each_entry(buf_slot, buf_list, list) {
if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
- read_unlock_bh(lock);
+ mutex_unlock(lock);
return buf_slot;
}
}
- read_unlock_bh(lock);
+ mutex_unlock(lock);
return NULL;
}
@@ -960,69 +2008,259 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
*/
static inline int smc_rmb_wnd_update_limit(int rmbe_size)
{
- return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+ return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
}
-static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
- bool is_rmb, int bufsize)
+/* map an buf to a link */
+static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
+ struct smc_link *lnk)
{
- struct smc_buf_desc *buf_desc;
- struct smc_link *lnk;
- int rc;
+ int rc, i, nents, offset, buf_size, size, access_flags;
+ struct scatterlist *sg;
+ void *buf;
- /* try to alloc a new buffer */
- buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
- if (!buf_desc)
- return ERR_PTR(-ENOMEM);
+ if (buf_desc->is_map_ib[lnk->link_idx])
+ return 0;
- buf_desc->order = get_order(bufsize);
- buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
- __GFP_NOMEMALLOC | __GFP_COMP |
- __GFP_NORETRY | __GFP_ZERO,
- buf_desc->order);
- if (!buf_desc->pages) {
- kfree(buf_desc);
- return ERR_PTR(-EAGAIN);
- }
- buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
-
- /* build the sg table from the pages */
- lnk = &lgr->lnk[SMC_SINGLE_LINK];
- rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
- GFP_KERNEL);
- if (rc) {
- smc_buf_free(lgr, is_rmb, buf_desc);
- return ERR_PTR(rc);
+ if (buf_desc->is_vm) {
+ buf = buf_desc->cpu_addr;
+ buf_size = buf_desc->len;
+ offset = offset_in_page(buf_desc->cpu_addr);
+ nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE;
+ } else {
+ nents = 1;
+ }
+
+ rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL);
+ if (rc)
+ return rc;
+
+ if (buf_desc->is_vm) {
+ /* virtually contiguous buffer */
+ for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) {
+ size = min_t(int, PAGE_SIZE - offset, buf_size);
+ sg_set_page(sg, vmalloc_to_page(buf), size, offset);
+ buf += size / sizeof(*buf);
+ buf_size -= size;
+ offset = 0;
+ }
+ } else {
+ /* physically contiguous buffer */
+ sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl,
+ buf_desc->cpu_addr, buf_desc->len);
}
- sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
- buf_desc->cpu_addr, bufsize);
/* map sg table to DMA address */
- rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc,
+ rc = smc_ib_buf_map_sg(lnk, buf_desc,
is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
/* SMC protocol depends on mapping to one DMA address only */
- if (rc != 1) {
- smc_buf_free(lgr, is_rmb, buf_desc);
- return ERR_PTR(-EAGAIN);
+ if (rc != nents) {
+ rc = -EAGAIN;
+ goto free_table;
}
- /* create a new memory region for the RMB */
- if (is_rmb) {
- rc = smc_ib_get_memory_region(lnk->roce_pd,
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE,
- buf_desc);
- if (rc) {
- smc_buf_free(lgr, is_rmb, buf_desc);
- return ERR_PTR(rc);
+ buf_desc->is_dma_need_sync |=
+ smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx;
+
+ if (is_rmb || buf_desc->is_vm) {
+ /* create a new memory region for the RMB or vzalloced sndbuf */
+ access_flags = is_rmb ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_LOCAL_WRITE;
+
+ rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags,
+ buf_desc, lnk->link_idx);
+ if (rc)
+ goto buf_unmap;
+ smc_ib_sync_sg_for_device(lnk, buf_desc,
+ is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+ buf_desc->is_map_ib[lnk->link_idx] = true;
+ return 0;
+
+buf_unmap:
+ smc_ib_buf_unmap_sg(lnk, buf_desc,
+ is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+free_table:
+ sg_free_table(&buf_desc->sgt[lnk->link_idx]);
+ return rc;
+}
+
+/* register a new buf on IB device, rmb or vzalloced sndbuf
+ * must be called under lgr->llc_conf_mutex lock
+ */
+int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc)
+{
+ if (list_empty(&link->lgr->list))
+ return -ENOLINK;
+ if (!buf_desc->is_reg_mr[link->link_idx]) {
+ /* register memory region for new buf */
+ if (buf_desc->is_vm)
+ buf_desc->mr[link->link_idx]->iova =
+ (uintptr_t)buf_desc->cpu_addr;
+ if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) {
+ buf_desc->is_reg_err = true;
+ return -EFAULT;
}
+ buf_desc->is_reg_mr[link->link_idx] = true;
}
+ return 0;
+}
- buf_desc->len = bufsize;
+static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock,
+ struct list_head *lst, bool is_rmb)
+{
+ struct smc_buf_desc *buf_desc, *bf;
+ int rc = 0;
+
+ mutex_lock(lock);
+ list_for_each_entry_safe(buf_desc, bf, lst, list) {
+ if (!buf_desc->used)
+ continue;
+ rc = smcr_buf_map_link(buf_desc, is_rmb, lnk);
+ if (rc)
+ goto out;
+ }
+out:
+ mutex_unlock(lock);
+ return rc;
+}
+
+/* map all used buffers of lgr for a new link */
+int smcr_buf_map_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ int i, rc = 0;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock,
+ &lgr->rmbs[i], true);
+ if (rc)
+ return rc;
+ rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock,
+ &lgr->sndbufs[i], false);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+/* register all used buffers of lgr for a new link,
+ * must be called under lgr->llc_conf_mutex lock
+ */
+int smcr_buf_reg_lgr(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr = lnk->lgr;
+ struct smc_buf_desc *buf_desc, *bf;
+ int i, rc = 0;
+
+ /* reg all RMBs for a new link */
+ mutex_lock(&lgr->rmbs_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) {
+ if (!buf_desc->used)
+ continue;
+ rc = smcr_link_reg_buf(lnk, buf_desc);
+ if (rc) {
+ mutex_unlock(&lgr->rmbs_lock);
+ return rc;
+ }
+ }
+ }
+ mutex_unlock(&lgr->rmbs_lock);
+
+ if (lgr->buf_type == SMCR_PHYS_CONT_BUFS)
+ return rc;
+
+ /* reg all vzalloced sndbufs for a new link */
+ mutex_lock(&lgr->sndbufs_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) {
+ if (!buf_desc->used || !buf_desc->is_vm)
+ continue;
+ rc = smcr_link_reg_buf(lnk, buf_desc);
+ if (rc) {
+ mutex_unlock(&lgr->sndbufs_lock);
+ return rc;
+ }
+ }
+ }
+ mutex_unlock(&lgr->sndbufs_lock);
+ return rc;
+}
+
+static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
+ bool is_rmb, int bufsize)
+{
+ struct smc_buf_desc *buf_desc;
+
+ /* try to alloc a new buffer */
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return ERR_PTR(-ENOMEM);
+
+ switch (lgr->buf_type) {
+ case SMCR_PHYS_CONT_BUFS:
+ case SMCR_MIXED_BUFS:
+ buf_desc->order = get_order(bufsize);
+ buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | __GFP_COMP |
+ __GFP_NORETRY | __GFP_ZERO,
+ buf_desc->order);
+ if (buf_desc->pages) {
+ buf_desc->cpu_addr =
+ (void *)page_address(buf_desc->pages);
+ buf_desc->len = bufsize;
+ buf_desc->is_vm = false;
+ break;
+ }
+ if (lgr->buf_type == SMCR_PHYS_CONT_BUFS)
+ goto out;
+ fallthrough; // try virtually continguous buf
+ case SMCR_VIRT_CONT_BUFS:
+ buf_desc->order = get_order(bufsize);
+ buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order);
+ if (!buf_desc->cpu_addr)
+ goto out;
+ buf_desc->pages = NULL;
+ buf_desc->len = bufsize;
+ buf_desc->is_vm = true;
+ break;
+ }
return buf_desc;
+
+out:
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
}
-#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+/* map buf_desc on all usable links,
+ * unused buffers stay mapped as long as the link is up
+ */
+static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
+ struct smc_buf_desc *buf_desc, bool is_rmb)
+{
+ int i, rc = 0, cnt = 0;
+
+ /* protect against parallel link reconfiguration */
+ mutex_lock(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ struct smc_link *lnk = &lgr->lnk[i];
+
+ if (!smc_link_usable(lnk))
+ continue;
+ if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ cnt++;
+ }
+out:
+ mutex_unlock(&lgr->llc_conf_mutex);
+ if (!rc && !cnt)
+ rc = -EINVAL;
+ return rc;
+}
static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
bool is_dmb, int bufsize)
@@ -1030,9 +2268,6 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
struct smc_buf_desc *buf_desc;
int rc;
- if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
- return ERR_PTR(-EAGAIN);
-
/* try to alloc a new DMB */
buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
if (!buf_desc)
@@ -1041,7 +2276,11 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
if (rc) {
kfree(buf_desc);
- return ERR_PTR(-EAGAIN);
+ if (rc == -ENOMEM)
+ return ERR_PTR(-EAGAIN);
+ if (rc == -ENOSPC)
+ return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-EIO);
}
buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
/* CDC header stored in buf. So, pretend it was smaller */
@@ -1066,19 +2305,19 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
struct smc_link_group *lgr = conn->lgr;
struct list_head *buf_list;
int bufsize, bufsize_short;
+ bool is_dgraded = false;
+ struct mutex *lock; /* lock buffer list */
int sk_buf_size;
- rwlock_t *lock;
if (is_rmb)
/* use socket recv buffer size (w/o overhead) as start value */
- sk_buf_size = smc->sk.sk_rcvbuf / 2;
+ sk_buf_size = smc->sk.sk_rcvbuf;
else
/* use socket send buffer size (w/o overhead) as start value */
- sk_buf_size = smc->sk.sk_sndbuf / 2;
+ sk_buf_size = smc->sk.sk_sndbuf;
- for (bufsize_short = smc_compress_bufsize(sk_buf_size);
+ for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb);
bufsize_short >= 0; bufsize_short--) {
-
if (is_rmb) {
lock = &lgr->rmbs_lock;
buf_list = &lgr->rmbs[bufsize_short];
@@ -1087,13 +2326,13 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
buf_list = &lgr->sndbufs[bufsize_short];
}
bufsize = smc_uncompress_bufsize(bufsize_short);
- if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC)
- continue;
/* check for reusable slot in the link group */
buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
if (buf_desc) {
- memset(buf_desc->cpu_addr, 0, bufsize);
+ buf_desc->is_dma_need_sync = 0;
+ SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
+ SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb);
break; /* found reusable slot */
}
@@ -1104,23 +2343,37 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
if (PTR_ERR(buf_desc) == -ENOMEM)
break;
- if (IS_ERR(buf_desc))
+ if (IS_ERR(buf_desc)) {
+ if (!is_dgraded) {
+ is_dgraded = true;
+ SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb);
+ }
continue;
+ }
+ SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb);
+ SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
buf_desc->used = 1;
- write_lock_bh(lock);
+ mutex_lock(lock);
list_add(&buf_desc->list, buf_list);
- write_unlock_bh(lock);
+ mutex_unlock(lock);
break; /* found */
}
if (IS_ERR(buf_desc))
- return -ENOMEM;
+ return PTR_ERR(buf_desc);
+
+ if (!is_smcd) {
+ if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) {
+ smcr_buf_unuse(buf_desc, is_rmb, lgr);
+ return -ENOMEM;
+ }
+ }
if (is_rmb) {
conn->rmb_desc = buf_desc;
conn->rmbe_size_short = bufsize_short;
- smc->sk.sk_rcvbuf = bufsize * 2;
+ smc->sk.sk_rcvbuf = bufsize;
atomic_set(&conn->bytes_to_rcv, 0);
conn->rmbe_update_limit =
smc_rmb_wnd_update_limit(buf_desc->len);
@@ -1128,50 +2381,36 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
} else {
conn->sndbuf_desc = buf_desc;
- smc->sk.sk_sndbuf = bufsize * 2;
+ smc->sk.sk_sndbuf = bufsize;
atomic_set(&conn->sndbuf_space, bufsize);
}
return 0;
}
-void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
-{
- struct smc_link_group *lgr = conn->lgr;
-
- if (!conn->lgr || conn->lgr->is_smcd)
- return;
- smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
- conn->sndbuf_desc, DMA_TO_DEVICE);
-}
-
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
{
- struct smc_link_group *lgr = conn->lgr;
-
- if (!conn->lgr || conn->lgr->is_smcd)
+ if (!conn->sndbuf_desc->is_dma_need_sync)
return;
- smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
- conn->sndbuf_desc, DMA_TO_DEVICE);
+ if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd ||
+ !smc_link_active(conn->lnk))
+ return;
+ smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE);
}
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
{
- struct smc_link_group *lgr = conn->lgr;
+ int i;
- if (!conn->lgr || conn->lgr->is_smcd)
+ if (!conn->rmb_desc->is_dma_need_sync)
return;
- smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
- conn->rmb_desc, DMA_FROM_DEVICE);
-}
-
-void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
-{
- struct smc_link_group *lgr = conn->lgr;
-
- if (!conn->lgr || conn->lgr->is_smcd)
+ if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd)
return;
- smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
- conn->rmb_desc, DMA_FROM_DEVICE);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&conn->lgr->lnk[i]))
+ continue;
+ smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc,
+ DMA_FROM_DEVICE);
+ }
}
/* create the send and receive buffer for an SMC socket;
@@ -1190,8 +2429,13 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
return rc;
/* create rmb */
rc = __smc_buf_create(smc, is_smcd, true);
- if (rc)
+ if (rc) {
+ mutex_lock(&smc->conn.lgr->sndbufs_lock);
+ list_del(&smc->conn.sndbuf_desc->list);
+ mutex_unlock(&smc->conn.lgr->sndbufs_lock);
smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
+ smc->conn.sndbuf_desc = NULL;
+ }
return rc;
}
@@ -1206,16 +2450,64 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
return -ENOSPC;
}
+static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx,
+ u32 rkey)
+{
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if (test_bit(i, lgr->rtokens_used_mask) &&
+ lgr->rtokens[i][lnk_idx].rkey == rkey)
+ return i;
+ }
+ return -ENOENT;
+}
+
+/* set rtoken for a new link to an existing rmb */
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+ __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey)
+{
+ int rtok_idx;
+
+ rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known));
+ if (rtok_idx == -ENOENT)
+ return;
+ lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey);
+ lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr);
+}
+
+/* set rtoken for a new link whose link_id is given */
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+ __be64 nw_vaddr, __be32 nw_rkey)
+{
+ u64 dma_addr = be64_to_cpu(nw_vaddr);
+ u32 rkey = ntohl(nw_rkey);
+ bool found = false;
+ int link_idx;
+
+ for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) {
+ if (lgr->lnk[link_idx].link_id == link_id) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return;
+ lgr->rtokens[rtok_idx][link_idx].rkey = rkey;
+ lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr;
+}
+
/* add a new rtoken from peer */
-int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey)
{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
u64 dma_addr = be64_to_cpu(nw_vaddr);
u32 rkey = ntohl(nw_rkey);
int i;
for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
- if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
- (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
+ if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
+ lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr &&
test_bit(i, lgr->rtokens_used_mask)) {
/* already in list */
return i;
@@ -1224,23 +2516,25 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
i = smc_rmb_reserve_rtoken_idx(lgr);
if (i < 0)
return i;
- lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
- lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
+ lgr->rtokens[i][lnk->link_idx].rkey = rkey;
+ lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr;
return i;
}
-/* delete an rtoken */
-int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
+/* delete an rtoken from all links */
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey)
{
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
u32 rkey = ntohl(nw_rkey);
- int i;
+ int i, j;
for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
- if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
+ if (lgr->rtokens[i][lnk->link_idx].rkey == rkey &&
test_bit(i, lgr->rtokens_used_mask)) {
- lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
- lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
-
+ for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ lgr->rtokens[i][j].rkey = 0;
+ lgr->rtokens[i][j].dma_addr = 0;
+ }
clear_bit(i, lgr->rtokens_used_mask);
return 0;
}
@@ -1250,10 +2544,11 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
/* save rkey and dma_addr received from peer during clc handshake */
int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_link *lnk,
struct smc_clc_msg_accept_confirm *clc)
{
- conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
- clc->rmb_rkey);
+ conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr,
+ clc->r0.rmb_rkey);
if (conn->rtoken_idx < 0)
return conn->rtoken_idx;
return 0;
@@ -1264,20 +2559,20 @@ static void smc_core_going_away(void)
struct smc_ib_device *smcibdev;
struct smcd_dev *smcd;
- spin_lock(&smc_ib_devices.lock);
+ mutex_lock(&smc_ib_devices.mutex);
list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
int i;
for (i = 0; i < SMC_MAX_PORTS; i++)
set_bit(i, smcibdev->ports_going_away);
}
- spin_unlock(&smc_ib_devices.lock);
+ mutex_unlock(&smc_ib_devices.mutex);
- spin_lock(&smcd_dev_list.lock);
+ mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(smcd, &smcd_dev_list.list, list) {
smcd->going_away = 1;
}
- spin_unlock(&smcd_dev_list.lock);
+ mutex_unlock(&smcd_dev_list.mutex);
}
/* Clean up all SMC link groups */
@@ -1289,10 +2584,10 @@ static void smc_lgrs_shutdown(void)
smc_smcr_terminate_all(NULL);
- spin_lock(&smcd_dev_list.lock);
+ mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(smcd, &smcd_dev_list.list, list)
smc_smcd_terminate_all(smcd);
- spin_unlock(&smcd_dev_list.lock);
+ mutex_unlock(&smcd_dev_list.mutex);
}
static int smc_core_reboot_event(struct notifier_block *this,
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 234ae25f0025..285f9bd8e232 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -13,7 +13,10 @@
#define _SMC_CORE_H
#include <linux/atomic.h>
+#include <linux/smc.h>
+#include <linux/pci.h>
#include <rdma/ib_verbs.h>
+#include <net/genetlink.h>
#include "smc.h"
#include "smc_ib.h"
@@ -32,18 +35,23 @@ enum smc_lgr_role { /* possible roles of a link group */
};
enum smc_link_state { /* possible states of a link */
+ SMC_LNK_UNUSED, /* link is unused */
SMC_LNK_INACTIVE, /* link is inactive */
SMC_LNK_ACTIVATING, /* link is being activated */
SMC_LNK_ACTIVE, /* link is active */
- SMC_LNK_DELETING, /* link is being deleted */
};
#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
+#define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */
struct smc_wr_buf {
u8 raw[SMC_WR_BUF_SIZE];
};
+struct smc_wr_v2_buf {
+ u8 raw[SMC_WR_BUF_V2_SIZE];
+};
+
#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */
enum smc_wr_reg_state {
@@ -70,6 +78,8 @@ struct smc_rdma_wr { /* work requests per message
struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
};
+#define SMC_LGR_ID_SIZE 4
+
struct smc_link {
struct smc_ib_device *smcibdev; /* ib-device */
u8 ibport; /* port - values 1 | 2 */
@@ -85,24 +95,34 @@ struct smc_link {
struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
+ struct completion *wr_tx_compl; /* WR send CQE completion */
/* above four vectors have wr_tx_cnt elements and use the same index */
+ struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */
+ struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/
+ struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */
dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
+ dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/
atomic_long_t wr_tx_id; /* seq # of last sent WR */
unsigned long *wr_tx_mask; /* bit mask of used indexes */
u32 wr_tx_cnt; /* number of WR send buffers */
wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
+ atomic_t wr_tx_refcnt; /* tx refs to link */
struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
/* above three vectors have wr_rx_cnt elements and use the same index */
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
+ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
u64 wr_rx_id; /* seq # of last recv WR */
+ u64 wr_rx_id_compl; /* seq # of last completed WR */
u32 wr_rx_cnt; /* number of WR recv buffers */
unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
+ wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */
struct ib_reg_wr wr_reg; /* WR register memory region */
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
+ atomic_t wr_reg_refcnt; /* reg refs to link */
enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
@@ -115,34 +135,30 @@ struct smc_link {
u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
u8 link_id; /* unique # within link group */
+ u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
+ u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
+ u8 link_idx; /* index in lgr link array */
+ u8 link_is_asym; /* is link asymmetric? */
+ u8 clearing : 1; /* link is being cleared */
+ refcount_t refcnt; /* link reference count */
+ struct smc_link_group *lgr; /* parent link group */
+ struct work_struct link_down_wrk; /* wrk to bring link down */
+ char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */
+ int ndev_ifidx; /* network device ifindex */
enum smc_link_state state; /* state of link */
- struct workqueue_struct *llc_wq; /* single thread work queue */
- struct completion llc_confirm; /* wait for rx of conf link */
- struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
- int llc_confirm_rc; /* rc from confirm link msg */
- int llc_confirm_resp_rc; /* rc from conf_resp msg */
- struct completion llc_add; /* wait for rx of add link */
- struct completion llc_add_resp; /* wait for rx of add link rsp*/
struct delayed_work llc_testlink_wrk; /* testlink worker */
struct completion llc_testlink_resp; /* wait for rx of testlink */
int llc_testlink_time; /* testlink interval */
- struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */
- int llc_confirm_rkey_rc; /* rc from cnf rkey msg */
- struct completion llc_delete_rkey; /* wait 4 rx of del rkey */
- int llc_delete_rkey_rc; /* rc from del rkey msg */
- struct mutex llc_delete_rkey_mutex; /* serialize usage */
+ atomic_t conn_cnt; /* connections on this link */
};
/* For now we just allow one parallel link per link group. The SMC protocol
* allows more (up to 8).
*/
-#define SMC_LINKS_PER_LGR_MAX 1
+#define SMC_LINKS_PER_LGR_MAX 3
#define SMC_SINGLE_LINK 0
-#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
-#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
-
/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
struct smc_buf_desc {
struct list_head list;
@@ -150,25 +166,37 @@ struct smc_buf_desc {
struct page *pages;
int len; /* length of buffer */
u32 used; /* currently used / unused */
- u8 wr_reg : 1; /* mem region registered */
- u8 regerr : 1; /* err during registration */
union {
struct { /* SMC-R */
- struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
- /* virtual buffer */
- struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
- /* for rmb only: memory region
- * incl. rkey provided to peer
- */
- u32 order; /* allocation order */
+ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
+ /* virtual buffer */
+ struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX];
+ /* memory region: for rmb and
+ * vzalloced sndbuf
+ * incl. rkey provided to peer
+ * and lkey provided to local
+ */
+ u32 order; /* allocation order */
+
+ u8 is_conf_rkey;
+ /* confirm_rkey done */
+ u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX];
+ /* mem region registered */
+ u8 is_map_ib[SMC_LINKS_PER_LGR_MAX];
+ /* mem region mapped to lnk */
+ u8 is_dma_need_sync;
+ u8 is_reg_err;
+ /* buffer registration err */
+ u8 is_vm;
+ /* virtually contiguous */
};
struct { /* SMC-D */
- unsigned short sba_idx;
- /* SBA index number */
- u64 token;
- /* DMB token number */
- dma_addr_t dma_addr;
- /* DMA address */
+ unsigned short sba_idx;
+ /* SBA index number */
+ u64 token;
+ /* DMB token number */
+ dma_addr_t dma_addr;
+ /* DMA address */
};
};
};
@@ -178,7 +206,6 @@ struct smc_rtoken { /* address/key of remote RMB */
u32 rkey;
};
-#define SMC_LGR_ID_SIZE 4
#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
/* theoretically, the RFC states that largest size would be 512K,
@@ -188,6 +215,35 @@ struct smc_rtoken { /* address/key of remote RMB */
struct smcd_dev;
+enum smc_lgr_type { /* redundancy state of lgr */
+ SMC_LGR_NONE, /* no active links, lgr to be deleted */
+ SMC_LGR_SINGLE, /* 1 active RNIC on each peer */
+ SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */
+ SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */
+ SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */
+};
+
+enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */
+ SMCR_PHYS_CONT_BUFS = 0,
+ SMCR_VIRT_CONT_BUFS = 1,
+ SMCR_MIXED_BUFS = 2,
+};
+
+enum smc_llc_flowtype {
+ SMC_LLC_FLOW_NONE = 0,
+ SMC_LLC_FLOW_ADD_LINK = 2,
+ SMC_LLC_FLOW_DEL_LINK = 4,
+ SMC_LLC_FLOW_REQ_ADD_LINK = 5,
+ SMC_LLC_FLOW_RKEY = 6,
+};
+
+struct smc_llc_qentry;
+
+struct smc_llc_flow {
+ enum smc_llc_flowtype type;
+ struct smc_llc_qentry *qentry;
+};
+
struct smc_link_group {
struct list_head list;
struct rb_root conns_all; /* connection tree */
@@ -196,25 +252,35 @@ struct smc_link_group {
unsigned short vlan_id; /* vlan id of link group */
struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
- rwlock_t sndbufs_lock; /* protects tx buffers */
+ struct mutex sndbufs_lock; /* protects tx buffers */
struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
- rwlock_t rmbs_lock; /* protects rx buffers */
+ struct mutex rmbs_lock; /* protects rx buffers */
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
struct work_struct terminate_work; /* abnormal lgr termination */
+ struct workqueue_struct *tx_wq; /* wq for conn. tx workers */
u8 sync_err : 1; /* lgr no longer fits to peer */
u8 terminating : 1;/* lgr is terminating */
- u8 freefast : 1; /* free worker scheduled fast */
u8 freeing : 1; /* lgr is being freed */
+ refcount_t refcnt; /* lgr reference count */
bool is_smcd; /* SMC-R or SMC-D */
+ u8 smc_version;
+ u8 negotiated_eid[SMC_MAX_EID_LEN];
+ u8 peer_os; /* peer operating system */
+ u8 peer_smc_release;
+ u8 peer_hostname[SMC_MAX_HOSTNAME_LEN];
union {
struct { /* SMC-R */
enum smc_lgr_role role;
/* client or server */
struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
/* smc link */
+ struct smc_wr_v2_buf *wr_rx_buf_v2;
+ /* WR v2 recv payload buffer */
+ struct smc_wr_v2_buf *wr_tx_buf_v2;
+ /* WR v2 send payload buffer */
char peer_systemid[SMC_SYSTEMID_LEN];
/* unique system_id of peer */
struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
@@ -222,6 +288,43 @@ struct smc_link_group {
/* remote addr/key pairs */
DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
/* used rtoken elements */
+ u8 next_link_id;
+ enum smc_lgr_type type;
+ enum smcr_buf_type buf_type;
+ /* redundancy state */
+ u8 pnet_id[SMC_MAX_PNETID_LEN + 1];
+ /* pnet id of this lgr */
+ struct list_head llc_event_q;
+ /* queue for llc events */
+ spinlock_t llc_event_q_lock;
+ /* protects llc_event_q */
+ struct mutex llc_conf_mutex;
+ /* protects lgr reconfig. */
+ struct work_struct llc_add_link_work;
+ struct work_struct llc_del_link_work;
+ struct work_struct llc_event_work;
+ /* llc event worker */
+ wait_queue_head_t llc_flow_waiter;
+ /* w4 next llc event */
+ wait_queue_head_t llc_msg_waiter;
+ /* w4 next llc msg */
+ struct smc_llc_flow llc_flow_lcl;
+ /* llc local control field */
+ struct smc_llc_flow llc_flow_rmt;
+ /* llc remote control field */
+ struct smc_llc_qentry *delayed_event;
+ /* arrived when flow active */
+ spinlock_t llc_flow_lock;
+ /* protects llc flow */
+ int llc_testlink_time;
+ /* link keep alive time */
+ u32 llc_termination_rsn;
+ /* rsn code for termination */
+ u8 nexthop_mac[ETH_ALEN];
+ u8 uses_gateway;
+ __be32 saddr;
+ /* net namespace */
+ struct net *net;
};
struct { /* SMC-D */
u64 peer_gid;
@@ -236,20 +339,58 @@ struct smc_link_group {
struct smc_clc_msg_local;
+#define GID_LIST_SIZE 2
+
+struct smc_gidlist {
+ u8 len;
+ u8 list[GID_LIST_SIZE][SMC_GID_SIZE];
+};
+
+struct smc_init_info_smcrv2 {
+ /* Input fields */
+ __be32 saddr;
+ struct sock *clc_sk;
+ __be32 daddr;
+
+ /* Output fields when saddr is set */
+ struct smc_ib_device *ib_dev_v2;
+ u8 ib_port_v2;
+ u8 ib_gid_v2[SMC_GID_SIZE];
+
+ /* Additional output fields when clc_sk and daddr is set as well */
+ u8 uses_gateway;
+ u8 nexthop_mac[ETH_ALEN];
+
+ struct smc_gidlist gidlist;
+};
+
struct smc_init_info {
u8 is_smcd;
+ u8 smc_type_v1;
+ u8 smc_type_v2;
+ u8 first_contact_peer;
+ u8 first_contact_local;
unsigned short vlan_id;
- int srv_first_contact;
- int cln_first_contact;
+ u32 rc;
+ u8 negotiated_eid[SMC_MAX_EID_LEN];
/* SMC-R */
- struct smc_clc_msg_local *ib_lcl;
+ u8 smcr_version;
+ u8 check_smcrv2;
+ u8 peer_gid[SMC_GID_SIZE];
+ u8 peer_mac[ETH_ALEN];
+ u8 peer_systemid[SMC_SYSTEMID_LEN];
struct smc_ib_device *ib_dev;
u8 ib_gid[SMC_GID_SIZE];
u8 ib_port;
u32 ib_clcqpn;
+ struct smc_init_info_smcrv2 smcrv2;
/* SMC-D */
- u64 ism_gid;
- struct smcd_dev *ism_dev;
+ u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1];
+ struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1];
+ u16 ism_chid[SMC_MAX_ISM_DEVS + 1];
+ u8 ism_offered_cnt; /* # of ISM devices offered */
+ u8 ism_selected; /* index of selected ISM dev*/
+ u8 smcd_version;
};
/* Find the connection associated with the given alert token in the link group.
@@ -285,34 +426,109 @@ static inline struct smc_connection *smc_lgr_find_conn(
return res;
}
-static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr)
+static inline bool smc_conn_lgr_valid(struct smc_connection *conn)
{
- if (!lgr->terminating && !lgr->freeing)
- schedule_work(&lgr->terminate_work);
+ return conn->lgr && conn->alert_token_local;
+}
+
+/*
+ * Returns true if the specified link is usable.
+ *
+ * usable means the link is ready to receive RDMA messages, map memory
+ * on the link, etc. This doesn't ensure we are able to send RDMA messages
+ * on this link, if sending RDMA messages is needed, use smc_link_sendable()
+ */
+static inline bool smc_link_usable(struct smc_link *lnk)
+{
+ if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE)
+ return false;
+ return true;
+}
+
+/*
+ * Returns true if the specified link is ready to receive AND send RDMA
+ * messages.
+ *
+ * For the client side in first contact, the underlying QP may still in
+ * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable()
+ * is not strong enough. For those places that need to send any CDC or LLC
+ * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead
+ */
+static inline bool smc_link_sendable(struct smc_link *lnk)
+{
+ return smc_link_usable(lnk) &&
+ lnk->qp_attr.cur_qp_state == IB_QPS_RTS;
+}
+
+static inline bool smc_link_active(struct smc_link *lnk)
+{
+ return lnk->state == SMC_LNK_ACTIVE;
+}
+
+static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+ sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ be16_to_cpu(((__be16 *)gid_raw)[0]),
+ be16_to_cpu(((__be16 *)gid_raw)[1]),
+ be16_to_cpu(((__be16 *)gid_raw)[2]),
+ be16_to_cpu(((__be16 *)gid_raw)[3]),
+ be16_to_cpu(((__be16 *)gid_raw)[4]),
+ be16_to_cpu(((__be16 *)gid_raw)[5]),
+ be16_to_cpu(((__be16 *)gid_raw)[6]),
+ be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+struct smc_pci_dev {
+ __u32 pci_fid;
+ __u16 pci_pchid;
+ __u16 pci_vendor;
+ __u16 pci_device;
+ __u8 pci_id[SMC_PCI_ID_STR_LEN];
+};
+
+static inline void smc_set_pci_values(struct pci_dev *pci_dev,
+ struct smc_pci_dev *smc_dev)
+{
+ smc_dev->pci_vendor = pci_dev->vendor;
+ smc_dev->pci_device = pci_dev->device;
+ snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s",
+ pci_name(pci_dev));
+#if IS_ENABLED(CONFIG_S390)
+ { /* Set s390 specific PCI information */
+ struct zpci_dev *zdev;
+
+ zdev = to_zpci(pci_dev);
+ smc_dev->pci_fid = zdev->fid;
+ smc_dev->pci_pchid = zdev->pchid;
+ }
+#endif
}
struct smc_sock;
struct smc_clc_msg_accept_confirm;
-struct smc_clc_msg_local;
-void smc_lgr_forget(struct smc_link_group *lgr);
-void smc_lgr_cleanup_early(struct smc_connection *conn);
-void smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
-void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
+void smc_lgr_cleanup_early(struct smc_link_group *lgr);
+void smc_lgr_terminate_sched(struct smc_link_group *lgr);
+void smc_lgr_hold(struct smc_link_group *lgr);
+void smc_lgr_put(struct smc_link_group *lgr);
+void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
+void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport);
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
unsigned short vlan);
void smc_smcd_terminate_all(struct smcd_dev *dev);
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smc_uncompress_bufsize(u8 compressed);
-int smc_rmb_rtoken_handling(struct smc_connection *conn,
+int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc);
-int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey);
-int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey);
-void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
+int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
+int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
+void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
+ __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey);
+void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
+ __be64 nw_vaddr, __be32 nw_rkey);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
-void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
void smc_conn_free(struct smc_connection *conn);
@@ -321,8 +537,30 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
int smc_core_init(void);
void smc_core_exit(void);
+int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
+ u8 link_idx, struct smc_init_info *ini);
+void smcr_link_clear(struct smc_link *lnk, bool log);
+void smcr_link_hold(struct smc_link *lnk);
+void smcr_link_put(struct smc_link *lnk);
+void smc_switch_link_and_count(struct smc_connection *conn,
+ struct smc_link *to_lnk);
+int smcr_buf_map_lgr(struct smc_link *lnk);
+int smcr_buf_reg_lgr(struct smc_link *lnk);
+void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
+void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
+ enum smc_lgr_type new_type, int asym_lnk_idx);
+int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc);
+struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
+ struct smc_link *from_lnk, bool is_dev_err);
+void smcr_link_down_cond(struct smc_link *lnk);
+void smcr_link_down_cond_sched(struct smc_link *lnk);
+int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb);
+int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
+int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb);
+int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
+
static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
{
- return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ return link->lgr;
}
#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index e1f64f4ba236..80ea7d954ece 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -22,17 +22,13 @@
#include "smc.h"
#include "smc_core.h"
-static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+struct smc_diag_dump_ctx {
+ int pos[2];
+};
+
+static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb)
{
- sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
- be16_to_cpu(((__be16 *)gid_raw)[0]),
- be16_to_cpu(((__be16 *)gid_raw)[1]),
- be16_to_cpu(((__be16 *)gid_raw)[2]),
- be16_to_cpu(((__be16 *)gid_raw)[3]),
- be16_to_cpu(((__be16 *)gid_raw)[4]),
- be16_to_cpu(((__be16 *)gid_raw)[5]),
- be16_to_cpu(((__be16 *)gid_raw)[6]),
- be16_to_cpu(((__be16 *)gid_raw)[7]));
+ return (struct smc_diag_dump_ctx *)cb->ctx;
}
static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
@@ -93,7 +89,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
r->diag_state = sk->sk_state;
if (smc->use_fallback)
r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
- else if (smc->conn.lgr && smc->conn.lgr->is_smcd)
+ else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd)
r->diag_mode = SMC_DIAG_MODE_SMCD;
else
r->diag_mode = SMC_DIAG_MODE_SMCR;
@@ -146,37 +142,39 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
goto errout;
}
- if (smc->conn.lgr && !smc->conn.lgr->is_smcd &&
+ if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd &&
(req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
!list_empty(&smc->conn.lgr->list)) {
+ struct smc_link *link = smc->conn.lnk;
+
struct smc_diag_lgrinfo linfo = {
.role = smc->conn.lgr->role,
- .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
- .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+ .lnk[0].ibport = link->ibport,
+ .lnk[0].link_id = link->link_id,
};
memcpy(linfo.lnk[0].ibname,
smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
- sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
- smc_gid_be16_convert(linfo.lnk[0].gid,
- smc->conn.lgr->lnk[0].gid);
- smc_gid_be16_convert(linfo.lnk[0].peer_gid,
- smc->conn.lgr->lnk[0].peer_gid);
+ sizeof(link->smcibdev->ibdev->name));
+ smc_gid_be16_convert(linfo.lnk[0].gid, link->gid);
+ smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid);
if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
goto errout;
}
- if (smc->conn.lgr && smc->conn.lgr->is_smcd &&
+ if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd &&
(req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
!list_empty(&smc->conn.lgr->list)) {
struct smc_connection *conn = &smc->conn;
- struct smcd_diag_dmbinfo dinfo = {
- .linkid = *((u32 *)conn->lgr->id),
- .peer_gid = conn->lgr->peer_gid,
- .my_gid = conn->lgr->smcd->local_gid,
- .token = conn->rmb_desc->token,
- .peer_token = conn->peer_token
- };
+ struct smcd_diag_dmbinfo dinfo;
+
+ memset(&dinfo, 0, sizeof(dinfo));
+
+ dinfo.linkid = *((u32 *)conn->lgr->id);
+ dinfo.peer_gid = conn->lgr->peer_gid;
+ dinfo.my_gid = conn->lgr->smcd->local_gid;
+ dinfo.token = conn->rmb_desc->token;
+ dinfo.peer_token = conn->peer_token;
if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
goto errout;
@@ -191,13 +189,15 @@ errout:
}
static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb, int p_type)
{
+ struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb);
struct net *net = sock_net(skb->sk);
+ int snum = cb_ctx->pos[p_type];
struct nlattr *bc = NULL;
struct hlist_head *head;
+ int rc = 0, num = 0;
struct sock *sk;
- int rc = 0;
read_lock(&prot->h.smc_hash->lock);
head = &prot->h.smc_hash->ht;
@@ -207,13 +207,18 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
sk_for_each(sk, head) {
if (!net_eq(sock_net(sk), net))
continue;
+ if (num < snum)
+ goto next;
rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
- if (rc)
- break;
+ if (rc < 0)
+ goto out;
+next:
+ num++;
}
out:
read_unlock(&prot->h.smc_hash->lock);
+ cb_ctx->pos[p_type] = num;
return rc;
}
@@ -221,10 +226,10 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int rc = 0;
- rc = smc_diag_dump_proto(&smc_proto, skb, cb);
+ rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC);
if (!rc)
- rc = smc_diag_dump_proto(&smc_proto6, skb, cb);
- return rc;
+ smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6);
+ return skb->len;
}
static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
@@ -263,3 +268,4 @@ module_init(smc_diag_init);
module_exit(smc_diag_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
+MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 05b825b3cfa4..854772dd52fd 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -12,10 +12,14 @@
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
*/
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
#include <linux/random.h>
#include <linux/workqueue.h>
#include <linux/scatterlist.h>
#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
@@ -24,6 +28,7 @@
#include "smc_core.h"
#include "smc_wr.h"
#include "smc.h"
+#include "smc_netlink.h"
#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */
@@ -33,15 +38,11 @@
#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
- .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
+ .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
.list = LIST_HEAD_INIT(smc_ib_devices.list),
};
-#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
-
-u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
- * identifier
- */
+u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
static int smc_ib_modify_qp_init(struct smc_link *lnk)
{
@@ -64,16 +65,23 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
struct ib_qp_attr qp_attr;
+ u8 hop_lim = 1;
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.qp_state = IB_QPS_RTR;
qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
- rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
+ if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+ hop_lim = IPV6_DEFAULT_HOPLIMIT;
+ rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
- memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
- sizeof(lnk->peer_mac));
+ if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+ memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
+ sizeof(lnk->lgr->nexthop_mac));
+ else
+ memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
+ sizeof(lnk->peer_mac));
qp_attr.dest_qp_num = lnk->peer_qpn;
qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
@@ -103,12 +111,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
IB_QP_MAX_QP_RD_ATOMIC);
}
-int smc_ib_modify_qp_reset(struct smc_link *lnk)
+int smc_ib_modify_qp_error(struct smc_link *lnk)
{
struct ib_qp_attr qp_attr;
memset(&qp_attr, 0, sizeof(qp_attr));
- qp_attr.qp_state = IB_QPS_RESET;
+ qp_attr.qp_state = IB_QPS_ERR;
return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
}
@@ -168,6 +176,15 @@ static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
{
memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
sizeof(smcibdev->mac[ibport - 1]));
+}
+
+bool smc_ib_is_valid_local_systemid(void)
+{
+ return !is_zero_ether_addr(&local_systemid[2]);
+}
+
+static void smc_ib_init_local_systemid(void)
+{
get_random_bytes(&local_systemid[0], 2);
}
@@ -176,9 +193,81 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
}
+int smc_ib_find_route(__be32 saddr, __be32 daddr,
+ u8 nexthop_mac[], u8 *uses_gateway)
+{
+ struct neighbour *neigh = NULL;
+ struct rtable *rt = NULL;
+ struct flowi4 fl4 = {
+ .saddr = saddr,
+ .daddr = daddr
+ };
+
+ if (daddr == cpu_to_be32(INADDR_NONE))
+ goto out;
+ rt = ip_route_output_flow(&init_net, &fl4, NULL);
+ if (IS_ERR(rt))
+ goto out;
+ if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
+ goto out;
+ neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
+ if (neigh) {
+ memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
+ *uses_gateway = rt->rt_uses_gateway;
+ return 0;
+ }
+out:
+ return -ENOENT;
+}
+
+static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
+ const struct ib_gid_attr *attr,
+ u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2)
+{
+ if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ return 0;
+ }
+ if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+ smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
+ struct in_device *in_dev = __in_dev_get_rcu(ndev);
+ const struct in_ifaddr *ifa;
+ bool subnet_match = false;
+
+ if (!in_dev)
+ goto out;
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (!inet_ifa_match(smcrv2->saddr, ifa))
+ continue;
+ subnet_match = true;
+ break;
+ }
+ if (!subnet_match)
+ goto out;
+ if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr,
+ smcrv2->daddr,
+ smcrv2->nexthop_mac,
+ &smcrv2->uses_gateway))
+ goto out;
+
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ return 0;
+ }
+out:
+ return -ENODEV;
+}
+
/* determine the gid for an ib-device port and vlan id */
int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
- unsigned short vlan_id, u8 gid[], u8 *sgid_index)
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2)
{
const struct ib_gid_attr *attr;
const struct net_device *ndev;
@@ -192,17 +281,15 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
rcu_read_lock();
ndev = rdma_read_gid_attr_ndev_rcu(attr);
if (!IS_ERR(ndev) &&
- ((!vlan_id && !is_vlan_dev(attr->ndev)) ||
- (vlan_id && is_vlan_dev(attr->ndev) &&
- vlan_dev_vlan_id(attr->ndev) == vlan_id)) &&
- attr->gid_type == IB_GID_TYPE_ROCE) {
- rcu_read_unlock();
- if (gid)
- memcpy(gid, &attr->gid, SMC_GID_SIZE);
- if (sgid_index)
- *sgid_index = attr->index;
- rdma_put_gid_attr(attr);
- return 0;
+ ((!vlan_id && !is_vlan_dev(ndev)) ||
+ (vlan_id && is_vlan_dev(ndev) &&
+ vlan_dev_vlan_id(ndev) == vlan_id))) {
+ if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
+ sgid_index, smcrv2)) {
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ return 0;
+ }
}
rcu_read_unlock();
rdma_put_gid_attr(attr);
@@ -210,6 +297,58 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
return -ENODEV;
}
+/* check if gid is still defined on smcibdev */
+static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
+ struct smc_ib_device *smcibdev, u8 ibport)
+{
+ const struct ib_gid_attr *attr;
+ bool rc = false;
+ int i;
+
+ for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+ if (IS_ERR(attr))
+ continue;
+
+ rcu_read_lock();
+ if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
+ (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+ !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
+ & IPV6_ADDR_LINKLOCAL)))
+ if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
+ rc = true;
+ rcu_read_unlock();
+ rdma_put_gid_attr(attr);
+ }
+ return rc;
+}
+
+/* check all links if the gid is still defined on smcibdev */
+static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct smc_link_group *lgr;
+ int i;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+ if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+ SMC_MAX_PNETID_LEN))
+ continue; /* lgr is not affected */
+ if (list_empty(&lgr->list))
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+ lgr->lnk[i].smcibdev != smcibdev)
+ continue;
+ if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
+ lgr->smc_version == SMC_V2,
+ smcibdev, ibport))
+ smcr_port_err(smcibdev, ibport);
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+}
+
static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
{
int rc;
@@ -224,8 +363,7 @@ static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
rc = smc_ib_fill_mac(smcibdev, ibport);
if (rc)
goto out;
- if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
- sizeof(local_systemid)) &&
+ if (!smc_ib_is_valid_local_systemid() &&
smc_ib_port_active(smcibdev, ibport))
/* create unique system identifier */
smc_ib_define_local_systemid(smcibdev, ibport);
@@ -245,9 +383,11 @@ static void smc_ib_port_event_work(struct work_struct *work)
clear_bit(port_idx, &smcibdev->port_event_mask);
if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
set_bit(port_idx, smcibdev->ports_going_away);
- smc_port_terminate(smcibdev, port_idx + 1);
+ smcr_port_err(smcibdev, port_idx + 1);
} else {
clear_bit(port_idx, smcibdev->ports_going_away);
+ smcr_port_add(smcibdev, port_idx + 1);
+ smc_ib_gid_check(smcibdev, port_idx + 1);
}
}
}
@@ -257,6 +397,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
struct ib_event *ibevent)
{
struct smc_ib_device *smcibdev;
+ bool schedule = false;
u8 port_idx;
smcibdev = container_of(handler, struct smc_ib_device, event_handler);
@@ -266,22 +407,35 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
/* terminate all ports on device */
for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
set_bit(port_idx, &smcibdev->port_event_mask);
- set_bit(port_idx, smcibdev->ports_going_away);
+ if (!test_and_set_bit(port_idx,
+ smcibdev->ports_going_away))
+ schedule = true;
}
- schedule_work(&smcibdev->port_event_work);
+ if (schedule)
+ schedule_work(&smcibdev->port_event_work);
break;
- case IB_EVENT_PORT_ERR:
case IB_EVENT_PORT_ACTIVE:
- case IB_EVENT_GID_CHANGE:
port_idx = ibevent->element.port_num - 1;
- if (port_idx < SMC_MAX_PORTS) {
- set_bit(port_idx, &smcibdev->port_event_mask);
- if (ibevent->event == IB_EVENT_PORT_ERR)
- set_bit(port_idx, smcibdev->ports_going_away);
- else if (ibevent->event == IB_EVENT_PORT_ACTIVE)
- clear_bit(port_idx, smcibdev->ports_going_away);
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
schedule_work(&smcibdev->port_event_work);
- }
+ break;
+ case IB_EVENT_PORT_ERR:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_GID_CHANGE:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
break;
default:
break;
@@ -306,6 +460,171 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
return rc;
}
+static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr,
+ struct smc_ib_device *smcibdev)
+{
+ struct smc_link_group *lgr;
+ bool rc = false;
+ int i;
+
+ spin_lock_bh(&smc_lgr->lock);
+ list_for_each_entry(lgr, &smc_lgr->list, list) {
+ if (lgr->is_smcd)
+ continue;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+ lgr->lnk[i].smcibdev != smcibdev)
+ continue;
+ if (lgr->type == SMC_LGR_SINGLE ||
+ lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) {
+ rc = true;
+ goto out;
+ }
+ }
+ }
+out:
+ spin_unlock_bh(&smc_lgr->lock);
+ return rc;
+}
+
+static int smc_nl_handle_dev_port(struct sk_buff *skb,
+ struct ib_device *ibdev,
+ struct smc_ib_device *smcibdev,
+ int port)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct nlattr *port_attrs;
+ unsigned char port_state;
+ int lnk_count = 0;
+
+ port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port);
+ if (!port_attrs)
+ goto errout;
+
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
+ smcibdev->pnetid_by_user[port]))
+ goto errattr;
+ memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
+ goto errattr;
+ if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
+ smcibdev->ndev_ifidx[port]))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1))
+ goto errattr;
+ port_state = smc_ib_port_active(smcibdev, port + 1);
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state))
+ goto errattr;
+ lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]);
+ if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count))
+ goto errattr;
+ nla_nest_end(skb, port_attrs);
+ return 0;
+errattr:
+ nla_nest_cancel(skb, port_attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
+ struct sk_buff *skb)
+{
+ if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
+ return false;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
+ return false;
+ if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
+ return false;
+ return true;
+}
+
+static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_ibname[IB_DEVICE_NAME_MAX];
+ struct smc_pci_dev smc_pci_dev;
+ struct pci_dev *pci_dev;
+ unsigned char is_crit;
+ struct nlattr *attrs;
+ void *nlh;
+ int i;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_DEV_SMCR);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR);
+ if (!attrs)
+ goto errout;
+ is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
+ if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
+ goto errattr;
+ if (smcibdev->ibdev->dev.parent) {
+ memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
+ pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
+ smc_set_pci_values(pci_dev, &smc_pci_dev);
+ if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
+ goto errattr;
+ }
+ snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
+ if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
+ goto errattr;
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(smcibdev->ibdev, i))
+ continue;
+ if (smc_nl_handle_dev_port(skb, smcibdev->ibdev,
+ smcibdev, i - 1))
+ goto errattr;
+ }
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct smc_ib_device *smcibdev;
+ int snum = cb_ctx->pos[0];
+ int num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcibdev, &dev_list->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_smcr_dev(smcibdev, skb, cb))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+}
+
+int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb);
+ return skb->len;
+}
+
static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
{
struct smc_link *lnk = (struct smc_link *)priv;
@@ -316,11 +635,11 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
case IB_EVENT_QP_FATAL:
case IB_EVENT_QP_ACCESS_ERR:
port_idx = ibevent->element.qp->port - 1;
- if (port_idx < SMC_MAX_PORTS) {
- set_bit(port_idx, &smcibdev->port_event_mask);
- set_bit(port_idx, smcibdev->ports_going_away);
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
schedule_work(&smcibdev->port_event_work);
- }
break;
default:
break;
@@ -337,6 +656,7 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
/* create a queue pair within the protection domain for a link */
int smc_ib_create_queue_pair(struct smc_link *lnk)
{
+ int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
struct ib_qp_init_attr qp_attr = {
.event_handler = smc_ib_qp_event_handler,
.qp_context = lnk,
@@ -350,7 +670,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
.max_send_wr = SMC_WR_BUF_CNT * 3,
.max_recv_wr = SMC_WR_BUF_CNT * 3,
.max_send_sge = SMC_IB_MAX_SEND_SGE,
- .max_recv_sge = 1,
+ .max_recv_sge = sges_per_buf,
+ .max_inline_data = 0,
},
.sq_sig_type = IB_SIGNAL_REQ_WR,
.qp_type = IB_QPT_RC,
@@ -371,15 +692,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr)
ib_dereg_mr(mr);
}
-static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot)
+static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
{
unsigned int offset = 0;
int sg_num;
/* map the largest prefix of a dma mapped SG list */
- sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK],
- buf_slot->sgt[SMC_SINGLE_LINK].sgl,
- buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
+ sg_num = ib_map_mr_sg(buf_slot->mr[link_idx],
+ buf_slot->sgt[link_idx].sgl,
+ buf_slot->sgt[link_idx].orig_nents,
&offset, PAGE_SIZE);
return sg_num;
@@ -387,41 +708,68 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot)
/* Allocate a memory region and map the dma mapped SG list of buf_slot */
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
- struct smc_buf_desc *buf_slot)
+ struct smc_buf_desc *buf_slot, u8 link_idx)
{
- if (buf_slot->mr_rx[SMC_SINGLE_LINK])
+ if (buf_slot->mr[link_idx])
return 0; /* already done */
- buf_slot->mr_rx[SMC_SINGLE_LINK] =
+ buf_slot->mr[link_idx] =
ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
- if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) {
+ if (IS_ERR(buf_slot->mr[link_idx])) {
int rc;
- rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]);
- buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL;
+ rc = PTR_ERR(buf_slot->mr[link_idx]);
+ buf_slot->mr[link_idx] = NULL;
return rc;
}
- if (smc_ib_map_mr_sg(buf_slot) != 1)
+ if (smc_ib_map_mr_sg(buf_slot, link_idx) !=
+ buf_slot->sgt[link_idx].orig_nents)
return -EINVAL;
return 0;
}
+bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot)
+{
+ struct scatterlist *sg;
+ unsigned int i;
+ bool ret = false;
+
+ /* for now there is just one DMA address */
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
+ if (!sg_dma_len(sg))
+ break;
+ if (dma_need_sync(lnk->smcibdev->ibdev->dma_device,
+ sg_dma_address(sg))) {
+ ret = true;
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
/* synchronize buffer usage for cpu access */
-void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{
struct scatterlist *sg;
unsigned int i;
+ if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
+ return;
+
/* for now there is just one DMA address */
- for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
- buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
if (!sg_dma_len(sg))
break;
- ib_dma_sync_single_for_cpu(smcibdev->ibdev,
+ ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
sg_dma_address(sg),
sg_dma_len(sg),
data_direction);
@@ -429,19 +777,22 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
}
/* synchronize buffer usage for device access */
-void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{
struct scatterlist *sg;
unsigned int i;
+ if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
+ return;
+
/* for now there is just one DMA address */
- for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg,
- buf_slot->sgt[SMC_SINGLE_LINK].nents, i) {
+ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
+ buf_slot->sgt[lnk->link_idx].nents, i) {
if (!sg_dma_len(sg))
break;
- ib_dma_sync_single_for_device(smcibdev->ibdev,
+ ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
sg_dma_address(sg),
sg_dma_len(sg),
data_direction);
@@ -449,15 +800,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
}
/* Map a new TX or RX buffer SG-table to DMA */
-int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
+int smc_ib_buf_map_sg(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{
int mapped_nents;
- mapped_nents = ib_dma_map_sg(smcibdev->ibdev,
- buf_slot->sgt[SMC_SINGLE_LINK].sgl,
- buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
+ mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
+ buf_slot->sgt[lnk->link_idx].sgl,
+ buf_slot->sgt[lnk->link_idx].orig_nents,
data_direction);
if (!mapped_nents)
return -ENOMEM;
@@ -465,18 +816,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
return mapped_nents;
}
-void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction)
{
- if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address)
+ if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
return; /* already unmapped */
- ib_dma_unmap_sg(smcibdev->ibdev,
- buf_slot->sgt[SMC_SINGLE_LINK].sgl,
- buf_slot->sgt[SMC_SINGLE_LINK].orig_nents,
+ ib_dma_unmap_sg(lnk->smcibdev->ibdev,
+ buf_slot->sgt[lnk->link_idx].sgl,
+ buf_slot->sgt[lnk->link_idx].orig_nents,
data_direction);
- buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;
+ buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
}
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
@@ -486,6 +837,10 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
int cqe_size_order, smc_order;
long rc;
+ mutex_lock(&smcibdev->mutex);
+ rc = 0;
+ if (smcibdev->initialized)
+ goto out;
/* the calculated number of cq entries fits to mlx5 cq allocation */
cqe_size_order = cache_line_size() == 128 ? 7 : 6;
smc_order = MAX_ORDER - cqe_size_order - 1;
@@ -497,7 +852,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
if (IS_ERR(smcibdev->roce_cq_send)) {
smcibdev->roce_cq_send = NULL;
- return rc;
+ goto out;
}
smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
smc_wr_rx_cq_handler, NULL,
@@ -509,46 +864,94 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
}
smc_wr_add_dev(smcibdev);
smcibdev->initialized = 1;
- return rc;
+ goto out;
err:
ib_destroy_cq(smcibdev->roce_cq_send);
+out:
+ mutex_unlock(&smcibdev->mutex);
return rc;
}
static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
{
+ mutex_lock(&smcibdev->mutex);
if (!smcibdev->initialized)
- return;
+ goto out;
smcibdev->initialized = 0;
ib_destroy_cq(smcibdev->roce_cq_recv);
ib_destroy_cq(smcibdev->roce_cq_send);
smc_wr_remove_dev(smcibdev);
+out:
+ mutex_unlock(&smcibdev->mutex);
}
static struct ib_client smc_ib_client;
+static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
+{
+ struct ib_device *ibdev = smcibdev->ibdev;
+ struct net_device *ndev;
+
+ if (!ibdev->ops.get_netdev)
+ return;
+ ndev = ibdev->ops.get_netdev(ibdev, port + 1);
+ if (ndev) {
+ smcibdev->ndev_ifidx[port] = ndev->ifindex;
+ dev_put(ndev);
+ }
+}
+
+void smc_ib_ndev_change(struct net_device *ndev, unsigned long event)
+{
+ struct smc_ib_device *smcibdev;
+ struct ib_device *libdev;
+ struct net_device *lndev;
+ u8 port_cnt;
+ int i;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
+ port_cnt = smcibdev->ibdev->phys_port_cnt;
+ for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) {
+ libdev = smcibdev->ibdev;
+ if (!libdev->ops.get_netdev)
+ continue;
+ lndev = libdev->ops.get_netdev(libdev, i + 1);
+ dev_put(lndev);
+ if (lndev != ndev)
+ continue;
+ if (event == NETDEV_REGISTER)
+ smcibdev->ndev_ifidx[i] = ndev->ifindex;
+ if (event == NETDEV_UNREGISTER)
+ smcibdev->ndev_ifidx[i] = 0;
+ }
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
/* callback function for ib_register_client() */
-static void smc_ib_add_dev(struct ib_device *ibdev)
+static int smc_ib_add_dev(struct ib_device *ibdev)
{
struct smc_ib_device *smcibdev;
u8 port_cnt;
int i;
if (ibdev->node_type != RDMA_NODE_IB_CA)
- return;
+ return -EOPNOTSUPP;
smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
if (!smcibdev)
- return;
+ return -ENOMEM;
smcibdev->ibdev = ibdev;
INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
atomic_set(&smcibdev->lnk_cnt, 0);
init_waitqueue_head(&smcibdev->lnks_deleted);
- spin_lock(&smc_ib_devices.lock);
+ mutex_init(&smcibdev->mutex);
+ mutex_lock(&smc_ib_devices.mutex);
list_add_tail(&smcibdev->list, &smc_ib_devices.list);
- spin_unlock(&smc_ib_devices.lock);
+ mutex_unlock(&smc_ib_devices.mutex);
ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
smc_ib_global_event_handler);
@@ -556,29 +959,39 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
/* trigger reading of the port attributes */
port_cnt = smcibdev->ibdev->phys_port_cnt;
+ pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
+ smcibdev->ibdev->name, port_cnt);
for (i = 0;
i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
i++) {
set_bit(i, &smcibdev->port_event_mask);
/* determine pnetids of the port */
- smc_pnetid_by_dev_port(ibdev->dev.parent, i,
- smcibdev->pnetid[i]);
+ if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
+ smcibdev->pnetid[i]))
+ smc_pnetid_by_table_ib(smcibdev, i + 1);
+ smc_copy_netdev_ifindex(smcibdev, i);
+ pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
+ "%.16s%s\n",
+ smcibdev->ibdev->name, i + 1,
+ smcibdev->pnetid[i],
+ smcibdev->pnetid_by_user[i] ?
+ " (user defined)" :
+ "");
}
schedule_work(&smcibdev->port_event_work);
+ return 0;
}
/* callback function for ib_unregister_client() */
static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
{
- struct smc_ib_device *smcibdev;
+ struct smc_ib_device *smcibdev = client_data;
- smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
- if (!smcibdev || smcibdev->ibdev != ibdev)
- return;
- ib_set_client_data(ibdev, &smc_ib_client, NULL);
- spin_lock(&smc_ib_devices.lock);
+ mutex_lock(&smc_ib_devices.mutex);
list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
- spin_unlock(&smc_ib_devices.lock);
+ mutex_unlock(&smc_ib_devices.mutex);
+ pr_warn_ratelimited("smc: removing ib device %s\n",
+ smcibdev->ibdev->name);
smc_smcr_terminate_all(smcibdev);
smc_ib_cleanup_per_ibdev(smcibdev);
ib_unregister_event_handler(&smcibdev->event_handler);
@@ -594,6 +1007,7 @@ static struct ib_client smc_ib_client = {
int __init smc_ib_register_client(void)
{
+ smc_ib_init_local_systemid();
return ib_register_client(&smc_ib_client);
}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 255db87547d3..034295676e88 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/if_ether.h>
+#include <linux/mutex.h>
#include <linux/wait.h>
#include <rdma/ib_verbs.h>
#include <net/smc.h>
@@ -25,10 +26,11 @@
struct smc_ib_devices { /* list of smc ib devices definition */
struct list_head list;
- spinlock_t lock; /* protects list of smc ib devices */
+ struct mutex mutex; /* protects list of smc ib devices */
};
extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
+extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */
struct smc_ib_device { /* ib-device infos for smc */
struct list_head list;
@@ -51,18 +53,41 @@ struct smc_ib_device { /* ib-device infos for smc */
DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS);
atomic_t lnk_cnt; /* number of links on ibdev */
wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/
+ struct mutex mutex; /* protect dev setup+cleanup */
+ atomic_t lnk_cnt_by_port[SMC_MAX_PORTS];
+ /* number of links per port */
+ int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */
};
+static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE])
+{
+ struct in6_addr *addr6 = (struct in6_addr *)gid;
+
+ if (ipv6_addr_v4mapped(addr6) ||
+ !(addr6->s6_addr32[0] | addr6->s6_addr32[1] | addr6->s6_addr32[2]))
+ return addr6->s6_addr32[3];
+ return cpu_to_be32(INADDR_NONE);
+}
+
+static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev)
+{
+ if (smcibdev && smcibdev->ibdev)
+ return read_pnet(&smcibdev->ibdev->coredev.rdma_net);
+ return NULL;
+}
+
+struct smc_init_info_smcrv2;
struct smc_buf_desc;
struct smc_link;
+void smc_ib_ndev_change(struct net_device *ndev, unsigned long event);
int smc_ib_register_client(void) __init;
void smc_ib_unregister_client(void);
bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
-int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
+int smc_ib_buf_map_sg(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
-void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
+void smc_ib_buf_unmap_sg(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
@@ -72,16 +97,24 @@ int smc_ib_create_queue_pair(struct smc_link *lnk);
int smc_ib_ready_link(struct smc_link *lnk);
int smc_ib_modify_qp_rts(struct smc_link *lnk);
int smc_ib_modify_qp_reset(struct smc_link *lnk);
+int smc_ib_modify_qp_error(struct smc_link *lnk);
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
- struct smc_buf_desc *buf_slot);
+ struct smc_buf_desc *buf_slot, u8 link_idx);
void smc_ib_put_memory_region(struct ib_mr *mr);
-void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
+bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
+ struct smc_buf_desc *buf_slot);
+void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
-void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
+void smc_ib_sync_sg_for_device(struct smc_link *lnk,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
- unsigned short vlan_id, u8 gid[], u8 *sgid_index);
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+ struct smc_init_info_smcrv2 *smcrv2);
+int smc_ib_find_route(__be32 saddr, __be32 daddr,
+ u8 nexthop_mac[], u8 *uses_gateway);
+bool smc_ib_is_valid_local_systemid(void);
+int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
#endif
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 5c4727d5066e..911fe08bc54b 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -6,7 +6,9 @@
* Copyright IBM Corp. 2018
*/
+#include <linux/if_vlan.h>
#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <asm/page.h>
@@ -14,28 +16,40 @@
#include "smc_core.h"
#include "smc_ism.h"
#include "smc_pnet.h"
+#include "smc_netlink.h"
struct smcd_dev_list smcd_dev_list = {
.list = LIST_HEAD_INIT(smcd_dev_list.list),
- .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock)
+ .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex)
};
-/* Test if an ISM communication is possible. */
+static bool smc_ism_v2_capable;
+static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN];
+
+/* Test if an ISM communication is possible - same CPC */
int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
{
return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
vlan_id);
}
-int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos,
- void *data, size_t len)
+void smc_ism_get_system_eid(u8 **eid)
{
- int rc;
+ if (!smc_ism_v2_capable)
+ *eid = NULL;
+ else
+ *eid = smc_ism_v2_system_eid;
+}
- rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal,
- pos->offset, data, len);
+u16 smc_ism_get_chid(struct smcd_dev *smcd)
+{
+ return smcd->ops->get_chid(smcd);
+}
- return rc < 0 ? rc : 0;
+/* HW supports ISM V2 and thus System EID is defined */
+bool smc_ism_is_v2_capable(void)
+{
+ return smc_ism_v2_capable;
}
/* Set a connection using this DMBE. */
@@ -188,6 +202,97 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
return rc;
}
+static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ char smc_pnet[SMC_MAX_PNETID_LEN + 1];
+ struct smc_pci_dev smc_pci_dev;
+ struct nlattr *port_attrs;
+ struct nlattr *attrs;
+ int use_cnt = 0;
+ void *nlh;
+
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_DEV_SMCD);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD);
+ if (!attrs)
+ goto errout;
+ use_cnt = atomic_read(&smcd->lgr_cnt);
+ if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt))
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0))
+ goto errattr;
+ memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
+ smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev);
+ if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device))
+ goto errattr;
+ if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id))
+ goto errattr;
+
+ port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT);
+ if (!port_attrs)
+ goto errattr;
+ if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user))
+ goto errportattr;
+ memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN);
+ smc_pnet[SMC_MAX_PNETID_LEN] = 0;
+ if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
+ goto errportattr;
+
+ nla_nest_end(skb, port_attrs);
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return 0;
+
+errportattr:
+ nla_nest_cancel(skb, port_attrs);
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ nlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ int snum = cb_ctx->pos[0];
+ struct smcd_dev *smcd;
+ int num = 0;
+
+ mutex_lock(&dev_list->mutex);
+ list_for_each_entry(smcd, &dev_list->list, list) {
+ if (num < snum)
+ goto next;
+ if (smc_nl_handle_smcd_dev(smcd, skb, cb))
+ goto errout;
+next:
+ num++;
+ }
+errout:
+ mutex_unlock(&dev_list->mutex);
+ cb_ctx->pos[0] = num;
+}
+
+int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb);
+ return skb->len;
+}
+
struct smc_ism_event_work {
struct work_struct work;
struct smcd_dev *smcd;
@@ -291,47 +396,77 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
return NULL;
}
+ smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
+ WQ_MEM_RECLAIM, name);
+ if (!smcd->event_wq) {
+ kfree(smcd->conn);
+ kfree(smcd);
+ return NULL;
+ }
+
smcd->dev.parent = parent;
smcd->dev.release = smcd_release;
device_initialize(&smcd->dev);
dev_set_name(&smcd->dev, name);
smcd->ops = ops;
- smc_pnetid_by_dev_port(parent, 0, smcd->pnetid);
+ if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid))
+ smc_pnetid_by_table_smcd(smcd);
spin_lock_init(&smcd->lock);
spin_lock_init(&smcd->lgr_lock);
INIT_LIST_HEAD(&smcd->vlan);
INIT_LIST_HEAD(&smcd->lgr_list);
init_waitqueue_head(&smcd->lgrs_deleted);
- smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
- WQ_MEM_RECLAIM, name);
- if (!smcd->event_wq) {
- kfree(smcd->conn);
- kfree(smcd);
- return NULL;
- }
return smcd;
}
EXPORT_SYMBOL_GPL(smcd_alloc_dev);
int smcd_register_dev(struct smcd_dev *smcd)
{
- spin_lock(&smcd_dev_list.lock);
- list_add_tail(&smcd->list, &smcd_dev_list.list);
- spin_unlock(&smcd_dev_list.lock);
+ int rc;
+
+ mutex_lock(&smcd_dev_list.mutex);
+ if (list_empty(&smcd_dev_list.list)) {
+ u8 *system_eid = NULL;
+
+ system_eid = smcd->ops->get_system_eid();
+ if (system_eid[24] != '0' || system_eid[28] != '0') {
+ smc_ism_v2_capable = true;
+ memcpy(smc_ism_v2_system_eid, system_eid,
+ SMC_MAX_EID_LEN);
+ }
+ }
+ /* sort list: devices without pnetid before devices with pnetid */
+ if (smcd->pnetid[0])
+ list_add_tail(&smcd->list, &smcd_dev_list.list);
+ else
+ list_add(&smcd->list, &smcd_dev_list.list);
+ mutex_unlock(&smcd_dev_list.mutex);
+
+ pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
+ dev_name(&smcd->dev), smcd->pnetid,
+ smcd->pnetid_by_user ? " (user defined)" : "");
+
+ rc = device_add(&smcd->dev);
+ if (rc) {
+ mutex_lock(&smcd_dev_list.mutex);
+ list_del(&smcd->list);
+ mutex_unlock(&smcd_dev_list.mutex);
+ }
- return device_add(&smcd->dev);
+ return rc;
}
EXPORT_SYMBOL_GPL(smcd_register_dev);
void smcd_unregister_dev(struct smcd_dev *smcd)
{
- spin_lock(&smcd_dev_list.lock);
+ pr_warn_ratelimited("smc: removing smcd device %s\n",
+ dev_name(&smcd->dev));
+ mutex_lock(&smcd_dev_list.mutex);
list_del_init(&smcd->list);
- spin_unlock(&smcd_dev_list.lock);
+ mutex_unlock(&smcd_dev_list.mutex);
smcd->going_away = 1;
smc_smcd_terminate_all(smcd);
- flush_workqueue(smcd->event_wq);
destroy_workqueue(smcd->event_wq);
device_del(&smcd->dev);
@@ -373,13 +508,13 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
EXPORT_SYMBOL_GPL(smcd_handle_event);
/* SMCD Device interrupt handler. Called from ISM device interrupt handler.
- * Parameters are smcd device pointer and DMB number. Find the connection and
- * schedule the tasklet for this connection.
+ * Parameters are smcd device pointer, DMB number, and the DMBE bitmask.
+ * Find the connection and schedule the tasklet for this connection.
*
* Context:
* - Function called in IRQ context from ISM device driver IRQ handler.
*/
-void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
+void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask)
{
struct smc_connection *conn = NULL;
unsigned long flags;
@@ -391,3 +526,9 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
spin_unlock_irqrestore(&smcd->lock, flags);
}
EXPORT_SYMBOL_GPL(smcd_handle_irq);
+
+void __init smc_ism_init(void)
+{
+ smc_ism_v2_capable = false;
+ memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN);
+}
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
index 4da946cbfa29..d6b2db604fe8 100644
--- a/net/smc/smc_ism.h
+++ b/net/smc/smc_ism.h
@@ -10,15 +10,17 @@
#define SMCD_ISM_H
#include <linux/uio.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
#include "smc.h"
struct smcd_dev_list { /* List of SMCD devices */
struct list_head list;
- spinlock_t lock; /* Protects list of devices */
+ struct mutex mutex; /* Protects list of devices */
};
-extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
+extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
struct smc_ism_vlanid { /* VLAN id set on ISM device */
struct list_head list;
@@ -26,13 +28,6 @@ struct smc_ism_vlanid { /* VLAN id set on ISM device */
refcount_t refcnt; /* Reference count */
};
-struct smc_ism_position { /* ISM device position to write to */
- u64 token; /* Token of DMB */
- u32 offset; /* Offset into DMBE */
- u8 index; /* Index of DMBE */
- u8 signal; /* Generate interrupt on owner side */
-};
-
struct smcd_dev;
int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev);
@@ -43,7 +38,21 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
struct smc_buf_desc *dmb_desc);
int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
-int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos,
- void *data, size_t len);
int smc_ism_signal_shutdown(struct smc_link_group *lgr);
+void smc_ism_get_system_eid(u8 **eid);
+u16 smc_ism_get_chid(struct smcd_dev *dev);
+bool smc_ism_is_v2_capable(void);
+void smc_ism_init(void);
+int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
+
+static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok,
+ unsigned int idx, bool sf, unsigned int offset,
+ void *data, size_t len)
+{
+ int rc;
+
+ rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len);
+ return rc < 0 ? rc : 0;
+}
+
#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index a9f6431dd69a..524649d0ab65 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -17,21 +17,30 @@
#include "smc_core.h"
#include "smc_clc.h"
#include "smc_llc.h"
+#include "smc_pnet.h"
#define SMC_LLC_DATA_LEN 40
struct smc_llc_hdr {
struct smc_wr_rx_hdr common;
- u8 length; /* 44 */
-#if defined(__BIG_ENDIAN_BITFIELD)
- u8 reserved:4,
- add_link_rej_rsn:4;
+ union {
+ struct {
+ u8 length; /* 44 */
+ #if defined(__BIG_ENDIAN_BITFIELD)
+ u8 reserved:4,
+ add_link_rej_rsn:4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 add_link_rej_rsn:4,
- reserved:4;
+ u8 add_link_rej_rsn:4,
+ reserved:4;
#endif
+ };
+ u16 length_v2; /* 44 - 8192*/
+ };
u8 flags;
-};
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
#define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03
@@ -58,11 +67,60 @@ struct smc_llc_msg_add_link { /* type 0x02 */
u8 sender_gid[SMC_GID_SIZE];
u8 sender_qp_num[3];
u8 link_num;
- u8 flags2; /* QP mtu */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 reserved3 : 4,
+ qp_mtu : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ reserved3 : 4;
+#endif
u8 initial_psn[3];
u8 reserved[8];
};
+struct smc_llc_msg_add_link_cont_rt {
+ __be32 rmb_key;
+ __be32 rmb_key_new;
+ __be64 rmb_vaddr_new;
+};
+
+struct smc_llc_msg_add_link_v2_ext {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 v2_direct : 1,
+ reserved : 7;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 7,
+ v2_direct : 1;
+#endif
+ u8 reserved2;
+ u8 client_target_gid[SMC_GID_SIZE];
+ u8 reserved3[8];
+ u16 num_rkeys;
+ struct smc_llc_msg_add_link_cont_rt rt[];
+} __packed; /* format defined in
+ * IBM Shared Memory Communications Version 2
+ * (https://www.ibm.com/support/pages/node/6326337)
+ */
+
+struct smc_llc_msg_req_add_link_v2 {
+ struct smc_llc_hdr hd;
+ u8 reserved[20];
+ u8 gid_cnt;
+ u8 reserved2[3];
+ u8 gid[][SMC_GID_SIZE];
+};
+
+#define SMC_LLC_RKEYS_PER_CONT_MSG 2
+
+struct smc_llc_msg_add_link_cont { /* type 0x03 */
+ struct smc_llc_hdr hd;
+ u8 link_num;
+ u8 num_rkeys;
+ u8 reserved2[2];
+ struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG];
+ u8 reserved[4];
+} __packed; /* format defined in RFC7609 */
+
#define SMC_LLC_FLAG_DEL_LINK_ALL 0x40
#define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20
@@ -90,7 +148,8 @@ struct smc_rmb_rtoken {
__be64 rmb_vaddr;
} __packed; /* format defined in RFC7609 */
-#define SMC_LLC_RKEYS_PER_MSG 3
+#define SMC_LLC_RKEYS_PER_MSG 3
+#define SMC_LLC_RKEYS_PER_MSG_V2 255
struct smc_llc_msg_confirm_rkey { /* type 0x06 */
struct smc_llc_hdr hd;
@@ -98,13 +157,8 @@ struct smc_llc_msg_confirm_rkey { /* type 0x06 */
u8 reserved;
};
-struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */
- struct smc_llc_hdr hd;
- u8 num_rkeys;
- struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG];
-};
-
#define SMC_LLC_DEL_RKEY_MAX 8
+#define SMC_LLC_FLAG_RKEY_RETRY 0x10
#define SMC_LLC_FLAG_RKEY_NEG 0x20
struct smc_llc_msg_delete_rkey { /* type 0x09 */
@@ -116,13 +170,22 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */
u8 reserved2[4];
};
+struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */
+ struct smc_llc_hdr hd;
+ u8 num_rkeys;
+ u8 num_inval_rkeys;
+ u8 reserved[2];
+ __be32 rkey[];
+};
+
union smc_llc_msg {
struct smc_llc_msg_confirm_link confirm_link;
struct smc_llc_msg_add_link add_link;
+ struct smc_llc_msg_req_add_link_v2 req_add_link;
+ struct smc_llc_msg_add_link_cont add_link_cont;
struct smc_llc_msg_del_link delete_link;
struct smc_llc_msg_confirm_rkey confirm_rkey;
- struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont;
struct smc_llc_msg_delete_rkey delete_rkey;
struct smc_llc_msg_test_link test_link;
@@ -134,6 +197,181 @@ union smc_llc_msg {
#define SMC_LLC_FLAG_RESP 0x80
+struct smc_llc_qentry {
+ struct list_head list;
+ struct smc_link *link;
+ union smc_llc_msg msg;
+};
+
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc);
+
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow)
+{
+ struct smc_llc_qentry *qentry = flow->qentry;
+
+ flow->qentry = NULL;
+ return qentry;
+}
+
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow)
+{
+ struct smc_llc_qentry *qentry;
+
+ if (flow->qentry) {
+ qentry = flow->qentry;
+ flow->qentry = NULL;
+ kfree(qentry);
+ }
+}
+
+static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow,
+ struct smc_llc_qentry *qentry)
+{
+ flow->qentry = qentry;
+}
+
+static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type,
+ struct smc_llc_qentry *qentry)
+{
+ u8 msg_type = qentry->msg.raw.hdr.common.llc_type;
+
+ if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) &&
+ flow_type != msg_type && !lgr->delayed_event) {
+ lgr->delayed_event = qentry;
+ return;
+ }
+ /* drop parallel or already-in-progress llc requests */
+ if (flow_type != msg_type)
+ pr_warn_once("smc: SMC-R lg %*phN net %llu dropped parallel "
+ "LLC msg: msg %d flow %d role %d\n",
+ SMC_LGR_ID_SIZE, &lgr->id,
+ lgr->net->net_cookie,
+ qentry->msg.raw.hdr.common.type,
+ flow_type, lgr->role);
+ kfree(qentry);
+}
+
+/* try to start a new llc flow, initiated by an incoming llc msg */
+static bool smc_llc_flow_start(struct smc_llc_flow *flow,
+ struct smc_llc_qentry *qentry)
+{
+ struct smc_link_group *lgr = qentry->link->lgr;
+
+ spin_lock_bh(&lgr->llc_flow_lock);
+ if (flow->type) {
+ /* a flow is already active */
+ smc_llc_flow_parallel(lgr, flow->type, qentry);
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return false;
+ }
+ switch (qentry->msg.raw.hdr.common.llc_type) {
+ case SMC_LLC_ADD_LINK:
+ flow->type = SMC_LLC_FLOW_ADD_LINK;
+ break;
+ case SMC_LLC_DELETE_LINK:
+ flow->type = SMC_LLC_FLOW_DEL_LINK;
+ break;
+ case SMC_LLC_CONFIRM_RKEY:
+ case SMC_LLC_DELETE_RKEY:
+ flow->type = SMC_LLC_FLOW_RKEY;
+ break;
+ default:
+ flow->type = SMC_LLC_FLOW_NONE;
+ }
+ smc_llc_flow_qentry_set(flow, qentry);
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return true;
+}
+
+/* start a new local llc flow, wait till current flow finished */
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+ enum smc_llc_flowtype type)
+{
+ enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE;
+ int rc;
+
+ /* all flows except confirm_rkey and delete_rkey are exclusive,
+ * confirm/delete rkey flows can run concurrently (local and remote)
+ */
+ if (type == SMC_LLC_FLOW_RKEY)
+ allowed_remote = SMC_LLC_FLOW_RKEY;
+again:
+ if (list_empty(&lgr->list))
+ return -ENODEV;
+ spin_lock_bh(&lgr->llc_flow_lock);
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+ (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+ lgr->llc_flow_rmt.type == allowed_remote)) {
+ lgr->llc_flow_lcl.type = type;
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ return 0;
+ }
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ rc = wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) ||
+ (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE &&
+ (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE ||
+ lgr->llc_flow_rmt.type == allowed_remote))),
+ SMC_LLC_WAIT_TIME * 10);
+ if (!rc)
+ return -ETIMEDOUT;
+ goto again;
+}
+
+/* finish the current llc flow */
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow)
+{
+ spin_lock_bh(&lgr->llc_flow_lock);
+ memset(flow, 0, sizeof(*flow));
+ flow->type = SMC_LLC_FLOW_NONE;
+ spin_unlock_bh(&lgr->llc_flow_lock);
+ if (!list_empty(&lgr->list) && lgr->delayed_event &&
+ flow == &lgr->llc_flow_lcl)
+ schedule_work(&lgr->llc_event_work);
+ else
+ wake_up(&lgr->llc_flow_waiter);
+}
+
+/* lnk is optional and used for early wakeup when link goes down, useful in
+ * cases where we wait for a response on the link after we sent a request
+ */
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+ struct smc_link *lnk,
+ int time_out, u8 exp_msg)
+{
+ struct smc_llc_flow *flow = &lgr->llc_flow_lcl;
+ u8 rcv_msg;
+
+ wait_event_timeout(lgr->llc_msg_waiter,
+ (flow->qentry ||
+ (lnk && !smc_link_usable(lnk)) ||
+ list_empty(&lgr->list)),
+ time_out);
+ if (!flow->qentry ||
+ (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) {
+ smc_llc_flow_qentry_del(flow);
+ goto out;
+ }
+ rcv_msg = flow->qentry->msg.raw.hdr.common.llc_type;
+ if (exp_msg && rcv_msg != exp_msg) {
+ if (exp_msg == SMC_LLC_ADD_LINK &&
+ rcv_msg == SMC_LLC_DELETE_LINK) {
+ /* flow_start will delay the unexpected msg */
+ smc_llc_flow_start(&lgr->llc_flow_lcl,
+ smc_llc_flow_qentry_clr(flow));
+ return NULL;
+ }
+ pr_warn_once("smc: SMC-R lg %*phN net %llu dropped unexpected LLC msg: "
+ "msg %d exp %d flow %d role %d flags %x\n",
+ SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie,
+ rcv_msg, exp_msg,
+ flow->type, lgr->role,
+ flow->qentry->msg.raw.hdr.flags);
+ smc_llc_flow_qentry_del(flow);
+ }
+out:
+ return flow->qentry;
+}
+
/********************************** send *************************************/
struct smc_llc_tx_pend {
@@ -182,23 +420,48 @@ static int smc_llc_add_pending_send(struct smc_link *link,
return 0;
}
+static int smc_llc_add_pending_send_v2(struct smc_link *link,
+ struct smc_wr_v2_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_v2_slot(link, smc_llc_tx_handler, wr_buf, pend);
+ if (rc < 0)
+ return rc;
+ return 0;
+}
+
+static void smc_llc_init_msg_hdr(struct smc_llc_hdr *hdr,
+ struct smc_link_group *lgr, size_t len)
+{
+ if (lgr->smc_version == SMC_V2) {
+ hdr->common.llc_version = SMC_V2;
+ hdr->length_v2 = len;
+ } else {
+ hdr->common.llc_version = 0;
+ hdr->length = len;
+ }
+}
+
/* high-level API to send LLC confirm link */
int smc_llc_send_confirm_link(struct smc_link *link,
enum smc_llc_reqresp reqresp)
{
- struct smc_link_group *lgr = smc_get_lgr(link);
struct smc_llc_msg_confirm_link *confllc;
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
int rc;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
if (rc)
- return rc;
+ goto put_out;
confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
memset(confllc, 0, sizeof(*confllc));
- confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
- confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+ confllc->hd.common.llc_type = SMC_LLC_CONFIRM_LINK;
+ smc_llc_init_msg_hdr(&confllc->hd, link->lgr, sizeof(*confllc));
confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
if (reqresp == SMC_LLC_RESP)
confllc->hd.flags |= SMC_LLC_FLAG_RESP;
@@ -207,35 +470,61 @@ int smc_llc_send_confirm_link(struct smc_link *link,
memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
confllc->link_num = link->link_id;
- memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
- confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */
+ memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE);
+ confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS;
/* send llc message */
rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
return rc;
}
/* send LLC confirm rkey request */
-static int smc_llc_send_confirm_rkey(struct smc_link *link,
+static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
struct smc_buf_desc *rmb_desc)
{
struct smc_llc_msg_confirm_rkey *rkeyllc;
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
- int rc;
+ struct smc_link *link;
+ int i, rc, rtok_ix;
- rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (!smc_wr_tx_link_hold(send_link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend);
if (rc)
- return rc;
+ goto put_out;
rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
memset(rkeyllc, 0, sizeof(*rkeyllc));
- rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
- rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
+ rkeyllc->hd.common.llc_type = SMC_LLC_CONFIRM_RKEY;
+ smc_llc_init_msg_hdr(&rkeyllc->hd, send_link->lgr, sizeof(*rkeyllc));
+
+ rtok_ix = 1;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ link = &send_link->lgr->lnk[i];
+ if (smc_link_active(link) && link != send_link) {
+ rkeyllc->rtoken[rtok_ix].link_id = link->link_id;
+ rkeyllc->rtoken[rtok_ix].rmb_key =
+ htonl(rmb_desc->mr[link->link_idx]->rkey);
+ rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ?
+ cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address
+ (rmb_desc->sgt[link->link_idx].sgl));
+ rtok_ix++;
+ }
+ }
+ /* rkey of send_link is in rtoken[0] */
+ rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1;
rkeyllc->rtoken[0].rmb_key =
- htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
- (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+ htonl(rmb_desc->mr[send_link->link_idx]->rkey);
+ rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ?
+ cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address
+ (rmb_desc->sgt[send_link->link_idx].sgl));
/* send llc message */
- rc = smc_wr_tx_send(link, pend);
+ rc = smc_wr_tx_send(send_link, pend);
+put_out:
+ smc_wr_tx_link_put(send_link);
return rc;
}
@@ -248,90 +537,195 @@ static int smc_llc_send_delete_rkey(struct smc_link *link,
struct smc_wr_buf *wr_buf;
int rc;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
if (rc)
- return rc;
+ goto put_out;
rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf;
memset(rkeyllc, 0, sizeof(*rkeyllc));
- rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY;
- rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey);
+ rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY;
+ smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc));
rkeyllc->num_rkeys = 1;
- rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey);
/* send llc message */
rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
return rc;
}
-/* prepare an add link message */
-static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
- struct smc_link *link, u8 mac[], u8 gid[],
- enum smc_llc_reqresp reqresp)
+/* return first buffer from any of the next buf lists */
+static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr,
+ int *buf_lst)
{
- memset(addllc, 0, sizeof(*addllc));
- addllc->hd.common.type = SMC_LLC_ADD_LINK;
- addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
- if (reqresp == SMC_LLC_RESP) {
- addllc->hd.flags |= SMC_LLC_FLAG_RESP;
- /* always reject more links for now */
- addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
- addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+ struct smc_buf_desc *buf_pos;
+
+ while (*buf_lst < SMC_RMBE_SIZES) {
+ buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst],
+ struct smc_buf_desc, list);
+ if (buf_pos)
+ return buf_pos;
+ (*buf_lst)++;
}
- memcpy(addllc->sender_mac, mac, ETH_ALEN);
- memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+ return NULL;
+}
+
+/* return next rmb from buffer lists */
+static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr,
+ int *buf_lst,
+ struct smc_buf_desc *buf_pos)
+{
+ struct smc_buf_desc *buf_next;
+
+ if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) {
+ (*buf_lst)++;
+ return _smc_llc_get_next_rmb(lgr, buf_lst);
+ }
+ buf_next = list_next_entry(buf_pos, list);
+ return buf_next;
+}
+
+static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr,
+ int *buf_lst)
+{
+ *buf_lst = 0;
+ return smc_llc_get_next_rmb(lgr, buf_lst, NULL);
+}
+
+static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext,
+ struct smc_link *link, struct smc_link *link_new)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_buf_desc *buf_pos;
+ int prim_lnk_idx, lnk_idx, i;
+ struct smc_buf_desc *rmb;
+ int len = sizeof(*ext);
+ int buf_lst;
+
+ ext->v2_direct = !lgr->uses_gateway;
+ memcpy(ext->client_target_gid, link_new->gid, SMC_GID_SIZE);
+
+ prim_lnk_idx = link->link_idx;
+ lnk_idx = link_new->link_idx;
+ mutex_lock(&lgr->rmbs_lock);
+ ext->num_rkeys = lgr->conns_num;
+ if (!ext->num_rkeys)
+ goto out;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ for (i = 0; i < ext->num_rkeys; i++) {
+ if (!buf_pos)
+ break;
+ rmb = buf_pos;
+ ext->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey);
+ ext->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey);
+ ext->rt[i].rmb_vaddr_new = rmb->is_vm ?
+ cpu_to_be64((uintptr_t)rmb->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
+ buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos);
+ while (buf_pos && !(buf_pos)->used)
+ buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos);
+ }
+ len += i * sizeof(ext->rt[0]);
+out:
+ mutex_unlock(&lgr->rmbs_lock);
+ return len;
}
/* send ADD LINK request or response */
int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+ struct smc_link *link_new,
enum smc_llc_reqresp reqresp)
{
+ struct smc_llc_msg_add_link_v2_ext *ext = NULL;
struct smc_llc_msg_add_link *addllc;
struct smc_wr_tx_pend_priv *pend;
- struct smc_wr_buf *wr_buf;
+ int len = sizeof(*addllc);
int rc;
- rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
- if (rc)
- return rc;
- addllc = (struct smc_llc_msg_add_link *)wr_buf;
- smc_llc_prep_add_link(addllc, link, mac, gid, reqresp);
- /* send llc message */
- rc = smc_wr_tx_send(link, pend);
- return rc;
-}
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ if (link->lgr->smc_version == SMC_V2) {
+ struct smc_wr_v2_buf *wr_buf;
-/* prepare a delete link message */
-static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
- struct smc_link *link,
- enum smc_llc_reqresp reqresp, bool orderly)
-{
- memset(delllc, 0, sizeof(*delllc));
- delllc->hd.common.type = SMC_LLC_DELETE_LINK;
- delllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+ rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addllc = (struct smc_llc_msg_add_link *)wr_buf;
+ ext = (struct smc_llc_msg_add_link_v2_ext *)
+ &wr_buf->raw[sizeof(*addllc)];
+ memset(ext, 0, SMC_WR_TX_SIZE);
+ } else {
+ struct smc_wr_buf *wr_buf;
+
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addllc = (struct smc_llc_msg_add_link *)wr_buf;
+ }
+
+ memset(addllc, 0, sizeof(*addllc));
+ addllc->hd.common.llc_type = SMC_LLC_ADD_LINK;
if (reqresp == SMC_LLC_RESP)
- delllc->hd.flags |= SMC_LLC_FLAG_RESP;
- /* DEL_LINK_ALL because only 1 link supported */
- delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
- if (orderly)
- delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
- delllc->link_num = link->link_id;
+ addllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(addllc->sender_mac, mac, ETH_ALEN);
+ memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+ if (link_new) {
+ addllc->link_num = link_new->link_id;
+ hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num);
+ hton24(addllc->initial_psn, link_new->psn_initial);
+ if (reqresp == SMC_LLC_REQ)
+ addllc->qp_mtu = link_new->path_mtu;
+ else
+ addllc->qp_mtu = min(link_new->path_mtu,
+ link_new->peer_mtu);
+ }
+ if (ext && link_new)
+ len += smc_llc_fill_ext_v2(ext, link, link_new);
+ smc_llc_init_msg_hdr(&addllc->hd, link->lgr, len);
+ /* send llc message */
+ if (link->lgr->smc_version == SMC_V2)
+ rc = smc_wr_tx_v2_send(link, pend, len);
+ else
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
}
/* send DELETE LINK request or response */
-int smc_llc_send_delete_link(struct smc_link *link,
- enum smc_llc_reqresp reqresp, bool orderly)
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+ enum smc_llc_reqresp reqresp, bool orderly,
+ u32 reason)
{
struct smc_llc_msg_del_link *delllc;
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
int rc;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
if (rc)
- return rc;
+ goto put_out;
delllc = (struct smc_llc_msg_del_link *)wr_buf;
- smc_llc_prep_delete_link(delllc, link, reqresp, orderly);
+
+ memset(delllc, 0, sizeof(*delllc));
+ delllc->hd.common.llc_type = SMC_LLC_DELETE_LINK;
+ smc_llc_init_msg_hdr(&delllc->hd, link->lgr, sizeof(*delllc));
+ if (reqresp == SMC_LLC_RESP)
+ delllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ if (orderly)
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ if (link_del_id)
+ delllc->link_num = link_del_id;
+ else
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+ delllc->reason = htonl(reason);
/* send llc message */
rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
return rc;
}
@@ -343,251 +737,1347 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
struct smc_wr_buf *wr_buf;
int rc;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
if (rc)
- return rc;
+ goto put_out;
testllc = (struct smc_llc_msg_test_link *)wr_buf;
memset(testllc, 0, sizeof(*testllc));
- testllc->hd.common.type = SMC_LLC_TEST_LINK;
- testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
+ testllc->hd.common.llc_type = SMC_LLC_TEST_LINK;
+ smc_llc_init_msg_hdr(&testllc->hd, link->lgr, sizeof(*testllc));
memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
/* send llc message */
rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
return rc;
}
-struct smc_llc_send_work {
- struct work_struct work;
- struct smc_link *link;
- int llclen;
- union smc_llc_msg llcbuf;
-};
-
-/* worker that sends a prepared message */
-static void smc_llc_send_message_work(struct work_struct *work)
+/* schedule an llc send on link, may wait for buffers */
+static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
{
- struct smc_llc_send_work *llcwrk = container_of(work,
- struct smc_llc_send_work, work);
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
int rc;
- if (llcwrk->link->state == SMC_LNK_INACTIVE)
- goto out;
- rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend);
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
if (rc)
- goto out;
- memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen);
- smc_wr_tx_send(llcwrk->link, pend);
-out:
- kfree(llcwrk);
+ goto put_out;
+ memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
}
-/* copy llcbuf and schedule an llc send on link */
-static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
+/* schedule an llc send on link, may wait for buffers,
+ * and wait for send completion notification.
+ * @return 0 on success
+ */
+static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf)
{
- struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
- if (!wrk)
- return -ENOMEM;
- INIT_WORK(&wrk->work, smc_llc_send_message_work);
- wrk->link = link;
- wrk->llclen = llclen;
- memcpy(&wrk->llcbuf, llcbuf, llclen);
- queue_work(link->llc_wq, &wrk->work);
- return 0;
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
+ rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
}
/********************************* receive ***********************************/
-static void smc_llc_rx_confirm_link(struct smc_link *link,
- struct smc_llc_msg_confirm_link *llc)
+static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
+ enum smc_lgr_type lgr_new_t)
{
- struct smc_link_group *lgr = smc_get_lgr(link);
- int conf_rc;
+ int i;
+
+ if (lgr->type == SMC_LGR_SYMMETRIC ||
+ (lgr->type != SMC_LGR_SINGLE &&
+ (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)))
+ return -EMLINK;
+
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) {
+ for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--)
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+ return i;
+ } else {
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (lgr->lnk[i].state == SMC_LNK_UNUSED)
+ return i;
+ }
+ return -EMLINK;
+}
- /* RMBE eyecatchers are not supported */
- if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)
- conf_rc = 0;
- else
- conf_rc = ENOTSUPP;
+/* send one add_link_continue msg */
+static int smc_llc_add_link_cont(struct smc_link *link,
+ struct smc_link *link_new, u8 *num_rkeys_todo,
+ int *buf_lst, struct smc_buf_desc **buf_pos)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ int prim_lnk_idx, lnk_idx, i, rc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ struct smc_buf_desc *rmb;
+ u8 n;
- if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- if (lgr->role == SMC_SERV &&
- link->state == SMC_LNK_ACTIVATING) {
- link->llc_confirm_resp_rc = conf_rc;
- complete(&link->llc_confirm_resp);
+ if (!smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf;
+ memset(addc_llc, 0, sizeof(*addc_llc));
+
+ prim_lnk_idx = link->link_idx;
+ lnk_idx = link_new->link_idx;
+ addc_llc->link_num = link_new->link_id;
+ addc_llc->num_rkeys = *num_rkeys_todo;
+ n = *num_rkeys_todo;
+ for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) {
+ if (!*buf_pos) {
+ addc_llc->num_rkeys = addc_llc->num_rkeys -
+ *num_rkeys_todo;
+ *num_rkeys_todo = 0;
+ break;
}
- } else {
- if (lgr->role == SMC_CLNT &&
- link->state == SMC_LNK_ACTIVATING) {
- link->llc_confirm_rc = conf_rc;
- link->link_id = llc->link_num;
- complete(&link->llc_confirm);
+ rmb = *buf_pos;
+
+ addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey);
+ addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey);
+ addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ?
+ cpu_to_be64((uintptr_t)rmb->cpu_addr) :
+ cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
+
+ (*num_rkeys_todo)--;
+ *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+ while (*buf_pos && !(*buf_pos)->used)
+ *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
+ }
+ addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT;
+ addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont);
+ if (lgr->role == SMC_CLNT)
+ addc_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ rc = smc_wr_tx_send(link, pend);
+put_out:
+ smc_wr_tx_link_put(link);
+ return rc;
+}
+
+static int smc_llc_cli_rkey_exchange(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ u8 max, num_rkeys_send, num_rkeys_recv;
+ struct smc_llc_qentry *qentry;
+ struct smc_buf_desc *buf_pos;
+ int buf_lst;
+ int rc = 0;
+ int i;
+
+ mutex_lock(&lgr->rmbs_lock);
+ num_rkeys_send = lgr->conns_num;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ do {
+ qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK_CONT);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ break;
}
+ addc_llc = &qentry->msg.add_link_cont;
+ num_rkeys_recv = addc_llc->num_rkeys;
+ max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ addc_llc->rt[i].rmb_key,
+ addc_llc->rt[i].rmb_vaddr_new,
+ addc_llc->rt[i].rmb_key_new);
+ num_rkeys_recv--;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+ &buf_lst, &buf_pos);
+ if (rc)
+ break;
+ } while (num_rkeys_send || num_rkeys_recv);
+
+ mutex_unlock(&lgr->rmbs_lock);
+ return rc;
+}
+
+/* prepare and send an add link reject response */
+static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
+{
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
+ qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+ smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr,
+ sizeof(qentry->msg));
+ return smc_llc_send_message(qentry->link, &qentry->msg);
+}
+
+static int smc_llc_cli_conf_link(struct smc_link *link,
+ struct smc_init_info *ini,
+ struct smc_link *link_new,
+ enum smc_lgr_type lgr_new_t)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc = 0;
+
+ /* receive CONFIRM LINK request over RoCE fabric */
+ qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0);
+ if (!qentry) {
+ rc = smc_llc_send_delete_link(link, link_new->link_id,
+ SMC_LLC_REQ, false,
+ SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ if (qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) {
+ /* received DELETE_LINK instead */
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return -ENOLINK;
+ }
+ smc_llc_save_peer_uid(qentry);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+ rc = smc_ib_modify_qp_rts(link_new);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ smc_wr_remember_qp_attr(link_new);
+
+ rc = smcr_buf_reg_lgr(link_new);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP);
+ if (rc) {
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ return -ENOLINK;
+ }
+ smc_llc_link_active(link_new);
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+ smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+ else
+ smcr_lgr_set_type(lgr, lgr_new_t);
+ return 0;
+}
+
+static void smc_llc_save_add_link_rkeys(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_v2_ext *ext;
+ struct smc_link_group *lgr = link->lgr;
+ int max, i;
+
+ ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 +
+ SMC_WR_TX_SIZE);
+ max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
+ mutex_lock(&lgr->rmbs_lock);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ ext->rt[i].rmb_key,
+ ext->rt[i].rmb_vaddr_new,
+ ext->rt[i].rmb_key_new);
}
+ mutex_unlock(&lgr->rmbs_lock);
}
-static void smc_llc_rx_add_link(struct smc_link *link,
- struct smc_llc_msg_add_link *llc)
+static void smc_llc_save_add_link_info(struct smc_link *link,
+ struct smc_llc_msg_add_link *add_llc)
{
+ link->peer_qpn = ntoh24(add_llc->sender_qp_num);
+ memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN);
+ link->peer_psn = ntoh24(add_llc->initial_psn);
+ link->peer_mtu = add_llc->qp_mtu;
+}
+
+/* as an SMC client, process an add link request */
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
+{
+ struct smc_llc_msg_add_link *llc = &qentry->msg.add_link;
+ enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_init_info *ini = NULL;
+ struct smc_link *lnk_new = NULL;
+ int lnk_idx, rc = 0;
+
+ if (!llc->qp_mtu)
+ goto out_reject;
- if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- if (link->state == SMC_LNK_ACTIVATING)
- complete(&link->llc_add_resp);
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = -ENOMEM;
+ goto out_reject;
+ }
+
+ ini->vlan_id = lgr->vlan_id;
+ if (lgr->smc_version == SMC_V2) {
+ ini->check_smcrv2 = true;
+ ini->smcrv2.saddr = lgr->saddr;
+ ini->smcrv2.daddr = smc_ib_gid_to_ipv4(llc->sender_gid);
+ }
+ smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+ if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+ (lgr->smc_version == SMC_V2 ||
+ !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN))) {
+ if (!ini->ib_dev && !ini->smcrv2.ib_dev_v2)
+ goto out_reject;
+ lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+ }
+ if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->smcrv2.ib_dev_v2 = link->smcibdev;
+ ini->smcrv2.ib_port_v2 = link->ibport;
+ } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->ib_dev = link->smcibdev;
+ ini->ib_port = link->ibport;
+ }
+ lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+ if (lnk_idx < 0)
+ goto out_reject;
+ lnk_new = &lgr->lnk[lnk_idx];
+ rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini);
+ if (rc)
+ goto out_reject;
+ smc_llc_save_add_link_info(lnk_new, llc);
+ lnk_new->link_id = llc->link_num; /* SMC server assigns link id */
+ smc_llc_link_set_uid(lnk_new);
+
+ rc = smc_ib_ready_link(lnk_new);
+ if (rc)
+ goto out_clear_lnk;
+
+ rc = smcr_buf_map_lgr(lnk_new);
+ if (rc)
+ goto out_clear_lnk;
+
+ rc = smc_llc_send_add_link(link,
+ lnk_new->smcibdev->mac[lnk_new->ibport - 1],
+ lnk_new->gid, lnk_new, SMC_LLC_RESP);
+ if (rc)
+ goto out_clear_lnk;
+ if (lgr->smc_version == SMC_V2) {
+ smc_llc_save_add_link_rkeys(link, lnk_new);
} else {
- if (link->state == SMC_LNK_ACTIVATING) {
- complete(&link->llc_add);
- return;
+ rc = smc_llc_cli_rkey_exchange(link, lnk_new);
+ if (rc) {
+ rc = 0;
+ goto out_clear_lnk;
}
+ }
+ rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t);
+ if (!rc)
+ goto out;
+out_clear_lnk:
+ lnk_new->state = SMC_LNK_INACTIVE;
+ smcr_link_clear(lnk_new, false);
+out_reject:
+ smc_llc_cli_add_link_reject(qentry);
+out:
+ kfree(ini);
+ kfree(qentry);
+ return rc;
+}
- if (lgr->role == SMC_SERV) {
- smc_llc_prep_add_link(llc, link,
- link->smcibdev->mac[link->ibport - 1],
- link->gid, SMC_LLC_REQ);
+static void smc_llc_send_request_add_link(struct smc_link *link)
+{
+ struct smc_llc_msg_req_add_link_v2 *llc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_v2_buf *wr_buf;
+ struct smc_gidlist gidlist;
+ int rc, len, i;
+
+ if (!smc_wr_tx_link_hold(link))
+ return;
+ if (link->lgr->type == SMC_LGR_SYMMETRIC ||
+ link->lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+ goto put_out;
+
+ smc_fill_gid_list(link->lgr, &gidlist, link->smcibdev, link->gid);
+ if (gidlist.len <= 1)
+ goto put_out;
+
+ rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend);
+ if (rc)
+ goto put_out;
+ llc = (struct smc_llc_msg_req_add_link_v2 *)wr_buf;
+ memset(llc, 0, SMC_WR_TX_SIZE);
+
+ llc->hd.common.llc_type = SMC_LLC_REQ_ADD_LINK;
+ for (i = 0; i < gidlist.len; i++)
+ memcpy(llc->gid[i], gidlist.list[i], sizeof(gidlist.list[0]));
+ llc->gid_cnt = gidlist.len;
+ len = sizeof(*llc) + (gidlist.len * sizeof(gidlist.list[0]));
+ smc_llc_init_msg_hdr(&llc->hd, link->lgr, len);
+ rc = smc_wr_tx_v2_send(link, pend, len);
+ if (!rc)
+ /* set REQ_ADD_LINK flow and wait for response from peer */
+ link->lgr->llc_flow_lcl.type = SMC_LLC_FLOW_REQ_ADD_LINK;
+put_out:
+ smc_wr_tx_link_put(link);
+}
+
+/* as an SMC client, invite server to start the add_link processing */
+static void smc_llc_cli_add_link_invite(struct smc_link *link,
+ struct smc_llc_qentry *qentry)
+{
+ struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_init_info *ini = NULL;
+
+ if (lgr->smc_version == SMC_V2) {
+ smc_llc_send_request_add_link(link);
+ goto out;
+ }
+
+ if (lgr->type == SMC_LGR_SYMMETRIC ||
+ lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+ goto out;
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini)
+ goto out;
- } else {
- smc_llc_prep_add_link(llc, link,
- link->smcibdev->mac[link->ibport - 1],
- link->gid, SMC_LLC_RESP);
+ ini->vlan_id = lgr->vlan_id;
+ smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+ if (!ini->ib_dev)
+ goto out;
+
+ smc_llc_send_add_link(link, ini->ib_dev->mac[ini->ib_port - 1],
+ ini->ib_gid, NULL, SMC_LLC_REQ);
+out:
+ kfree(ini);
+ kfree(qentry);
+}
+
+static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++)
+ if (llc->raw.data[i])
+ return false;
+ return true;
+}
+
+static bool smc_llc_is_local_add_link(union smc_llc_msg *llc)
+{
+ if (llc->raw.hdr.common.llc_type == SMC_LLC_ADD_LINK &&
+ smc_llc_is_empty_llc_message(llc))
+ return true;
+ return false;
+}
+
+static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
+{
+ struct smc_llc_qentry *qentry;
+
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+
+ mutex_lock(&lgr->llc_conf_mutex);
+ if (smc_llc_is_local_add_link(&qentry->msg))
+ smc_llc_cli_add_link_invite(qentry->link, qentry);
+ else
+ smc_llc_cli_add_link(qentry->link, qentry);
+ mutex_unlock(&lgr->llc_conf_mutex);
+}
+
+static int smc_llc_active_link_count(struct smc_link_group *lgr)
+{
+ int i, link_count = 0;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ link_count++;
+ }
+ return link_count;
+}
+
+/* find the asymmetric link when 3 links are established */
+static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr)
+{
+ int asym_idx = -ENOENT;
+ int i, j, k;
+ bool found;
+
+ /* determine asymmetric link */
+ found = false;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) {
+ if (!smc_link_usable(&lgr->lnk[i]) ||
+ !smc_link_usable(&lgr->lnk[j]))
+ continue;
+ if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid,
+ SMC_GID_SIZE)) {
+ found = true; /* asym_lnk is i or j */
+ break;
+ }
+ }
+ if (found)
+ break;
+ }
+ if (!found)
+ goto out; /* no asymmetric link */
+ for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) {
+ if (!smc_link_usable(&lgr->lnk[k]))
+ continue;
+ if (k != i &&
+ !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid,
+ SMC_GID_SIZE)) {
+ asym_idx = i;
+ break;
+ }
+ if (k != j &&
+ !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid,
+ SMC_GID_SIZE)) {
+ asym_idx = j;
+ break;
}
- smc_llc_send_message(link, llc, sizeof(*llc));
}
+out:
+ return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx];
}
-static void smc_llc_rx_delete_link(struct smc_link *link,
- struct smc_llc_msg_del_link *llc)
+static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
{
- struct smc_link_group *lgr = smc_get_lgr(link);
+ struct smc_link *lnk_new = NULL, *lnk_asym;
+ struct smc_llc_qentry *qentry;
+ int rc;
- if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- if (lgr->role == SMC_SERV)
- smc_lgr_schedule_free_work_fast(lgr);
- } else {
- smc_lgr_forget(lgr);
- smc_llc_link_deleting(link);
- if (lgr->role == SMC_SERV) {
- /* client asks to delete this link, send request */
- smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
- } else {
- /* server requests to delete this link, send response */
- smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
+ lnk_asym = smc_llc_find_asym_link(lgr);
+ if (!lnk_asym)
+ return; /* no asymmetric link */
+ if (!smc_link_downing(&lnk_asym->state))
+ return;
+ lnk_new = smc_switch_conns(lgr, lnk_asym, false);
+ smc_wr_tx_wait_no_pending_sends(lnk_asym);
+ if (!lnk_new)
+ goto out_free;
+ /* change flow type from ADD_LINK into DEL_LINK */
+ lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK;
+ rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ,
+ true, SMC_LLC_DEL_NO_ASYM_NEEDED);
+ if (rc) {
+ smcr_link_down_cond(lnk_new);
+ goto out_free;
+ }
+ qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_LINK);
+ if (!qentry) {
+ smcr_link_down_cond(lnk_new);
+ goto out_free;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+out_free:
+ smcr_link_clear(lnk_asym, true);
+}
+
+static int smc_llc_srv_rkey_exchange(struct smc_link *link,
+ struct smc_link *link_new)
+{
+ struct smc_llc_msg_add_link_cont *addc_llc;
+ struct smc_link_group *lgr = link->lgr;
+ u8 max, num_rkeys_send, num_rkeys_recv;
+ struct smc_llc_qentry *qentry = NULL;
+ struct smc_buf_desc *buf_pos;
+ int buf_lst;
+ int rc = 0;
+ int i;
+
+ mutex_lock(&lgr->rmbs_lock);
+ num_rkeys_send = lgr->conns_num;
+ buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+ do {
+ smc_llc_add_link_cont(link, link_new, &num_rkeys_send,
+ &buf_lst, &buf_pos);
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_ADD_LINK_CONT);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ goto out;
}
- smc_llc_send_message(link, llc, sizeof(*llc));
- smc_lgr_terminate_sched(lgr);
+ addc_llc = &qentry->msg.add_link_cont;
+ num_rkeys_recv = addc_llc->num_rkeys;
+ max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG);
+ for (i = 0; i < max; i++) {
+ smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+ addc_llc->rt[i].rmb_key,
+ addc_llc->rt[i].rmb_vaddr_new,
+ addc_llc->rt[i].rmb_key_new);
+ num_rkeys_recv--;
+ }
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ } while (num_rkeys_send || num_rkeys_recv);
+out:
+ mutex_unlock(&lgr->rmbs_lock);
+ return rc;
+}
+
+static int smc_llc_srv_conf_link(struct smc_link *link,
+ struct smc_link *link_new,
+ enum smc_lgr_type lgr_new_t)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc;
+
+ /* send CONFIRM LINK request over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ);
+ if (rc)
+ return -ENOLINK;
+ /* receive CONFIRM LINK response over the RoCE fabric */
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0);
+ if (!qentry ||
+ qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) {
+ /* send DELETE LINK */
+ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
+ false, SMC_LLC_DEL_LOST_PATH);
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return -ENOLINK;
}
+ smc_llc_save_peer_uid(qentry);
+ smc_llc_link_active(link_new);
+ if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL ||
+ lgr_new_t == SMC_LGR_ASYMMETRIC_PEER)
+ smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx);
+ else
+ smcr_lgr_set_type(lgr, lgr_new_t);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return 0;
}
-static void smc_llc_rx_test_link(struct smc_link *link,
- struct smc_llc_msg_test_link *llc)
+static void smc_llc_send_req_add_link_response(struct smc_llc_qentry *qentry)
{
- if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- if (link->state == SMC_LNK_ACTIVE)
- complete(&link->llc_testlink_resp);
+ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr,
+ sizeof(qentry->msg));
+ memset(&qentry->msg.raw.data, 0, sizeof(qentry->msg.raw.data));
+ smc_llc_send_message(qentry->link, &qentry->msg);
+}
+
+int smc_llc_srv_add_link(struct smc_link *link,
+ struct smc_llc_qentry *req_qentry)
+{
+ enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_msg_add_link *add_llc;
+ struct smc_llc_qentry *qentry = NULL;
+ bool send_req_add_link_resp = false;
+ struct smc_link *link_new = NULL;
+ struct smc_init_info *ini = NULL;
+ int lnk_idx, rc = 0;
+
+ if (req_qentry &&
+ req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK)
+ send_req_add_link_resp = true;
+
+ ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+ if (!ini) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* ignore client add link recommendation, start new flow */
+ ini->vlan_id = lgr->vlan_id;
+ if (lgr->smc_version == SMC_V2) {
+ ini->check_smcrv2 = true;
+ ini->smcrv2.saddr = lgr->saddr;
+ if (send_req_add_link_resp) {
+ struct smc_llc_msg_req_add_link_v2 *req_add =
+ &req_qentry->msg.req_add_link;
+
+ ini->smcrv2.daddr = smc_ib_gid_to_ipv4(req_add->gid[0]);
+ }
+ }
+ smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+ if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->smcrv2.ib_dev_v2 = link->smcibdev;
+ ini->smcrv2.ib_port_v2 = link->ibport;
+ } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+ ini->ib_dev = link->smcibdev;
+ ini->ib_port = link->ibport;
+ }
+ lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
+ if (lnk_idx < 0) {
+ rc = 0;
+ goto out;
+ }
+
+ rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini);
+ if (rc)
+ goto out;
+ link_new = &lgr->lnk[lnk_idx];
+
+ rc = smcr_buf_map_lgr(link_new);
+ if (rc)
+ goto out_err;
+
+ rc = smc_llc_send_add_link(link,
+ link_new->smcibdev->mac[link_new->ibport-1],
+ link_new->gid, link_new, SMC_LLC_REQ);
+ if (rc)
+ goto out_err;
+ send_req_add_link_resp = false;
+ /* receive ADD LINK response over the RoCE fabric */
+ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK);
+ if (!qentry) {
+ rc = -ETIMEDOUT;
+ goto out_err;
+ }
+ add_llc = &qentry->msg.add_link;
+ if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) {
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ rc = -ENOLINK;
+ goto out_err;
+ }
+ if (lgr->type == SMC_LGR_SINGLE &&
+ (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
+ (lgr->smc_version == SMC_V2 ||
+ !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN)))) {
+ lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
+ }
+ smc_llc_save_add_link_info(link_new, add_llc);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+
+ rc = smc_ib_ready_link(link_new);
+ if (rc)
+ goto out_err;
+ rc = smcr_buf_reg_lgr(link_new);
+ if (rc)
+ goto out_err;
+ if (lgr->smc_version == SMC_V2) {
+ smc_llc_save_add_link_rkeys(link, link_new);
} else {
- llc->hd.flags |= SMC_LLC_FLAG_RESP;
- smc_llc_send_message(link, llc, sizeof(*llc));
+ rc = smc_llc_srv_rkey_exchange(link, link_new);
+ if (rc)
+ goto out_err;
+ }
+ rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t);
+ if (rc)
+ goto out_err;
+ kfree(ini);
+ return 0;
+out_err:
+ if (link_new) {
+ link_new->state = SMC_LNK_INACTIVE;
+ smcr_link_clear(link_new, false);
}
+out:
+ kfree(ini);
+ if (send_req_add_link_resp)
+ smc_llc_send_req_add_link_response(req_qentry);
+ return rc;
}
-static void smc_llc_rx_confirm_rkey(struct smc_link *link,
- struct smc_llc_msg_confirm_rkey *llc)
+static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
{
+ struct smc_link *link = lgr->llc_flow_lcl.qentry->link;
+ struct smc_llc_qentry *qentry;
int rc;
- if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- link->llc_confirm_rkey_rc = llc->hd.flags &
- SMC_LLC_FLAG_RKEY_NEG;
- complete(&link->llc_confirm_rkey);
- } else {
- rc = smc_rtoken_add(smc_get_lgr(link),
- llc->rtoken[0].rmb_vaddr,
- llc->rtoken[0].rmb_key);
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
- /* ignore rtokens for other links, we have only one link */
+ mutex_lock(&lgr->llc_conf_mutex);
+ rc = smc_llc_srv_add_link(link, qentry);
+ if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
+ /* delete any asymmetric link */
+ smc_llc_delete_asym_link(lgr);
+ }
+ mutex_unlock(&lgr->llc_conf_mutex);
+ kfree(qentry);
+}
- llc->hd.flags |= SMC_LLC_FLAG_RESP;
- if (rc < 0)
- llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
- smc_llc_send_message(link, llc, sizeof(*llc));
+/* enqueue a local add_link req to trigger a new add_link flow */
+void smc_llc_add_link_local(struct smc_link *link)
+{
+ struct smc_llc_msg_add_link add_llc = {};
+
+ add_llc.hd.common.llc_type = SMC_LLC_ADD_LINK;
+ smc_llc_init_msg_hdr(&add_llc.hd, link->lgr, sizeof(add_llc));
+ /* no dev and port needed */
+ smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc);
+}
+
+/* worker to process an add link message */
+static void smc_llc_add_link_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_add_link_work);
+
+ if (list_empty(&lgr->list)) {
+ /* link group is terminating */
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ goto out;
}
+
+ if (lgr->role == SMC_CLNT)
+ smc_llc_process_cli_add_link(lgr);
+ else
+ smc_llc_process_srv_add_link(lgr);
+out:
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_REQ_ADD_LINK)
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
}
-static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
- struct smc_llc_msg_confirm_rkey_cont *llc)
+/* enqueue a local del_link msg to trigger a new del_link flow,
+ * called only for role SMC_SERV
+ */
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
{
- if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- /* unused as long as we don't send this type of msg */
- } else {
- /* ignore rtokens for other links, we have only one link */
- llc->hd.flags |= SMC_LLC_FLAG_RESP;
- smc_llc_send_message(link, llc, sizeof(*llc));
+ struct smc_llc_msg_del_link del_llc = {};
+
+ del_llc.hd.common.llc_type = SMC_LLC_DELETE_LINK;
+ smc_llc_init_msg_hdr(&del_llc.hd, link->lgr, sizeof(del_llc));
+ del_llc.link_num = del_link_id;
+ del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH);
+ del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc);
+}
+
+static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk_del = NULL, *lnk_asym, *lnk;
+ struct smc_llc_msg_del_link *del_llc;
+ struct smc_llc_qentry *qentry;
+ int active_links;
+ int lnk_idx;
+
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+ lnk = qentry->link;
+ del_llc = &qentry->msg.delete_link;
+
+ if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+ smc_lgr_terminate_sched(lgr);
+ goto out;
+ }
+ mutex_lock(&lgr->llc_conf_mutex);
+ /* delete single link */
+ for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) {
+ if (lgr->lnk[lnk_idx].link_id != del_llc->link_num)
+ continue;
+ lnk_del = &lgr->lnk[lnk_idx];
+ break;
+ }
+ del_llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ if (!lnk_del) {
+ /* link was not found */
+ del_llc->reason = htonl(SMC_LLC_DEL_NOLNK);
+ smc_llc_send_message(lnk, &qentry->msg);
+ goto out_unlock;
+ }
+ lnk_asym = smc_llc_find_asym_link(lgr);
+
+ del_llc->reason = 0;
+ smc_llc_send_message(lnk, &qentry->msg); /* response */
+
+ if (smc_link_downing(&lnk_del->state))
+ smc_switch_conns(lgr, lnk_del, false);
+ smcr_link_clear(lnk_del, true);
+
+ active_links = smc_llc_active_link_count(lgr);
+ if (lnk_del == lnk_asym) {
+ /* expected deletion of asym link, don't change lgr state */
+ } else if (active_links == 1) {
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ } else if (!active_links) {
+ smcr_lgr_set_type(lgr, SMC_LGR_NONE);
+ smc_lgr_terminate_sched(lgr);
}
+out_unlock:
+ mutex_unlock(&lgr->llc_conf_mutex);
+out:
+ kfree(qentry);
}
-static void smc_llc_rx_delete_rkey(struct smc_link *link,
- struct smc_llc_msg_delete_rkey *llc)
+/* try to send a DELETE LINK ALL request on any active link,
+ * waiting for send completion
+ */
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
{
+ struct smc_llc_msg_del_link delllc = {};
+ int i;
+
+ delllc.hd.common.llc_type = SMC_LLC_DELETE_LINK;
+ smc_llc_init_msg_hdr(&delllc.hd, lgr, sizeof(delllc));
+ if (ord)
+ delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+ delllc.reason = htonl(rsn);
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_sendable(&lgr->lnk[i]))
+ continue;
+ if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc))
+ break;
+ }
+}
+
+static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_del_link *del_llc;
+ struct smc_link *lnk, *lnk_del;
+ struct smc_llc_qentry *qentry;
+ int active_links;
+ int i;
+
+ mutex_lock(&lgr->llc_conf_mutex);
+ qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
+ lnk = qentry->link;
+ del_llc = &qentry->msg.delete_link;
+
+ if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) {
+ /* delete entire lgr */
+ smc_llc_send_link_delete_all(lgr, true, ntohl(
+ qentry->msg.delete_link.reason));
+ smc_lgr_terminate_sched(lgr);
+ goto out;
+ }
+ /* delete single link */
+ lnk_del = NULL;
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (lgr->lnk[i].link_id == del_llc->link_num) {
+ lnk_del = &lgr->lnk[i];
+ break;
+ }
+ }
+ if (!lnk_del)
+ goto out; /* asymmetric link already deleted */
+
+ if (smc_link_downing(&lnk_del->state)) {
+ if (smc_switch_conns(lgr, lnk_del, false))
+ smc_wr_tx_wait_no_pending_sends(lnk_del);
+ }
+ if (!list_empty(&lgr->list)) {
+ /* qentry is either a request from peer (send it back to
+ * initiate the DELETE_LINK processing), or a locally
+ * enqueued DELETE_LINK request (forward it)
+ */
+ if (!smc_llc_send_message(lnk, &qentry->msg)) {
+ struct smc_llc_qentry *qentry2;
+
+ qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_LINK);
+ if (qentry2)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ }
+ }
+ smcr_link_clear(lnk_del, true);
+
+ active_links = smc_llc_active_link_count(lgr);
+ if (active_links == 1) {
+ smcr_lgr_set_type(lgr, SMC_LGR_SINGLE);
+ } else if (!active_links) {
+ smcr_lgr_set_type(lgr, SMC_LGR_NONE);
+ smc_lgr_terminate_sched(lgr);
+ }
+
+ if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) {
+ /* trigger setup of asymm alt link */
+ smc_llc_add_link_local(lnk);
+ }
+out:
+ mutex_unlock(&lgr->llc_conf_mutex);
+ kfree(qentry);
+}
+
+static void smc_llc_delete_link_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_del_link_work);
+
+ if (list_empty(&lgr->list)) {
+ /* link group is terminating */
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ goto out;
+ }
+
+ if (lgr->role == SMC_CLNT)
+ smc_llc_process_cli_delete_link(lgr);
+ else
+ smc_llc_process_srv_delete_link(lgr);
+out:
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+}
+
+/* process a confirm_rkey request from peer, remote flow */
+static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_confirm_rkey *llc;
+ struct smc_llc_qentry *qentry;
+ struct smc_link *link;
+ int num_entries;
+ int rk_idx;
+ int i;
+
+ qentry = lgr->llc_flow_rmt.qentry;
+ llc = &qentry->msg.confirm_rkey;
+ link = qentry->link;
+
+ num_entries = llc->rtoken[0].num_rkeys;
+ if (num_entries > SMC_LLC_RKEYS_PER_MSG)
+ goto out_err;
+ /* first rkey entry is for receiving link */
+ rk_idx = smc_rtoken_add(link,
+ llc->rtoken[0].rmb_vaddr,
+ llc->rtoken[0].rmb_key);
+ if (rk_idx < 0)
+ goto out_err;
+
+ for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++)
+ smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id,
+ llc->rtoken[i].rmb_vaddr,
+ llc->rtoken[i].rmb_key);
+ /* max links is 3 so there is no need to support conf_rkey_cont msgs */
+ goto out;
+out_err:
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY;
+out:
+ llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc));
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
+}
+
+/* process a delete_rkey request from peer, remote flow */
+static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
+{
+ struct smc_llc_msg_delete_rkey *llc;
+ struct smc_llc_qentry *qentry;
+ struct smc_link *link;
u8 err_mask = 0;
int i, max;
- if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
- link->llc_delete_rkey_rc = llc->hd.flags &
- SMC_LLC_FLAG_RKEY_NEG;
- complete(&link->llc_delete_rkey);
- } else {
- max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
+ qentry = lgr->llc_flow_rmt.qentry;
+ llc = &qentry->msg.delete_rkey;
+ link = qentry->link;
+
+ if (lgr->smc_version == SMC_V2) {
+ struct smc_llc_msg_delete_rkey_v2 *llcv2;
+
+ memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc));
+ llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2;
+ llcv2->num_inval_rkeys = 0;
+
+ max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
for (i = 0; i < max; i++) {
- if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i]))
- err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
+ if (smc_rtoken_delete(link, llcv2->rkey[i]))
+ llcv2->num_inval_rkeys++;
}
-
- if (err_mask) {
+ memset(&llc->rkey[0], 0, sizeof(llc->rkey));
+ memset(&llc->reserved2, 0, sizeof(llc->reserved2));
+ smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc));
+ if (llcv2->num_inval_rkeys) {
llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
- llc->err_mask = err_mask;
+ llc->err_mask = llcv2->num_inval_rkeys;
}
+ goto finish;
+ }
- llc->hd.flags |= SMC_LLC_FLAG_RESP;
- smc_llc_send_message(link, llc, sizeof(*llc));
+ max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
+ for (i = 0; i < max; i++) {
+ if (smc_rtoken_delete(link, llc->rkey[i]))
+ err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
}
+ if (err_mask) {
+ llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+ llc->err_mask = err_mask;
+ }
+finish:
+ llc->hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, &qentry->msg);
+ smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
}
-static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type)
{
- struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
- union smc_llc_msg *llc = buf;
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu LLC protocol violation: "
+ "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id,
+ lgr->net->net_cookie, type);
+ smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL);
+ smc_lgr_terminate_sched(lgr);
+}
- if (wc->byte_len < sizeof(*llc))
- return; /* short message */
- if (llc->raw.hdr.length != sizeof(*llc))
- return; /* invalid message */
- if (link->state == SMC_LNK_INACTIVE)
- return; /* link not active, drop msg */
+/* flush the llc event queue */
+static void smc_llc_event_flush(struct smc_link_group *lgr)
+{
+ struct smc_llc_qentry *qentry, *q;
+
+ spin_lock_bh(&lgr->llc_event_q_lock);
+ list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) {
+ list_del_init(&qentry->list);
+ kfree(qentry);
+ }
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
+{
+ union smc_llc_msg *llc = &qentry->msg;
+ struct smc_link *link = qentry->link;
+ struct smc_link_group *lgr = link->lgr;
+
+ if (!smc_link_usable(link))
+ goto out;
- switch (llc->raw.hdr.common.type) {
+ switch (llc->raw.hdr.common.llc_type) {
case SMC_LLC_TEST_LINK:
- smc_llc_rx_test_link(link, &llc->test_link);
- break;
- case SMC_LLC_CONFIRM_LINK:
- smc_llc_rx_confirm_link(link, &llc->confirm_link);
+ llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP;
+ smc_llc_send_message(link, llc);
break;
case SMC_LLC_ADD_LINK:
- smc_llc_rx_add_link(link, &llc->add_link);
+ if (list_empty(&lgr->list))
+ goto out; /* lgr is terminating */
+ if (lgr->role == SMC_CLNT) {
+ if (smc_llc_is_local_add_link(llc)) {
+ if (lgr->llc_flow_lcl.type ==
+ SMC_LLC_FLOW_ADD_LINK)
+ break; /* add_link in progress */
+ if (smc_llc_flow_start(&lgr->llc_flow_lcl,
+ qentry)) {
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ }
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
+ !lgr->llc_flow_lcl.qentry) {
+ /* a flow is waiting for this message */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+ qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ return;
+ }
+ if (lgr->llc_flow_lcl.type ==
+ SMC_LLC_FLOW_REQ_ADD_LINK) {
+ /* server started add_link processing */
+ lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK;
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+ qentry);
+ schedule_work(&lgr->llc_add_link_work);
+ return;
+ }
+ if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ /* as smc server, handle client suggestion */
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ case SMC_LLC_CONFIRM_LINK:
+ case SMC_LLC_ADD_LINK_CONT:
+ if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
+ /* a flow is waiting for this message */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ return;
+ }
break;
case SMC_LLC_DELETE_LINK:
- smc_llc_rx_delete_link(link, &llc->delete_link);
- break;
+ if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK &&
+ !lgr->llc_flow_lcl.qentry) {
+ /* DEL LINK REQ during ADD LINK SEQ */
+ smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry);
+ wake_up(&lgr->llc_msg_waiter);
+ } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ schedule_work(&lgr->llc_del_link_work);
+ }
+ return;
case SMC_LLC_CONFIRM_RKEY:
- smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey);
- break;
+ /* new request from remote, assign to remote flow */
+ if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+ /* process here, does not wait for more llc msgs */
+ smc_llc_rmt_conf_rkey(lgr);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+ }
+ return;
case SMC_LLC_CONFIRM_RKEY_CONT:
- smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont);
+ /* not used because max links is 3, and 3 rkeys fit into
+ * one CONFIRM_RKEY message
+ */
break;
case SMC_LLC_DELETE_RKEY:
- smc_llc_rx_delete_rkey(link, &llc->delete_rkey);
+ /* new request from remote, assign to remote flow */
+ if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) {
+ /* process here, does not wait for more llc msgs */
+ smc_llc_rmt_delete_rkey(lgr);
+ smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
+ }
+ return;
+ case SMC_LLC_REQ_ADD_LINK:
+ /* handle response here, smc_llc_flow_stop() cannot be called
+ * in tasklet context
+ */
+ if (lgr->role == SMC_CLNT &&
+ lgr->llc_flow_lcl.type == SMC_LLC_FLOW_REQ_ADD_LINK &&
+ (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP)) {
+ smc_llc_flow_stop(link->lgr, &lgr->llc_flow_lcl);
+ } else if (lgr->role == SMC_SERV) {
+ if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+ /* as smc server, handle client suggestion */
+ lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK;
+ schedule_work(&lgr->llc_add_link_work);
+ }
+ return;
+ }
break;
+ default:
+ smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type);
+ break;
+ }
+out:
+ kfree(qentry);
+}
+
+/* worker to process llc messages on the event queue */
+static void smc_llc_event_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ llc_event_work);
+ struct smc_llc_qentry *qentry;
+
+ if (!lgr->llc_flow_lcl.type && lgr->delayed_event) {
+ qentry = lgr->delayed_event;
+ lgr->delayed_event = NULL;
+ if (smc_link_usable(qentry->link))
+ smc_llc_event_handler(qentry);
+ else
+ kfree(qentry);
+ }
+
+again:
+ spin_lock_bh(&lgr->llc_event_q_lock);
+ if (!list_empty(&lgr->llc_event_q)) {
+ qentry = list_first_entry(&lgr->llc_event_q,
+ struct smc_llc_qentry, list);
+ list_del_init(&qentry->list);
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+ smc_llc_event_handler(qentry);
+ goto again;
+ }
+ spin_unlock_bh(&lgr->llc_event_q_lock);
+}
+
+/* process llc responses in tasklet context */
+static void smc_llc_rx_response(struct smc_link *link,
+ struct smc_llc_qentry *qentry)
+{
+ enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type;
+ struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl;
+ u8 llc_type = qentry->msg.raw.hdr.common.llc_type;
+
+ switch (llc_type) {
+ case SMC_LLC_TEST_LINK:
+ if (smc_link_active(link))
+ complete(&link->llc_testlink_resp);
+ break;
+ case SMC_LLC_ADD_LINK:
+ case SMC_LLC_ADD_LINK_CONT:
+ case SMC_LLC_CONFIRM_LINK:
+ if (flowtype != SMC_LLC_FLOW_ADD_LINK || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_DELETE_LINK:
+ if (flowtype != SMC_LLC_FLOW_DEL_LINK || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_CONFIRM_RKEY:
+ case SMC_LLC_DELETE_RKEY:
+ if (flowtype != SMC_LLC_FLOW_RKEY || flow->qentry)
+ break; /* drop out-of-flow response */
+ goto assign;
+ case SMC_LLC_CONFIRM_RKEY_CONT:
+ /* not used because max links is 3 */
+ break;
+ default:
+ smc_llc_protocol_violation(link->lgr,
+ qentry->msg.raw.hdr.common.type);
+ break;
+ }
+ kfree(qentry);
+ return;
+assign:
+ /* assign responses to the local flow, we requested them */
+ smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry);
+ wake_up(&link->lgr->llc_msg_waiter);
+}
+
+static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
+{
+ struct smc_link_group *lgr = link->lgr;
+ struct smc_llc_qentry *qentry;
+ unsigned long flags;
+
+ qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
+ if (!qentry)
+ return;
+ qentry->link = link;
+ INIT_LIST_HEAD(&qentry->list);
+ memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg));
+
+ /* process responses immediately */
+ if ((llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) &&
+ llc->raw.hdr.common.llc_type != SMC_LLC_REQ_ADD_LINK) {
+ smc_llc_rx_response(link, qentry);
+ return;
+ }
+
+ /* add requests to event queue */
+ spin_lock_irqsave(&lgr->llc_event_q_lock, flags);
+ list_add_tail(&qentry->list, &lgr->llc_event_q);
+ spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags);
+ queue_work(system_highpri_wq, &lgr->llc_event_work);
+}
+
+/* copy received msg and add it to the event queue */
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ union smc_llc_msg *llc = buf;
+
+ if (wc->byte_len < sizeof(*llc))
+ return; /* short message */
+ if (!llc->raw.hdr.common.llc_version) {
+ if (llc->raw.hdr.length != sizeof(*llc))
+ return; /* invalid message */
+ } else {
+ if (llc->raw.hdr.length_v2 < sizeof(*llc))
+ return; /* invalid message */
}
+
+ smc_llc_enqueue(link, llc);
}
/***************************** worker, utils *********************************/
@@ -601,7 +2091,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
u8 user_data[16] = { 0 };
int rc;
- if (link->state != SMC_LNK_ACTIVE)
+ if (!smc_link_active(link))
return; /* don't reschedule worker */
expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
if (time_is_after_jiffies(expire_time)) {
@@ -613,112 +2103,164 @@ static void smc_llc_testlink_work(struct work_struct *work)
/* receive TEST LINK response over RoCE fabric */
rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
SMC_LLC_WAIT_TIME);
+ if (!smc_link_active(link))
+ return; /* link state changed */
if (rc <= 0) {
- smc_lgr_terminate(smc_get_lgr(link), true);
+ smcr_link_down_cond_sched(link);
return;
}
next_interval = link->llc_testlink_time;
out:
- queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
- next_interval);
+ schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
}
-int smc_llc_link_init(struct smc_link *link)
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
{
- struct smc_link_group *lgr = smc_get_lgr(link);
- link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
- *((u32 *)lgr->id),
- link->link_id);
- if (!link->llc_wq)
- return -ENOMEM;
- init_completion(&link->llc_confirm);
- init_completion(&link->llc_confirm_resp);
- init_completion(&link->llc_add);
- init_completion(&link->llc_add_resp);
- init_completion(&link->llc_confirm_rkey);
- init_completion(&link->llc_delete_rkey);
- mutex_init(&link->llc_delete_rkey_mutex);
- init_completion(&link->llc_testlink_resp);
- INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
- return 0;
+ struct net *net = sock_net(smc->clcsock->sk);
+
+ INIT_WORK(&lgr->llc_event_work, smc_llc_event_work);
+ INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work);
+ INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work);
+ INIT_LIST_HEAD(&lgr->llc_event_q);
+ spin_lock_init(&lgr->llc_event_q_lock);
+ spin_lock_init(&lgr->llc_flow_lock);
+ init_waitqueue_head(&lgr->llc_flow_waiter);
+ init_waitqueue_head(&lgr->llc_msg_waiter);
+ mutex_init(&lgr->llc_conf_mutex);
+ lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time);
}
-void smc_llc_link_active(struct smc_link *link, int testlink_time)
+/* called after lgr was removed from lgr_list */
+void smc_llc_lgr_clear(struct smc_link_group *lgr)
{
- link->state = SMC_LNK_ACTIVE;
- if (testlink_time) {
- link->llc_testlink_time = testlink_time * HZ;
- queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
- link->llc_testlink_time);
+ smc_llc_event_flush(lgr);
+ wake_up_all(&lgr->llc_flow_waiter);
+ wake_up_all(&lgr->llc_msg_waiter);
+ cancel_work_sync(&lgr->llc_event_work);
+ cancel_work_sync(&lgr->llc_add_link_work);
+ cancel_work_sync(&lgr->llc_del_link_work);
+ if (lgr->delayed_event) {
+ kfree(lgr->delayed_event);
+ lgr->delayed_event = NULL;
}
}
-void smc_llc_link_deleting(struct smc_link *link)
+int smc_llc_link_init(struct smc_link *link)
{
- link->state = SMC_LNK_DELETING;
- smc_wr_wakeup_tx_wait(link);
+ init_completion(&link->llc_testlink_resp);
+ INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+ return 0;
}
-/* called in tasklet context */
-void smc_llc_link_inactive(struct smc_link *link)
+void smc_llc_link_active(struct smc_link *link)
{
- link->state = SMC_LNK_INACTIVE;
- cancel_delayed_work(&link->llc_testlink_wrk);
- smc_wr_wakeup_reg_wait(link);
- smc_wr_wakeup_tx_wait(link);
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link added: id %*phN, "
+ "peerid %*phN, ibdev %s, ibport %d\n",
+ SMC_LGR_ID_SIZE, &link->lgr->id,
+ link->lgr->net->net_cookie,
+ SMC_LGR_ID_SIZE, &link->link_uid,
+ SMC_LGR_ID_SIZE, &link->peer_link_uid,
+ link->smcibdev->ibdev->name, link->ibport);
+ link->state = SMC_LNK_ACTIVE;
+ if (link->lgr->llc_testlink_time) {
+ link->llc_testlink_time = link->lgr->llc_testlink_time;
+ schedule_delayed_work(&link->llc_testlink_wrk,
+ link->llc_testlink_time);
+ }
}
/* called in worker context */
-void smc_llc_link_clear(struct smc_link *link)
+void smc_llc_link_clear(struct smc_link *link, bool log)
{
- flush_workqueue(link->llc_wq);
- destroy_workqueue(link->llc_wq);
+ if (log)
+ pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link removed: id %*phN"
+ ", peerid %*phN, ibdev %s, ibport %d\n",
+ SMC_LGR_ID_SIZE, &link->lgr->id,
+ link->lgr->net->net_cookie,
+ SMC_LGR_ID_SIZE, &link->link_uid,
+ SMC_LGR_ID_SIZE, &link->peer_link_uid,
+ link->smcibdev->ibdev->name, link->ibport);
+ complete(&link->llc_testlink_resp);
+ cancel_delayed_work_sync(&link->llc_testlink_wrk);
}
-/* register a new rtoken at the remote peer */
-int smc_llc_do_confirm_rkey(struct smc_link *link,
+/* register a new rtoken at the remote peer (for all links) */
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
struct smc_buf_desc *rmb_desc)
{
- int rc;
+ struct smc_link_group *lgr = send_link->lgr;
+ struct smc_llc_qentry *qentry = NULL;
+ int rc = 0;
- /* protected by mutex smc_create_lgr_pending */
- reinit_completion(&link->llc_confirm_rkey);
- rc = smc_llc_send_confirm_rkey(link, rmb_desc);
+ rc = smc_llc_send_confirm_rkey(send_link, rmb_desc);
if (rc)
- return rc;
+ goto out;
/* receive CONFIRM RKEY response from server over RoCE fabric */
- rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey,
- SMC_LLC_WAIT_TIME);
- if (rc <= 0 || link->llc_confirm_rkey_rc)
- return -EFAULT;
- return 0;
+ qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_CONFIRM_RKEY);
+ if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
+ rc = -EFAULT;
+out:
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+ return rc;
}
/* unregister an rtoken at the remote peer */
-int smc_llc_do_delete_rkey(struct smc_link *link,
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
struct smc_buf_desc *rmb_desc)
{
+ struct smc_llc_qentry *qentry = NULL;
+ struct smc_link *send_link;
int rc = 0;
- mutex_lock(&link->llc_delete_rkey_mutex);
- if (link->state != SMC_LNK_ACTIVE)
- goto out;
- reinit_completion(&link->llc_delete_rkey);
- rc = smc_llc_send_delete_rkey(link, rmb_desc);
+ send_link = smc_llc_usable_link(lgr);
+ if (!send_link)
+ return -ENOLINK;
+
+ /* protected by llc_flow control */
+ rc = smc_llc_send_delete_rkey(send_link, rmb_desc);
if (rc)
goto out;
/* receive DELETE RKEY response from server over RoCE fabric */
- rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey,
- SMC_LLC_WAIT_TIME);
- if (rc <= 0 || link->llc_delete_rkey_rc)
+ qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME,
+ SMC_LLC_DELETE_RKEY);
+ if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG))
rc = -EFAULT;
- else
- rc = 0;
out:
- mutex_unlock(&link->llc_delete_rkey_mutex);
+ if (qentry)
+ smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
return rc;
}
+void smc_llc_link_set_uid(struct smc_link *link)
+{
+ __be32 link_uid;
+
+ link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id);
+ memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE);
+}
+
+/* save peers link user id, used for debug purposes */
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry)
+{
+ memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid,
+ SMC_LGR_ID_SIZE);
+}
+
+/* evaluate confirm link request or response */
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+ enum smc_llc_reqresp type)
+{
+ if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */
+ qentry->link->link_id = qentry->msg.confirm_link.link_num;
+ smc_llc_link_set_uid(qentry->link);
+ }
+ if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC))
+ return -ENOTSUPP;
+ return 0;
+}
+
/***************************** init, exit, misc ******************************/
static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
@@ -736,6 +2278,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
},
{
.handler = smc_llc_rx_handler,
+ .type = SMC_LLC_ADD_LINK_CONT
+ },
+ {
+ .handler = smc_llc_rx_handler,
.type = SMC_LLC_DELETE_LINK
},
{
@@ -750,6 +2296,35 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
.handler = smc_llc_rx_handler,
.type = SMC_LLC_DELETE_RKEY
},
+ /* V2 types */
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_TEST_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_ADD_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_REQ_ADD_LINK_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_RKEY_V2
+ },
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_DELETE_RKEY_V2
+ },
{
.handler = NULL,
}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 461c0c3ef76e..7e7a3162c68b 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -19,6 +19,7 @@
#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
#define SMC_LLC_WAIT_TIME (2 * HZ)
+#define SMC_LLC_TESTLINK_DEFAULT_TIME (30 * HZ)
enum smc_llc_reqresp {
SMC_LLC_REQ,
@@ -28,29 +29,92 @@ enum smc_llc_reqresp {
enum smc_llc_msg_type {
SMC_LLC_CONFIRM_LINK = 0x01,
SMC_LLC_ADD_LINK = 0x02,
+ SMC_LLC_ADD_LINK_CONT = 0x03,
SMC_LLC_DELETE_LINK = 0x04,
+ SMC_LLC_REQ_ADD_LINK = 0x05,
SMC_LLC_CONFIRM_RKEY = 0x06,
SMC_LLC_TEST_LINK = 0x07,
SMC_LLC_CONFIRM_RKEY_CONT = 0x08,
SMC_LLC_DELETE_RKEY = 0x09,
+ /* V2 types */
+ SMC_LLC_CONFIRM_LINK_V2 = 0x21,
+ SMC_LLC_ADD_LINK_V2 = 0x22,
+ SMC_LLC_DELETE_LINK_V2 = 0x24,
+ SMC_LLC_REQ_ADD_LINK_V2 = 0x25,
+ SMC_LLC_CONFIRM_RKEY_V2 = 0x26,
+ SMC_LLC_TEST_LINK_V2 = 0x27,
+ SMC_LLC_DELETE_RKEY_V2 = 0x29,
};
+#define smc_link_downing(state) \
+ (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE)
+
+/* LLC DELETE LINK Request Reason Codes */
+#define SMC_LLC_DEL_LOST_PATH 0x00010000
+#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000
+#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000
+#define SMC_LLC_DEL_PROT_VIOL 0x00040000
+#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000
+/* LLC DELETE LINK Response Reason Codes */
+#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */
+#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */
+
+/* returns a usable link of the link group, or NULL */
+static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++)
+ if (smc_link_usable(&lgr->lnk[i]))
+ return &lgr->lnk[i];
+ return NULL;
+}
+
+/* set the termination reason code for the link group */
+static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr,
+ u32 rsn)
+{
+ if (!lgr->llc_termination_rsn)
+ lgr->llc_termination_rsn = rsn;
+}
+
/* transmit */
int smc_llc_send_confirm_link(struct smc_link *lnk,
enum smc_llc_reqresp reqresp);
int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
+ struct smc_link *link_new,
enum smc_llc_reqresp reqresp);
-int smc_llc_send_delete_link(struct smc_link *link,
- enum smc_llc_reqresp reqresp, bool orderly);
+int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
+ enum smc_llc_reqresp reqresp, bool orderly,
+ u32 reason);
+void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id);
+void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc);
+void smc_llc_lgr_clear(struct smc_link_group *lgr);
int smc_llc_link_init(struct smc_link *link);
-void smc_llc_link_active(struct smc_link *link, int testlink_time);
-void smc_llc_link_deleting(struct smc_link *link);
-void smc_llc_link_inactive(struct smc_link *link);
-void smc_llc_link_clear(struct smc_link *link);
-int smc_llc_do_confirm_rkey(struct smc_link *link,
+void smc_llc_link_active(struct smc_link *link);
+void smc_llc_link_clear(struct smc_link *link, bool log);
+int smc_llc_do_confirm_rkey(struct smc_link *send_link,
struct smc_buf_desc *rmb_desc);
-int smc_llc_do_delete_rkey(struct smc_link *link,
+int smc_llc_do_delete_rkey(struct smc_link_group *lgr,
struct smc_buf_desc *rmb_desc);
+int smc_llc_flow_initiate(struct smc_link_group *lgr,
+ enum smc_llc_flowtype type);
+void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow);
+int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry,
+ enum smc_llc_reqresp type);
+void smc_llc_link_set_uid(struct smc_link *link);
+void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry);
+struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
+ struct smc_link *lnk,
+ int time_out, u8 exp_msg);
+struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow);
+void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
+void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord,
+ u32 rsn);
+int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
+int smc_llc_srv_add_link(struct smc_link *link,
+ struct smc_llc_qentry *req_qentry);
+void smc_llc_add_link_local(struct smc_link *link);
int smc_llc_init(void) __init;
#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c
new file mode 100644
index 000000000000..621c46c70073
--- /dev/null
+++ b/net/smc/smc_netlink.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic netlink support functions to interact with SMC module
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Guvenc Gulce <guvenc@linux.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/if.h>
+#include <linux/smc.h>
+
+#include "smc_core.h"
+#include "smc_ism.h"
+#include "smc_ib.h"
+#include "smc_clc.h"
+#include "smc_stats.h"
+#include "smc_netlink.h"
+
+const struct nla_policy
+smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = {
+ [SMC_NLA_EID_TABLE_UNSPEC] = { .type = NLA_UNSPEC },
+ [SMC_NLA_EID_TABLE_ENTRY] = { .type = NLA_STRING,
+ .len = SMC_MAX_EID_LEN,
+ },
+};
+
+#define SMC_CMD_MAX_ATTR 1
+/* SMC_GENL generic netlink operation definition */
+static const struct genl_ops smc_gen_nl_ops[] = {
+ {
+ .cmd = SMC_NETLINK_GET_SYS_INFO,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_get_sys_info,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LGR_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_lgr,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LINK_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_link,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_LGR_SMCD,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcd_nl_get_lgr,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_DEV_SMCD,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcd_nl_get_device,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_DEV_SMCR,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smcr_nl_get_device,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_STATS,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_get_stats,
+ },
+ {
+ .cmd = SMC_NETLINK_GET_FBACK_STATS,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_get_fback_stats,
+ },
+ {
+ .cmd = SMC_NETLINK_DUMP_UEID,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_ueid,
+ },
+ {
+ .cmd = SMC_NETLINK_ADD_UEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_add_ueid,
+ .policy = smc_gen_ueid_policy,
+ },
+ {
+ .cmd = SMC_NETLINK_REMOVE_UEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_remove_ueid,
+ .policy = smc_gen_ueid_policy,
+ },
+ {
+ .cmd = SMC_NETLINK_FLUSH_UEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_flush_ueid,
+ },
+ {
+ .cmd = SMC_NETLINK_DUMP_SEID,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_seid,
+ },
+ {
+ .cmd = SMC_NETLINK_ENABLE_SEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_enable_seid,
+ },
+ {
+ .cmd = SMC_NETLINK_DISABLE_SEID,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_disable_seid,
+ },
+ {
+ .cmd = SMC_NETLINK_DUMP_HS_LIMITATION,
+ /* can be retrieved by unprivileged users */
+ .dumpit = smc_nl_dump_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_enable_hs_limitation,
+ },
+ {
+ .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION,
+ .flags = GENL_ADMIN_PERM,
+ .doit = smc_nl_disable_hs_limitation,
+ },
+};
+
+static const struct nla_policy smc_gen_nl_policy[2] = {
+ [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, },
+};
+
+/* SMC_GENL family definition */
+struct genl_family smc_gen_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = SMC_GENL_FAMILY_NAME,
+ .version = SMC_GENL_FAMILY_VERSION,
+ .maxattr = SMC_CMD_MAX_ATTR,
+ .policy = smc_gen_nl_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = smc_gen_nl_ops,
+ .n_ops = ARRAY_SIZE(smc_gen_nl_ops),
+ .resv_start_op = SMC_NETLINK_DISABLE_HS_LIMITATION + 1,
+};
+
+int __init smc_nl_init(void)
+{
+ return genl_register_family(&smc_gen_nl_family);
+}
+
+void smc_nl_exit(void)
+{
+ genl_unregister_family(&smc_gen_nl_family);
+}
diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h
new file mode 100644
index 000000000000..e8c6c3f0e98c
--- /dev/null
+++ b/net/smc/smc_netlink.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * SMC Generic netlink operations
+ *
+ * Copyright IBM Corp. 2020
+ *
+ * Author(s): Guvenc Gulce <guvenc@linux.ibm.com>
+ */
+
+#ifndef _SMC_NETLINK_H
+#define _SMC_NETLINK_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+extern struct genl_family smc_gen_nl_family;
+
+extern const struct nla_policy smc_gen_ueid_policy[];
+
+struct smc_nl_dmp_ctx {
+ int pos[3];
+};
+
+static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c)
+{
+ return (struct smc_nl_dmp_ctx *)c->ctx;
+}
+
+int smc_nl_init(void) __init;
+void smc_nl_exit(void);
+
+#endif
diff --git a/net/smc/smc_netns.h b/net/smc/smc_netns.h
index e7a8fc4ae02f..0f4f35aa43ad 100644
--- a/net/smc/smc_netns.h
+++ b/net/smc/smc_netns.h
@@ -16,5 +16,6 @@ extern unsigned int smc_net_id;
/* per-network namespace private data */
struct smc_net {
struct smc_pnettable pnettable;
+ struct smc_pnetids_ndev pnetids_ndev;
};
#endif
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 2a5ed47c3e08..25fb2fd186e2 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/list.h>
#include <linux/ctype.h>
+#include <linux/mutex.h>
#include <net/netlink.h>
#include <net/genetlink.h>
@@ -28,11 +29,10 @@
#include "smc_ism.h"
#include "smc_core.h"
-#define SMC_ASCII_BLANK 32
-
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev);
static struct net_device *pnet_find_base_ndev(struct net_device *ndev);
-static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
[SMC_PNETID_NAME] = {
.type = NLA_NUL_STRING,
.len = SMC_MAX_PNETID_LEN
@@ -50,39 +50,45 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
static struct genl_family smc_pnet_nl_family;
-/**
- * struct smc_user_pnetentry - pnet identifier name entry for/from user
- * @list: List node.
- * @pnet_name: Pnet identifier name
- * @ndev: pointer to network device.
- * @smcibdev: Pointer to IB device.
- * @ib_port: Port of IB device.
- * @smcd_dev: Pointer to smcd device.
- */
-struct smc_user_pnetentry {
- struct list_head list;
- char pnet_name[SMC_MAX_PNETID_LEN + 1];
- struct net_device *ndev;
- struct smc_ib_device *smcibdev;
- u8 ib_port;
- struct smcd_dev *smcd_dev;
+enum smc_pnet_nametype {
+ SMC_PNET_ETH = 1,
+ SMC_PNET_IB = 2,
};
/* pnet entry stored in pnet table */
struct smc_pnetentry {
struct list_head list;
char pnet_name[SMC_MAX_PNETID_LEN + 1];
- struct net_device *ndev;
+ enum smc_pnet_nametype type;
+ union {
+ struct {
+ char eth_name[IFNAMSIZ + 1];
+ struct net_device *ndev;
+ netdevice_tracker dev_tracker;
+ };
+ struct {
+ char ib_name[IB_DEVICE_NAME_MAX + 1];
+ u8 ib_port;
+ };
+ };
};
+/* Check if the pnetid is set */
+bool smc_pnet_is_pnetid_set(u8 *pnetid)
+{
+ if (pnetid[0] == 0 || pnetid[0] == _S)
+ return false;
+ return true;
+}
+
/* Check if two given pnetids match */
static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2)
{
int i;
for (i = 0; i < SMC_MAX_PNETID_LEN; i++) {
- if ((pnetid1[i] == 0 || pnetid1[i] == SMC_ASCII_BLANK) &&
- (pnetid2[i] == 0 || pnetid2[i] == SMC_ASCII_BLANK))
+ if ((pnetid1[i] == 0 || pnetid1[i] == _S) &&
+ (pnetid2[i] == 0 || pnetid2[i] == _S))
break;
if (pnetid1[i] != pnetid2[i])
return false;
@@ -106,32 +112,46 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
sn = net_generic(net, smc_net_id);
pnettable = &sn->pnettable;
- /* remove netdevices */
- write_lock(&pnettable->lock);
+ /* remove table entry */
+ mutex_lock(&pnettable->lock);
list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist,
list) {
if (!pnet_name ||
smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
list_del(&pnetelem->list);
- dev_put(pnetelem->ndev);
+ if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) {
+ netdev_put(pnetelem->ndev,
+ &pnetelem->dev_tracker);
+ pr_warn_ratelimited("smc: net device %s "
+ "erased user defined "
+ "pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ }
kfree(pnetelem);
rc = 0;
}
}
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
/* if this is not the initial namespace, stop here */
if (net != &init_net)
return rc;
/* remove ib devices */
- spin_lock(&smc_ib_devices.lock);
+ mutex_lock(&smc_ib_devices.mutex);
list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) {
if (ibdev->pnetid_by_user[ibport] &&
(!pnet_name ||
smc_pnet_match(pnet_name,
ibdev->pnetid[ibport]))) {
+ pr_warn_ratelimited("smc: ib device %s ibport "
+ "%d erased user defined "
+ "pnetid %.16s\n",
+ ibdev->ibdev->name,
+ ibport + 1,
+ ibdev->pnetid[ibport]);
memset(ibdev->pnetid[ibport], 0,
SMC_MAX_PNETID_LEN);
ibdev->pnetid_by_user[ibport] = false;
@@ -139,25 +159,29 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
}
}
}
- spin_unlock(&smc_ib_devices.lock);
+ mutex_unlock(&smc_ib_devices.mutex);
/* remove smcd devices */
- spin_lock(&smcd_dev_list.lock);
+ mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
if (smcd_dev->pnetid_by_user &&
(!pnet_name ||
smc_pnet_match(pnet_name, smcd_dev->pnetid))) {
+ pr_warn_ratelimited("smc: smcd device %s "
+ "erased user defined pnetid "
+ "%.16s\n", dev_name(&smcd_dev->dev),
+ smcd_dev->pnetid);
memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN);
smcd_dev->pnetid_by_user = false;
rc = 0;
}
}
- spin_unlock(&smcd_dev_list.lock);
+ mutex_unlock(&smcd_dev_list.mutex);
return rc;
}
-/* Remove a pnet entry mentioning a given network device from the pnet table.
+/* Add the reference to a given network device to the pnet table.
*/
-static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+static int smc_pnet_add_by_ndev(struct net_device *ndev)
{
struct smc_pnetentry *pnetelem, *tmp_pe;
struct smc_pnettable *pnettable;
@@ -169,94 +193,87 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev)
sn = net_generic(net, smc_net_id);
pnettable = &sn->pnettable;
- write_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
- if (pnetelem->ndev == ndev) {
- list_del(&pnetelem->list);
- dev_put(pnetelem->ndev);
- kfree(pnetelem);
+ if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
+ !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
+ netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC);
+ pnetelem->ndev = ndev;
rc = 0;
+ pr_warn_ratelimited("smc: adding net device %s with "
+ "user defined pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
break;
}
}
- write_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return rc;
}
-/* Append a pnetid to the end of the pnet table if not already on this list.
+/* Remove the reference to a given network device from the pnet table.
*/
-static int smc_pnet_enter(struct smc_pnettable *pnettable,
- struct smc_user_pnetentry *new_pnetelem)
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
{
- u8 pnet_null[SMC_MAX_PNETID_LEN] = {0};
- u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
- struct smc_pnetentry *tmp_pnetelem;
- struct smc_pnetentry *pnetelem;
- bool new_smcddev = false;
- struct net_device *ndev;
- bool new_netdev = true;
- bool new_ibdev = false;
-
- if (new_pnetelem->smcibdev) {
- struct smc_ib_device *ib_dev = new_pnetelem->smcibdev;
- int ib_port = new_pnetelem->ib_port;
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ struct smc_pnettable *pnettable;
+ struct net *net = dev_net(ndev);
+ struct smc_net *sn;
+ int rc = -ENOENT;
- spin_lock(&smc_ib_devices.lock);
- if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) {
- memcpy(ib_dev->pnetid[ib_port - 1],
- new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
- ib_dev->pnetid_by_user[ib_port - 1] = true;
- new_ibdev = true;
- }
- spin_unlock(&smc_ib_devices.lock);
- }
- if (new_pnetelem->smcd_dev) {
- struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev;
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
- spin_lock(&smcd_dev_list.lock);
- if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) {
- memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name,
- SMC_MAX_PNETID_LEN);
- smcd_dev->pnetid_by_user = true;
- new_smcddev = true;
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
+ if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
+ netdev_put(pnetelem->ndev, &pnetelem->dev_tracker);
+ pnetelem->ndev = NULL;
+ rc = 0;
+ pr_warn_ratelimited("smc: removing net device %s with "
+ "user defined pnetid %.16s\n",
+ pnetelem->eth_name,
+ pnetelem->pnet_name);
+ break;
}
- spin_unlock(&smcd_dev_list.lock);
}
+ mutex_unlock(&pnettable->lock);
+ return rc;
+}
- if (!new_pnetelem->ndev)
- return (new_ibdev || new_smcddev) ? 0 : -EEXIST;
+/* Apply pnetid to ib device when no pnetid is set.
+ */
+static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
+ char *pnet_name)
+{
+ bool applied = false;
- /* check if (base) netdev already has a pnetid. If there is one, we do
- * not want to add a pnet table entry
- */
- ndev = pnet_find_base_ndev(new_pnetelem->ndev);
- if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
- ndev_pnetid))
- return (new_ibdev || new_smcddev) ? 0 : -EEXIST;
+ mutex_lock(&smc_ib_devices.mutex);
+ if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) {
+ memcpy(ib_dev->pnetid[ib_port - 1], pnet_name,
+ SMC_MAX_PNETID_LEN);
+ ib_dev->pnetid_by_user[ib_port - 1] = true;
+ applied = true;
+ }
+ mutex_unlock(&smc_ib_devices.mutex);
+ return applied;
+}
- /* add a new netdev entry to the pnet table if there isn't one */
- tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
- if (!tmp_pnetelem)
- return -ENOMEM;
- memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name,
- SMC_MAX_PNETID_LEN);
- tmp_pnetelem->ndev = new_pnetelem->ndev;
+/* Apply pnetid to smcd device when no pnetid is set.
+ */
+static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name)
+{
+ bool applied = false;
- write_lock(&pnettable->lock);
- list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
- if (pnetelem->ndev == new_pnetelem->ndev)
- new_netdev = false;
+ mutex_lock(&smcd_dev_list.mutex);
+ if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) {
+ memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN);
+ smcd_dev->pnetid_by_user = true;
+ applied = true;
}
- if (new_netdev) {
- dev_hold(tmp_pnetelem->ndev);
- list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist);
- write_unlock(&pnettable->lock);
- } else {
- write_unlock(&pnettable->lock);
- kfree(tmp_pnetelem);
- }
-
- return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST;
+ mutex_unlock(&smcd_dev_list.mutex);
+ return applied;
}
/* The limit for pnetid is 16 characters.
@@ -291,18 +308,19 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
{
struct smc_ib_device *ibdev;
- spin_lock(&smc_ib_devices.lock);
+ mutex_lock(&smc_ib_devices.mutex);
list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
if (!strncmp(ibdev->ibdev->name, ib_name,
sizeof(ibdev->ibdev->name)) ||
- !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
- IB_DEVICE_NAME_MAX - 1)) {
+ (ibdev->ibdev->dev.parent &&
+ !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
+ IB_DEVICE_NAME_MAX - 1))) {
goto out;
}
}
ibdev = NULL;
out:
- spin_unlock(&smc_ib_devices.lock);
+ mutex_unlock(&smc_ib_devices.mutex);
return ibdev;
}
@@ -311,7 +329,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name)
{
struct smcd_dev *smcd_dev;
- spin_lock(&smcd_dev_list.lock);
+ mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
if (!strncmp(dev_name(&smcd_dev->dev), smcd_name,
IB_DEVICE_NAME_MAX - 1))
@@ -319,61 +337,190 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name)
}
smcd_dev = NULL;
out:
- spin_unlock(&smcd_dev_list.lock);
+ mutex_unlock(&smcd_dev_list.mutex);
return smcd_dev;
}
-/* Parse the supplied netlink attributes and fill a pnetentry structure.
- * For ethernet and infiniband device names verify that the devices exist.
+static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
+ char *eth_name, char *pnet_name)
+{
+ struct smc_pnetentry *tmp_pe, *new_pe;
+ struct net_device *ndev, *base_ndev;
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ bool new_netdev;
+ int rc;
+
+ /* check if (base) netdev already has a pnetid. If there is one, we do
+ * not want to add a pnet table entry
+ */
+ rc = -EEXIST;
+ ndev = dev_get_by_name(net, eth_name); /* dev_hold() */
+ if (ndev) {
+ base_ndev = pnet_find_base_ndev(ndev);
+ if (!smc_pnetid_by_dev_port(base_ndev->dev.parent,
+ base_ndev->dev_port, ndev_pnetid))
+ goto out_put;
+ }
+
+ /* add a new netdev entry to the pnet table if there isn't one */
+ rc = -ENOMEM;
+ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+ if (!new_pe)
+ goto out_put;
+ new_pe->type = SMC_PNET_ETH;
+ memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+ strncpy(new_pe->eth_name, eth_name, IFNAMSIZ);
+ rc = -EEXIST;
+ new_netdev = true;
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_ETH &&
+ !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) {
+ new_netdev = false;
+ break;
+ }
+ }
+ if (new_netdev) {
+ if (ndev) {
+ new_pe->ndev = ndev;
+ netdev_tracker_alloc(ndev, &new_pe->dev_tracker,
+ GFP_ATOMIC);
+ }
+ list_add_tail(&new_pe->list, &pnettable->pnetlist);
+ mutex_unlock(&pnettable->lock);
+ } else {
+ mutex_unlock(&pnettable->lock);
+ kfree(new_pe);
+ goto out_put;
+ }
+ if (ndev)
+ pr_warn_ratelimited("smc: net device %s "
+ "applied user defined pnetid %.16s\n",
+ new_pe->eth_name, new_pe->pnet_name);
+ return 0;
+
+out_put:
+ dev_put(ndev);
+ return rc;
+}
+
+static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
+ u8 ib_port, char *pnet_name)
+{
+ struct smc_pnetentry *tmp_pe, *new_pe;
+ struct smc_ib_device *ib_dev;
+ bool smcddev_applied = true;
+ bool ibdev_applied = true;
+ struct smcd_dev *smcd_dev;
+ bool new_ibdev;
+
+ /* try to apply the pnetid to active devices */
+ ib_dev = smc_pnet_find_ib(ib_name);
+ if (ib_dev) {
+ ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name);
+ if (ibdev_applied)
+ pr_warn_ratelimited("smc: ib device %s ibport %d "
+ "applied user defined pnetid "
+ "%.16s\n", ib_dev->ibdev->name,
+ ib_port,
+ ib_dev->pnetid[ib_port - 1]);
+ }
+ smcd_dev = smc_pnet_find_smcd(ib_name);
+ if (smcd_dev) {
+ smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name);
+ if (smcddev_applied)
+ pr_warn_ratelimited("smc: smcd device %s "
+ "applied user defined pnetid "
+ "%.16s\n", dev_name(&smcd_dev->dev),
+ smcd_dev->pnetid);
+ }
+ /* Apply fails when a device has a hardware-defined pnetid set, do not
+ * add a pnet table entry in that case.
+ */
+ if (!ibdev_applied || !smcddev_applied)
+ return -EEXIST;
+
+ /* add a new ib entry to the pnet table if there isn't one */
+ new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
+ if (!new_pe)
+ return -ENOMEM;
+ new_pe->type = SMC_PNET_IB;
+ memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
+ strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
+ new_pe->ib_port = ib_port;
+
+ new_ibdev = true;
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+ new_ibdev = false;
+ break;
+ }
+ }
+ if (new_ibdev) {
+ list_add_tail(&new_pe->list, &pnettable->pnetlist);
+ mutex_unlock(&pnettable->lock);
+ } else {
+ mutex_unlock(&pnettable->lock);
+ kfree(new_pe);
+ }
+ return (new_ibdev) ? 0 : -EEXIST;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
*/
-static int smc_pnet_fill_entry(struct net *net,
- struct smc_user_pnetentry *pnetelem,
- struct nlattr *tb[])
+static int smc_pnet_enter(struct net *net, struct nlattr *tb[])
{
- char *string, *ibname;
+ char pnet_name[SMC_MAX_PNETID_LEN + 1];
+ struct smc_pnettable *pnettable;
+ bool new_netdev = false;
+ bool new_ibdev = false;
+ struct smc_net *sn;
+ u8 ibport = 1;
+ char *string;
int rc;
- memset(pnetelem, 0, sizeof(*pnetelem));
- INIT_LIST_HEAD(&pnetelem->list);
+ /* get pnettable for namespace */
+ sn = net_generic(net, smc_net_id);
+ pnettable = &sn->pnettable;
rc = -EINVAL;
if (!tb[SMC_PNETID_NAME])
goto error;
string = (char *)nla_data(tb[SMC_PNETID_NAME]);
- if (!smc_pnetid_valid(string, pnetelem->pnet_name))
+ if (!smc_pnetid_valid(string, pnet_name))
goto error;
- rc = -EINVAL;
if (tb[SMC_PNETID_ETHNAME]) {
string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
- pnetelem->ndev = dev_get_by_name(net, string);
- if (!pnetelem->ndev)
+ rc = smc_pnet_add_eth(pnettable, net, string, pnet_name);
+ if (!rc)
+ new_netdev = true;
+ else if (rc != -EEXIST)
goto error;
}
/* if this is not the initial namespace, stop here */
if (net != &init_net)
- return 0;
+ return new_netdev ? 0 : -EEXIST;
rc = -EINVAL;
if (tb[SMC_PNETID_IBNAME]) {
- ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
- ibname = strim(ibname);
- pnetelem->smcibdev = smc_pnet_find_ib(ibname);
- pnetelem->smcd_dev = smc_pnet_find_smcd(ibname);
- if (!pnetelem->smcibdev && !pnetelem->smcd_dev)
- goto error;
- if (pnetelem->smcibdev) {
- if (!tb[SMC_PNETID_IBPORT])
- goto error;
- pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
- if (pnetelem->ib_port < 1 ||
- pnetelem->ib_port > SMC_MAX_PORTS)
+ string = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+ string = strim(string);
+ if (tb[SMC_PNETID_IBPORT]) {
+ ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+ if (ibport < 1 || ibport > SMC_MAX_PORTS)
goto error;
}
+ rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name);
+ if (!rc)
+ new_ibdev = true;
+ else if (rc != -EEXIST)
+ goto error;
}
-
- return 0;
+ return (new_netdev || new_ibdev) ? 0 : -EEXIST;
error:
return rc;
@@ -381,28 +528,22 @@ error:
/* Convert an smc_pnetentry to a netlink attribute sequence */
static int smc_pnet_set_nla(struct sk_buff *msg,
- struct smc_user_pnetentry *pnetelem)
+ struct smc_pnetentry *pnetelem)
{
if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name))
return -1;
- if (pnetelem->ndev) {
+ if (pnetelem->type == SMC_PNET_ETH) {
if (nla_put_string(msg, SMC_PNETID_ETHNAME,
- pnetelem->ndev->name))
+ pnetelem->eth_name))
return -1;
} else {
if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a"))
return -1;
}
- if (pnetelem->smcibdev) {
- if (nla_put_string(msg, SMC_PNETID_IBNAME,
- dev_name(pnetelem->smcibdev->ibdev->dev.parent)) ||
+ if (pnetelem->type == SMC_PNET_IB) {
+ if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) ||
nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
return -1;
- } else if (pnetelem->smcd_dev) {
- if (nla_put_string(msg, SMC_PNETID_IBNAME,
- dev_name(&pnetelem->smcd_dev->dev)) ||
- nla_put_u8(msg, SMC_PNETID_IBPORT, 1))
- return -1;
} else {
if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") ||
nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff))
@@ -415,21 +556,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg,
static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
- struct smc_user_pnetentry pnetelem;
- struct smc_pnettable *pnettable;
- struct smc_net *sn;
- int rc;
- /* get pnettable for namespace */
- sn = net_generic(net, smc_net_id);
- pnettable = &sn->pnettable;
-
- rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs);
- if (!rc)
- rc = smc_pnet_enter(pnettable, &pnetelem);
- if (pnetelem.ndev)
- dev_put(pnetelem.ndev);
- return rc;
+ return smc_pnet_enter(net, info->attrs);
}
static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
@@ -450,7 +578,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb)
static int smc_pnet_dumpinfo(struct sk_buff *skb,
u32 portid, u32 seq, u32 flags,
- struct smc_user_pnetentry *pnetelem)
+ struct smc_pnetentry *pnetelem)
{
void *hdr;
@@ -469,91 +597,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb,
static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
u32 seq, u8 *pnetid, int start_idx)
{
- struct smc_user_pnetentry tmp_entry;
struct smc_pnettable *pnettable;
struct smc_pnetentry *pnetelem;
- struct smc_ib_device *ibdev;
- struct smcd_dev *smcd_dev;
struct smc_net *sn;
int idx = 0;
- int ibport;
/* get pnettable for namespace */
sn = net_generic(net, smc_net_id);
pnettable = &sn->pnettable;
- /* dump netdevices */
- read_lock(&pnettable->lock);
+ /* dump pnettable entries */
+ mutex_lock(&pnettable->lock);
list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid))
continue;
if (idx++ < start_idx)
continue;
- memset(&tmp_entry, 0, sizeof(tmp_entry));
- memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name,
- SMC_MAX_PNETID_LEN);
- tmp_entry.ndev = pnetelem->ndev;
+ /* if this is not the initial namespace, dump only netdev */
+ if (net != &init_net && pnetelem->type != SMC_PNET_ETH)
+ continue;
if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
- &tmp_entry)) {
+ pnetelem)) {
--idx;
break;
}
}
- read_unlock(&pnettable->lock);
-
- /* if this is not the initial namespace, stop here */
- if (net != &init_net)
- return idx;
-
- /* dump ib devices */
- spin_lock(&smc_ib_devices.lock);
- list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
- for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) {
- if (ibdev->pnetid_by_user[ibport]) {
- if (pnetid &&
- !smc_pnet_match(ibdev->pnetid[ibport],
- pnetid))
- continue;
- if (idx++ < start_idx)
- continue;
- memset(&tmp_entry, 0, sizeof(tmp_entry));
- memcpy(&tmp_entry.pnet_name,
- ibdev->pnetid[ibport],
- SMC_MAX_PNETID_LEN);
- tmp_entry.smcibdev = ibdev;
- tmp_entry.ib_port = ibport + 1;
- if (smc_pnet_dumpinfo(skb, portid, seq,
- NLM_F_MULTI,
- &tmp_entry)) {
- --idx;
- break;
- }
- }
- }
- }
- spin_unlock(&smc_ib_devices.lock);
-
- /* dump smcd devices */
- spin_lock(&smcd_dev_list.lock);
- list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
- if (smcd_dev->pnetid_by_user) {
- if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid))
- continue;
- if (idx++ < start_idx)
- continue;
- memset(&tmp_entry, 0, sizeof(tmp_entry));
- memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid,
- SMC_MAX_PNETID_LEN);
- tmp_entry.smcd_dev = smcd_dev;
- if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
- &tmp_entry)) {
- --idx;
- break;
- }
- }
- }
- spin_unlock(&smcd_dev_list.lock);
-
+ mutex_unlock(&pnettable->lock);
return idx;
}
@@ -646,18 +715,140 @@ static struct genl_family smc_pnet_nl_family __ro_after_init = {
.netnsok = true,
.module = THIS_MODULE,
.ops = smc_pnet_ops,
- .n_ops = ARRAY_SIZE(smc_pnet_ops)
+ .n_ops = ARRAY_SIZE(smc_pnet_ops),
+ .resv_start_op = SMC_PNETID_FLUSH + 1,
};
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe;
+ bool rc = false;
+
+ read_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pe, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ rc = true;
+ goto unlock;
+ }
+ }
+
+unlock:
+ read_unlock(&sn->pnetids_ndev.lock);
+ return rc;
+}
+
+static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pi;
+
+ pe = kzalloc(sizeof(*pe), GFP_KERNEL);
+ if (!pe)
+ return -ENOMEM;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry(pi, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ refcount_inc(&pi->refcnt);
+ kfree(pe);
+ goto unlock;
+ }
+ }
+ refcount_set(&pe->refcnt, 1);
+ memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN);
+ list_add_tail(&pe->list, &sn->pnetids_ndev.list);
+
+unlock:
+ write_unlock(&sn->pnetids_ndev.lock);
+ return 0;
+}
+
+static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *pe2;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) {
+ if (smc_pnet_match(pnetid, pe->pnetid)) {
+ if (refcount_dec_and_test(&pe->refcnt)) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ break;
+ }
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
+static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev,
+ u8 *ndev_pnetid)
+{
+ struct net_device *base_dev;
+
+ base_dev = __pnet_find_base_ndev(dev);
+ if (base_dev->flags & IFF_UP &&
+ !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port,
+ ndev_pnetid)) {
+ /* add to PNETIDs list */
+ smc_pnet_add_pnetid(net, ndev_pnetid);
+ }
+}
+
+/* create initial list of netdevice pnetids */
+static void smc_pnet_create_pnetids_list(struct net *net)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct net_device *dev;
+
+ rtnl_lock();
+ for_each_netdev(net, dev)
+ smc_pnet_add_base_pnetid(net, dev, ndev_pnetid);
+ rtnl_unlock();
+}
+
+/* clean up list of netdevice pnetids */
+static void smc_pnet_destroy_pnetids_list(struct net *net)
+{
+ struct smc_net *sn = net_generic(net, smc_net_id);
+ struct smc_pnetids_ndev_entry *pe, *temp_pe;
+
+ write_lock(&sn->pnetids_ndev.lock);
+ list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) {
+ list_del(&pe->list);
+ kfree(pe);
+ }
+ write_unlock(&sn->pnetids_ndev.lock);
+}
+
static int smc_pnet_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(event_dev);
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
switch (event) {
case NETDEV_REBOOT:
case NETDEV_UNREGISTER:
smc_pnet_remove_by_ndev(event_dev);
+ smc_ib_ndev_change(event_dev, event);
+ return NOTIFY_OK;
+ case NETDEV_REGISTER:
+ smc_pnet_add_by_ndev(event_dev);
+ smc_ib_ndev_change(event_dev, event);
+ return NOTIFY_OK;
+ case NETDEV_UP:
+ smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid);
+ return NOTIFY_OK;
+ case NETDEV_DOWN:
+ event_dev = __pnet_find_base_ndev(event_dev);
+ if (!smc_pnetid_by_dev_port(event_dev->dev.parent,
+ event_dev->dev_port, ndev_pnetid)) {
+ /* remove from PNETIDs list */
+ smc_pnet_remove_pnetid(net, ndev_pnetid);
+ }
return NOTIFY_OK;
default:
return NOTIFY_DONE;
@@ -673,9 +864,17 @@ int smc_pnet_net_init(struct net *net)
{
struct smc_net *sn = net_generic(net, smc_net_id);
struct smc_pnettable *pnettable = &sn->pnettable;
+ struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev;
INIT_LIST_HEAD(&pnettable->pnetlist);
- rwlock_init(&pnettable->lock);
+ mutex_init(&pnettable->lock);
+ INIT_LIST_HEAD(&pnetids_ndev->list);
+ rwlock_init(&pnetids_ndev->lock);
+
+ smc_pnet_create_pnetids_list(net);
+
+ /* disable handshake limitation by default */
+ net->smc.limit_smc_hs = 0;
return 0;
}
@@ -690,6 +889,7 @@ int __init smc_pnet_init(void)
rc = register_netdevice_notifier(&smc_netdev_notifier);
if (rc)
genl_unregister_family(&smc_pnet_nl_family);
+
return rc;
}
@@ -698,6 +898,7 @@ void smc_pnet_net_exit(struct net *net)
{
/* flush pnet table */
smc_pnet_remove_by_pnetid(net, NULL);
+ smc_pnet_destroy_pnetids_list(net);
}
void smc_pnet_exit(void)
@@ -706,16 +907,11 @@ void smc_pnet_exit(void)
genl_unregister_family(&smc_pnet_nl_family);
}
-/* Determine one base device for stacked net devices.
- * If the lower device level contains more than one devices
- * (for instance with bonding slaves), just the first device
- * is used to reach a base device.
- */
-static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
+static struct net_device *__pnet_find_base_ndev(struct net_device *ndev)
{
int i, nest_lvl;
- rtnl_lock();
+ ASSERT_RTNL();
nest_lvl = ndev->lower_level;
for (i = 0; i < nest_lvl; i++) {
struct list_head *lower = &ndev->adj_list.lower;
@@ -725,6 +921,18 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
lower = lower->next;
ndev = netdev_lower_get_next(ndev, &lower);
}
+ return ndev;
+}
+
+/* Determine one base device for stacked net devices.
+ * If the lower device level contains more than one devices
+ * (for instance with bonding slaves), just the first device
+ * is used to reach a base device.
+ */
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
+{
+ rtnl_lock();
+ ndev = __pnet_find_base_ndev(ndev);
rtnl_unlock();
return ndev;
}
@@ -742,32 +950,96 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
sn = net_generic(net, smc_net_id);
pnettable = &sn->pnettable;
- read_lock(&pnettable->lock);
+ mutex_lock(&pnettable->lock);
list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
- if (ndev == pnetelem->ndev) {
+ if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) {
/* get pnetid of netdev device */
memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
rc = 0;
break;
}
}
- read_unlock(&pnettable->lock);
+ mutex_unlock(&pnettable->lock);
return rc;
}
+static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i,
+ struct smc_init_info *ini)
+{
+ if (!ini->check_smcrv2 &&
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL,
+ NULL)) {
+ ini->ib_dev = ibdev;
+ ini->ib_port = i;
+ return 0;
+ }
+ if (ini->check_smcrv2 &&
+ !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2,
+ NULL, &ini->smcrv2)) {
+ ini->smcrv2.ib_dev_v2 = ibdev;
+ ini->smcrv2.ib_port_v2 = i;
+ return 0;
+ }
+ return -ENODEV;
+}
+
+/* find a roce device for the given pnetid */
+static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev,
+ struct net *net)
+{
+ struct smc_ib_device *ibdev;
+ int i;
+
+ mutex_lock(&smc_ib_devices.mutex);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ if (ibdev == known_dev ||
+ !rdma_dev_access_netns(ibdev->ibdev, net))
+ continue;
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(ibdev->ibdev, i))
+ continue;
+ if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) &&
+ smc_ib_port_active(ibdev, i) &&
+ !test_bit(i - 1, ibdev->ports_going_away)) {
+ if (!smc_pnet_determine_gid(ibdev, i, ini))
+ goto out;
+ }
+ }
+ }
+out:
+ mutex_unlock(&smc_ib_devices.mutex);
+}
+
+/* find alternate roce device with same pnet_id, vlan_id and net namespace */
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev)
+{
+ struct net *net = lgr->net;
+
+ _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net);
+}
+
/* if handshake network device belongs to a roce device, return its
* IB device and port
*/
static void smc_pnet_find_rdma_dev(struct net_device *netdev,
struct smc_init_info *ini)
{
+ struct net *net = dev_net(netdev);
struct smc_ib_device *ibdev;
- spin_lock(&smc_ib_devices.lock);
+ mutex_lock(&smc_ib_devices.mutex);
list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
struct net_device *ndev;
int i;
+ /* check rdma net namespace */
+ if (!rdma_dev_access_netns(ibdev->ibdev, net))
+ continue;
+
for (i = 1; i <= SMC_MAX_PORTS; i++) {
if (!rdma_is_port_valid(ibdev->ibdev, i))
continue;
@@ -779,16 +1051,13 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev,
dev_put(ndev);
if (netdev == ndev &&
smc_ib_port_active(ibdev, i) &&
- !test_bit(i - 1, ibdev->ports_going_away) &&
- !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
- ini->ib_gid, NULL)) {
- ini->ib_dev = ibdev;
- ini->ib_port = i;
- break;
+ !test_bit(i - 1, ibdev->ports_going_away)) {
+ if (!smc_pnet_determine_gid(ibdev, i, ini))
+ break;
}
}
}
- spin_unlock(&smc_ib_devices.lock);
+ mutex_unlock(&smc_ib_devices.mutex);
}
/* Determine the corresponding IB device port based on the hardware PNETID.
@@ -801,35 +1070,17 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
struct smc_init_info *ini)
{
u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
- struct smc_ib_device *ibdev;
- int i;
+ struct net *net;
ndev = pnet_find_base_ndev(ndev);
+ net = dev_net(ndev);
if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
ndev_pnetid) &&
smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) {
smc_pnet_find_rdma_dev(ndev, ini);
return; /* pnetid could not be determined */
}
-
- spin_lock(&smc_ib_devices.lock);
- list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
- for (i = 1; i <= SMC_MAX_PORTS; i++) {
- if (!rdma_is_port_valid(ibdev->ibdev, i))
- continue;
- if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) &&
- smc_ib_port_active(ibdev, i) &&
- !test_bit(i - 1, ibdev->ports_going_away) &&
- !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
- ini->ib_gid, NULL)) {
- ini->ib_dev = ibdev;
- ini->ib_port = i;
- goto out;
- }
- }
- }
-out:
- spin_unlock(&smc_ib_devices.lock);
+ _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net);
}
static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
@@ -844,15 +1095,18 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid))
return; /* pnetid could not be determined */
- spin_lock(&smcd_dev_list.lock);
+ mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
- !ismdev->going_away) {
- ini->ism_dev = ismdev;
+ !ismdev->going_away &&
+ (!ini->ism_peer_gid[0] ||
+ !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id,
+ ismdev))) {
+ ini->ism_dev[0] = ismdev;
break;
}
}
- spin_unlock(&smcd_dev_list.lock);
+ mutex_unlock(&smcd_dev_list.mutex);
}
/* PNET table analysis for a given sock:
@@ -863,8 +1117,6 @@ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
{
struct dst_entry *dst = sk_dst_get(sk);
- ini->ib_dev = NULL;
- ini->ib_port = 0;
if (!dst)
goto out;
if (!dst->dev)
@@ -882,7 +1134,7 @@ void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
{
struct dst_entry *dst = sk_dst_get(sk);
- ini->ism_dev = NULL;
+ ini->ism_dev[0] = NULL;
if (!dst)
goto out;
if (!dst->dev)
@@ -895,3 +1147,60 @@ out_rel:
out:
return;
}
+
+/* Lookup and apply a pnet table entry to the given ib device.
+ */
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
+{
+ char *ib_name = smcibdev->ibdev->name;
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *tmp_pe;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for init namespace */
+ sn = net_generic(&init_net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) &&
+ tmp_pe->ib_port == ib_port) {
+ smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ return rc;
+}
+
+/* Lookup and apply a pnet table entry to the given smcd device.
+ */
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
+{
+ const char *ib_name = dev_name(&smcddev->dev);
+ struct smc_pnettable *pnettable;
+ struct smc_pnetentry *tmp_pe;
+ struct smc_net *sn;
+ int rc = -ENOENT;
+
+ /* get pnettable for init namespace */
+ sn = net_generic(&init_net, smc_net_id);
+ pnettable = &sn->pnettable;
+
+ mutex_lock(&pnettable->lock);
+ list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
+ if (tmp_pe->type == SMC_PNET_IB &&
+ !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+ smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name);
+ rc = 0;
+ break;
+ }
+ }
+ mutex_unlock(&pnettable->lock);
+
+ return rc;
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 4564e4d69c2e..80a88eea4949 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -12,6 +12,8 @@
#ifndef _SMC_PNET_H
#define _SMC_PNET_H
+#include <net/smc.h>
+
#if IS_ENABLED(CONFIG_HAVE_PNETID)
#include <asm/pnet.h>
#endif
@@ -19,6 +21,7 @@
struct smc_ib_device;
struct smcd_dev;
struct smc_init_info;
+struct smc_link_group;
/**
* struct smc_pnettable - SMC PNET table anchor
@@ -26,10 +29,21 @@ struct smc_init_info;
* @pnetlist: List of PNETIDs
*/
struct smc_pnettable {
- rwlock_t lock;
+ struct mutex lock;
struct list_head pnetlist;
};
+struct smc_pnetids_ndev { /* list of pnetids for net devices in UP state*/
+ struct list_head list;
+ rwlock_t lock;
+};
+
+struct smc_pnetids_ndev_entry {
+ struct list_head list;
+ u8 pnetid[SMC_MAX_PNETID_LEN];
+ refcount_t refcnt;
+};
+
static inline int smc_pnetid_by_dev_port(struct device *dev,
unsigned short port, u8 *pnetid)
{
@@ -46,5 +60,11 @@ void smc_pnet_exit(void);
void smc_pnet_net_exit(struct net *net);
void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini);
void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini);
-
+int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port);
+int smc_pnetid_by_table_smcd(struct smcd_dev *smcd);
+void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
+ struct smc_init_info *ini,
+ struct smc_ib_device *known_dev);
+bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid);
+bool smc_pnet_is_pnetid_set(u8 *pnetid);
#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 39d7b34d06d2..17c5aee7ee4f 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -21,6 +21,8 @@
#include "smc_cdc.h"
#include "smc_tx.h" /* smc_tx_consumer_update() */
#include "smc_rx.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
/* callback implementation to wakeup consumers blocked with smc_rx_wait().
* indirectly called by smc_cdc_msg_recv_action().
@@ -129,16 +131,8 @@ out:
sock_put(sk);
}
-static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- return 1;
-}
-
static const struct pipe_buf_operations smc_pipe_ops = {
- .confirm = generic_pipe_buf_confirm,
.release = smc_rx_pipe_buf_release,
- .steal = smc_rx_pipe_buf_nosteal,
.get = generic_pipe_buf_get
};
@@ -151,35 +145,93 @@ static void smc_rx_spd_release(struct splice_pipe_desc *spd,
static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
struct smc_sock *smc)
{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ int offset = offset_in_page(src);
+ struct partial_page *partial;
struct splice_pipe_desc spd;
- struct partial_page partial;
- struct smc_spd_priv *priv;
- int bytes;
+ struct smc_spd_priv **priv;
+ struct page **pages;
+ int bytes, nr_pages;
+ int i;
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ?
+ PAGE_ALIGN(len + offset) / PAGE_SIZE : 1;
+
+ pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ goto out;
+ partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL);
+ if (!partial)
+ goto out_page;
+ priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL);
if (!priv)
- return -ENOMEM;
- priv->len = len;
- priv->smc = smc;
- partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
- partial.len = len;
- partial.private = (unsigned long)priv;
-
- spd.nr_pages_max = 1;
- spd.nr_pages = 1;
- spd.pages = &smc->conn.rmb_desc->pages;
- spd.partial = &partial;
+ goto out_part;
+ for (i = 0; i < nr_pages; i++) {
+ priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL);
+ if (!priv[i])
+ goto out_priv;
+ }
+
+ if (lgr->is_smcd ||
+ (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) {
+ /* smcd or smcr that uses physically contiguous RMBs */
+ priv[0]->len = len;
+ priv[0]->smc = smc;
+ partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
+ partial[0].len = len;
+ partial[0].private = (unsigned long)priv[0];
+ pages[0] = smc->conn.rmb_desc->pages;
+ } else {
+ int size, left = len;
+ void *buf = src;
+ /* smcr that uses virtually contiguous RMBs*/
+ for (i = 0; i < nr_pages; i++) {
+ size = min_t(int, PAGE_SIZE - offset, left);
+ priv[i]->len = size;
+ priv[i]->smc = smc;
+ pages[i] = vmalloc_to_page(buf);
+ partial[i].offset = offset;
+ partial[i].len = size;
+ partial[i].private = (unsigned long)priv[i];
+ buf += size / sizeof(*buf);
+ left -= size;
+ offset = 0;
+ }
+ }
+ spd.nr_pages_max = nr_pages;
+ spd.nr_pages = nr_pages;
+ spd.pages = pages;
+ spd.partial = partial;
spd.ops = &smc_pipe_ops;
spd.spd_release = smc_rx_spd_release;
bytes = splice_to_pipe(pipe, &spd);
if (bytes > 0) {
sock_hold(&smc->sk);
- get_page(smc->conn.rmb_desc->pages);
+ if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) {
+ for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++)
+ get_page(pages[i]);
+ } else {
+ get_page(smc->conn.rmb_desc->pages);
+ }
atomic_add(bytes, &smc->conn.splice_pending);
}
+ kfree(priv);
+ kfree(partial);
+ kfree(pages);
return bytes;
+
+out_priv:
+ for (i = (i - 1); i >= 0; i--)
+ kfree(priv[i]);
+ kfree(priv);
+out_part:
+ kfree(partial);
+out_page:
+ kfree(pages);
+out:
+ return -ENOMEM;
}
static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
@@ -235,6 +287,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
conn->urg_state == SMC_URG_READ)
return -EINVAL;
+ SMC_STAT_INC(smc, urg_data_cnt);
if (conn->urg_state == SMC_URG_VALID) {
if (!(flags & MSG_PEEK))
smc->conn.urg_state = SMC_URG_READ;
@@ -311,6 +364,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ readable = atomic_read(&conn->bytes_to_rcv);
+ if (readable >= conn->rmb_desc->len)
+ SMC_STAT_RMB_RX_FULL(smc, !conn->lnk);
+
+ if (len < readable)
+ SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk);
/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr;
@@ -354,12 +413,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
}
break;
}
+ if (!timeo)
+ return -EAGAIN;
if (signal_pending(current)) {
read_done = sock_intr_errno(timeo);
break;
}
- if (!timeo)
- return -EAGAIN;
}
if (!smc_rx_data_available(conn)) {
@@ -412,7 +471,6 @@ copy:
if (rc < 0) {
if (!read_done)
read_done = -EFAULT;
- smc_rmb_sync_sg_for_device(conn);
goto out;
}
}
@@ -426,7 +484,6 @@ copy:
chunk_len_sum += chunk_len;
chunk_off = 0; /* modulo offset in recv ring buffer */
}
- smc_rmb_sync_sg_for_device(conn);
/* update cursors */
if (!(flags & MSG_PEEK)) {
@@ -438,6 +495,8 @@ copy:
if (msg && smc_rx_update_consumer(smc, cons, copylen))
goto out;
}
+
+ trace_smc_rx_recvmsg(smc, copylen);
} while (read_remaining);
out:
return read_done;
diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c
new file mode 100644
index 000000000000..e80e34f7ac15
--- /dev/null
+++ b/net/smc/smc_stats.c
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * SMC statistics netlink routines
+ *
+ * Copyright IBM Corp. 2021
+ *
+ * Author(s): Guvenc Gulce
+ */
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/smc.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include "smc_netlink.h"
+#include "smc_stats.h"
+
+int smc_stats_init(struct net *net)
+{
+ net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL);
+ if (!net->smc.fback_rsn)
+ goto err_fback;
+ net->smc.smc_stats = alloc_percpu(struct smc_stats);
+ if (!net->smc.smc_stats)
+ goto err_stats;
+ mutex_init(&net->smc.mutex_fback_rsn);
+ return 0;
+
+err_stats:
+ kfree(net->smc.fback_rsn);
+err_fback:
+ return -ENOMEM;
+}
+
+void smc_stats_exit(struct net *net)
+{
+ kfree(net->smc.fback_rsn);
+ if (net->smc.smc_stats)
+ free_percpu(net->smc.smc_stats);
+}
+
+static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb,
+ struct smc_stats *stats, int tech,
+ int type)
+{
+ struct smc_stats_rmbcnt *stats_rmb_cnt;
+ struct nlattr *attrs;
+
+ if (type == SMC_NLA_STATS_T_TX_RMB_STATS)
+ stats_rmb_cnt = &stats->smc[tech].rmb_tx;
+ else
+ stats_rmb_cnt = &stats->smc[tech].rmb_rx;
+
+ attrs = nla_nest_start(skb, type);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT,
+ stats_rmb_cnt->reuse_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT,
+ stats_rmb_cnt->buf_size_small_peer_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT,
+ stats_rmb_cnt->buf_size_small_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT,
+ stats_rmb_cnt->buf_full_peer_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT,
+ stats_rmb_cnt->buf_full_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT,
+ stats_rmb_cnt->alloc_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT,
+ stats_rmb_cnt->dgrade_cnt,
+ SMC_NLA_STATS_RMB_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb,
+ struct smc_stats *stats, int tech,
+ int type)
+{
+ struct smc_stats_memsize *stats_pload;
+ struct nlattr *attrs;
+
+ if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE)
+ stats_pload = &stats->smc[tech].tx_pd;
+ else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE)
+ stats_pload = &stats->smc[tech].rx_pd;
+ else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE)
+ stats_pload = &stats->smc[tech].tx_rmbsize;
+ else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE)
+ stats_pload = &stats->smc[tech].rx_rmbsize;
+ else
+ goto errout;
+
+ attrs = nla_nest_start(skb, type);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K,
+ stats_pload->buf[SMC_BUF_8K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K,
+ stats_pload->buf[SMC_BUF_16K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K,
+ stats_pload->buf[SMC_BUF_32K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K,
+ stats_pload->buf[SMC_BUF_64K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K,
+ stats_pload->buf[SMC_BUF_128K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K,
+ stats_pload->buf[SMC_BUF_256K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K,
+ stats_pload->buf[SMC_BUF_512K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K,
+ stats_pload->buf[SMC_BUF_1024K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K,
+ stats_pload->buf[SMC_BUF_G_1024K],
+ SMC_NLA_STATS_PLOAD_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+static int smc_nl_fill_stats_tech_data(struct sk_buff *skb,
+ struct smc_stats *stats, int tech)
+{
+ struct smc_stats_tech *smc_tech;
+ struct nlattr *attrs;
+
+ smc_tech = &stats->smc[tech];
+ if (tech == SMC_TYPE_D)
+ attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH);
+ else
+ attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH);
+
+ if (!attrs)
+ goto errout;
+ if (smc_nl_fill_stats_rmb_data(skb, stats, tech,
+ SMC_NLA_STATS_T_TX_RMB_STATS))
+ goto errattr;
+ if (smc_nl_fill_stats_rmb_data(skb, stats, tech,
+ SMC_NLA_STATS_T_RX_RMB_STATS))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_TXPLOAD_SIZE))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_RXPLOAD_SIZE))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_TX_RMB_SIZE))
+ goto errattr;
+ if (smc_nl_fill_stats_bufsize_data(skb, stats, tech,
+ SMC_NLA_STATS_T_RX_RMB_SIZE))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC,
+ smc_tech->clnt_v1_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC,
+ smc_tech->clnt_v2_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC,
+ smc_tech->srv_v1_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC,
+ smc_tech->srv_v2_succ_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES,
+ smc_tech->rx_bytes,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES,
+ smc_tech->tx_bytes,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT,
+ smc_tech->rx_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT,
+ smc_tech->tx_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT,
+ smc_tech->sendpage_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT,
+ smc_tech->cork_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT,
+ smc_tech->ndly_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT,
+ smc_tech->splice_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT,
+ smc_tech->urg_data_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ return 0;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ return -EMSGSIZE;
+}
+
+int smc_nl_get_stats(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ struct smc_stats *stats;
+ struct nlattr *attrs;
+ int cpu, i, size;
+ void *nlh;
+ u64 *src;
+ u64 *sum;
+
+ if (cb_ctx->pos[0])
+ goto errmsg;
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_STATS);
+ if (!nlh)
+ goto errmsg;
+
+ attrs = nla_nest_start(skb, SMC_GEN_STATS);
+ if (!attrs)
+ goto errnest;
+ stats = kzalloc(sizeof(*stats), GFP_KERNEL);
+ if (!stats)
+ goto erralloc;
+ size = sizeof(*stats) / sizeof(u64);
+ for_each_possible_cpu(cpu) {
+ src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu);
+ sum = (u64 *)stats;
+ for (i = 0; i < size; i++)
+ *(sum++) += *(src++);
+ }
+ if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D))
+ goto errattr;
+ if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT,
+ stats->clnt_hshake_err_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT,
+ stats->srv_hshake_err_cnt,
+ SMC_NLA_STATS_PAD))
+ goto errattr;
+
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ cb_ctx->pos[0] = 1;
+ kfree(stats);
+ return skb->len;
+
+errattr:
+ kfree(stats);
+erralloc:
+ nla_nest_cancel(skb, attrs);
+errnest:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return skb->len;
+}
+
+static int smc_nl_get_fback_details(struct sk_buff *skb,
+ struct netlink_callback *cb, int pos,
+ bool is_srv)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ int cnt_reported = cb_ctx->pos[2];
+ struct smc_stats_fback *trgt_arr;
+ struct nlattr *attrs;
+ int rc = 0;
+ void *nlh;
+
+ if (is_srv)
+ trgt_arr = &net->smc.fback_rsn->srv[0];
+ else
+ trgt_arr = &net->smc.fback_rsn->clnt[0];
+ if (!trgt_arr[pos].fback_code)
+ return -ENODATA;
+ nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &smc_gen_nl_family, NLM_F_MULTI,
+ SMC_NETLINK_GET_FBACK_STATS);
+ if (!nlh)
+ goto errmsg;
+ attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS);
+ if (!attrs)
+ goto errout;
+ if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv))
+ goto errattr;
+ if (!cnt_reported) {
+ if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT,
+ net->smc.fback_rsn->srv_fback_cnt,
+ SMC_NLA_FBACK_STATS_PAD))
+ goto errattr;
+ if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT,
+ net->smc.fback_rsn->clnt_fback_cnt,
+ SMC_NLA_FBACK_STATS_PAD))
+ goto errattr;
+ cnt_reported = 1;
+ }
+
+ if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE,
+ trgt_arr[pos].fback_code))
+ goto errattr;
+ if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT,
+ trgt_arr[pos].count))
+ goto errattr;
+
+ cb_ctx->pos[2] = cnt_reported;
+ nla_nest_end(skb, attrs);
+ genlmsg_end(skb, nlh);
+ return rc;
+
+errattr:
+ nla_nest_cancel(skb, attrs);
+errout:
+ genlmsg_cancel(skb, nlh);
+errmsg:
+ return -EMSGSIZE;
+}
+
+int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ int rc_srv = 0, rc_clnt = 0, k;
+ int skip_serv = cb_ctx->pos[1];
+ int snum = cb_ctx->pos[0];
+ bool is_srv = true;
+
+ mutex_lock(&net->smc.mutex_fback_rsn);
+ for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) {
+ if (k < snum)
+ continue;
+ if (!skip_serv) {
+ rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv);
+ if (rc_srv && rc_srv != -ENODATA)
+ break;
+ } else {
+ skip_serv = 0;
+ }
+ rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv);
+ if (rc_clnt && rc_clnt != -ENODATA) {
+ skip_serv = 1;
+ break;
+ }
+ if (rc_clnt == -ENODATA && rc_srv == -ENODATA)
+ break;
+ }
+ mutex_unlock(&net->smc.mutex_fback_rsn);
+ cb_ctx->pos[1] = skip_serv;
+ cb_ctx->pos[0] = k;
+ return skb->len;
+}
diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h
new file mode 100644
index 000000000000..84b7ecd8c05c
--- /dev/null
+++ b/net/smc/smc_stats.h
@@ -0,0 +1,266 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Macros for SMC statistics
+ *
+ * Copyright IBM Corp. 2021
+ *
+ * Author(s): Guvenc Gulce
+ */
+
+#ifndef NET_SMC_SMC_STATS_H_
+#define NET_SMC_SMC_STATS_H_
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/smc.h>
+
+#include "smc_clc.h"
+
+#define SMC_MAX_FBACK_RSN_CNT 30
+
+enum {
+ SMC_BUF_8K,
+ SMC_BUF_16K,
+ SMC_BUF_32K,
+ SMC_BUF_64K,
+ SMC_BUF_128K,
+ SMC_BUF_256K,
+ SMC_BUF_512K,
+ SMC_BUF_1024K,
+ SMC_BUF_G_1024K,
+ SMC_BUF_MAX,
+};
+
+struct smc_stats_fback {
+ int fback_code;
+ u16 count;
+};
+
+struct smc_stats_rsn {
+ struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT];
+ struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT];
+ u64 srv_fback_cnt;
+ u64 clnt_fback_cnt;
+};
+
+struct smc_stats_rmbcnt {
+ u64 buf_size_small_peer_cnt;
+ u64 buf_size_small_cnt;
+ u64 buf_full_peer_cnt;
+ u64 buf_full_cnt;
+ u64 reuse_cnt;
+ u64 alloc_cnt;
+ u64 dgrade_cnt;
+};
+
+struct smc_stats_memsize {
+ u64 buf[SMC_BUF_MAX];
+};
+
+struct smc_stats_tech {
+ struct smc_stats_memsize tx_rmbsize;
+ struct smc_stats_memsize rx_rmbsize;
+ struct smc_stats_memsize tx_pd;
+ struct smc_stats_memsize rx_pd;
+ struct smc_stats_rmbcnt rmb_tx;
+ struct smc_stats_rmbcnt rmb_rx;
+ u64 clnt_v1_succ_cnt;
+ u64 clnt_v2_succ_cnt;
+ u64 srv_v1_succ_cnt;
+ u64 srv_v2_succ_cnt;
+ u64 sendpage_cnt;
+ u64 urg_data_cnt;
+ u64 splice_cnt;
+ u64 cork_cnt;
+ u64 ndly_cnt;
+ u64 rx_bytes;
+ u64 tx_bytes;
+ u64 rx_cnt;
+ u64 tx_cnt;
+};
+
+struct smc_stats {
+ struct smc_stats_tech smc[2];
+ u64 clnt_hshake_err_cnt;
+ u64 srv_hshake_err_cnt;
+};
+
+#define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \
+do { \
+ typeof(_smc_stats) stats = (_smc_stats); \
+ typeof(_tech) t = (_tech); \
+ typeof(_len) l = (_len); \
+ int _pos = fls64((l) >> 13); \
+ typeof(_rc) r = (_rc); \
+ int m = SMC_BUF_MAX - 1; \
+ this_cpu_inc((*stats).smc[t].key ## _cnt); \
+ if (r <= 0) \
+ break; \
+ _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \
+ this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \
+ this_cpu_add((*stats).smc[t].key ## _bytes, r); \
+} \
+while (0)
+
+#define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \
+do { \
+ typeof(_smc) __smc = _smc; \
+ struct net *_net = sock_net(&__smc->sk); \
+ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \
+ typeof(length) _len = (length); \
+ typeof(rcode) _rc = (rcode); \
+ bool is_smcd = !__smc->conn.lnk; \
+ if (is_smcd) \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, tx, _len, _rc); \
+ else \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, tx, _len, _rc); \
+} \
+while (0)
+
+#define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \
+do { \
+ typeof(_smc) __smc = _smc; \
+ struct net *_net = sock_net(&__smc->sk); \
+ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \
+ typeof(length) _len = (length); \
+ typeof(rcode) _rc = (rcode); \
+ bool is_smcd = !__smc->conn.lnk; \
+ if (is_smcd) \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, rx, _len, _rc); \
+ else \
+ SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, rx, _len, _rc); \
+} \
+while (0)
+
+#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \
+do { \
+ typeof(_len) _l = (_len); \
+ typeof(_tech) t = (_tech); \
+ int _pos = fls((_l) >> 13); \
+ int m = SMC_BUF_MAX - 1; \
+ _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \
+ this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \
+} \
+while (0)
+
+#define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \
+ this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt)
+
+#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \
+do { \
+ struct net *_net = sock_net(&(_smc)->sk); \
+ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \
+ typeof(_is_smcd) is_d = (_is_smcd); \
+ typeof(_is_rx) is_r = (_is_rx); \
+ typeof(_len) l = (_len); \
+ if ((is_d) && (is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \
+ if ((is_d) && !(is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \
+ if (!(is_d) && (is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \
+ if (!(is_d) && !(is_r)) \
+ SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \
+} \
+while (0)
+
+#define SMC_STAT_RMB(_smc, type, _is_smcd, _is_rx) \
+do { \
+ struct net *net = sock_net(&(_smc)->sk); \
+ struct smc_stats __percpu *_smc_stats = net->smc.smc_stats; \
+ typeof(_is_smcd) is_d = (_is_smcd); \
+ typeof(_is_rx) is_r = (_is_rx); \
+ if ((is_d) && (is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, rx); \
+ if ((is_d) && !(is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, tx); \
+ if (!(is_d) && (is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, rx); \
+ if (!(is_d) && !(is_r)) \
+ SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, tx); \
+} \
+while (0)
+
+#define SMC_STAT_BUF_REUSE(smc, is_smcd, is_rx) \
+ SMC_STAT_RMB(smc, reuse, is_smcd, is_rx)
+
+#define SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rx) \
+ SMC_STAT_RMB(smc, alloc, is_smcd, is_rx)
+
+#define SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rx) \
+ SMC_STAT_RMB(smc, dgrade, is_smcd, is_rx)
+
+#define SMC_STAT_RMB_TX_PEER_FULL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_full_peer, is_smcd, false)
+
+#define SMC_STAT_RMB_TX_FULL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_full, is_smcd, false)
+
+#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_size_small_peer, is_smcd, false)
+
+#define SMC_STAT_RMB_TX_SIZE_SMALL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_size_small, is_smcd, false)
+
+#define SMC_STAT_RMB_RX_SIZE_SMALL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_size_small, is_smcd, true)
+
+#define SMC_STAT_RMB_RX_FULL(smc, is_smcd) \
+ SMC_STAT_RMB(smc, buf_full, is_smcd, true)
+
+#define SMC_STAT_INC(_smc, type) \
+do { \
+ typeof(_smc) __smc = _smc; \
+ bool is_smcd = !(__smc)->conn.lnk; \
+ struct net *net = sock_net(&(__smc)->sk); \
+ struct smc_stats __percpu *smc_stats = net->smc.smc_stats; \
+ if ((is_smcd)) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \
+ else \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].type); \
+} \
+while (0)
+
+#define SMC_STAT_CLNT_SUCC_INC(net, _aclc) \
+do { \
+ typeof(_aclc) acl = (_aclc); \
+ bool is_v2 = (acl->hdr.version == SMC_V2); \
+ bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \
+ struct smc_stats __percpu *smc_stats = (net)->smc.smc_stats; \
+ if (is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \
+ else if (is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v2_succ_cnt); \
+ else if (!is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v1_succ_cnt); \
+ else if (!is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v1_succ_cnt); \
+} \
+while (0)
+
+#define SMC_STAT_SERV_SUCC_INC(net, _ini) \
+do { \
+ typeof(_ini) i = (_ini); \
+ bool is_v2 = (i->smcd_version & SMC_V2); \
+ bool is_smcd = (i->is_smcd); \
+ typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \
+ if (is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \
+ else if (is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v2_succ_cnt); \
+ else if (!is_v2 && is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v1_succ_cnt); \
+ else if (!is_v2 && !is_smcd) \
+ this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v1_succ_cnt); \
+} \
+while (0)
+
+int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb);
+int smc_stats_init(struct net *net);
+void smc_stats_exit(struct net *net);
+
+#endif /* NET_SMC_SMC_STATS_H_ */
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
new file mode 100644
index 000000000000..b6f79fabb9d3
--- /dev/null
+++ b/net/smc/smc_sysctl.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ * Copyright (c) 2022, Alibaba Inc.
+ *
+ * Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_llc.h"
+#include "smc_sysctl.h"
+
+static int min_sndbuf = SMC_BUF_MIN_SIZE;
+static int min_rcvbuf = SMC_BUF_MIN_SIZE;
+
+static struct ctl_table smc_table[] = {
+ {
+ .procname = "autocorking_size",
+ .data = &init_net.smc.sysctl_autocorking_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "smcr_buf_type",
+ .data = &init_net.smc.sysctl_smcr_buf_type,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "smcr_testlink_time",
+ .data = &init_net.smc.sysctl_smcr_testlink_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "wmem",
+ .data = &init_net.smc.sysctl_wmem,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem",
+ .data = &init_net.smc.sysctl_rmem,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ { }
+};
+
+int __net_init smc_sysctl_net_init(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = smc_table;
+ if (!net_eq(net, &init_net)) {
+ int i;
+
+ table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++)
+ table[i].data += (void *)net - (void *)&init_net;
+ }
+
+ net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table);
+ if (!net->smc.smc_hdr)
+ goto err_reg;
+
+ net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+ net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
+ net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
+ WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]));
+ WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]));
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+void __net_exit smc_sysctl_net_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->smc.smc_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->smc.smc_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h
new file mode 100644
index 000000000000..0becc11bd2f4
--- /dev/null
+++ b/net/smc/smc_sysctl.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * smc_sysctl.c: sysctl interface to SMC subsystem.
+ *
+ * Copyright (c) 2022, Alibaba Inc.
+ *
+ * Author: Tony Lu <tonylu@linux.alibaba.com>
+ *
+ */
+
+#ifndef _SMC_SYSCTL_H
+#define _SMC_SYSCTL_H
+
+#ifdef CONFIG_SYSCTL
+
+int __net_init smc_sysctl_net_init(struct net *net);
+void __net_exit smc_sysctl_net_exit(struct net *net);
+
+#else
+
+static inline int smc_sysctl_net_init(struct net *net)
+{
+ net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
+ return 0;
+}
+
+static inline void smc_sysctl_net_exit(struct net *net) { }
+
+#endif /* CONFIG_SYSCTL */
+
+#endif /* _SMC_SYSCTL_H */
diff --git a/net/smc/smc_tracepoint.c b/net/smc/smc_tracepoint.c
new file mode 100644
index 000000000000..8d47ced5a492
--- /dev/null
+++ b/net/smc/smc_tracepoint.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define CREATE_TRACE_POINTS
+#include "smc_tracepoint.h"
+
+EXPORT_TRACEPOINT_SYMBOL(smc_switch_to_fallback);
+EXPORT_TRACEPOINT_SYMBOL(smc_tx_sendmsg);
+EXPORT_TRACEPOINT_SYMBOL(smc_rx_recvmsg);
+EXPORT_TRACEPOINT_SYMBOL(smcr_link_down);
diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h
new file mode 100644
index 000000000000..9fc5e586d24a
--- /dev/null
+++ b/net/smc/smc_tracepoint.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM smc
+
+#if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SMC_H
+
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/tracepoint.h>
+#include <net/ipv6.h>
+#include "smc.h"
+#include "smc_core.h"
+
+TRACE_EVENT(smc_switch_to_fallback,
+
+ TP_PROTO(const struct smc_sock *smc, int fallback_rsn),
+
+ TP_ARGS(smc, fallback_rsn),
+
+ TP_STRUCT__entry(
+ __field(const void *, sk)
+ __field(const void *, clcsk)
+ __field(u64, net_cookie)
+ __field(int, fallback_rsn)
+ ),
+
+ TP_fast_assign(
+ const struct sock *sk = &smc->sk;
+ const struct sock *clcsk = smc->clcsock->sk;
+
+ __entry->sk = sk;
+ __entry->clcsk = clcsk;
+ __entry->net_cookie = sock_net(sk)->net_cookie;
+ __entry->fallback_rsn = fallback_rsn;
+ ),
+
+ TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d",
+ __entry->sk, __entry->clcsk,
+ __entry->net_cookie, __entry->fallback_rsn)
+);
+
+DECLARE_EVENT_CLASS(smc_msg_event,
+
+ TP_PROTO(const struct smc_sock *smc, size_t len),
+
+ TP_ARGS(smc, len),
+
+ TP_STRUCT__entry(
+ __field(const void *, smc)
+ __field(u64, net_cookie)
+ __field(size_t, len)
+ __string(name, smc->conn.lnk->ibname)
+ ),
+
+ TP_fast_assign(
+ const struct sock *sk = &smc->sk;
+
+ __entry->smc = smc;
+ __entry->net_cookie = sock_net(sk)->net_cookie;
+ __entry->len = len;
+ __assign_str(name, smc->conn.lnk->ibname);
+ ),
+
+ TP_printk("smc=%p net=%llu len=%zu dev=%s",
+ __entry->smc, __entry->net_cookie,
+ __entry->len, __get_str(name))
+);
+
+DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg,
+
+ TP_PROTO(const struct smc_sock *smc, size_t len),
+
+ TP_ARGS(smc, len)
+);
+
+DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg,
+
+ TP_PROTO(const struct smc_sock *smc, size_t len),
+
+ TP_ARGS(smc, len)
+);
+
+TRACE_EVENT(smcr_link_down,
+
+ TP_PROTO(const struct smc_link *lnk, void *location),
+
+ TP_ARGS(lnk, location),
+
+ TP_STRUCT__entry(
+ __field(const void *, lnk)
+ __field(const void *, lgr)
+ __field(u64, net_cookie)
+ __field(int, state)
+ __string(name, lnk->ibname)
+ __field(void *, location)
+ ),
+
+ TP_fast_assign(
+ const struct smc_link_group *lgr = lnk->lgr;
+
+ __entry->lnk = lnk;
+ __entry->lgr = lgr;
+ __entry->net_cookie = lgr->net->net_cookie;
+ __entry->state = lnk->state;
+ __assign_str(name, lnk->ibname);
+ __entry->location = location;
+ ),
+
+ TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS",
+ __entry->lnk, __entry->lgr, __entry->net_cookie,
+ __entry->state, __get_str(name),
+ __entry->location)
+);
+
+#endif /* _TRACE_SMC_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE smc_tracepoint
+
+#include <trace/define_trace.h>
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 0d42e7716b91..64dedffe9d26 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -27,9 +27,10 @@
#include "smc_close.h"
#include "smc_ism.h"
#include "smc_tx.h"
+#include "smc_stats.h"
+#include "smc_tracepoint.h"
#define SMC_TX_WORK_DELAY 0
-#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */
/***************************** sndbuf producer *******************************/
@@ -45,6 +46,8 @@ static void smc_tx_write_space(struct sock *sk)
/* similar to sk_stream_write_space */
if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+ if (test_bit(SOCK_NOSPACE, &sock->flags))
+ SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk);
clear_bit(SOCK_NOSPACE, &sock->flags);
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
@@ -128,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc)
return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
}
+/* If we have pending CDC messages, do not send:
+ * Because CQE of this CDC message will happen shortly, it gives
+ * a chance to coalesce future sendmsg() payload in to one RDMA Write,
+ * without need for a timer, and with no latency trade off.
+ * Algorithm here:
+ * 1. First message should never cork
+ * 2. If we have pending Tx CDC messages, wait for the first CDC
+ * message's completion
+ * 3. Don't cork to much data in a single RDMA Write to prevent burst
+ * traffic, total corked message should not exceed sendbuf/2
+ */
+static bool smc_should_autocork(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ int corking_size;
+
+ corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1,
+ sock_net(&smc->sk)->smc.sysctl_autocorking_size);
+
+ if (atomic_read(&conn->cdc_pend_tx_wr) == 0 ||
+ smc_tx_prepared_sends(conn) > corking_size)
+ return false;
+ return true;
+}
+
+static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
+{
+ struct smc_connection *conn = &smc->conn;
+
+ if (smc_should_autocork(smc))
+ return true;
+
+ /* for a corked socket defer the RDMA writes if
+ * sndbuf_space is still available. The applications
+ * should known how/when to uncork it.
+ */
+ if ((msg->msg_flags & MSG_MORE ||
+ smc_tx_is_corked(smc) ||
+ msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
+ atomic_read(&conn->sndbuf_space))
+ return true;
+
+ return false;
+}
+
/* sndbuf producer: main API called by socket layer.
* called under sock lock.
*/
@@ -151,9 +199,19 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
goto out_err;
}
+ if (sk->sk_state == SMC_INIT)
+ return -ENOTCONN;
+
+ if (len > conn->sndbuf_desc->len)
+ SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk);
+
+ if (len > conn->peer_rmbe_size)
+ SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk);
+
+ if (msg->msg_flags & MSG_OOB)
+ SMC_STAT_INC(smc, urg_data_cnt);
+
while (msg_data_left(msg)) {
- if (sk->sk_state == SMC_INIT)
- return -ENOTCONN;
if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
(smc->sk.sk_err == ECONNABORTED) ||
conn->killed)
@@ -188,7 +246,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
tx_cnt_prep);
chunk_len_sum = chunk_len;
chunk_off = tx_cnt_prep;
- smc_sndbuf_sync_sg_for_cpu(conn);
for (chunk = 0; chunk < 2; chunk++) {
rc = memcpy_from_msg(sndbuf_base + chunk_off,
msg, chunk_len);
@@ -222,16 +279,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
*/
if ((msg->msg_flags & MSG_OOB) && !send_remaining)
conn->urg_tx_pend = true;
- if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
- (atomic_read(&conn->sndbuf_space) >
- (conn->sndbuf_desc->len >> 1)))
- /* for a corked socket defer the RDMA writes if there
- * is still sufficient sndbuf_space available
- */
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_CORK_DELAY);
- else
+ /* If we need to cork, do nothing and wait for the next
+ * sendmsg() call or push on tx completion
+ */
+ if (!smc_tx_should_cork(smc, msg))
smc_tx_sndbuf_nonempty(conn);
+
+ trace_smc_tx_sendmsg(smc, copylen);
} /* while (msg_data_left(msg)) */
return send_done;
@@ -244,21 +298,33 @@ out_err:
return rc;
}
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct msghdr msg = {.msg_flags = flags};
+ char *kaddr = kmap(page);
+ struct kvec iov;
+ int rc;
+
+ iov.iov_base = kaddr + offset;
+ iov.iov_len = size;
+ iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size);
+ rc = smc_tx_sendmsg(smc, &msg, size);
+ kunmap(page);
+ return rc;
+}
+
/***************************** sndbuf consumer *******************************/
/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
u32 offset, int signal)
{
- struct smc_ism_position pos;
int rc;
- memset(&pos, 0, sizeof(pos));
- pos.token = conn->peer_token;
- pos.index = conn->peer_rmbe_idx;
- pos.offset = conn->tx_off + offset;
- pos.signal = signal;
- rc = smc_ism_write(conn->lgr->smcd, &pos, data, len);
+ rc = smc_ism_write(conn->lgr->smcd, conn->peer_token,
+ conn->peer_rmbe_idx, signal, conn->tx_off + offset,
+ data, len);
if (rc)
conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
return rc;
@@ -269,22 +335,21 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
int num_sges, struct ib_rdma_wr *rdma_wr)
{
struct smc_link_group *lgr = conn->lgr;
- struct smc_link *link;
+ struct smc_link *link = conn->lnk;
int rc;
- link = &lgr->lnk[SMC_SINGLE_LINK];
rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
rdma_wr->wr.num_sge = num_sges;
rdma_wr->remote_addr =
- lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
+ lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
/* RMBE within RMB */
conn->tx_off +
/* offset within RMBE */
peer_rmbe_offset;
- rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+ rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey;
rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
if (rc)
- smc_lgr_terminate(lgr, true);
+ smcr_link_down_cond_sched(link);
return rc;
}
@@ -310,8 +375,11 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
size_t dst_off, size_t dst_len,
struct smc_rdma_wr *wr_rdma_buf)
{
+ struct smc_link *link = conn->lnk;
+
dma_addr_t dma_addr =
- sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
+ sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl);
+ u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr;
int src_len_sum = src_len, dst_len_sum = dst_len;
int sent_count = src_off;
int srcchunk, dstchunk;
@@ -319,13 +387,25 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
int rc;
for (dstchunk = 0; dstchunk < 2; dstchunk++) {
- struct ib_sge *sge =
- wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list;
+ struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk];
+ struct ib_sge *sge = wr->wr.sg_list;
+ u64 base_addr = dma_addr;
+
+ if (dst_len < link->qp_attr.cap.max_inline_data) {
+ base_addr = virt_addr;
+ wr->wr.send_flags |= IB_SEND_INLINE;
+ } else {
+ wr->wr.send_flags &= ~IB_SEND_INLINE;
+ }
num_sges = 0;
for (srcchunk = 0; srcchunk < 2; srcchunk++) {
- sge[srcchunk].addr = dma_addr + src_off;
+ sge[srcchunk].addr = conn->sndbuf_desc->is_vm ?
+ (virt_addr + src_off) : (base_addr + src_off);
sge[srcchunk].length = src_len;
+ if (conn->sndbuf_desc->is_vm)
+ sge[srcchunk].lkey =
+ conn->sndbuf_desc->mr[link->link_idx]->lkey;
num_sges++;
src_off += src_len;
@@ -338,8 +418,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
src_len = dst_len - src_len; /* remainder */
src_len_sum += src_len;
}
- rc = smc_tx_rdma_write(conn, dst_off, num_sges,
- &wr_rdma_buf->wr_tx_rdma[dstchunk]);
+ rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr);
if (rc)
return rc;
if (dst_len_sum == len)
@@ -418,8 +497,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn,
/* destination: RMBE */
/* cf. snd_wnd */
rmbespace = atomic_read(&conn->peer_rmbe_space);
- if (rmbespace <= 0)
+ if (rmbespace <= 0) {
+ struct smc_sock *smc = container_of(conn, struct smc_sock,
+ conn);
+ SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
return 0;
+ }
smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
@@ -481,13 +564,17 @@ static int smc_tx_rdma_writes(struct smc_connection *conn,
static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
{
struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+ struct smc_link *link = conn->lnk;
struct smc_rdma_wr *wr_rdma_buf;
struct smc_cdc_tx_pend *pend;
struct smc_wr_buf *wr_buf;
int rc;
- rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend);
+ if (!link || !smc_wr_tx_link_hold(link))
+ return -ENOLINK;
+ rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend);
if (rc < 0) {
+ smc_wr_tx_link_put(link);
if (rc == -EBUSY) {
struct smc_sock *smc =
container_of(conn, struct smc_sock, conn);
@@ -497,17 +584,24 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
if (conn->killed)
return -EPIPE;
rc = 0;
- mod_delayed_work(system_wq, &conn->tx_work,
+ mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
SMC_TX_WORK_DELAY);
}
return rc;
}
spin_lock_bh(&conn->send_lock);
+ if (link != conn->lnk) {
+ /* link of connection changed, tx_work will restart */
+ smc_wr_tx_put_slot(link,
+ (struct smc_wr_tx_pend_priv *)pend);
+ rc = -ENOLINK;
+ goto out_unlock;
+ }
if (!pflags->urg_data_present) {
rc = smc_tx_rdma_writes(conn, wr_rdma_buf);
if (rc) {
- smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ smc_wr_tx_put_slot(link,
(struct smc_wr_tx_pend_priv *)pend);
goto out_unlock;
}
@@ -521,6 +615,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
out_unlock:
spin_unlock_bh(&conn->send_lock);
+ smc_wr_tx_link_put(link);
return rc;
}
@@ -543,13 +638,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
return rc;
}
-int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn)
{
- int rc;
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ int rc = 0;
+
+ /* No data in the send queue */
+ if (unlikely(smc_tx_prepared_sends(conn) <= 0))
+ goto out;
+
+ /* Peer don't have RMBE space */
+ if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) {
+ SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
+ goto out;
+ }
if (conn->killed ||
- conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
- return -EPIPE; /* connection being aborted */
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ rc = -EPIPE; /* connection being aborted */
+ goto out;
+ }
if (conn->lgr->is_smcd)
rc = smcd_tx_sndbuf_nonempty(conn);
else
@@ -557,34 +665,72 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
if (!rc) {
/* trigger socket release if connection is closing */
- struct smc_sock *smc = container_of(conn, struct smc_sock,
- conn);
smc_close_wake_tx_prepared(smc);
}
+
+out:
+ return rc;
+}
+
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ int rc;
+
+ /* This make sure only one can send simultaneously to prevent wasting
+ * of CPU and CDC slot.
+ * Record whether someone has tried to push while we are pushing.
+ */
+ if (atomic_inc_return(&conn->tx_pushing) > 1)
+ return 0;
+
+again:
+ atomic_set(&conn->tx_pushing, 1);
+ smp_wmb(); /* Make sure tx_pushing is 1 before real send */
+ rc = __smc_tx_sndbuf_nonempty(conn);
+
+ /* We need to check whether someone else have added some data into
+ * the send queue and tried to push but failed after the atomic_set()
+ * when we are pushing.
+ * If so, we need to push again to prevent those data hang in the send
+ * queue.
+ */
+ if (unlikely(!atomic_dec_and_test(&conn->tx_pushing)))
+ goto again;
+
return rc;
}
/* Wakeup sndbuf consumers from process context
- * since there is more data to transmit
+ * since there is more data to transmit. The caller
+ * must hold sock lock.
*/
-void smc_tx_work(struct work_struct *work)
+void smc_tx_pending(struct smc_connection *conn)
{
- struct smc_connection *conn = container_of(to_delayed_work(work),
- struct smc_connection,
- tx_work);
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
int rc;
- lock_sock(&smc->sk);
if (smc->sk.sk_err)
- goto out;
+ return;
rc = smc_tx_sndbuf_nonempty(conn);
if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked &&
!atomic_read(&conn->bytes_to_rcv))
conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
-out:
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit in locked
+ * sock.
+ */
+void smc_tx_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(to_delayed_work(work),
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_tx_pending(conn);
release_sock(&smc->sk);
}
@@ -614,8 +760,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
return;
if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
!conn->killed) {
- schedule_delayed_work(&conn->tx_work,
- SMC_TX_WORK_DELAY);
+ queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
return;
}
}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 07e6ad76224a..34b578498b1f 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -27,9 +27,12 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
}
+void smc_tx_pending(struct smc_connection *conn);
void smc_tx_work(struct work_struct *work);
void smc_tx_init(struct smc_sock *smc);
int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
+ size_t size, int flags);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
void smc_tx_consumer_update(struct smc_connection *conn, bool force);
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 337ee52ad3d3..b0678a417e09 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -44,6 +44,7 @@ struct smc_wr_tx_pend { /* control data for a pending send request */
struct smc_link *link;
u32 idx;
struct smc_wr_tx_pend_priv priv;
+ u8 compl_requested;
};
/******************************** send queue *********************************/
@@ -53,21 +54,13 @@ struct smc_wr_tx_pend { /* control data for a pending send request */
/* returns true if at least one tx work request is pending on the given link */
static inline bool smc_wr_is_tx_pend(struct smc_link *link)
{
- if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) !=
- link->wr_tx_cnt) {
- return true;
- }
- return false;
+ return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt);
}
/* wait till all pending tx work requests on the given link are completed */
-static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
+void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
{
- if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link),
- SMC_WR_TX_WAIT_PENDING_TIME))
- return 0;
- else /* timeout */
- return -EPIPE;
+ wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
}
static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
@@ -86,7 +79,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
struct smc_wr_tx_pend pnd_snd;
struct smc_link *link;
u32 pnd_snd_idx;
- int i;
link = wc->qp->qp_context;
@@ -100,37 +92,50 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
}
pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
- if (pnd_snd_idx == link->wr_tx_cnt)
- return;
- link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
- memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
- /* clear the full struct smc_wr_tx_pend including .priv */
- memset(&link->wr_tx_pends[pnd_snd_idx], 0,
- sizeof(link->wr_tx_pends[pnd_snd_idx]));
- memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
- sizeof(link->wr_tx_bufs[pnd_snd_idx]));
- if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
- return;
+ if (pnd_snd_idx == link->wr_tx_cnt) {
+ if (link->lgr->smc_version != SMC_V2 ||
+ link->wr_tx_v2_pend->wr_id != wc->wr_id)
+ return;
+ link->wr_tx_v2_pend->wc_status = wc->status;
+ memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(link->wr_tx_v2_pend, 0,
+ sizeof(*link->wr_tx_v2_pend));
+ memset(link->lgr->wr_tx_buf_v2, 0,
+ sizeof(*link->lgr->wr_tx_buf_v2));
+ } else {
+ link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+ if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
+ complete(&link->wr_tx_compl[pnd_snd_idx]);
+ memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
+ sizeof(pnd_snd));
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_pends[pnd_snd_idx]));
+ memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+ if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+ return;
+ }
+
if (wc->status) {
- for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
- /* clear full struct smc_wr_tx_pend including .priv */
- memset(&link->wr_tx_pends[i], 0,
- sizeof(link->wr_tx_pends[i]));
- memset(&link->wr_tx_bufs[i], 0,
- sizeof(link->wr_tx_bufs[i]));
- clear_bit(i, link->wr_tx_mask);
+ if (link->lgr->smc_version == SMC_V2) {
+ memset(link->wr_tx_v2_pend, 0,
+ sizeof(*link->wr_tx_v2_pend));
+ memset(link->lgr->wr_tx_buf_v2, 0,
+ sizeof(*link->lgr->wr_tx_buf_v2));
}
- /* terminate connections of this link group abnormally */
- smc_lgr_terminate_sched(smc_get_lgr(link));
+ /* terminate link */
+ smcr_link_down_cond_sched(link);
}
if (pnd_snd.handler)
pnd_snd.handler(&pnd_snd.priv, link, wc->status);
wake_up(&link->wr_tx_wait);
}
-static void smc_wr_tx_tasklet_fn(unsigned long data)
+static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
{
- struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
int i = 0, rc;
int polled = 0;
@@ -166,6 +171,8 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
{
*idx = link->wr_tx_cnt;
+ if (!smc_link_sendable(link))
+ return -ENOLINK;
for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
if (!test_and_set_bit(*idx, link->wr_tx_mask))
return 0;
@@ -207,13 +214,13 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
} else {
rc = wait_event_interruptible_timeout(
link->wr_tx_wait,
- link->state == SMC_LNK_INACTIVE ||
+ !smc_link_sendable(link) ||
lgr->terminating ||
(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
SMC_WR_TX_WAIT_FREE_SLOT_TIME);
if (!rc) {
- /* timeout - terminate connections */
- smc_lgr_terminate_sched(lgr);
+ /* timeout - terminate link */
+ smcr_link_down_cond_sched(link);
return -EPIPE;
}
if (idx == link->wr_tx_cnt)
@@ -234,6 +241,33 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
return 0;
}
+int smc_wr_tx_get_v2_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_v2_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+ struct smc_wr_tx_pend *wr_pend;
+ struct ib_send_wr *wr_ib;
+ u64 wr_id;
+
+ if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
+ return -EBUSY;
+
+ *wr_buf = NULL;
+ *wr_pend_priv = NULL;
+ wr_id = smc_wr_tx_get_next_wr_id(link);
+ wr_pend = link->wr_tx_v2_pend;
+ wr_pend->wr_id = wr_id;
+ wr_pend->handler = handler;
+ wr_pend->link = link;
+ wr_pend->idx = link->wr_tx_cnt;
+ wr_ib = link->wr_tx_v2_ib;
+ wr_ib->wr_id = wr_id;
+ *wr_buf = link->lgr->wr_tx_buf_v2;
+ *wr_pend_priv = &wr_pend->priv;
+ return 0;
+}
+
int smc_wr_tx_put_slot(struct smc_link *link,
struct smc_wr_tx_pend_priv *wr_pend_priv)
{
@@ -251,6 +285,14 @@ int smc_wr_tx_put_slot(struct smc_link *link,
test_and_clear_bit(idx, link->wr_tx_mask);
wake_up(&link->wr_tx_wait);
return 1;
+ } else if (link->lgr->smc_version == SMC_V2 &&
+ pend->idx == link->wr_tx_cnt) {
+ /* Large v2 buffer */
+ memset(&link->wr_tx_v2_pend, 0,
+ sizeof(link->wr_tx_v2_pend));
+ memset(&link->lgr->wr_tx_buf_v2, 0,
+ sizeof(link->lgr->wr_tx_buf_v2));
+ return 1;
}
return 0;
@@ -270,11 +312,56 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
if (rc) {
smc_wr_tx_put_slot(link, priv);
- smc_lgr_terminate_sched(smc_get_lgr(link));
+ smcr_link_down_cond_sched(link);
+ }
+ return rc;
+}
+
+int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ int len)
+{
+ int rc;
+
+ link->wr_tx_v2_ib->sg_list[0].length = len;
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+ rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
+ if (rc) {
+ smc_wr_tx_put_slot(link, priv);
+ smcr_link_down_cond_sched(link);
}
return rc;
}
+/* Send prepared WR slot via ib_post_send and wait for send completion
+ * notification.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ unsigned long timeout)
+{
+ struct smc_wr_tx_pend *pend;
+ u32 pnd_idx;
+ int rc;
+
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ pend->compl_requested = 1;
+ pnd_idx = pend->idx;
+ init_completion(&link->wr_tx_compl[pnd_idx]);
+
+ rc = smc_wr_tx_send(link, priv);
+ if (rc)
+ return rc;
+ /* wait for completion by smc_wr_tx_process_cqe() */
+ rc = wait_for_completion_interruptible_timeout(
+ &link->wr_tx_compl[pnd_idx], timeout);
+ if (rc <= 0)
+ rc = -ENODATA;
+ if (rc > 0)
+ rc = 0;
+ return rc;
+}
+
/* Register a memory region and wait for result. */
int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
{
@@ -290,12 +377,15 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
if (rc)
return rc;
+ atomic_inc(&link->wr_reg_refcnt);
rc = wait_event_interruptible_timeout(link->wr_reg_wait,
(link->wr_reg_state != POSTED),
SMC_WR_REG_MR_WAIT_TIME);
+ if (atomic_dec_and_test(&link->wr_reg_refcnt))
+ wake_up_all(&link->wr_reg_wait);
if (!rc) {
- /* timeout - terminate connections */
- smc_lgr_terminate_sched(smc_get_lgr(link));
+ /* timeout - terminate link */
+ smcr_link_down_cond_sched(link);
return -EPIPE;
}
if (rc == -ERESTARTSYS)
@@ -314,25 +404,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
return rc;
}
-void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type,
- smc_wr_tx_filter filter,
- smc_wr_tx_dismisser dismisser,
- unsigned long data)
-{
- struct smc_wr_tx_pend_priv *tx_pend;
- struct smc_wr_rx_hdr *wr_tx;
- int i;
-
- for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
- wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i];
- if (wr_tx->type != wr_tx_hdr_type)
- continue;
- tx_pend = &link->wr_tx_pends[i].priv;
- if (filter(tx_pend, data))
- dismisser(tx_pend);
- }
-}
-
/****************************** receive queue ********************************/
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
@@ -383,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
for (i = 0; i < num; i++) {
link = wc[i].qp->qp_context;
+ link->wr_rx_id_compl = wc[i].wr_id;
if (wc[i].status == IB_WC_SUCCESS) {
link->wr_rx_tstamp = jiffies;
smc_wr_rx_demultiplex(&wc[i]);
@@ -393,10 +465,9 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
case IB_WC_RETRY_EXC_ERR:
case IB_WC_RNR_RETRY_EXC_ERR:
case IB_WC_WR_FLUSH_ERR:
- /* terminate connections of this link group
- * abnormally
- */
- smc_lgr_terminate_sched(smc_get_lgr(link));
+ smcr_link_down_cond_sched(link);
+ if (link->wr_rx_id_compl == link->wr_rx_id)
+ wake_up(&link->wr_rx_empty_wait);
break;
default:
smc_wr_rx_post(link); /* refill WR RX */
@@ -406,9 +477,9 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
}
}
-static void smc_wr_rx_tasklet_fn(unsigned long data)
+static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
{
- struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
int polled = 0;
int rc;
@@ -485,10 +556,12 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
static void smc_wr_init_sge(struct smc_link *lnk)
{
+ int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
+ bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
u32 i;
for (i = 0; i < lnk->wr_tx_cnt; i++) {
- lnk->wr_tx_sges[i].addr =
+ lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
@@ -506,6 +579,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
lnk->wr_tx_ibs[i].send_flags =
IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ if (send_inline)
+ lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
@@ -513,14 +588,44 @@ static void smc_wr_init_sge(struct smc_link *lnk)
lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
}
+
+ if (lnk->lgr->smc_version == SMC_V2) {
+ lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
+ lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
+ lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
+
+ lnk->wr_tx_v2_ib->next = NULL;
+ lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
+ lnk->wr_tx_v2_ib->num_sge = 1;
+ lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
+ lnk->wr_tx_v2_ib->send_flags =
+ IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
+
+ /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
+ * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
+ * and the same buffer for all sges. When a larger message arrived then
+ * the content of the first small sge is copied to the beginning of
+ * the larger spillover buffer, allowing easy data mapping.
+ */
for (i = 0; i < lnk->wr_rx_cnt; i++) {
- lnk->wr_rx_sges[i].addr =
+ int x = i * sges_per_buf;
+
+ lnk->wr_rx_sges[x].addr =
lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
- lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
- lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
+ lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
+ if (lnk->lgr->smc_version == SMC_V2) {
+ lnk->wr_rx_sges[x + 1].addr =
+ lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
+ lnk->wr_rx_sges[x + 1].length =
+ SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
+ lnk->wr_rx_sges[x + 1].lkey =
+ lnk->roce_pd->local_dma_lkey;
+ }
lnk->wr_rx_ibs[i].next = NULL;
- lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
- lnk->wr_rx_ibs[i].num_sge = 1;
+ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
+ lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
}
lnk->wr_reg.wr.next = NULL;
lnk->wr_reg.wr.num_sge = 0;
@@ -533,34 +638,68 @@ void smc_wr_free_link(struct smc_link *lnk)
{
struct ib_device *ibdev;
- if (smc_wr_tx_wait_no_pending_sends(lnk))
- memset(lnk->wr_tx_mask, 0,
- BITS_TO_LONGS(SMC_WR_BUF_CNT) *
- sizeof(*lnk->wr_tx_mask));
-
if (!lnk->smcibdev)
return;
ibdev = lnk->smcibdev->ibdev;
+ smc_wr_drain_cq(lnk);
+ smc_wr_wakeup_reg_wait(lnk);
+ smc_wr_wakeup_tx_wait(lnk);
+
+ smc_wr_tx_wait_no_pending_sends(lnk);
+ wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
+ wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
+
if (lnk->wr_rx_dma_addr) {
ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
DMA_FROM_DEVICE);
lnk->wr_rx_dma_addr = 0;
}
+ if (lnk->wr_rx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_v2_dma_addr = 0;
+ }
if (lnk->wr_tx_dma_addr) {
ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
DMA_TO_DEVICE);
lnk->wr_tx_dma_addr = 0;
}
+ if (lnk->wr_tx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_v2_dma_addr = 0;
+ }
+}
+
+void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
+{
+ if (lgr->smc_version < SMC_V2)
+ return;
+
+ kfree(lgr->wr_rx_buf_v2);
+ lgr->wr_rx_buf_v2 = NULL;
+ kfree(lgr->wr_tx_buf_v2);
+ lgr->wr_tx_buf_v2 = NULL;
}
void smc_wr_free_link_mem(struct smc_link *lnk)
{
+ kfree(lnk->wr_tx_v2_ib);
+ lnk->wr_tx_v2_ib = NULL;
+ kfree(lnk->wr_tx_v2_sge);
+ lnk->wr_tx_v2_sge = NULL;
+ kfree(lnk->wr_tx_v2_pend);
+ lnk->wr_tx_v2_pend = NULL;
+ kfree(lnk->wr_tx_compl);
+ lnk->wr_tx_compl = NULL;
kfree(lnk->wr_tx_pends);
lnk->wr_tx_pends = NULL;
- kfree(lnk->wr_tx_mask);
+ bitmap_free(lnk->wr_tx_mask);
lnk->wr_tx_mask = NULL;
kfree(lnk->wr_tx_sges);
lnk->wr_tx_sges = NULL;
@@ -580,8 +719,26 @@ void smc_wr_free_link_mem(struct smc_link *lnk)
lnk->wr_rx_bufs = NULL;
}
+int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
+{
+ if (lgr->smc_version < SMC_V2)
+ return 0;
+
+ lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
+ if (!lgr->wr_rx_buf_v2)
+ return -ENOMEM;
+ lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
+ if (!lgr->wr_tx_buf_v2) {
+ kfree(lgr->wr_rx_buf_v2);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
int smc_wr_alloc_link_mem(struct smc_link *link)
{
+ int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
+
/* allocate link related memory */
link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
if (!link->wr_tx_bufs)
@@ -614,13 +771,11 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
if (!link->wr_tx_sges)
goto no_mem_wr_tx_rdma_sges;
link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
- sizeof(link->wr_rx_sges[0]),
+ sizeof(link->wr_rx_sges[0]) * sges_per_buf,
GFP_KERNEL);
if (!link->wr_rx_sges)
goto no_mem_wr_tx_sges;
- link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT),
- sizeof(*link->wr_tx_mask),
- GFP_KERNEL);
+ link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
if (!link->wr_tx_mask)
goto no_mem_wr_rx_sges;
link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
@@ -628,8 +783,36 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
GFP_KERNEL);
if (!link->wr_tx_pends)
goto no_mem_wr_tx_mask;
+ link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_compl[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_compl)
+ goto no_mem_wr_tx_pends;
+
+ if (link->lgr->smc_version == SMC_V2) {
+ link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
+ GFP_KERNEL);
+ if (!link->wr_tx_v2_ib)
+ goto no_mem_tx_compl;
+ link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
+ GFP_KERNEL);
+ if (!link->wr_tx_v2_sge)
+ goto no_mem_v2_ib;
+ link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
+ GFP_KERNEL);
+ if (!link->wr_tx_v2_pend)
+ goto no_mem_v2_sge;
+ }
return 0;
+no_mem_v2_sge:
+ kfree(link->wr_tx_v2_sge);
+no_mem_v2_ib:
+ kfree(link->wr_tx_v2_ib);
+no_mem_tx_compl:
+ kfree(link->wr_tx_compl);
+no_mem_wr_tx_pends:
+ kfree(link->wr_tx_pends);
no_mem_wr_tx_mask:
kfree(link->wr_tx_mask);
no_mem_wr_rx_sges:
@@ -660,10 +843,8 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
void smc_wr_add_dev(struct smc_ib_device *smcibdev)
{
- tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
- (unsigned long)smcibdev);
- tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
- (unsigned long)smcibdev);
+ tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
+ tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
}
int smc_wr_create_link(struct smc_link *lnk)
@@ -681,6 +862,24 @@ int smc_wr_create_link(struct smc_link *lnk)
rc = -EIO;
goto out;
}
+ if (lnk->lgr->smc_version == SMC_V2) {
+ lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
+ lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
+ lnk->wr_rx_v2_dma_addr = 0;
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
+ lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
+ lnk->wr_tx_v2_dma_addr = 0;
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ }
lnk->wr_tx_dma_addr = ib_dma_map_single(
ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
DMA_TO_DEVICE);
@@ -689,13 +888,27 @@ int smc_wr_create_link(struct smc_link *lnk)
goto dma_unmap;
}
smc_wr_init_sge(lnk);
- memset(lnk->wr_tx_mask, 0,
- BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+ bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
init_waitqueue_head(&lnk->wr_tx_wait);
+ atomic_set(&lnk->wr_tx_refcnt, 0);
init_waitqueue_head(&lnk->wr_reg_wait);
+ atomic_set(&lnk->wr_reg_refcnt, 0);
+ init_waitqueue_head(&lnk->wr_rx_empty_wait);
return rc;
dma_unmap:
+ if (lnk->wr_rx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_v2_dma_addr = 0;
+ }
+ if (lnk->wr_tx_v2_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
+ SMC_WR_BUF_V2_SIZE,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_v2_dma_addr = 0;
+ }
ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
DMA_FROM_DEVICE);
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 3ac99c898418..45e9b894d3f8 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -22,7 +22,6 @@
#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
-#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
@@ -60,6 +59,25 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
atomic_long_set(wr_tx_id, val);
}
+static inline bool smc_wr_tx_link_hold(struct smc_link *link)
+{
+ if (!smc_link_sendable(link))
+ return false;
+ atomic_inc(&link->wr_tx_refcnt);
+ return true;
+}
+
+static inline void smc_wr_tx_link_put(struct smc_link *link)
+{
+ if (atomic_dec_and_test(&link->wr_tx_refcnt))
+ wake_up_all(&link->wr_tx_wait);
+}
+
+static inline void smc_wr_drain_cq(struct smc_link *lnk)
+{
+ wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
+}
+
static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
{
wake_up_all(&lnk->wr_tx_wait);
@@ -87,8 +105,10 @@ static inline int smc_wr_rx_post(struct smc_link *link)
int smc_wr_create_link(struct smc_link *lnk);
int smc_wr_alloc_link_mem(struct smc_link *lnk);
+int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
void smc_wr_free_link(struct smc_link *lnk);
void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
void smc_wr_remember_qp_attr(struct smc_link *lnk);
void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
void smc_wr_add_dev(struct smc_ib_device *smcibdev);
@@ -97,15 +117,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
struct smc_wr_buf **wr_buf,
struct smc_rdma_wr **wrs,
struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_get_v2_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_v2_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv);
int smc_wr_tx_put_slot(struct smc_link *link,
struct smc_wr_tx_pend_priv *wr_pend_priv);
int smc_wr_tx_send(struct smc_link *link,
struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_v2_send(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *priv, int len);
+int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+ unsigned long timeout);
void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
-void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
- smc_wr_tx_filter filter,
- smc_wr_tx_dismisser dismisser,
- unsigned long data);
+void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
int smc_wr_rx_post_init(struct smc_link *link);