diff options
Diffstat (limited to '')
34 files changed, 10139 insertions, 2078 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig index f54a70b8da82..1ab3c5a2c5ad 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -2,7 +2,7 @@ config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND - ---help--- + help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade AF_INET TCP connections transparently. @@ -14,7 +14,7 @@ config SMC config SMC_DIAG tristate "SMC: socket monitoring interface" depends on SMC - ---help--- + help Support for SMC socket monitoring interface used by tools such as smcss. diff --git a/net/smc/Makefile b/net/smc/Makefile index cb1254541f37..875efcd126a2 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +ccflags-y += -I$(src) obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o +smc-y += smc_tracepoint.o +smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6fd44bdb0fc3..e12d4fa5aece 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -26,6 +26,7 @@ #include <linux/sched/signal.h> #include <linux/if_vlan.h> #include <linux/rcupdate_wait.h> +#include <linux/ctype.h> #include <net/sock.h> #include <net/tcp.h> @@ -44,9 +45,13 @@ #include "smc_ib.h" #include "smc_ism.h" #include "smc_pnet.h" +#include "smc_netlink.h" #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" +#include "smc_sysctl.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -55,9 +60,52 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ +static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ +struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ +struct workqueue_struct *smc_close_wq; /* wq for close work */ + static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + void *hdr; + + if (cb_ctx->pos[0]) + goto out; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_HS_LIMITATION); + if (!hdr) + return -ENOMEM; + + if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, + sock_net(skb->sk)->smc.limit_smc_hs)) + goto err; + + genlmsg_end(skb, hdr); + cb_ctx->pos[0] = 1; +out: + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = true; + return 0; +} + +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = false; + return 0; +} + static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); @@ -65,6 +113,61 @@ static void smc_set_keepalive(struct sock *sk, int val) smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } +static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct smc_sock *smc; + struct sock *child; + + smc = smc_clcsock_user_data(sk); + + if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > + sk->sk_max_ack_backlog) + goto drop; + + if (sk_acceptq_is_full(&smc->sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + /* passthrough to original syn recv sock fct */ + child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, + own_req); + /* child must not inherit smc or its ops */ + if (child) { + rcu_assign_sk_user_data(child, NULL); + + /* v4-mapped sockets don't inherit parent ops. Don't restore. */ + if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops) + inet_csk(child)->icsk_af_ops = smc->ori_af_ops; + } + return child; + +drop: + dst_release(dst); + tcp_listendrop(sk); + return NULL; +} + +static bool smc_hs_congested(const struct sock *sk) +{ + const struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (!smc) + return true; + + if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) + return true; + + return false; +} + static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -82,8 +185,8 @@ int smc_hash_sk(struct sock *sk) write_lock_bh(&h->lock); sk_add_node(sk, head); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } @@ -100,12 +203,27 @@ void smc_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(smc_unhash_sk); +/* This will be called before user really release sock_lock. So do the + * work which we didn't do because of user hold the sock_lock in the + * BH context + */ +static void smc_release_cb(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + if (smc->conn.tx_in_release_sock) { + smc_tx_pending(&smc->conn); + smc->conn.tx_in_release_sock = false; + } +} + struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -118,16 +236,35 @@ struct proto smc_proto6 = { .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_fback_restore_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = NULL; + + smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); + smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); + smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); + smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + static void smc_restore_fallback_changes(struct smc_sock *smc) { - smc->clcsock->file->private_data = smc->sk.sk_socket; - smc->clcsock->file = NULL; + if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ + smc->clcsock->file->private_data = smc->sk.sk_socket; + smc->clcsock->file = NULL; + smc_fback_restore_callbacks(smc); + } } static int __smc_release(struct smc_sock *smc) @@ -140,14 +277,18 @@ static int __smc_release(struct smc_sock *smc) sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { - if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) - sock_put(sk); /* passive closing */ - if (sk->sk_state == SMC_LISTEN) { - /* wake up clcsock accept */ - rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + if (sk->sk_state != SMC_CLOSED) { + if (sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ + if (sk->sk_state == SMC_LISTEN) { + /* wake up clcsock accept */ + rc = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + } + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); } - sk->sk_state = SMC_CLOSED; - sk->sk_state_change(sk); smc_restore_fallback_changes(smc); } @@ -170,7 +311,7 @@ static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = 0; + int old_state, rc = 0; if (!sk) goto out; @@ -178,10 +319,14 @@ static int smc_release(struct socket *sock) sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); + old_state = sk->sk_state; + /* cleanup for a dangling non-blocking connect */ - if (smc->connect_nonblock && sk->sk_state == SMC_INIT) + if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); - flush_work(&smc->connect_work); + + if (cancel_work_sync(&smc->connect_work)) + sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires @@ -191,6 +336,10 @@ static int smc_release(struct socket *sock) else lock_sock(sk); + if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && + !smc->use_fallback) + smc_close_active_abort(smc); + rc = __smc_release(smc); /* detach socket */ @@ -230,6 +379,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -240,6 +391,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); + smc_init_saved_callbacks(smc); return sk; } @@ -277,6 +429,7 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; + smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; rc = kernel_bind(smc->clcsock, uaddr, addr_len); out_rel: @@ -337,50 +490,84 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, send confirm_rkey msg to register with peer */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, - bool conf_rkey) +/* register the new vzalloced sndbuf on all links */ +static int smcr_lgr_reg_sndbufs(struct smc_link *link, + struct smc_buf_desc *snd_desc) { - if (!rmb_desc->wr_reg) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; - return -EFAULT; - } - rmb_desc->wr_reg = 1; + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + if (!snd_desc->is_vm) + return -EINVAL; + + /* protect against parallel smcr_link_reg_buf() */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); + if (rc) + break; } - if (!conf_rkey) - return 0; + mutex_unlock(&lgr->llc_conf_mutex); + return rc; +} + +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + /* protect against parallel smc_llc_cli_rkey_exchange() and + * parallel smcr_link_reg_buf() + */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); + if (rc) + goto out; + } + /* exchange confirm_rkey msg with peer */ - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->regerr = 1; - return -EFAULT; + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; } - return 0; + rmb_desc->is_conf_rkey = true; +out: + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + return rc; } -static int smc_clnt_conf_first_link(struct smc_sock *smc) +static int smcr_clnt_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; /* receive CONFIRM LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); @@ -389,44 +576,92 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the sndbuf if it was vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; - /* receive ADD LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); + return 0; +} - /* send add link reject message, only one link supported for now */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; +static bool smc_isascii(char *hostname) +{ + int i; - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) + if (!isascii(hostname[i])) + return false; + return true; +} - return 0; +static void smc_conn_save_peer_info_fce(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)clc; + struct smc_clc_first_contact_ext *fce; + int clc_v2_len; + + if (clc->hdr.version == SMC_V1 || + !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return; + + if (smc->conn.lgr->is_smcd) { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + d1); + } else { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + r1); + } + fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len); + smc->conn.lgr->peer_os = fce->os_type; + smc->conn.lgr->peer_smc_release = fce->release; + if (smc_isascii(fce->hostname)) + memcpy(smc->conn.lgr->peer_hostname, fce->hostname, + SMC_MAX_HOSTNAME_LEN); } static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - int bufsize = smc_uncompress_bufsize(clc->rmbe_size); + int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); - smc->conn.peer_rmbe_idx = clc->rmbe_idx; - smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); + smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; + smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); @@ -435,10 +670,10 @@ static void smcr_conn_save_peer_info(struct smc_sock *smc, static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - int bufsize = smc_uncompress_bufsize(clc->dmbe_size); + int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); - smc->conn.peer_rmbe_idx = clc->dmbe_idx; - smc->conn.peer_token = clc->token; + smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; + smc->conn.peer_token = clc->d0.token; /* msg header takes up space in the buffer */ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); @@ -452,34 +687,222 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, smcd_conn_save_peer_info(smc, clc); else smcr_conn_save_peer_info(smc, clc); + smc_conn_save_peer_info_fce(smc, clc); } static void smc_link_save_peer_info(struct smc_link *link, - struct smc_clc_msg_accept_confirm *clc) + struct smc_clc_msg_accept_confirm *clc, + struct smc_init_info *ini) +{ + link->peer_qpn = ntoh24(clc->r0.qpn); + memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); + link->peer_psn = ntoh24(clc->r0.psn); + link->peer_mtu = clc->r0.qp_mtu; +} + +static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, + struct smc_stats_fback *fback_arr) +{ + int cnt; + + for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { + if (fback_arr[cnt].fback_code == smc->fallback_rsn) { + fback_arr[cnt].count++; + break; + } + if (!fback_arr[cnt].fback_code) { + fback_arr[cnt].fback_code = smc->fallback_rsn; + fback_arr[cnt].count++; + break; + } + } +} + +static void smc_stat_fallback(struct smc_sock *smc) +{ + struct net *net = sock_net(&smc->sk); + + mutex_lock(&net->smc.mutex_fback_rsn); + if (smc->listen_smc) { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); + net->smc.fback_rsn->srv_fback_cnt++; + } else { + smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); + net->smc.fback_rsn->clnt_fback_cnt++; + } + mutex_unlock(&net->smc.mutex_fback_rsn); +} + +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) { - link->peer_qpn = ntoh24(clc->qpn); - memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); - memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); - link->peer_psn = ntoh24(clc->psn); - link->peer_mtu = clc->qp_mtu; + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } } -static void smc_switch_to_fallback(struct smc_sock *smc) +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) { + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_state_change); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_data_ready); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_write_space); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_error_report); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_replace_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, + &smc->clcsk_state_change); + smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, + &smc->clcsk_data_ready); + smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, + &smc->clcsk_write_space); + smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, + &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + +static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) +{ + int rc = 0; + + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + rc = -EBADF; + goto out; + } + smc->use_fallback = true; + smc->fallback_rsn = reason_code; + smc_stat_fallback(smc); + trace_smc_switch_to_fallback(smc, reason_code); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. + */ + smc_fback_replace_callbacks(smc); } +out: + mutex_unlock(&smc->clcsock_release_lock); + return rc; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = reason_code; + struct net *net = sock_net(&smc->sk); + int rc = 0; + + rc = smc_switch_to_fallback(smc, reason_code); + if (rc) { /* fallback fails */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -488,18 +911,22 @@ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) } /* decline and fall back during connect */ -static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) +static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, + u8 version) { + struct net *net = sock_net(&smc->sk); int rc; if (reason_code < 0) { /* error, fallback is not possible */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; } if (reason_code != SMC_CLC_DECL_PEERDECL) { - rc = smc_clc_send_decline(smc, reason_code); + rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; @@ -508,24 +935,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) return smc_connect_fallback(smc, reason_code); } -/* abort connecting */ -static int smc_connect_abort(struct smc_sock *smc, int reason_code, - int local_contact) +static void smc_conn_abort(struct smc_sock *smc, int local_first) { - bool is_smcd = smc->conn.lgr->is_smcd; + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + bool lgr_valid = false; - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_cleanup_early(&smc->conn); - else - smc_conn_free(&smc->conn); - if (is_smcd) - /* there is only one lgr role for SMC-D; use server lock */ - mutex_unlock(&smc_server_lgr_pending); - else - mutex_unlock(&smc_client_lgr_pending); + if (smc_conn_lgr_valid(conn)) + lgr_valid = true; - smc->connect_nonblock = 0; - return reason_code; + smc_conn_free(conn); + if (local_first && lgr_valid) + smc_lgr_cleanup_early(lgr); } /* check if there is a rdma device available for this connection. */ @@ -537,7 +958,9 @@ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) * used for the internal TCP socket */ smc_pnet_find_roce_resource(smc->clcsock->sk, ini); - if (!ini->ib_dev) + if (!ini->check_smcrv2 && !ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) return SMC_CLC_DECL_NOSMCRDEV; return 0; } @@ -548,47 +971,210 @@ static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ smc_pnet_find_ism_resource(smc->clcsock->sk, ini); - if (!ini->ism_dev) + if (!ini->ism_dev[0]) return SMC_CLC_DECL_NOSMCDDEV; + else + ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]); return 0; } +/* is chid unique for the ism devices that are already determined? */ +static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini, + int cnt) +{ + int i = (!ini->ism_dev[0]) ? 1 : 0; + + for (; i < cnt; i++) + if (ini->ism_chid[i] == chid) + return false; + return true; +} + +/* determine possible V2 ISM devices (either without PNETID or with PNETID plus + * PNETID matching net_device) + */ +static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, + struct smc_init_info *ini) +{ + int rc = SMC_CLC_DECL_NOSMCDDEV; + struct smcd_dev *smcd; + int i = 1; + u16 chid; + + if (smcd_indicated(ini->smc_type_v1)) + rc = 0; /* already initialized for V1 */ + mutex_lock(&smcd_dev_list.mutex); + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->going_away || smcd == ini->ism_dev[0]) + continue; + chid = smc_ism_get_chid(smcd); + if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) + continue; + if (!smc_pnet_is_pnetid_set(smcd->pnetid) || + smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { + ini->ism_dev[i] = smcd; + ini->ism_chid[i] = chid; + ini->is_smcd = true; + rc = 0; + i++; + if (i > SMC_MAX_ISM_DEVS) + break; + } + } + mutex_unlock(&smcd_dev_list.mutex); + ini->ism_offered_cnt = i - 1; + if (!ini->ism_dev[0] && !ini->ism_dev[1]) + ini->smcd_version = 0; + + return rc; +} + /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, struct smc_init_info *ini) { - if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) + if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; return 0; } +static int smc_find_proposal_devices(struct smc_sock *smc, + struct smc_init_info *ini) +{ + int rc = 0; + + /* check if there is an ism device available */ + if (!(ini->smcd_version & SMC_V1) || + smc_find_ism_device(smc, ini) || + smc_connect_ism_vlan_setup(smc, ini)) + ini->smcd_version &= ~SMC_V1; + /* else ISM V1 is supported for this connection */ + + /* check if there is an rdma device available */ + if (!(ini->smcr_version & SMC_V1) || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V1; + /* else RDMA is supported for this connection */ + + ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, + ini->smcr_version & SMC_V1); + + /* check if there is an ism v2 device available */ + if (!(ini->smcd_version & SMC_V2) || + !smc_ism_is_v2_capable() || + smc_find_ism_v2_device_clnt(smc, ini)) + ini->smcd_version &= ~SMC_V2; + + /* check if there is an rdma v2 device available */ + ini->check_smcrv2 = true; + ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; + if (!(ini->smcr_version & SMC_V2) || + smc->clcsock->sk->sk_family != AF_INET || + !smc_clc_ueid_count() || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V2; + ini->check_smcrv2 = false; + + ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, + ini->smcr_version & SMC_V2); + + /* if neither ISM nor RDMA are supported, fallback */ + if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) + rc = SMC_CLC_DECL_NOSMCDEV; + + return rc; +} + /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ -static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd, +static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, struct smc_init_info *ini) { - if (!is_smcd) + if (!smcd_indicated(ini->smc_type_v1)) return 0; - if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id)) + if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } +#define SMC_CLC_MAX_ACCEPT_LEN \ + (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ + sizeof(struct smc_clc_first_contact_ext) + \ + sizeof(struct smc_clc_msg_trail)) + /* CLC handshake during connect */ -static int smc_connect_clc(struct smc_sock *smc, int smc_type, - struct smc_clc_msg_accept_confirm *aclc, +static int smc_connect_clc(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm_v2 *aclc2, struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ - rc = smc_clc_send_proposal(smc, smc_type, ini); + rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT, - CLC_WAIT_TIME); + return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + SMC_CLC_ACCEPT, CLC_WAIT_TIME); +} + +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid) +{ + struct smc_init_info *alt_ini = NULL; + + memset(gidlist, 0, sizeof(*gidlist)); + memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); + + alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); + if (!alt_ini) + goto out; + + alt_ini->vlan_id = lgr->vlan_id; + alt_ini->check_smcrv2 = true; + alt_ini->smcrv2.saddr = lgr->saddr; + smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); + + if (!alt_ini->smcrv2.ib_dev_v2) + goto out; + + memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, + SMC_GID_SIZE); + +out: + kfree(alt_ini); +} + +static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + struct smc_clc_first_contact_ext *fce = + (struct smc_clc_first_contact_ext *) + (((u8 *)clc_v2) + sizeof(*clc_v2)); + + if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) + return 0; + + if (fce->v2_direct) { + memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); + ini->smcrv2.uses_gateway = false; + } else { + if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr, + smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), + ini->smcrv2.nexthop_mac, + &ini->smcrv2.uses_gateway)) + return SMC_CLC_DECL_NOROUTE; + if (!ini->smcrv2.uses_gateway) { + /* mismatch: peer claims indirect, but its direct */ + return SMC_CLC_DECL_NOINDIRECT; + } + } + return 0; } /* setup for RDMA connection of client */ @@ -596,13 +1182,20 @@ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { + int i, reason_code = 0; struct smc_link *link; - int reason_code = 0; + u8 *eid = NULL; ini->is_smcd = false; - ini->ib_lcl = &aclc->lcl; - ini->ib_clcqpn = ntoh24(aclc->qpn); - ini->srv_first_contact = aclc->hdr.flag; + ini->ib_clcqpn = ntoh24(aclc->r0.qpn); + ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); + + reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); + if (reason_code) + return reason_code; mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); @@ -610,49 +1203,94 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; smc_conn_save_peer_info(smc, aclc); + if (ini->first_contact_local) { + link = smc->conn.lnk; + } else { + /* set link that was assigned by server */ + link = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *l = &smc->conn.lgr->lnk[i]; + + if (l->peer_qpn == ntoh24(aclc->r0.qpn) && + !memcmp(l->peer_gid, &aclc->r0.lcl.gid, + SMC_GID_SIZE) && + (aclc->hdr.version > SMC_V1 || + !memcmp(l->peer_mac, &aclc->r0.lcl.mac, + sizeof(l->peer_mac)))) { + link = l; + break; + } + } + if (!link) { + reason_code = SMC_CLC_DECL_NOSRVLINK; + goto connect_abort; + } + smc_switch_link_and_count(&smc->conn, link); + } + /* create send buffer and rmb */ - if (smc_buf_create(smc, false)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, - ini->cln_first_contact); + if (smc_buf_create(smc, false)) { + reason_code = SMC_CLC_DECL_MEM; + goto connect_abort; + } - if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, aclc); + if (ini->first_contact_local) + smc_link_save_peer_info(link, aclc, ini); - if (smc_rmb_rtoken_handling(&smc->conn, aclc)) - return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, - ini->cln_first_contact); + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { + reason_code = SMC_CLC_DECL_ERR_RTOK; + goto connect_abort; + } smc_close_init(smc); smc_rx_init(smc); - if (ini->cln_first_contact == SMC_FIRST_CONTACT) { - if (smc_ib_ready_link(link)) - return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, - ini->cln_first_contact); + if (ini->first_contact_local) { + if (smc_ib_ready_link(link)) { + reason_code = SMC_CLC_DECL_ERR_RDYLNK; + goto connect_abort; + } } else { - if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, - ini->cln_first_contact); + /* reg sendbufs if they were vzalloced */ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } + } + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { + reason_code = SMC_CLC_DECL_ERR_REGBUF; + goto connect_abort; + } } - smc_rmb_sync_sg_for_device(&smc->conn); - reason_code = smc_clc_send_confirm(smc); + if (aclc->hdr.version > SMC_V1) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + eid = clc_v2->r1.eid; + if (ini->first_contact_local) + smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, + link->smcibdev, link->gid); + } + + reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, + aclc->hdr.version, eid, ini); if (reason_code) - return smc_connect_abort(smc, reason_code, - ini->cln_first_contact); + goto connect_abort; smc_tx_init(smc); - if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (ini->first_contact_local) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link(smc); + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_clnt_conf_first_link(smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) - return smc_connect_abort(smc, reason_code, - ini->cln_first_contact); + goto connect_abort; } mutex_unlock(&smc_client_lgr_pending); @@ -662,6 +1300,31 @@ static int smc_connect_rdma(struct smc_sock *smc, smc->sk.sk_state = SMC_ACTIVE; return 0; +connect_abort: + smc_conn_abort(smc, ini->first_contact_local); + mutex_unlock(&smc_client_lgr_pending); + smc->connect_nonblock = 0; + + return reason_code; +} + +/* The server has chosen one of the proposed ISM devices for the communication. + * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. + */ +static int +smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, + struct smc_init_info *ini) +{ + int i; + + for (i = 0; i < ini->ism_offered_cnt + 1; i++) { + if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { + ini->ism_selected = i; + return 0; + } + } + + return -EPROTO; } /* setup for ISM connection of client */ @@ -669,11 +1332,21 @@ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { + u8 *eid = NULL; int rc = 0; ini->is_smcd = true; - ini->ism_gid = aclc->gid; - ini->srv_first_contact = aclc->hdr.flag; + ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + + if (aclc->hdr.version == SMC_V2) { + struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + rc = smc_v2_determine_accepted_chid(aclc_v2, ini); + if (rc) + return rc; + } + ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid; /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); @@ -684,18 +1357,28 @@ static int smc_connect_ism(struct smc_sock *smc, } /* Create send and receive buffers */ - if (smc_buf_create(smc, true)) - return smc_connect_abort(smc, SMC_CLC_DECL_MEM, - ini->cln_first_contact); + rc = smc_buf_create(smc, true); + if (rc) { + rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; + goto connect_abort; + } smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); smc_rx_init(smc); smc_tx_init(smc); - rc = smc_clc_send_confirm(smc); + if (aclc->hdr.version > SMC_V1) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + eid = clc_v2->d1.eid; + } + + rc = smc_clc_send_confirm(smc, ini->first_contact_local, + aclc->hdr.version, eid, NULL); if (rc) - return smc_connect_abort(smc, rc, ini->cln_first_contact); + goto connect_abort; mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); @@ -704,15 +1387,47 @@ static int smc_connect_ism(struct smc_sock *smc, smc->sk.sk_state = SMC_ACTIVE; return 0; +connect_abort: + smc_conn_abort(smc, ini->first_contact_local); + mutex_unlock(&smc_server_lgr_pending); + smc->connect_nonblock = 0; + + return rc; +} + +/* check if received accept type and version matches a proposed one */ +static int smc_connect_check_aclc(struct smc_init_info *ini, + struct smc_clc_msg_accept_confirm *aclc) +{ + if (aclc->hdr.typev1 != SMC_TYPE_R && + aclc->hdr.typev1 != SMC_TYPE_D) + return SMC_CLC_DECL_MODEUNSUPP; + + if (aclc->hdr.version >= SMC_V2) { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v2)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v2))) + return SMC_CLC_DECL_MODEUNSUPP; + } else { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v1)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v1))) + return SMC_CLC_DECL_MODEUNSUPP; + } + + return 0; } /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { - bool ism_supported = false, rdma_supported = false; - struct smc_clc_msg_accept_confirm aclc; - struct smc_init_info ini = {0}; - int smc_type; + u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; + struct smc_clc_msg_accept_confirm_v2 *aclc2; + struct smc_clc_msg_accept_confirm *aclc; + struct smc_init_info *ini = NULL; + u8 *buf = NULL; int rc = 0; if (smc->use_fallback) @@ -722,58 +1437,84 @@ static int __smc_connect(struct smc_sock *smc) if (!tcp_sk(smc->clcsock->sk)->syn_smc) return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); - /* IPSec connections opt out of SMC-R optimizations */ + /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(smc)) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, + version); - /* get vlan id from IP device */ - if (smc_vlan_by_tcpsk(smc->clcsock, &ini)) - return smc_connect_decline_fallback(smc, - SMC_CLC_DECL_GETVLANERR); + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, + version); - /* check if there is an ism device available */ - if (!smc_find_ism_device(smc, &ini) && - !smc_connect_ism_vlan_setup(smc, &ini)) { - /* ISM is supported for this connection */ - ism_supported = true; - smc_type = SMC_TYPE_D; - } - - /* check if there is a rdma device available */ - if (!smc_find_rdma_device(smc, &ini)) { - /* RDMA is supported for this connection */ - rdma_supported = true; - if (ism_supported) - smc_type = SMC_TYPE_B; /* both */ - else - smc_type = SMC_TYPE_R; /* only RDMA */ + ini->smcd_version = SMC_V1 | SMC_V2; + ini->smcr_version = SMC_V1 | SMC_V2; + ini->smc_type_v1 = SMC_TYPE_B; + ini->smc_type_v2 = SMC_TYPE_B; + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { + ini->smcd_version &= ~SMC_V1; + ini->smcr_version = 0; + ini->smc_type_v1 = SMC_TYPE_N; + if (!ini->smcd_version) { + rc = SMC_CLC_DECL_GETVLANERR; + goto fallback; + } } - /* if neither ISM nor RDMA are supported, fallback */ - if (!rdma_supported && !ism_supported) - return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV); + rc = smc_find_proposal_devices(smc, ini); + if (rc) + goto fallback; + + buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL); + if (!buf) { + rc = SMC_CLC_DECL_MEM; + goto fallback; + } + aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf; + aclc = (struct smc_clc_msg_accept_confirm *)aclc2; /* perform CLC handshake */ - rc = smc_connect_clc(smc, smc_type, &aclc, &ini); + rc = smc_connect_clc(smc, aclc2, ini); if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); - return smc_connect_decline_fallback(smc, rc); + /* -EAGAIN on timeout, see tcp_recvmsg() */ + if (rc == -EAGAIN) { + rc = -ETIMEDOUT; + smc->sk.sk_err = ETIMEDOUT; + } + goto vlan_cleanup; } + /* check if smc modes and versions of CLC proposal and accept match */ + rc = smc_connect_check_aclc(ini, aclc); + version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; + if (rc) + goto vlan_cleanup; + /* depending on previous steps, connect using rdma or ism */ - if (rdma_supported && aclc.hdr.path == SMC_TYPE_R) - rc = smc_connect_rdma(smc, &aclc, &ini); - else if (ism_supported && aclc.hdr.path == SMC_TYPE_D) - rc = smc_connect_ism(smc, &aclc, &ini); - else - rc = SMC_CLC_DECL_MODEUNSUPP; - if (rc) { - smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); - return smc_connect_decline_fallback(smc, rc); + if (aclc->hdr.typev1 == SMC_TYPE_R) { + ini->smcr_version = version; + rc = smc_connect_rdma(smc, aclc, ini); + } else if (aclc->hdr.typev1 == SMC_TYPE_D) { + ini->smcd_version = version; + rc = smc_connect_ism(smc, aclc, ini); } + if (rc) + goto vlan_cleanup; - smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini); + SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); + smc_connect_ism_vlan_cleanup(smc, ini); + kfree(buf); + kfree(ini); return 0; + +vlan_cleanup: + smc_connect_ism_vlan_cleanup(smc, ini); + kfree(buf); +fallback: + kfree(ini); + return smc_connect_decline_fallback(smc, rc, version); } static void smc_connect_work(struct work_struct *work) @@ -789,7 +1530,7 @@ static void smc_connect_work(struct work_struct *work) if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; } else if ((1 << smc->clcsock->sk->sk_state) & - (TCPF_SYN_SENT | TCP_SYN_RECV)) { + (TCPF_SYN_SENT | TCPF_SYN_RECV)) { rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); if ((rc == -EPIPE) && ((1 << smc->clcsock->sk->sk_state) & @@ -802,6 +1543,8 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; + else if (rc == -ECONNREFUSED) + smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ @@ -840,14 +1583,33 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out_err; lock_sock(sk); + switch (sock->state) { + default: + rc = -EINVAL; + goto out; + case SS_CONNECTED: + rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; + goto out; + case SS_CONNECTING: + if (sk->sk_state == SMC_ACTIVE) + goto connected; + break; + case SS_UNCONNECTED: + sock->state = SS_CONNECTING; + break; + } + switch (sk->sk_state) { default: goto out; + case SMC_CLOSED: + rc = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; case SMC_INIT: - rc = 0; break; } @@ -861,21 +1623,25 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (rc && rc != -EINPROGRESS) goto out; - sock_hold(&smc->sk); /* sock put in passive closing */ - if (smc->use_fallback) + if (smc->use_fallback) { + sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; + } + sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { - if (schedule_work(&smc->connect_work)) + if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; + goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; - else - rc = 0; /* success cases including fallback */ } +connected: + rc = 0; + sock->state = SS_CONNECTED; out: release_sock(sk); out_err: @@ -902,10 +1668,10 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) - rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); lock_sock(lsk); - if (rc < 0) + if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); @@ -918,6 +1684,23 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) goto out; } + /* new clcsock has inherited the smc listen-specific sk_data_ready + * function; switch it back to the original sk_data_ready function + */ + new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; + + /* if new clcsock has also inherited the fallback-specific callback + * functions, switch them back to the original ones. + */ + if (lsmc->use_fallback) { + if (lsmc->clcsk_state_change) + new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; + if (lsmc->clcsk_write_space) + new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; + if (lsmc->clcsk_error_report) + new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; + } + (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -973,6 +1756,7 @@ struct sock *smc_accept_dequeue(struct sock *parent, } if (new_sock) { sock_graft(new_sk, new_sock); + new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; @@ -999,18 +1783,21 @@ void smc_close_non_accepted(struct sock *sk) sock_put(sk); /* final sock_put */ } -static int smc_serv_conf_first_link(struct smc_sock *smc) +static int smcr_serv_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; + /* reg the sndbuf if it was vzalloced*/ + if (smc->conn.sndbuf_desc->is_vm) { + if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) - return SMC_CLC_DECL_ERR_REGRMB; + /* reg the rmb */ + if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); @@ -1018,40 +1805,29 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm_resp, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_resp_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; - /* send ADD LINK request to client over the RoCE fabric */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; - /* receive ADD LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; - } - - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); return 0; } @@ -1061,6 +1837,9 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_dec(&lsmc->queued_smc_hs); + if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -1079,7 +1858,6 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; - sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; @@ -1090,7 +1868,9 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; + struct net *net = sock_net(newsmcsk); + this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; @@ -1100,21 +1880,18 @@ static void smc_listen_out_err(struct smc_sock *new_smc) /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, - int local_contact) + int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); - if (reason_code < 0) { /* error, no fallback possible */ + smc_conn_abort(new_smc, local_first); + if (reason_code < 0 || + smc_switch_to_fallback(new_smc, reason_code)) { + /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { - if (smc_clc_send_decline(new_smc, reason_code) < 0) { + if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); return; } @@ -1122,6 +1899,64 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, smc_listen_out_connected(new_smc); } +/* listen worker: version checking */ +static int smc_listen_v2_check(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; + struct smc_clc_v2_extension *pclc_v2_ext; + int rc = SMC_CLC_DECL_PEERNOSMC; + + ini->smc_type_v1 = pclc->hdr.typev1; + ini->smc_type_v2 = pclc->hdr.typev2; + ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + if (pclc->hdr.version > SMC_V1) { + if (smcd_indicated(ini->smc_type_v2)) + ini->smcd_version |= SMC_V2; + if (smcr_indicated(ini->smc_type_v2)) + ini->smcr_version |= SMC_V2; + } + if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { + rc = SMC_CLC_DECL_PEERNOSMC; + goto out; + } + pclc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!pclc_v2_ext) { + ini->smcd_version &= ~SMC_V2; + ini->smcr_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2EXT; + goto out; + } + pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); + if (ini->smcd_version & SMC_V2) { + if (!smc_ism_is_v2_capable()) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOISM2SUPP; + } else if (!pclc_smcd_v2_ext) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2DEXT; + } else if (!pclc_v2_ext->hdr.eid_cnt && + !pclc_v2_ext->hdr.flag.seid) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } + } + if (ini->smcr_version & SMC_V2) { + if (!pclc_v2_ext->hdr.eid_cnt) { + ini->smcr_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } + } + +out: + if (!ini->smcd_version && !ini->smcr_version) + return rc; + + return 0; +} + /* listen worker: check prefixes */ static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) @@ -1129,6 +1964,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; + if (pclc->hdr.typev1 == SMC_TYPE_N) + return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; @@ -1156,99 +1993,338 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, - struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - struct smc_clc_msg_smcd *pclc_smcd; int rc; - pclc_smcd = smc_get_clc_msg_smcd(pclc); - ini->ism_gid = pclc_smcd->gid; rc = smc_conn_create(new_smc, ini); if (rc) return rc; - /* Check if peer can be reached via ISM device */ - if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid, - new_smc->conn.lgr->vlan_id, - new_smc->conn.lgr->smcd)) { - if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_SMCDNOTALK; - } - /* Create send and receive buffers */ - if (smc_buf_create(new_smc, true)) { - if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_lgr_cleanup_early(&new_smc->conn); - else - smc_conn_free(&new_smc->conn); - return SMC_CLC_DECL_MEM; + rc = smc_buf_create(new_smc, true); + if (rc) { + smc_conn_abort(new_smc, ini->first_contact_local); + return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : + SMC_CLC_DECL_MEM; } return 0; } +static bool smc_is_already_selected(struct smcd_dev *smcd, + struct smc_init_info *ini, + int matches) +{ + int i; + + for (i = 0; i < matches; i++) + if (smcd == ini->ism_dev[i]) + return true; + + return false; +} + +/* check for ISM devices matching proposed ISM devices */ +static void smc_check_ism_v2_match(struct smc_init_info *ini, + u16 proposed_chid, u64 proposed_gid, + unsigned int *matches) +{ + struct smcd_dev *smcd; + + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->going_away) + continue; + if (smc_is_already_selected(smcd, ini, *matches)) + continue; + if (smc_ism_get_chid(smcd) == proposed_chid && + !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { + ini->ism_peer_gid[*matches] = proposed_gid; + ini->ism_dev[*matches] = smcd; + (*matches)++; + break; + } + } +} + +static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) +{ + if (!ini->rc) + ini->rc = rc; +} + +static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_smcd_v2_extension *smcd_v2_ext; + struct smc_clc_v2_extension *smc_v2_ext; + struct smc_clc_msg_smcd *pclc_smcd; + unsigned int matches = 0; + u8 smcd_version; + u8 *eid = NULL; + int i, rc; + + if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) + goto not_found; + + pclc_smcd = smc_get_clc_msg_smcd(pclc); + smc_v2_ext = smc_get_clc_v2_ext(pclc); + smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + + mutex_lock(&smcd_dev_list.mutex); + if (pclc_smcd->ism.chid) + /* check for ISM device matching proposed native ISM device */ + smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), + ntohll(pclc_smcd->ism.gid), &matches); + for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) { + /* check for ISM devices matching proposed non-native ISM + * devices + */ + smc_check_ism_v2_match(ini, + ntohs(smcd_v2_ext->gidchid[i - 1].chid), + ntohll(smcd_v2_ext->gidchid[i - 1].gid), + &matches); + } + mutex_unlock(&smcd_dev_list.mutex); + + if (!ini->ism_dev[0]) { + smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); + goto not_found; + } + + smc_ism_get_system_eid(&eid); + if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, + smcd_v2_ext->system_eid, eid)) + goto not_found; + + /* separate - outside the smcd_dev_list.lock */ + smcd_version = ini->smcd_version; + for (i = 0; i < matches; i++) { + ini->smcd_version = SMC_V2; + ini->is_smcd = true; + ini->ism_selected = i; + rc = smc_listen_ism_init(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); + /* try next active ISM device */ + continue; + } + return; /* matching and usable V2 ISM device found */ + } + /* no V2 ISM device could be initialized */ + ini->smcd_version = smcd_version; /* restore original value */ + ini->negotiated_eid[0] = 0; + +not_found: + ini->smcd_version &= ~SMC_V2; + ini->ism_dev[0] = NULL; + ini->is_smcd = false; +} + +static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); + int rc = 0; + + /* check if ISM V1 is available */ + if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + goto not_found; + ini->is_smcd = true; /* prepare ISM check */ + ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); + rc = smc_find_ism_device(new_smc, ini); + if (rc) + goto not_found; + ini->ism_selected = 0; + rc = smc_listen_ism_init(new_smc, ini); + if (!rc) + return; /* V1 ISM device found */ + +not_found: + smc_find_ism_store_rc(rc, ini); + ini->smcd_version &= ~SMC_V1; + ini->ism_dev[0] = NULL; + ini->is_smcd = false; +} + /* listen worker: register buffers */ -static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) +static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_connection *conn = &new_smc->conn; - if (local_contact != SMC_FIRST_CONTACT) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) - return SMC_CLC_DECL_ERR_REGRMB; + if (!local_first) { + /* reg sendbufs if they were vzalloced */ + if (conn->sndbuf_desc->is_vm) { + if (smcr_lgr_reg_sndbufs(conn->lnk, + conn->sndbuf_desc)) + return SMC_CLC_DECL_ERR_REGBUF; + } + if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) + return SMC_CLC_DECL_ERR_REGBUF; } - smc_rmb_sync_sg_for_device(&new_smc->conn); return 0; } +static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *smc_v2_ext; + u8 smcr_version; + int rc; + + if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) + goto not_found; + + smc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + goto not_found; + + /* prepare RDMA check */ + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); + ini->check_smcrv2 = true; + ini->smcrv2.clc_sk = new_smc->clcsock->sk; + ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); + rc = smc_find_rdma_device(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); + goto not_found; + } + if (!ini->smcrv2.uses_gateway) + memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); + + smcr_version = ini->smcr_version; + ini->smcr_version = SMC_V2; + rc = smc_listen_rdma_init(new_smc, ini); + if (!rc) + rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (!rc) + return; + ini->smcr_version = smcr_version; + smc_find_ism_store_rc(rc, ini); + +not_found: + ini->smcr_version &= ~SMC_V2; + ini->smcrv2.ib_dev_v2 = NULL; + ini->check_smcrv2 = false; +} + +static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + int rc; + + if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) + return SMC_CLC_DECL_NOSMCDEV; + + /* prepare RDMA check */ + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); + rc = smc_find_rdma_device(new_smc, ini); + if (rc) { + /* no RDMA device found */ + return SMC_CLC_DECL_NOSMCDEV; + } + rc = smc_listen_rdma_init(new_smc, ini); + if (rc) + return rc; + return smc_listen_rdma_reg(new_smc, ini->first_contact_local); +} + +/* determine the local device matching to proposal */ +static int smc_listen_find_device(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + int prfx_rc; + + /* check for ISM device matching V2 proposed device */ + smc_find_ism_v2_device_serv(new_smc, pclc, ini); + if (ini->ism_dev[0]) + return 0; + + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); + + /* get vlan id from IP device */ + if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) + return ini->rc ?: SMC_CLC_DECL_GETVLANERR; + + /* check for ISM device matching V1 proposed device */ + if (!prfx_rc) + smc_find_ism_v1_device_serv(new_smc, pclc, ini); + if (ini->ism_dev[0]) + return 0; + + if (!smcr_indicated(pclc->hdr.typev1) && + !smcr_indicated(pclc->hdr.typev2)) + /* skip RDMA and decline */ + return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; + + /* check if RDMA V2 is available */ + smc_find_rdma_v2_device_serv(new_smc, pclc, ini); + if (ini->smcrv2.ib_dev_v2) + return 0; + + /* check if RDMA V1 is available */ + if (!prfx_rc) { + int rc; + + rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + smc_find_ism_store_rc(rc, ini); + return (!rc) ? 0 : ini->rc; + } + return SMC_CLC_DECL_NOSMCDEV; +} + /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, - int local_contact) + bool local_first, + struct smc_init_info *ini) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; - if (local_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, cclc); + if (local_first) + smc_link_save_peer_info(link, cclc, ini); - if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { - reason_code = SMC_CLC_DECL_ERR_RTOK; - goto decline; - } + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) + return SMC_CLC_DECL_ERR_RTOK; - if (local_contact == SMC_FIRST_CONTACT) { - if (smc_ib_ready_link(link)) { - reason_code = SMC_CLC_DECL_ERR_RDYLNK; - goto decline; - } + if (local_first) { + if (smc_ib_ready_link(link)) + return SMC_CLC_DECL_ERR_RDYLNK; /* QP confirmation over RoCE fabric */ - reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code) - goto decline; + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_serv_conf_first_link(new_smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); } - return 0; - -decline: - smc_listen_decline(new_smc, reason_code, local_contact); return reason_code; } -/* setup for RDMA connection of server */ +/* setup for connection of server */ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); struct socket *newclcsock = new_smc->clcsock; - struct smc_clc_msg_accept_confirm cclc; + struct smc_clc_msg_accept_confirm *cclc; + struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; - struct smc_init_info ini = {0}; - bool ism_supported = false; - u8 buf[SMC_CLC_MAX_LEN]; + struct smc_init_info *ini = NULL; + u8 proposal_version = SMC_V1; + u8 accept_version; int rc = 0; if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) @@ -1261,111 +2337,101 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc); - new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC; - smc_listen_out_connected(new_smc); + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); return; } /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ - pclc = (struct smc_clc_msg_proposal *)&buf; - rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) { + rc = SMC_CLC_DECL_MEM; + goto out_decl; + } + pclc = (struct smc_clc_msg_proposal *)buf; + rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (rc) goto out_decl; - /* IPSec connections opt out of SMC-R optimizations */ + if (pclc->hdr.version > SMC_V1) + proposal_version = SMC_V2; + + /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(new_smc)) { rc = SMC_CLC_DECL_IPSEC; goto out_decl; } - /* check for matching IP prefix and subnet length */ - rc = smc_listen_prfx_check(new_smc, pclc); - if (rc) + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = SMC_CLC_DECL_MEM; goto out_decl; + } - /* get vlan id from IP device */ - if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) { - rc = SMC_CLC_DECL_GETVLANERR; + /* initial version checking */ + rc = smc_listen_v2_check(new_smc, pclc, ini); + if (rc) goto out_decl; - } mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); - /* check if ISM is available */ - if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) { - ini.is_smcd = true; /* prepare ISM check */ - rc = smc_find_ism_device(new_smc, &ini); - if (!rc) - rc = smc_listen_ism_init(new_smc, pclc, &ini); - if (!rc) - ism_supported = true; - else if (pclc->hdr.path == SMC_TYPE_D) - goto out_unlock; /* skip RDMA and decline */ - } - - /* check if RDMA is available */ - if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */ - /* prepare RDMA check */ - ini.is_smcd = false; - ini.ism_dev = NULL; - ini.ib_lcl = &pclc->lcl; - rc = smc_find_rdma_device(new_smc, &ini); - if (rc) { - /* no RDMA device found */ - if (pclc->hdr.path == SMC_TYPE_B) - /* neither ISM nor RDMA device found */ - rc = SMC_CLC_DECL_NOSMCDEV; - goto out_unlock; - } - rc = smc_listen_rdma_init(new_smc, &ini); - if (rc) - goto out_unlock; - rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact); - if (rc) - goto out_unlock; - } + /* determine ISM or RoCE device used for connection */ + rc = smc_listen_find_device(new_smc, pclc, ini); + if (rc) + goto out_unlock; /* send SMC Accept CLC message */ - rc = smc_clc_send_accept(new_smc, ini.cln_first_contact); + accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; + rc = smc_clc_send_accept(new_smc, ini->first_contact_local, + accept_version, ini->negotiated_eid); if (rc) goto out_unlock; /* SMC-D does not need this lock any more */ - if (ism_supported) + if (ini->is_smcd) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ - rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + memset(buf, 0, sizeof(*buf)); + cclc = (struct smc_clc_msg_accept_confirm *)buf; + rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), SMC_CLC_CONFIRM, CLC_WAIT_TIME); if (rc) { - if (!ism_supported) + if (!ini->is_smcd) goto out_unlock; goto out_decl; } /* finish worker */ - if (!ism_supported) { - rc = smc_listen_rdma_finish(new_smc, &cclc, - ini.cln_first_contact); - mutex_unlock(&smc_server_lgr_pending); + if (!ini->is_smcd) { + rc = smc_listen_rdma_finish(new_smc, cclc, + ini->first_contact_local, ini); if (rc) - return; + goto out_unlock; + mutex_unlock(&smc_server_lgr_pending); } - smc_conn_save_peer_info(new_smc, &cclc); + smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); - return; + SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + goto out_free; out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: - smc_listen_decline(new_smc, rc, ini.cln_first_contact); + smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, + proposal_version); +out_free: + kfree(ini); + kfree(buf); } static void smc_tcp_listen_work(struct work_struct *work) @@ -1379,11 +2445,14 @@ static void smc_tcp_listen_work(struct work_struct *work) lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); - if (rc) + if (rc) /* clcsock accept queue empty or error */ goto out; if (!new_smc) continue; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_inc(&lsmc->queued_smc_hs); + new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -1393,13 +2462,31 @@ static void smc_tcp_listen_work(struct work_struct *work) new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ - if (!schedule_work(&new_smc->smc_listen_work)) + if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); } out: release_sock(lsk); - sock_put(&lsmc->sk); /* sock_hold in smc_listen */ + sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ +} + +static void smc_clcsock_data_ready(struct sock *listen_clcsock) +{ + struct smc_sock *lsmc; + + read_lock_bh(&listen_clcsock->sk_callback_lock); + lsmc = smc_clcsock_user_data(listen_clcsock); + if (!lsmc) + goto out; + lsmc->clcsk_data_ready(listen_clcsock); + if (lsmc->sk.sk_state == SMC_LISTEN) { + sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) + sock_put(&lsmc->sk); + } +out: + read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) @@ -1413,7 +2500,7 @@ static int smc_listen(struct socket *sock, int backlog) rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || - smc->connect_nonblock) + smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; @@ -1428,15 +2515,39 @@ static int smc_listen(struct socket *sock, int backlog) if (!smc->use_fallback) tcp_sk(smc->clcsock->sk)->syn_smc = 1; + /* save original sk_data_ready function and establish + * smc-specific sk_data_ready function + */ + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc->clcsock->sk->sk_user_data = + (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, + smc_clcsock_data_ready, &smc->clcsk_data_ready); + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); + + /* save original ops */ + smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; + + smc->af_ops = *smc->ori_af_ops; + smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; + + inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; + + if (smc->limit_smc_hs) + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + rc = kernel_listen(smc->clcsock, backlog); - if (rc) + if (rc) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; + } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; - sock_hold(sk); /* sock_hold in tcp_listen_worker */ - if (!schedule_work(&smc->tcp_listen_work)) - sock_put(sk); out: release_sock(sk); @@ -1542,18 +2653,21 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + if (rc) + goto out; } else { rc = -EINVAL; goto out; } } - if (smc->use_fallback) + if (smc->use_fallback) { rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); - else + } else { rc = smc_tx_sendmsg(smc, msg, len); + SMC_STAT_TX_PAYLOAD(smc, len, rc); + } out: release_sock(sk); return rc; @@ -1588,6 +2702,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } else { msg->msg_namelen = 0; rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); + SMC_STAT_RX_PAYLOAD(smc, rc, rc); } out: @@ -1664,8 +2779,10 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, static int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; + bool do_shutdown = true; struct smc_sock *smc; int rc = -EINVAL; + int old_state; int rc1 = 0; smc = smc_sk(sk); @@ -1675,6 +2792,17 @@ static int smc_shutdown(struct socket *sock, int how) lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if (sk->sk_state == SMC_ACTIVE) + sock->state = SS_CONNECTED; + else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || + sk->sk_state == SMC_PEERCLOSEWAIT2 || + sk->sk_state == SMC_APPCLOSEWAIT1 || + sk->sk_state == SMC_APPCLOSEWAIT2 || + sk->sk_state == SMC_APPFINCLOSEWAIT) + sock->state = SS_DISCONNECTING; + } + rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && @@ -1686,13 +2814,20 @@ static int smc_shutdown(struct socket *sock, int how) if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sk->sk_socket->state = SS_UNCONNECTED; + sock_put(sk); + } goto out; } switch (how) { case SHUT_RDWR: /* shutdown in both directions */ + old_state = sk->sk_state; rc = smc_close_active(smc); + if (old_state == SMC_ACTIVE && + sk->sk_state == SMC_PEERCLOSEWAIT1) + do_shutdown = false; break; case SHUT_WR: rc = smc_close_shutdown_write(smc); @@ -1702,53 +2837,134 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (smc->clcsock) + if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; + if (sk->sk_state == SMC_CLOSED) + sock->state = SS_UNCONNECTED; + else + sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; } +static int __smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + int val, len; + + smc = smc_sk(sock->sk); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case SMC_LIMIT_HS: + val = smc->limit_smc_hs; + break; + default: + return -EOPNOTSUPP; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int __smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int val, rc; + + smc = smc_sk(sk); + + lock_sock(sk); + switch (optname) { + case SMC_LIMIT_HS: + if (optlen < sizeof(int)) { + rc = -EINVAL; + break; + } + if (copy_from_sockptr(&val, optval, sizeof(int))) { + rc = -EFAULT; + break; + } + + smc->limit_smc_hs = !!val; + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + release_sock(sk); + + return rc; +} + static int smc_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; int val, rc; + if (level == SOL_TCP && optname == TCP_ULP) + return -EOPNOTSUPP; + else if (level == SOL_SMC) + return __smc_setsockopt(sock, level, optname, optval, optlen); + smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ - rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } + if (unlikely(!smc->clcsock->ops->setsockopt)) + rc = -EOPNOTSUPP; + else + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; - sk->sk_error_report(sk); + sk_error_report(sk); } + mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; - if (get_user(val, (int __user *)optval)) + if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); if (rc || smc->use_fallback) goto out; switch (optname) { - case TCP_ULP: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc); - smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP; + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } @@ -1757,18 +2973,22 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (val) - mod_delayed_work(system_wq, &smc->conn.tx_work, - 0); + if (val) { + SMC_STAT_INC(smc, ndly_cnt); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { - if (!val) - mod_delayed_work(system_wq, &smc->conn.tx_work, - 0); + if (!val) { + SMC_STAT_INC(smc, cork_cnt); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } } break; case TCP_DEFER_ACCEPT: @@ -1787,11 +3007,26 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; + int rc; + + if (level == SOL_SMC) + return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + mutex_unlock(&smc->clcsock_release_lock); + return -EBADF; + } /* socket options apply to the CLC socket */ - return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, - optval, optlen); + if (unlikely(!smc->clcsock->ops->getsockopt)) { + mutex_unlock(&smc->clcsock_release_lock); + return -EOPNOTSUPP; + } + rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); + mutex_unlock(&smc->clcsock_release_lock); + return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, @@ -1889,11 +3124,15 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, goto out; } release_sock(sk); - if (smc->use_fallback) + if (smc->use_fallback) { rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); - else - rc = sock_no_sendpage(sock, page, offset, size, flags); + } else { + lock_sock(sk); + rc = smc_tx_sendpage(smc, page, offset, size, flags); + release_sock(sk); + SMC_STAT_INC(smc, sendpage_cnt); + } out: return rc; @@ -1942,6 +3181,7 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, flags = MSG_DONTWAIT; else flags = 0; + SMC_STAT_INC(smc, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: @@ -1973,8 +3213,8 @@ static const struct proto_ops smc_sock_ops = { .splice_read = smc_splice_read, }; -static int smc_create(struct net *net, struct socket *sock, int protocol, - int kern) +static int __smc_create(struct net *net, struct socket *sock, int protocol, + int kern, struct socket *clcsock) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; @@ -1991,6 +3231,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, rc = -ENOBUFS; sock->ops = &smc_sock_ops; + sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; @@ -1999,37 +3240,124 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; - rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, - &smc->clcsock); - if (rc) { - sk_common_release(sk); - goto out; + + /* default behavior from limit_smc_hs in every net namespace */ + smc->limit_smc_hs = net->smc.limit_smc_hs; + + rc = 0; + if (!clcsock) { + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) { + sk_common_release(sk); + goto out; + } + } else { + smc->clcsock = clcsock; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); out: return rc; } +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + return __smc_create(net, sock, protocol, kern, NULL); +} + static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; +static int smc_ulp_init(struct sock *sk) +{ + struct socket *tcp = sk->sk_socket; + struct net *net = sock_net(sk); + struct socket *smcsock; + int protocol, ret; + + /* only TCP can be replaced */ + if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || + (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) + return -ESOCKTNOSUPPORT; + /* don't handle wq now */ + if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) + return -ENOTCONN; + + if (sk->sk_family == AF_INET) + protocol = SMCPROTO_SMC; + else + protocol = SMCPROTO_SMC6; + + smcsock = sock_alloc(); + if (!smcsock) + return -ENFILE; + + smcsock->type = SOCK_STREAM; + __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ + ret = __smc_create(net, smcsock, protocol, 1, tcp); + if (ret) { + sock_release(smcsock); /* module_put() which ops won't be NULL */ + return ret; + } + + /* replace tcp socket to smc */ + smcsock->file = tcp->file; + smcsock->file->private_data = smcsock; + smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ + smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ + tcp->file = NULL; + + return ret; +} + +static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + /* don't inherit ulp ops to child when listen */ + icsk->icsk_ulp_ops = NULL; +} + +static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { + .name = "smc", + .owner = THIS_MODULE, + .init = smc_ulp_init, + .clone = smc_ulp_clone, +}; + unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { + int rc; + + rc = smc_sysctl_net_init(net); + if (rc) + return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { + smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } +static __net_init int smc_net_stat_init(struct net *net) +{ + return smc_stats_init(net); +} + +static void __net_exit smc_net_stat_exit(struct net *net) +{ + smc_stats_exit(net); +} + static struct pernet_operations smc_net_ops = { .init = smc_net_init, .exit = smc_net_exit, @@ -2037,6 +3365,11 @@ static struct pernet_operations smc_net_ops = { .size = sizeof(struct smc_net), }; +static struct pernet_operations smc_net_stat_ops = { + .init = smc_net_stat_init, + .exit = smc_net_stat_exit, +}; + static int __init smc_init(void) { int rc; @@ -2045,14 +3378,39 @@ static int __init smc_init(void) if (rc) return rc; - rc = smc_pnet_init(); + rc = register_pernet_subsys(&smc_net_stat_ops); if (rc) goto out_pernet_subsys; + smc_ism_init(); + smc_clc_init(); + + rc = smc_nl_init(); + if (rc) + goto out_pernet_subsys_stat; + + rc = smc_pnet_init(); + if (rc) + goto out_nl; + + rc = -ENOMEM; + + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + if (!smc_tcp_ls_wq) + goto out_pnet; + + smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + if (!smc_hs_wq) + goto out_alloc_tcp_ls_wq; + + smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + if (!smc_close_wq) + goto out_alloc_hs_wq; + rc = smc_core_init(); if (rc) { pr_err("%s: smc_core_init fails with %d\n", __func__, rc); - goto out_pnet; + goto out_alloc_wqs; } rc = smc_llc_init(); @@ -2093,9 +3451,17 @@ static int __init smc_init(void) goto out_sock; } + rc = tcp_register_ulp(&smc_ulp_ops); + if (rc) { + pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); + goto out_ib; + } + static_branch_enable(&tcp_have_smc); return 0; +out_ib: + smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -2104,8 +3470,18 @@ out_proto: proto_unregister(&smc_proto); out_core: smc_core_exit(); +out_alloc_wqs: + destroy_workqueue(smc_close_wq); +out_alloc_hs_wq: + destroy_workqueue(smc_hs_wq); +out_alloc_tcp_ls_wq: + destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); +out_nl: + smc_nl_exit(); +out_pernet_subsys_stat: + unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); @@ -2115,12 +3491,19 @@ out_pernet_subsys: static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); + destroy_workqueue(smc_close_wq); + destroy_workqueue(smc_tcp_ls_wq); + destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); + smc_nl_exit(); + smc_clc_exit(); + unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } @@ -2132,3 +3515,5 @@ MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); +MODULE_ALIAS_TCP_ULP("smc"); +MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME); diff --git a/net/smc/smc.h b/net/smc/smc.h index be11ba41190f..5ed765ea0c73 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -14,13 +14,23 @@ #include <linux/socket.h> #include <linux/types.h> #include <linux/compiler.h> /* __aligned */ +#include <net/genetlink.h> #include <net/sock.h> #include "smc_ib.h" +#define SMC_V1 1 /* SMC version V1 */ +#define SMC_V2 2 /* SMC version V2 */ +#define SMC_RELEASE 0 + #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ +#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM + * devices + */ +#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ + extern struct proto smc_proto; extern struct proto smc_proto6; @@ -48,7 +58,20 @@ enum smc_state { /* possible states of an SMC socket */ struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ - u8 type; + union { + u8 type; +#if defined(__BIG_ENDIAN_BITFIELD) + struct { + u8 llc_version:4, + llc_type:4; + }; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + struct { + u8 llc_type:4, + llc_version:4; + }; +#endif + }; } __aligned(1); struct smc_cdc_conn_state_flags { @@ -118,9 +141,16 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; +struct smc_mark_woken { + bool woken; + void *key; + wait_queue_entry_t wait_entry; +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ @@ -142,6 +172,9 @@ struct smc_connection { * .prod cf. TCP snd_nxt * .cons cf. TCP sends ack */ + union smc_host_cursor local_tx_ctrl_fin; + /* prod crsr - confirmed by peer + */ union smc_host_cursor tx_curs_prep; /* tx - prepared data * snd_max..wmem_alloc */ @@ -153,7 +186,14 @@ struct smc_connection { */ atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ + u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ + atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe + * - inc when post wqe, + * - dec on polled tx cqe + */ + wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ @@ -173,6 +213,10 @@ struct smc_connection { * data still pending */ char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ @@ -183,17 +227,28 @@ struct smc_connection { spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ + struct work_struct abort_work; /* abort the connection */ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ + u8 freed : 1; /* normal termiation */ + u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + void (*clcsk_state_change)(struct sock *sk); + /* original stat_change fct. */ + void (*clcsk_data_ready)(struct sock *sk); + /* original data_ready fct. */ + void (*clcsk_write_space)(struct sock *sk); + /* original write_space fct. */ + void (*clcsk_error_report)(struct sock *sk); + /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ @@ -201,9 +256,14 @@ struct smc_sock { /* smc sock container */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ + bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ + atomic_t queued_smc_hs; /* queued smc handshakes */ + struct inet_connection_sock_af_ops af_ops; + const struct inet_connection_sock_af_ops *ori_af_ops; + /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value @@ -228,10 +288,51 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } +static inline void smc_init_saved_callbacks(struct smc_sock *smc) +{ + smc->clcsk_state_change = NULL; + smc->clcsk_data_ready = NULL; + smc->clcsk_write_space = NULL; + smc->clcsk_error_report = NULL; +} + +static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + +/* save target_cb in saved_cb, and replace target_cb with new_cb */ +static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), + void (*new_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + /* only save once */ + if (!*saved_cb) + *saved_cb = *target_cb; + *target_cb = new_cb; +} + +/* restore target_cb to saved_cb, and reset saved_cb to NULL */ +static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + if (!*saved_cb) + return; + *target_cb = *saved_cb; + *saved_cb = NULL; +} + +extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ +extern struct workqueue_struct *smc_close_wq; /* wq for close work */ + #define SMC_SYSTEMID_LEN 8 extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ +#define ntohll(x) be64_to_cpu(x) +#define htonll(x) cpu_to_be64(x) + /* convert an u32 value into network byte order, store it into a 3 byte field */ static inline void hton24(u8 *net, u32 host) { @@ -263,7 +364,17 @@ static inline bool using_ipsec(struct smc_sock *smc) } #endif +struct smc_gidlist; + struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid); + +/* smc handshake limitation interface for netlink */ +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 164f1584861b..53f63bfbaf5f 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -31,10 +31,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, struct smc_sock *smc; int diff; - if (!conn) - /* already dismissed */ - return; - smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { @@ -47,25 +43,48 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn); + smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor, + conn); + conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } + + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { + /* If user owns the sock_lock, mark the connection need sending. + * User context will later try to send when it release sock_lock + * in smc_release_cb() + */ + if (sock_owned_by_user(&smc->sk)) + conn->tx_in_release_sock = true; + else + smc_tx_pending(conn); + + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + } + WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); + smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); } int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, wr_rdma_buf, (struct smc_wr_tx_pend_priv **)pend); - if (conn->killed) + if (conn->killed) { /* abnormal termination */ + if (!rc) + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)(*pend)); rc = -EPIPE; + } return rc; } @@ -91,39 +110,96 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend) { + struct smc_link *link = conn->lnk; union smc_host_cursor cfed; - struct smc_link *link; int rc; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + + atomic_inc(&conn->cdc_pend_tx_wr); + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + } else { + conn->tx_cdc_seq--; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + atomic_dec(&conn->cdc_pend_tx_wr); } return rc; } +/* send a validation msg indicating the move of a conn to an other QP link */ +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; + struct smc_link *link = conn->lnk; + struct smc_cdc_msg *peer; + int rc; + + peer = (struct smc_cdc_msg *)wr_buf; + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */ + peer->token = htonl(local->token); + peer->prod_flags.failover_validation = 1; + + /* We need to set pend->conn here to make sure smc_cdc_tx_handler() + * can handle properly + */ + smc_cdc_add_pending_send(conn, pend); + + atomic_inc(&conn->cdc_pend_tx_wr); + smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ + + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + if (unlikely(rc)) + atomic_dec(&conn->cdc_pend_tx_wr); + + return rc; +} + static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; + struct smc_link *link; + bool again = false; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend); +again: + link = conn->lnk; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) - return rc; + goto put_out; spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, try again one time*/ + spin_unlock_bh(&conn->send_lock); + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + smc_wr_tx_link_put(link); + if (again) + return -ENOLINK; + again = true; + goto again; + } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -131,7 +207,8 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; - if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + if (!smc_conn_lgr_valid(conn) || + (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) return -EPIPE; if (conn->lgr->is_smcd) { @@ -145,31 +222,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) return rc; } -static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, - unsigned long data) +void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn) { - struct smc_connection *conn = (struct smc_connection *)data; - struct smc_cdc_tx_pend *cdc_pend = - (struct smc_cdc_tx_pend *)tx_pend; - - return cdc_pend->conn == conn; -} - -static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) -{ - struct smc_cdc_tx_pend *cdc_pend = - (struct smc_cdc_tx_pend *)tx_pend; - - cdc_pend->conn = NULL; -} - -void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) -{ - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - - smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, - smc_cdc_tx_filter, smc_cdc_tx_dismisser, - (unsigned long)conn); + wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr)); } /* Send a SMC-D CDC header. @@ -239,6 +294,28 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, sk_send_sigurg(&smc->sk); } +static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc, + struct smc_link *link) +{ + struct smc_connection *conn = &smc->conn; + u16 recv_seq = ntohs(cdc->seqno); + s16 diff; + + /* check that seqnum was seen before */ + diff = conn->local_rx_ctrl.seqno - recv_seq; + if (diff < 0) { /* diff larger than 0x7fff */ + /* drop connection */ + conn->out_of_sync = 1; /* prevent any further receives */ + spin_lock_bh(&conn->send_lock); + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + conn->lnk = link; + spin_unlock_bh(&conn->send_lock); + sock_hold(&smc->sk); /* sock_put in abort_work */ + if (!queue_work(smc_close_wq, &conn->abort_work)) + sock_put(&smc->sk); + } +} + static void smc_cdc_msg_recv_action(struct smc_sock *smc, struct smc_cdc_msg *cdc) { @@ -283,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || - conn->local_rx_ctrl.prod_flags.urg_data_pending) - smc_tx_sndbuf_nonempty(conn); + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (!sock_owned_by_user(&smc->sk)) + smc_tx_pending(conn); + else + conn->tx_in_release_sock = true; + } if (diff_cons && conn->urg_tx_pend && atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { @@ -303,7 +384,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(&smc->sk, SOCK_DONE); sock_hold(&smc->sk); /* sock_put in close_work */ - if (!schedule_work(&conn->close_work)) + if (!queue_work(smc_close_wq, &conn->close_work)) sock_put(&smc->sk); } } @@ -324,9 +405,9 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) * Context: * - tasklet context */ -static void smcd_cdc_rx_tsklet(unsigned long data) +static void smcd_cdc_rx_tsklet(struct tasklet_struct *t) { - struct smc_connection *conn = (struct smc_connection *)data; + struct smc_connection *conn = from_tasklet(conn, t, rx_tsklet); struct smcd_cdc_msg *data_cdc; struct smcd_cdc_msg cdc; struct smc_sock *smc; @@ -346,7 +427,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data) */ void smcd_cdc_rx_init(struct smc_connection *conn) { - tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn); + tasklet_setup(&conn->rx_tsklet, smcd_cdc_rx_tsklet); } /***************************** init, exit, misc ******************************/ @@ -369,16 +450,19 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) read_lock_bh(&lgr->conns_lock); conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); read_unlock_bh(&lgr->conns_lock); - if (!conn) + if (!conn || conn->out_of_sync) return; smc = container_of(conn, struct smc_sock, conn); - if (!cdc->prod_flags.failover_validation) { - if (smc_cdc_before(ntohs(cdc->seqno), - conn->local_rx_ctrl.seqno)) - /* received seqno is old */ - return; + if (cdc->prod_flags.failover_validation) { + smc_cdc_msg_validate(smc, cdc, link); + return; } + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + smc_cdc_msg_recv(smc, cdc); } diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 861dc24c588c..696cc11f2303 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -97,23 +97,6 @@ static inline void smc_curs_add(int size, union smc_host_cursor *curs, } } -/* SMC cursors are 8 bytes long and require atomic reading and writing */ -static inline u64 smc_curs_read(union smc_host_cursor *curs, - struct smc_connection *conn) -{ -#ifndef KERNEL_HAS_ATOMIC64 - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&conn->acurs_lock, flags); - ret = curs->acurs; - spin_unlock_irqrestore(&conn->acurs_lock, flags); - return ret; -#else - return atomic64_read(&curs->acurs); -#endif -} - /* Copy cursor src into tgt */ static inline void smc_curs_copy(union smc_host_cursor *tgt, union smc_host_cursor *src, @@ -304,14 +287,18 @@ struct smc_cdc_tx_pend { }; int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend); -void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); +void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); int smcd_cdc_msg_send(struct smc_connection *conn); +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf); int smc_cdc_init(void) __init; void smcd_cdc_rx_init(struct smc_connection *conn); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 86cccc24e52e..1472f31480d8 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -14,6 +14,8 @@ #include <linux/inetdevice.h> #include <linux/if_ether.h> #include <linux/sched/signal.h> +#include <linux/utsname.h> +#include <linux/ctype.h> #include <net/addrconf.h> #include <net/sock.h> @@ -24,22 +26,415 @@ #include "smc_clc.h" #include "smc_ib.h" #include "smc_ism.h" +#include "smc_netlink.h" #define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 #define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 +#define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78 +#define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108 +#define SMC_CLC_RECV_BUF_LEN 100 /* eye catcher "SMCR" EBCDIC for CLC messages */ static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; /* eye catcher "SMCD" EBCDIC for CLC messages */ static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; +static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN]; + +struct smc_clc_eid_table { + rwlock_t lock; + struct list_head list; + u8 ueid_cnt; + u8 seid_enabled; +}; + +static struct smc_clc_eid_table smc_clc_eid_table; + +struct smc_clc_eid_entry { + struct list_head list; + u8 eid[SMC_MAX_EID_LEN]; +}; + +/* The size of a user EID is 32 characters. + * Valid characters should be (single-byte character set) A-Z, 0-9, '.' and '-'. + * Blanks should only be used to pad to the expected size. + * First character must be alphanumeric. + */ +static bool smc_clc_ueid_valid(char *ueid) +{ + char *end = ueid + SMC_MAX_EID_LEN; + + while (--end >= ueid && isspace(*end)) + ; + if (end < ueid) + return false; + if (!isalnum(*ueid) || islower(*ueid)) + return false; + while (ueid <= end) { + if ((!isalnum(*ueid) || islower(*ueid)) && *ueid != '.' && + *ueid != '-') + return false; + ueid++; + } + return true; +} + +static int smc_clc_ueid_add(char *ueid) +{ + struct smc_clc_eid_entry *new_ueid, *tmp_ueid; + int rc; + + if (!smc_clc_ueid_valid(ueid)) + return -EINVAL; + + /* add a new ueid entry to the ueid table if there isn't one */ + new_ueid = kzalloc(sizeof(*new_ueid), GFP_KERNEL); + if (!new_ueid) + return -ENOMEM; + memcpy(new_ueid->eid, ueid, SMC_MAX_EID_LEN); + + write_lock(&smc_clc_eid_table.lock); + if (smc_clc_eid_table.ueid_cnt >= SMC_MAX_UEID) { + rc = -ERANGE; + goto err_out; + } + list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { + if (!memcmp(tmp_ueid->eid, ueid, SMC_MAX_EID_LEN)) { + rc = -EEXIST; + goto err_out; + } + } + list_add_tail(&new_ueid->list, &smc_clc_eid_table.list); + smc_clc_eid_table.ueid_cnt++; + write_unlock(&smc_clc_eid_table.lock); + return 0; + +err_out: + write_unlock(&smc_clc_eid_table.lock); + kfree(new_ueid); + return rc; +} + +int smc_clc_ueid_count(void) +{ + int count; + + read_lock(&smc_clc_eid_table.lock); + count = smc_clc_eid_table.ueid_cnt; + read_unlock(&smc_clc_eid_table.lock); + + return count; +} + +int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; + char *ueid; + + if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) + return -EINVAL; + ueid = (char *)nla_data(nla_ueid); + + return smc_clc_ueid_add(ueid); +} + +/* remove one or all ueid entries from the table */ +static int smc_clc_ueid_remove(char *ueid) +{ + struct smc_clc_eid_entry *lst_ueid, *tmp_ueid; + int rc = -ENOENT; + + /* remove table entry */ + write_lock(&smc_clc_eid_table.lock); + list_for_each_entry_safe(lst_ueid, tmp_ueid, &smc_clc_eid_table.list, + list) { + if (!ueid || !memcmp(lst_ueid->eid, ueid, SMC_MAX_EID_LEN)) { + list_del(&lst_ueid->list); + smc_clc_eid_table.ueid_cnt--; + kfree(lst_ueid); + rc = 0; + } + } + if (!rc && !smc_clc_eid_table.ueid_cnt) { + smc_clc_eid_table.seid_enabled = 1; + rc = -EAGAIN; /* indicate success and enabling of seid */ + } + write_unlock(&smc_clc_eid_table.lock); + return rc; +} + +int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; + char *ueid; + + if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) + return -EINVAL; + ueid = (char *)nla_data(nla_ueid); + + return smc_clc_ueid_remove(ueid); +} + +int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info) +{ + smc_clc_ueid_remove(NULL); + return 0; +} + +static int smc_nl_ueid_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, + u32 flags, char *ueid) +{ + char ueid_str[SMC_MAX_EID_LEN + 1]; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &smc_gen_nl_family, + flags, SMC_NETLINK_DUMP_UEID); + if (!hdr) + return -ENOMEM; + memcpy(ueid_str, ueid, SMC_MAX_EID_LEN); + ueid_str[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_EID_TABLE_ENTRY, ueid_str)) { + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; + } + genlmsg_end(skb, hdr); + return 0; +} + +static int _smc_nl_ueid_dump(struct sk_buff *skb, u32 portid, u32 seq, + int start_idx) +{ + struct smc_clc_eid_entry *lst_ueid; + int idx = 0; + + read_lock(&smc_clc_eid_table.lock); + list_for_each_entry(lst_ueid, &smc_clc_eid_table.list, list) { + if (idx++ < start_idx) + continue; + if (smc_nl_ueid_dumpinfo(skb, portid, seq, NLM_F_MULTI, + lst_ueid->eid)) { + --idx; + break; + } + } + read_unlock(&smc_clc_eid_table.lock); + return idx; +} + +int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int idx; + + idx = _smc_nl_ueid_dump(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb_ctx->pos[0]); + + cb_ctx->pos[0] = idx; + return skb->len; +} + +int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + char seid_str[SMC_MAX_EID_LEN + 1]; + u8 seid_enabled; + void *hdr; + u8 *seid; + + if (cb_ctx->pos[0]) + return skb->len; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_SEID); + if (!hdr) + return -ENOMEM; + if (!smc_ism_is_v2_capable()) + goto end; + + smc_ism_get_system_eid(&seid); + memcpy(seid_str, seid, SMC_MAX_EID_LEN); + seid_str[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SEID_ENTRY, seid_str)) + goto err; + read_lock(&smc_clc_eid_table.lock); + seid_enabled = smc_clc_eid_table.seid_enabled; + read_unlock(&smc_clc_eid_table.lock); + if (nla_put_u8(skb, SMC_NLA_SEID_ENABLED, seid_enabled)) + goto err; +end: + genlmsg_end(skb, hdr); + cb_ctx->pos[0]++; + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info) +{ + write_lock(&smc_clc_eid_table.lock); + smc_clc_eid_table.seid_enabled = 1; + write_unlock(&smc_clc_eid_table.lock); + return 0; +} + +int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + + write_lock(&smc_clc_eid_table.lock); + if (!smc_clc_eid_table.ueid_cnt) + rc = -ENOENT; + else + smc_clc_eid_table.seid_enabled = 0; + write_unlock(&smc_clc_eid_table.lock); + return rc; +} + +static bool _smc_clc_match_ueid(u8 *peer_ueid) +{ + struct smc_clc_eid_entry *tmp_ueid; + + list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { + if (!memcmp(tmp_ueid->eid, peer_ueid, SMC_MAX_EID_LEN)) + return true; + } + return false; +} + +bool smc_clc_match_eid(u8 *negotiated_eid, + struct smc_clc_v2_extension *smc_v2_ext, + u8 *peer_eid, u8 *local_eid) +{ + bool match = false; + int i; + + negotiated_eid[0] = 0; + read_lock(&smc_clc_eid_table.lock); + if (peer_eid && local_eid && + smc_clc_eid_table.seid_enabled && + smc_v2_ext->hdr.flag.seid && + !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) { + memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN); + match = true; + goto out; + } + + for (i = 0; i < smc_v2_ext->hdr.eid_cnt; i++) { + if (_smc_clc_match_ueid(smc_v2_ext->user_eids[i])) { + memcpy(negotiated_eid, smc_v2_ext->user_eids[i], + SMC_MAX_EID_LEN); + match = true; + goto out; + } + } +out: + read_unlock(&smc_clc_eid_table.lock); + return match; +} + +/* check arriving CLC proposal */ +static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_smcd_v2_extension *smcd_v2_ext; + struct smc_clc_msg_hdr *hdr = &pclc->hdr; + struct smc_clc_v2_extension *v2_ext; + + v2_ext = smc_get_clc_v2_ext(pclc); + pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (hdr->version == SMC_V1) { + if (hdr->typev1 == SMC_TYPE_N) + return false; + if (ntohs(hdr->length) != + sizeof(*pclc) + ntohs(pclc->iparea_offset) + + sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(struct smc_clc_ipv6_prefix) + + sizeof(struct smc_clc_msg_trail)) + return false; + } else { + if (ntohs(hdr->length) != + sizeof(*pclc) + + sizeof(struct smc_clc_msg_smcd) + + (hdr->typev1 != SMC_TYPE_N ? + sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(struct smc_clc_ipv6_prefix) : 0) + + (hdr->typev2 != SMC_TYPE_N ? + sizeof(*v2_ext) + + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN : 0) + + (smcd_indicated(hdr->typev2) ? + sizeof(*smcd_v2_ext) + v2_ext->hdr.ism_gid_cnt * + sizeof(struct smc_clc_smcd_gid_chid) : + 0) + + sizeof(struct smc_clc_msg_trail)) + return false; + } + return true; +} + +/* check arriving CLC accept or confirm */ +static bool +smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) +{ + struct smc_clc_msg_hdr *hdr = &clc_v2->hdr; + + if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) + return false; + if (hdr->version == SMC_V1) { + if ((hdr->typev1 == SMC_TYPE_R && + ntohs(hdr->length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || + (hdr->typev1 == SMC_TYPE_D && + ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) + return false; + } else { + if (hdr->typev1 == SMC_TYPE_D && + ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 && + (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 + + sizeof(struct smc_clc_first_contact_ext))) + return false; + if (hdr->typev1 == SMC_TYPE_R && + ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) + return false; + } + return true; +} + +/* check arriving CLC decline */ +static bool +smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) +{ + struct smc_clc_msg_hdr *hdr = &dclc->hdr; + + if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) + return false; + if (hdr->version == SMC_V1) { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) + return false; + } else { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2)) + return false; + } + return true; +} + +static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len) +{ + memset(fce, 0, sizeof(*fce)); + fce->os_type = SMC_CLC_OS_LINUX; + fce->release = SMC_RELEASE; + memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname)); + (*len) += sizeof(*fce); +} + /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers */ -static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) +static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) { - struct smc_clc_msg_proposal_prefix *pclc_prfx; - struct smc_clc_msg_accept_confirm *clc; + struct smc_clc_msg_accept_confirm_v2 *clc_v2; struct smc_clc_msg_proposal *pclc; struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; @@ -49,44 +444,32 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) return false; switch (clcm->type) { case SMC_CLC_PROPOSAL: - if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && - clcm->path != SMC_TYPE_B) - return false; pclc = (struct smc_clc_msg_proposal *)clcm; - pclc_prfx = smc_clc_proposal_get_prefix(pclc); - if (ntohs(pclc->hdr.length) != - sizeof(*pclc) + ntohs(pclc->iparea_offset) + - sizeof(*pclc_prfx) + - pclc_prfx->ipv6_prefixes_cnt * - sizeof(struct smc_clc_ipv6_prefix) + - sizeof(*trl)) + if (!smc_clc_msg_prop_valid(pclc)) return false; trl = (struct smc_clc_msg_trail *) ((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: - if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D) - return false; - clc = (struct smc_clc_msg_accept_confirm *)clcm; - if ((clcm->path == SMC_TYPE_R && - ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) || - (clcm->path == SMC_TYPE_D && - ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN)) + clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm; + if (!smc_clc_msg_acc_conf_valid(clc_v2)) return false; trl = (struct smc_clc_msg_trail *) - ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl)); + ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) - + sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; - if (ntohs(dclc->hdr.length) != sizeof(*dclc)) + if (!smc_clc_msg_decl_valid(dclc)) return false; - trl = &dclc->trl; + check_trl = false; break; default: return false; } - if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && + if (check_trl && + memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) && memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER))) return false; return true; @@ -154,7 +537,6 @@ static int smc_clc_prfx_set(struct socket *clcsock, struct sockaddr_in *addr; int rc = -ENOENT; - memset(prop, 0, sizeof(*prop)); if (!dst) { rc = -ENOTCONN; goto out; @@ -164,7 +546,8 @@ static int smc_clc_prfx_set(struct socket *clcsock, goto out_rel; } /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addrs); + if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) + goto out_rel; /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); @@ -276,7 +659,8 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, struct msghdr msg = {NULL, 0}; int reason_code = 0; struct kvec vec = {buf, buflen}; - int len, datlen; + int len, datlen, recvlen; + bool check_trl = true; int krflags; /* peek the first few bytes to determine length of data to receive @@ -320,10 +704,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } datlen = ntohs(clcm->length); if ((len < sizeof(struct smc_clc_msg_hdr)) || - (datlen > buflen) || - (clcm->version != SMC_CLC_V1) || - (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D && - clcm->path != SMC_TYPE_B) || + (clcm->version < SMC_V1) || ((clcm->type != SMC_CLC_DECLINE) && (clcm->type != expected_type))) { smc->sk.sk_err = EPROTO; @@ -333,23 +714,43 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, /* receive the complete CLC message */ memset(&msg, 0, sizeof(struct msghdr)); - iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, datlen); + if (datlen > buflen) { + check_trl = false; + recvlen = buflen; + } else { + recvlen = datlen; + } + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); krflags = MSG_WAITALL; len = sock_recvmsg(smc->clcsock, &msg, krflags); - if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { + if (len < recvlen || !smc_clc_msg_hdr_valid(clcm, check_trl)) { smc->sk.sk_err = EPROTO; reason_code = -EPROTO; goto out; } + datlen -= len; + while (datlen) { + u8 tmp[SMC_CLC_RECV_BUF_LEN]; + + vec.iov_base = &tmp; + vec.iov_len = SMC_CLC_RECV_BUF_LEN; + /* receive remaining proposal message */ + recvlen = datlen > SMC_CLC_RECV_BUF_LEN ? + SMC_CLC_RECV_BUF_LEN : datlen; + iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, recvlen); + len = sock_recvmsg(smc->clcsock, &msg, krflags); + datlen -= len; + } if (clcm->type == SMC_CLC_DECLINE) { struct smc_clc_msg_decline *dclc; dclc = (struct smc_clc_msg_decline *)clcm; reason_code = SMC_CLC_DECL_PEERDECL; smc->peer_diagnosis = ntohl(dclc->peer_diagnosis); - if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { + if (((struct smc_clc_msg_decline *)buf)->hdr.typev2 & + SMC_FIRST_CONTACT_MASK) { smc->conn.lgr->sync_err = 1; - smc_lgr_terminate(smc->conn.lgr, true); + smc_lgr_terminate_sched(smc->conn.lgr); } } @@ -359,172 +760,369 @@ out: } /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) { - struct smc_clc_msg_decline dclc; + struct smc_clc_msg_decline *dclc_v1; + struct smc_clc_msg_decline_v2 dclc; struct msghdr msg; + int len, send_len; struct kvec vec; - int len; + dclc_v1 = (struct smc_clc_msg_decline *)&dclc; memset(&dclc, 0, sizeof(dclc)); memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); dclc.hdr.type = SMC_CLC_DECLINE; - dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); - dclc.hdr.version = SMC_CLC_V1; - dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; - if (smc->conn.lgr && !smc->conn.lgr->is_smcd) + dclc.hdr.version = version; + dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; + dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? + SMC_FIRST_CONTACT_MASK : 0; + if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) && + smc_ib_is_valid_local_systemid()) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); - memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + if (version == SMC_V1) { + memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(*dclc_v1); + } else { + memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(dclc); + } + dclc.hdr.length = htons(send_len); memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; - vec.iov_len = sizeof(struct smc_clc_msg_decline); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, - sizeof(struct smc_clc_msg_decline)); - if (len < 0 || len < sizeof(struct smc_clc_msg_decline)) + vec.iov_len = send_len; + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); + if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; } /* send CLC PROPOSAL message across internal TCP socket */ -int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_init_info *ini) +int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) { - struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; - struct smc_clc_msg_proposal_prefix pclc_prfx; - struct smc_clc_msg_smcd pclc_smcd; - struct smc_clc_msg_proposal pclc; - struct smc_clc_msg_trail trl; + struct smc_clc_smcd_v2_extension *smcd_v2_ext; + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct smc_clc_msg_proposal *pclc_base; + struct smc_clc_smcd_gid_chid *gidchids; + struct smc_clc_msg_proposal_area *pclc; + struct smc_clc_ipv6_prefix *ipv6_prfx; + struct smc_clc_v2_extension *v2_ext; + struct smc_clc_msg_smcd *pclc_smcd; + struct smc_clc_msg_trail *trl; int len, i, plen, rc; int reason_code = 0; - struct kvec vec[5]; + struct kvec vec[8]; struct msghdr msg; + pclc = kzalloc(sizeof(*pclc), GFP_KERNEL); + if (!pclc) + return -ENOMEM; + + pclc_base = &pclc->pclc_base; + pclc_smcd = &pclc->pclc_smcd; + pclc_prfx = &pclc->pclc_prfx; + ipv6_prfx = pclc->pclc_prfx_ipv6; + v2_ext = &pclc->pclc_v2_ext; + smcd_v2_ext = &pclc->pclc_smcd_v2_ext; + gidchids = pclc->pclc_gidchids; + trl = &pclc->pclc_trl; + + pclc_base->hdr.version = SMC_V2; + pclc_base->hdr.typev1 = ini->smc_type_v1; + pclc_base->hdr.typev2 = ini->smc_type_v2; + plen = sizeof(*pclc_base) + sizeof(*pclc_smcd) + sizeof(*trl); + /* retrieve ip prefixes for CLC proposal msg */ - rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx); - if (rc) - return SMC_CLC_DECL_CNFERR; /* configuration error */ + if (ini->smc_type_v1 != SMC_TYPE_N) { + rc = smc_clc_prfx_set(smc->clcsock, pclc_prfx, ipv6_prfx); + if (rc) { + if (ini->smc_type_v2 == SMC_TYPE_N) { + kfree(pclc); + return SMC_CLC_DECL_CNFERR; + } + pclc_base->hdr.typev1 = SMC_TYPE_N; + } else { + pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); + plen += sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + } + } - /* send SMC Proposal CLC message */ - plen = sizeof(pclc) + sizeof(pclc_prfx) + - (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) + - sizeof(trl); - memset(&pclc, 0, sizeof(pclc)); - memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); - pclc.hdr.type = SMC_CLC_PROPOSAL; - pclc.hdr.version = SMC_CLC_V1; /* SMC version */ - pclc.hdr.path = smc_type; - if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) { + /* build SMC Proposal CLC message */ + memcpy(pclc_base->hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + pclc_base->hdr.type = SMC_CLC_PROPOSAL; + if (smcr_indicated(ini->smc_type_v1)) { /* add SMC-R specifics */ - memcpy(pclc.lcl.id_for_peer, local_systemid, + memcpy(pclc_base->lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&pclc.lcl.gid, ini->ib_gid, SMC_GID_SIZE); - memcpy(&pclc.lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], + memcpy(pclc_base->lcl.gid, ini->ib_gid, SMC_GID_SIZE); + memcpy(pclc_base->lcl.mac, &ini->ib_dev->mac[ini->ib_port - 1], ETH_ALEN); - pclc.iparea_offset = htons(0); } - if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { + if (smcd_indicated(ini->smc_type_v1)) { /* add SMC-D specifics */ - memset(&pclc_smcd, 0, sizeof(pclc_smcd)); - plen += sizeof(pclc_smcd); - pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET); - pclc_smcd.gid = ini->ism_dev->local_gid; + if (ini->ism_dev[0]) { + pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid); + pclc_smcd->ism.chid = + htons(smc_ism_get_chid(ini->ism_dev[0])); + } } - pclc.hdr.length = htons(plen); + if (ini->smc_type_v2 == SMC_TYPE_N) { + pclc_smcd->v2_ext_offset = 0; + } else { + struct smc_clc_eid_entry *ueident; + u16 v2_ext_offset; + + v2_ext->hdr.flag.release = SMC_RELEASE; + v2_ext_offset = sizeof(*pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + if (ini->smc_type_v1 != SMC_TYPE_N) + v2_ext_offset += sizeof(*pclc_prfx) + + pclc_prfx->ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + pclc_smcd->v2_ext_offset = htons(v2_ext_offset); + plen += sizeof(*v2_ext); - memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + read_lock(&smc_clc_eid_table.lock); + v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt; + plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN; + i = 0; + list_for_each_entry(ueident, &smc_clc_eid_table.list, list) { + memcpy(v2_ext->user_eids[i++], ueident->eid, + sizeof(ueident->eid)); + } + read_unlock(&smc_clc_eid_table.lock); + } + if (smcd_indicated(ini->smc_type_v2)) { + u8 *eid = NULL; + + v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; + v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt; + v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - + offsetofend(struct smc_clnt_opts_area_hdr, + smcd_v2_ext_offset) + + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); + smc_ism_get_system_eid(&eid); + if (eid && v2_ext->hdr.flag.seid) + memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN); + plen += sizeof(*smcd_v2_ext); + if (ini->ism_offered_cnt) { + for (i = 1; i <= ini->ism_offered_cnt; i++) { + gidchids[i - 1].gid = + htonll(ini->ism_dev[i]->local_gid); + gidchids[i - 1].chid = + htons(smc_ism_get_chid(ini->ism_dev[i])); + } + plen += ini->ism_offered_cnt * + sizeof(struct smc_clc_smcd_gid_chid); + } + } + if (smcr_indicated(ini->smc_type_v2)) + memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); + + pclc_base->hdr.length = htons(plen); + memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + /* send SMC Proposal CLC message */ memset(&msg, 0, sizeof(msg)); i = 0; - vec[i].iov_base = &pclc; - vec[i++].iov_len = sizeof(pclc); - if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) { - vec[i].iov_base = &pclc_smcd; - vec[i++].iov_len = sizeof(pclc_smcd); - } - vec[i].iov_base = &pclc_prfx; - vec[i++].iov_len = sizeof(pclc_prfx); - if (pclc_prfx.ipv6_prefixes_cnt > 0) { - vec[i].iov_base = &ipv6_prfx[0]; - vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt * - sizeof(ipv6_prfx[0]); + vec[i].iov_base = pclc_base; + vec[i++].iov_len = sizeof(*pclc_base); + vec[i].iov_base = pclc_smcd; + vec[i++].iov_len = sizeof(*pclc_smcd); + if (ini->smc_type_v1 != SMC_TYPE_N) { + vec[i].iov_base = pclc_prfx; + vec[i++].iov_len = sizeof(*pclc_prfx); + if (pclc_prfx->ipv6_prefixes_cnt > 0) { + vec[i].iov_base = ipv6_prfx; + vec[i++].iov_len = pclc_prfx->ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + } } - vec[i].iov_base = &trl; - vec[i++].iov_len = sizeof(trl); + if (ini->smc_type_v2 != SMC_TYPE_N) { + vec[i].iov_base = v2_ext; + vec[i++].iov_len = sizeof(*v2_ext) + + (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); + if (smcd_indicated(ini->smc_type_v2)) { + vec[i].iov_base = smcd_v2_ext; + vec[i++].iov_len = sizeof(*smcd_v2_ext); + if (ini->ism_offered_cnt) { + vec[i].iov_base = gidchids; + vec[i++].iov_len = ini->ism_offered_cnt * + sizeof(struct smc_clc_smcd_gid_chid); + } + } + } + vec[i].iov_base = trl; + vec[i++].iov_len = sizeof(*trl); /* due to the few bytes needed for clc-handshake this cannot block */ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); if (len < 0) { smc->sk.sk_err = smc->clcsock->sk->sk_err; reason_code = -smc->sk.sk_err; - } else if (len < (int)sizeof(pclc)) { + } else if (len < ntohs(pclc_base->hdr.length)) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; } + kfree(pclc); return reason_code; } -/* send CLC CONFIRM message across internal TCP socket */ -int smc_clc_send_confirm(struct smc_sock *smc) +/* build and send CLC CONFIRM / ACCEPT message */ +static int smc_clc_send_confirm_accept(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm_v2 *clc_v2, + int first_contact, u8 version, + u8 *eid, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; - struct smc_clc_msg_accept_confirm cclc; - struct smc_link *link; - int reason_code = 0; + struct smc_clc_msg_accept_confirm *clc; + struct smc_clc_first_contact_ext fce; + struct smc_clc_fce_gid_ext gle; + struct smc_clc_msg_trail trl; + struct kvec vec[5]; struct msghdr msg; - struct kvec vec; - int len; + int i, len; /* send SMC Confirm CLC msg */ - memset(&cclc, 0, sizeof(cclc)); - cclc.hdr.type = SMC_CLC_CONFIRM; - cclc.hdr.version = SMC_CLC_V1; /* SMC version */ - if (smc->conn.lgr->is_smcd) { + clc = (struct smc_clc_msg_accept_confirm *)clc_v2; + clc->hdr.version = version; /* SMC version */ + if (first_contact) + clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK; + if (conn->lgr->is_smcd) { /* SMC-D specific settings */ - memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER, + memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - cclc.hdr.path = SMC_TYPE_D; - cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); - cclc.gid = conn->lgr->smcd->local_gid; - cclc.token = conn->rmb_desc->token; - cclc.dmbe_size = conn->rmbe_size_short; - cclc.dmbe_idx = 0; - memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); - memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, + clc->hdr.typev1 = SMC_TYPE_D; + clc->d0.gid = conn->lgr->smcd->local_gid; + clc->d0.token = conn->rmb_desc->token; + clc->d0.dmbe_size = conn->rmbe_size_short; + clc->d0.dmbe_idx = 0; + memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + } else { + clc_v2->d1.chid = + htons(smc_ism_get_chid(conn->lgr->smcd)); + if (eid && eid[0]) + memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); + len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) + smc_clc_fill_fce(&fce, &len); + clc_v2->hdr.length = htons(len); + } + memcpy(trl.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); } else { + struct smc_link *link = conn->lnk; + /* SMC-R specific settings */ - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, + memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); - cclc.hdr.path = SMC_TYPE_R; - cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - memcpy(cclc.lcl.id_for_peer, local_systemid, + clc->hdr.typev1 = SMC_TYPE_R; + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + memcpy(clc->r0.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); - memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE); - memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); + memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); - hton24(cclc.qpn, link->roce_qp->qp_num); - cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ - cclc.rmbe_alert_token = htonl(conn->alert_token_local); - cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); - cclc.rmbe_size = conn->rmbe_size_short; - cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(cclc.psn, link->psn_initial); - memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, - sizeof(SMC_EYECATCHER)); + hton24(clc->r0.qpn, link->roce_qp->qp_num); + clc->r0.rmb_rkey = + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); + clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); + switch (clc->hdr.type) { + case SMC_CLC_ACCEPT: + clc->r0.qp_mtu = link->path_mtu; + break; + case SMC_CLC_CONFIRM: + clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + break; + } + clc->r0.rmbe_size = conn->rmbe_size_short; + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); + hton24(clc->r0.psn, link->psn_initial); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + } else { + if (eid && eid[0]) + memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); + len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + smc_clc_fill_fce(&fce, &len); + fce.v2_direct = !link->lgr->uses_gateway; + memset(&gle, 0, sizeof(gle)); + if (ini && clc->hdr.type == SMC_CLC_CONFIRM) { + gle.gid_cnt = ini->smcrv2.gidlist.len; + len += sizeof(gle); + len += gle.gid_cnt * sizeof(gle.gid[0]); + } else { + len += sizeof(gle.reserved); + } + } + clc_v2->hdr.length = htons(len); + } + memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } memset(&msg, 0, sizeof(msg)); - vec.iov_base = &cclc; - vec.iov_len = ntohs(cclc.hdr.length); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, - ntohs(cclc.hdr.length)); - if (len < ntohs(cclc.hdr.length)) { + i = 0; + vec[i].iov_base = clc_v2; + if (version > SMC_V1) + vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? + SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 : + SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) - + sizeof(trl); + else + vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? + SMCD_CLC_ACCEPT_CONFIRM_LEN : + SMCR_CLC_ACCEPT_CONFIRM_LEN) - + sizeof(trl); + if (version > SMC_V1 && first_contact) { + vec[i].iov_base = &fce; + vec[i++].iov_len = sizeof(fce); + if (!conn->lgr->is_smcd) { + if (clc->hdr.type == SMC_CLC_CONFIRM) { + vec[i].iov_base = &gle; + vec[i++].iov_len = sizeof(gle); + vec[i].iov_base = &ini->smcrv2.gidlist.list; + vec[i++].iov_len = gle.gid_cnt * + sizeof(gle.gid[0]); + } else { + vec[i].iov_base = &gle.reserved; + vec[i++].iov_len = sizeof(gle.reserved); + } + } + } + vec[i].iov_base = &trl; + vec[i++].iov_len = sizeof(trl); + return kernel_sendmsg(smc->clcsock, &msg, vec, 1, + ntohs(clc->hdr.length)); +} + +/* send CLC CONFIRM message across internal TCP socket */ +int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, + u8 version, u8 *eid, struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 cclc_v2; + int reason_code = 0; + int len; + + /* send SMC Confirm CLC msg */ + memset(&cclc_v2, 0, sizeof(cclc_v2)); + cclc_v2.hdr.type = SMC_CLC_CONFIRM; + len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact, + version, eid, ini); + if (len < ntohs(cclc_v2.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; @@ -537,67 +1135,43 @@ int smc_clc_send_confirm(struct smc_sock *smc) } /* send CLC ACCEPT message across internal TCP socket */ -int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) +int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, + u8 version, u8 *negotiated_eid) { - struct smc_connection *conn = &new_smc->conn; - struct smc_clc_msg_accept_confirm aclc; - struct smc_link *link; - struct msghdr msg; - struct kvec vec; + struct smc_clc_msg_accept_confirm_v2 aclc_v2; int len; - memset(&aclc, 0, sizeof(aclc)); - aclc.hdr.type = SMC_CLC_ACCEPT; - aclc.hdr.version = SMC_CLC_V1; /* SMC version */ - if (srv_first_contact) - aclc.hdr.flag = 1; - - if (new_smc->conn.lgr->is_smcd) { - /* SMC-D specific settings */ - aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); - memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - aclc.hdr.path = SMC_TYPE_D; - aclc.gid = conn->lgr->smcd->local_gid; - aclc.token = conn->rmb_desc->token; - aclc.dmbe_size = conn->rmbe_size_short; - aclc.dmbe_idx = 0; - memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); - memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - } else { - /* SMC-R specific settings */ - aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, - sizeof(SMC_EYECATCHER)); - aclc.hdr.path = SMC_TYPE_R; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - memcpy(aclc.lcl.id_for_peer, local_systemid, - sizeof(local_systemid)); - memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); - memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], - ETH_ALEN); - hton24(aclc.qpn, link->roce_qp->qp_num); - aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ - aclc.rmbe_alert_token = htonl(conn->alert_token_local); - aclc.qp_mtu = link->path_mtu; - aclc.rmbe_size = conn->rmbe_size_short, - aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); - hton24(aclc.psn, link->psn_initial); - memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, - sizeof(SMC_EYECATCHER)); - } - - memset(&msg, 0, sizeof(msg)); - vec.iov_base = &aclc; - vec.iov_len = ntohs(aclc.hdr.length); - len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, - ntohs(aclc.hdr.length)); - if (len < ntohs(aclc.hdr.length)) + memset(&aclc_v2, 0, sizeof(aclc_v2)); + aclc_v2.hdr.type = SMC_CLC_ACCEPT; + len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, + version, negotiated_eid, NULL); + if (len < ntohs(aclc_v2.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; return len > 0 ? 0 : len; } + +void smc_clc_get_hostname(u8 **host) +{ + *host = &smc_hostname[0]; +} + +void __init smc_clc_init(void) +{ + struct new_utsname *u; + + memset(smc_hostname, _S, sizeof(smc_hostname)); /* ASCII blanks */ + u = utsname(); + memcpy(smc_hostname, u->nodename, + min_t(size_t, strlen(u->nodename), sizeof(smc_hostname))); + + INIT_LIST_HEAD(&smc_clc_eid_table.list); + rwlock_init(&smc_clc_eid_table.lock); + smc_clc_eid_table.ueid_cnt = 0; + smc_clc_eid_table.seid_enabled = 1; +} + +void smc_clc_exit(void) +{ + smc_clc_ueid_remove(NULL); +} diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ca209272e5fa..5fee545c9a10 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -14,17 +14,19 @@ #define _SMC_CLC_H #include <rdma/ib_verbs.h> +#include <linux/smc.h> #include "smc.h" +#include "smc_netlink.h" #define SMC_CLC_PROPOSAL 0x01 #define SMC_CLC_ACCEPT 0x02 #define SMC_CLC_CONFIRM 0x03 #define SMC_CLC_DECLINE 0x04 -#define SMC_CLC_V1 0x1 /* SMC version */ #define SMC_TYPE_R 0 /* SMC-R only */ #define SMC_TYPE_D 1 /* SMC-D only */ +#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */ #define SMC_TYPE_B 3 /* SMC-R and SMC-D */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */ @@ -37,19 +39,32 @@ #define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */ #define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */ #define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */ -#define SMC_CLC_DECL_SMCDNOTALK 0x03030003 /* SMC-D dev can't talk to peer */ +#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */ +#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */ +#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */ +#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ +#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ +#define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ #define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ +#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ +#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ +#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ +#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */ +#define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */ +#define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ -#define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ + +#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ @@ -57,13 +72,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */ __be16 length; #if defined(__BIG_ENDIAN_BITFIELD) u8 version : 4, - flag : 1, - rsvd : 1, - path : 2; + typev2 : 2, + typev1 : 2; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 path : 2, - rsvd : 1, - flag : 1, + u8 typev1 : 2, + typev2 : 2, version : 4; #endif } __packed; /* format defined in RFC7609 */ @@ -78,8 +91,6 @@ struct smc_clc_msg_local { /* header2 of clc messages */ u8 mac[6]; /* mac of ib_device port */ }; -#define SMC_CLC_MAX_V6_PREFIX 8 - /* Struct would be 4 byte aligned, but it is used in an array that is sent * to peers and must conform to RFC7609, hence we need to use packed here. */ @@ -88,6 +99,44 @@ struct smc_clc_ipv6_prefix { u8 prefix_len; } __packed; /* format defined in RFC7609 */ +#if defined(__BIG_ENDIAN_BITFIELD) +struct smc_clc_v2_flag { + u8 release : 4, + rsvd : 3, + seid : 1; +}; +#elif defined(__LITTLE_ENDIAN_BITFIELD) +struct smc_clc_v2_flag { + u8 seid : 1, + rsvd : 3, + release : 4; +}; +#endif + +struct smc_clnt_opts_area_hdr { + u8 eid_cnt; /* number of user defined EIDs */ + u8 ism_gid_cnt; /* number of ISMv2 GIDs */ + u8 reserved1; + struct smc_clc_v2_flag flag; + u8 reserved2[2]; + __be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */ +}; + +struct smc_clc_smcd_gid_chid { + __be64 gid; /* ISM GID */ + __be16 chid; /* ISMv2 CHID */ +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ + +struct smc_clc_v2_extension { + struct smc_clnt_opts_area_hdr hdr; + u8 roce[16]; /* RoCEv2 GID */ + u8 reserved[16]; + u8 user_eids[][SMC_MAX_EID_LEN]; +}; + struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ u8 prefix_len; /* number of significant bits in mask */ @@ -96,8 +145,15 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ } __aligned(4); struct smc_clc_msg_smcd { /* SMC-D GID information */ - u64 gid; /* ISM GID of requestor */ - u8 res[32]; + struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ + __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ + u8 reserved[28]; +}; + +struct smc_clc_smcd_v2_extension { + u8 system_eid[SMC_MAX_EID_LEN]; + u8 reserved[16]; + struct smc_clc_smcd_gid_chid gidchid[]; }; struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ @@ -106,64 +162,141 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ __be16 iparea_offset; /* offset to IP address information area */ } __aligned(4); -#define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28 -#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \ - sizeof(struct smc_clc_ipv6_prefix)) -#define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \ - SMC_CLC_PROPOSAL_MAX_OFFSET + \ - sizeof(struct smc_clc_msg_proposal_prefix) + \ - SMC_CLC_PROPOSAL_MAX_PREFIX + \ - sizeof(struct smc_clc_msg_trail)) +#define SMC_CLC_MAX_V6_PREFIX 8 +#define SMC_CLC_MAX_UEID 8 -struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ - struct smc_clc_msg_hdr hdr; - union { - struct { /* SMC-R */ - struct smc_clc_msg_local lcl; - u8 qpn[3]; /* QP number */ - __be32 rmb_rkey; /* RMB rkey */ - u8 rmbe_idx; /* Index of RMBE in RMB */ - __be32 rmbe_alert_token;/* unique connection id */ +struct smc_clc_msg_proposal_area { + struct smc_clc_msg_proposal pclc_base; + struct smc_clc_msg_smcd pclc_smcd; + struct smc_clc_msg_proposal_prefix pclc_prfx; + struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; + struct smc_clc_v2_extension pclc_v2_ext; + u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; + struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; + struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS]; + struct smc_clc_msg_trail pclc_trl; +}; + +struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 rmbe_idx; /* Index of RMBE in RMB */ + __be32 rmbe_alert_token; /* unique connection id */ + #if defined(__BIG_ENDIAN_BITFIELD) + u8 rmbe_size : 4, /* buf size (compressed) */ + qp_mtu : 4; /* QP mtu */ +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + rmbe_size : 4; +#endif + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* packet sequence number */ +} __packed; + +struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */ + u64 gid; /* Sender GID */ + u64 token; /* DMB token */ + u8 dmbe_idx; /* DMBE index */ #if defined(__BIG_ENDIAN_BITFIELD) - u8 rmbe_size : 4, /* buf size (compressed) */ - qp_mtu : 4; /* QP mtu */ + u8 dmbe_size : 4, /* buf size (compressed) */ + reserved3 : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 qp_mtu : 4, - rmbe_size : 4; + u8 reserved3 : 4, + dmbe_size : 4; #endif - u8 reserved; - __be64 rmb_dma_addr; /* RMB virtual address */ - u8 reserved2; - u8 psn[3]; /* packet sequence number */ - struct smc_clc_msg_trail smcr_trl; - /* eye catcher "SMCR" EBCDIC */ - } __packed; - struct { /* SMC-D */ - u64 gid; /* Sender GID */ - u64 token; /* DMB token */ - u8 dmbe_idx; /* DMBE index */ + u16 reserved4; + __be32 linkid; /* Link identifier */ +} __packed; + +#define SMC_CLC_OS_ZOS 1 +#define SMC_CLC_OS_LINUX 2 +#define SMC_CLC_OS_AIX 3 + +struct smc_clc_first_contact_ext { #if defined(__BIG_ENDIAN_BITFIELD) - u8 dmbe_size : 4, /* buf size (compressed) */ - reserved3 : 4; + u8 v2_direct : 1, + reserved : 7; + u8 os_type : 4, + release : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 reserved3 : 4, - dmbe_size : 4; + u8 reserved : 7, + v2_direct : 1; + u8 release : 4, + os_type : 4; #endif - u16 reserved4; - u32 linkid; /* Link identifier */ + u8 reserved2[2]; + u8 hostname[SMC_MAX_HOSTNAME_LEN]; +}; + +struct smc_clc_fce_gid_ext { + u8 reserved[16]; + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + +struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + union { + struct smcr_clc_msg_accept_confirm r0; /* SMC-R */ + struct { /* SMC-D */ + struct smcd_clc_msg_accept_confirm_common d0; u32 reserved5[3]; - struct smc_clc_msg_trail smcd_trl; - /* eye catcher "SMCD" EBCDIC */ - } __packed; + }; }; } __packed; /* format defined in RFC7609 */ +struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + union { + struct { /* SMC-R */ + struct smcr_clc_msg_accept_confirm r0; + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved6[8]; + } r1; + struct { /* SMC-D */ + struct smcd_clc_msg_accept_confirm_common d0; + __be16 chid; + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved5[8]; + } d1; + }; +}; + struct smc_clc_msg_decline { /* clc decline message */ struct smc_clc_msg_hdr hdr; u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ __be32 peer_diagnosis; /* diagnosis information */ - u8 reserved2[4]; - struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 os_type : 4, + reserved : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 4, + os_type : 4; +#endif + u8 reserved2[3]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ +} __aligned(4); + +#define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */ + +struct smc_clc_msg_decline_v2 { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 os_type : 4, + reserved : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 4, + os_type : 4; +#endif + u8 reserved2[3]; + __be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ } __aligned(4); /* determine start of the prefix area within the proposal message */ @@ -174,16 +307,69 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } +static inline bool smcr_indicated(int smc_type) +{ + return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B; +} + +static inline bool smcd_indicated(int smc_type) +{ + return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B; +} + +static inline u8 smc_indicated_type(int is_smcd, int is_smcr) +{ + if (is_smcd && is_smcr) + return SMC_TYPE_B; + if (is_smcd) + return SMC_TYPE_D; + if (is_smcr) + return SMC_TYPE_R; + return SMC_TYPE_N; +} + /* get SMC-D info from proposal message */ static inline struct smc_clc_msg_smcd * smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) { - if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd)) + if (smcd_indicated(prop->hdr.typev1) && + ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd)) return NULL; return (struct smc_clc_msg_smcd *)(prop + 1); } +static inline struct smc_clc_v2_extension * +smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) +{ + struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + return NULL; + + return (struct smc_clc_v2_extension *) + ((u8 *)prop_smcd + + offsetof(struct smc_clc_msg_smcd, v2_ext_offset) + + sizeof(prop_smcd->v2_ext_offset) + + ntohs(prop_smcd->v2_ext_offset)); +} + +static inline struct smc_clc_smcd_v2_extension * +smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) +{ + if (!prop_v2ext) + return NULL; + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + return NULL; + + return (struct smc_clc_smcd_v2_extension *) + ((u8 *)prop_v2ext + + offsetof(struct smc_clc_v2_extension, hdr) + + offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + + sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) + + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)); +} + struct smcd_dev; struct smc_init_info; @@ -191,10 +377,25 @@ int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); -int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, - struct smc_init_info *ini); -int smc_clc_send_confirm(struct smc_sock *smc); -int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version); +int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); +int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, + u8 version, u8 *eid, struct smc_init_info *ini); +int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, + u8 version, u8 *negotiated_eid); +void smc_clc_init(void) __init; +void smc_clc_exit(void); +void smc_clc_get_hostname(u8 **host); +bool smc_clc_match_eid(u8 *negotiated_eid, + struct smc_clc_v2_extension *smc_v2_ext, + u8 *peer_eid, u8 *local_eid); +int smc_clc_ueid_count(void); +int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 290270c821ca..31db7438857c 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -57,6 +57,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) if (!smc_tx_prepared_sends(&smc->conn)) return; + /* Send out corked data remaining in sndbuf */ + smc_tx_pending(&smc->conn); + smc->wait_close_tx_prepared = 1; add_wait_queue(sk_sleep(sk), &wait); while (!signal_pending(current) && timeout) { @@ -116,7 +119,6 @@ static void smc_close_cancel_work(struct smc_sock *smc) cancel_work_sync(&smc->conn.close_work); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); - sk->sk_state = SMC_CLOSED; } /* terminate smc socket abnormally - active abort @@ -134,22 +136,22 @@ void smc_close_active_abort(struct smc_sock *smc) } switch (sk->sk_state) { case SMC_ACTIVE: - sk->sk_state = SMC_PEERABORTWAIT; - smc_close_cancel_work(smc); - sk->sk_state = SMC_CLOSED; - sock_put(sk); /* passive closing */ - break; case SMC_APPCLOSEWAIT1: case SMC_APPCLOSEWAIT2: + sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; sk->sk_state = SMC_CLOSED; - sock_put(sk); /* postponed passive closing */ + sock_put(sk); /* (postponed) passive closing */ break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: case SMC_PEERFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; @@ -159,6 +161,8 @@ void smc_close_active_abort(struct smc_sock *smc) case SMC_APPFINCLOSEWAIT: sk->sk_state = SMC_PEERABORTWAIT; smc_close_cancel_work(smc); + if (sk->sk_state != SMC_PEERABORTWAIT) + break; sk->sk_state = SMC_CLOSED; smc_conn_free(&smc->conn); release_clcsock = true; @@ -194,6 +198,7 @@ int smc_close_active(struct smc_sock *smc) int old_state; long timeout; int rc = 0; + int rc1 = 0; timeout = current->flags & PF_EXITING ? 0 : sock_flag(sk, SOCK_LINGER) ? @@ -209,9 +214,12 @@ again: sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); - /* wake up kernel_accept of smc_tcp_listen_worker */ - smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); } smc_close_cleanup_listen(sk); release_sock(sk); @@ -227,6 +235,15 @@ again: /* send close request */ rc = smc_close_final(conn); sk->sk_state = SMC_PEERCLOSEWAIT1; + + /* actively shutdown clcsock before peer close it, + * prevent peer from entering TIME_WAIT state. + */ + if (smc->clcsock && smc->clcsock->sk) { + rc1 = kernel_sock_shutdown(smc->clcsock, + SHUT_RDWR); + rc = rc ? rc : rc1; + } } else { /* peer event has changed the state */ goto again; @@ -353,9 +370,9 @@ static void smc_close_passive_work(struct work_struct *work) if (rxflags->peer_conn_abort) { /* peer has not received all data */ smc_close_passive_abort_received(smc); - release_sock(&smc->sk); + release_sock(sk); cancel_delayed_work_sync(&conn->tx_work); - lock_sock(&smc->sk); + lock_sock(sk); goto wakeup; } @@ -372,7 +389,7 @@ static void smc_close_passive_work(struct work_struct *work) case SMC_PEERCLOSEWAIT1: if (rxflags->peer_done_writing) sk->sk_state = SMC_PEERCLOSEWAIT2; - /* fall through */ + fallthrough; /* to check for closing */ case SMC_PEERCLOSEWAIT2: if (!smc_cdc_rxed_any_close(conn)) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 5b085efa3bce..c305d8dd23f8 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -15,6 +15,9 @@ #include <linux/workqueue.h> #include <linux/wait.h> #include <linux/reboot.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/smc.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> @@ -29,13 +32,15 @@ #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" +#include "smc_netlink.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) -#define SMC_LGR_FREE_DELAY_FAST (8 * HZ) -static struct smc_lgr_list smc_lgr_list = { /* established link groups */ +struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, @@ -46,6 +51,9 @@ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); +static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); + +static void smc_link_down_work(struct work_struct *work); /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, @@ -60,13 +68,23 @@ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, return &smc_lgr_list.list; } +static void smc_ibdev_cnt_inc(struct smc_link *lnk) +{ + atomic_inc(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + +static void smc_ibdev_cnt_dec(struct smc_link *lnk) +{ + atomic_dec(&lnk->smcibdev->lnk_cnt_by_port[lnk->ibport - 1]); +} + static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ - if (!lgr->freeing && !lgr->freefast) { + if (!lgr->freeing) { mod_delayed_work(system_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : @@ -74,15 +92,6 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) } } -void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) -{ - if (!lgr->freeing && !lgr->freefast) { - lgr->freefast = 1; - mod_delayed_work(system_wq, &lgr->free_work, - SMC_LGR_FREE_DELAY_FAST); - } -} - /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock @@ -110,16 +119,63 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn) rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } +/* assign an SMC-R link to the connection */ +static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) +{ + enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : + SMC_LNK_ACTIVE; + int i, j; + + /* do link balancing */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &conn->lgr->lnk[i]; + + if (lnk->state != expected || lnk->link_is_asym) + continue; + if (conn->lgr->role == SMC_CLNT) { + conn->lnk = lnk; /* temporary, SMC server assigns link*/ + break; + } + if (conn->lgr->conns_num % 2) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + struct smc_link *lnk2; + + lnk2 = &conn->lgr->lnk[j]; + if (lnk2->state == expected && + !lnk2->link_is_asym) { + conn->lnk = lnk2; + break; + } + } + } + if (!conn->lnk) + conn->lnk = lnk; + break; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + atomic_inc(&conn->lnk->conn_cnt); + return 0; +} + /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ -static void smc_lgr_register_conn(struct smc_connection *conn) +static int smc_lgr_register_conn(struct smc_connection *conn, bool first) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); + int rc; + if (!conn->lgr->is_smcd) { + rc = smcr_lgr_conn_assign_link(conn, first); + if (rc) { + conn->lgr = NULL; + return rc; + } + } /* find a new alert_token_local value not yet used by some connection * in this link group */ @@ -131,6 +187,7 @@ static void smc_lgr_register_conn(struct smc_connection *conn) } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; + return 0; } /* Unregister connection and reset the alert token of the given connection< @@ -141,6 +198,8 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn) struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); + if (conn->lnk) + atomic_dec(&conn->lnk->conn_cnt); lgr->conns_num--; conn->alert_token_local = 0; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ @@ -152,40 +211,451 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!smc_conn_lgr_valid(conn)) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - conn->lgr = NULL; } -void smc_lgr_cleanup_early(struct smc_connection *conn) +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) { - struct smc_link_group *lgr = conn->lgr; + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + char hostname[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_seid[SMC_MAX_EID_LEN + 1]; + struct nlattr *attrs; + u8 *seid = NULL; + u8 *host = NULL; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_SYS_INFO); + if (!nlh) + goto errmsg; + if (cb_ctx->pos[0]) + goto errout; + attrs = nla_nest_start(skb, SMC_GEN_SYS_INFO); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_SYS_VER, SMC_V2)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_REL, SMC_RELEASE)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true)) + goto errattr; + smc_clc_get_hostname(&host); + if (host) { + memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); + hostname[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) + goto errattr; + } + if (smc_ism_is_v2_capable()) { + smc_ism_get_system_eid(&seid); + memcpy(smc_seid, seid, SMC_MAX_EID_LEN); + smc_seid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) + goto errattr; + } + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + return skb->len; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +/* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */ +static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + struct nlattr *v2_attrs) +{ + char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_eid[SMC_MAX_EID_LEN + 1]; + + if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) + goto errv2attr; + memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); + smc_host[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) + goto errv2attr; + memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); + smc_eid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); + return -EMSGSIZE; +} + +static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *v2_attrs; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2); + if (!v2_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); +errattr: + return -EMSGSIZE; +} + +static int smc_nl_fill_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_target[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *attrs, *v2_attrs; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_R_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_R_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_ROLE, lgr->role)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_TYPE, lgr->type)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_BUF_TYPE, lgr->buf_type)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_VLAN_ID, lgr->vlan_id)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_R_NET_COOKIE, + lgr->net->net_cookie, SMC_NLA_LGR_R_PAD)) + goto errattr; + memcpy(smc_target, lgr->pnet_id, SMC_MAX_PNETID_LEN); + smc_target[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) + goto errattr; + if (lgr->smc_version > SMC_V1) { + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb)) + goto errattr; + } + + nla_nest_end(skb, attrs); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_lgr_link(struct smc_link_group *lgr, + struct smc_link *link, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX]; + u8 smc_gid_target[41]; + struct nlattr *attrs; + u32 link_uid = 0; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LINK_SMCR); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LINK_SMCR); + if (!attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_LINK_ID, link->link_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_STATE, link->state)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_CONN_CNT, + atomic_read(&link->conn_cnt))) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LINK_IB_PORT, link->ibport)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LINK_NET_DEV, link->ndev_ifidx)) + goto errattr; + snprintf(smc_ibname, sizeof(smc_ibname), "%s", link->ibname); + if (nla_put_string(skb, SMC_NLA_LINK_IB_DEV, smc_ibname)) + goto errattr; + memcpy(&link_uid, link->link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_UID, link_uid)) + goto errattr; + memcpy(&link_uid, link->peer_link_uid, sizeof(link_uid)); + if (nla_put_u32(skb, SMC_NLA_LINK_PEER_UID, link_uid)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->gid); + if (nla_put_string(skb, SMC_NLA_LINK_GID, smc_gid_target)) + goto errattr; + memset(smc_gid_target, 0, sizeof(smc_gid_target)); + smc_gid_be16_convert(smc_gid_target, link->peer_gid); + if (nla_put_string(skb, SMC_NLA_LINK_PEER_GID, smc_gid_target)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCR); + if (!nlh) + goto errmsg; + if (smc_nl_fill_lgr(lgr, skb, cb)) + goto errout; + + genlmsg_end(skb, nlh); + if (!list_links) + goto out; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + if (smc_nl_fill_lgr_link(lgr, &lgr->lnk[i], skb, cb)) + goto errout; + } +out: + return 0; + +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_fill_lgr_list(struct smc_lgr_list *smc_lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + bool list_links) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[0]; + int num = 0; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_lgr(lgr, skb, cb, list_links)) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&smc_lgr->lock); + cb_ctx->pos[0] = num; +} + +static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *attrs; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_LGR_SMCD); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCD); + if (!attrs) + goto errout; + + if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid, + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, + SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) + goto errattr; + memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) + goto errattr; + if (lgr->smc_version > SMC_V1) { + struct nlattr *v2_attrs; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + } + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static int smc_nl_handle_smcd_lgr(struct smcd_dev *dev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_link_group *lgr; + int snum = cb_ctx->pos[1]; + int rc = 0, num = 0; + + spin_lock_bh(&dev->lgr_lock); + list_for_each_entry(lgr, &dev->lgr_list, list) { + if (!lgr->is_smcd) + continue; + if (num < snum) + goto next; + rc = smc_nl_fill_smcd_lgr(lgr, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + spin_unlock_bh(&dev->lgr_lock); + cb_ctx->pos[1] = num; + return rc; +} + +static int smc_nl_fill_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smcd_dev *smcd_dev; + int snum = cb_ctx->pos[0]; + int rc = 0, num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd_dev, &dev_list->list, list) { + if (list_empty(&smcd_dev->lgr_list)) + continue; + if (num < snum) + goto next; + rc = smc_nl_handle_smcd_lgr(smcd_dev, skb, cb); + if (rc) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; + return rc; +} + +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = false; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb) +{ + bool list_links = true; + + smc_nl_fill_lgr_list(&smc_lgr_list, skb, cb, list_links); + return skb->len; +} + +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_fill_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + +void smc_lgr_cleanup_early(struct smc_link_group *lgr) +{ + spinlock_t *lgr_lock; if (!lgr) return; - smc_conn_free(conn); - smc_lgr_forget(lgr); - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + /* do not use this link group for new connections */ + if (!list_empty(&lgr->list)) + list_del_init(&lgr->list); + spin_unlock_bh(lgr_lock); + __smc_lgr_terminate(lgr, true); } -/* Send delete link, either as client to request the initiation - * of the DELETE LINK sequence from server; or as server to - * initiate the delete processing. See smc_llc_rx_delete_link(). - */ -static int smc_link_send_delete(struct smc_link *lnk, bool orderly) +static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) { - if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { - smc_llc_link_deleting(lnk); - return 0; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_sendable(lnk)) + lnk->state = SMC_LNK_INACTIVE; } - return -ENOTCONN; + wake_up_all(&lgr->llc_msg_waiter); + wake_up_all(&lgr->llc_flow_waiter); } static void smc_lgr_free(struct smc_link_group *lgr); @@ -196,7 +666,6 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group, free_work); spinlock_t *lgr_lock; - struct smc_link *lnk; bool conns; smc_lgr_list_head(lgr, &lgr_lock); @@ -213,26 +682,17 @@ static void smc_lgr_free_work(struct work_struct *work) return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ - - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - if (!lgr->is_smcd && !lgr->terminating) { - /* try to send del link msg, on error free lgr immediately */ - if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk, true)) { - /* reschedule in case we never receive a response */ - smc_lgr_schedule_free_work(lgr); - spin_unlock_bh(lgr_lock); - return; - } - } lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (!lgr->is_smcd && !lgr->terminating) + smc_llc_send_link_delete_all(lgr, true, + SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); + if (!lgr->is_smcd) + smcr_lgr_link_deactivate_all(lgr); smc_lgr_free(lgr); } @@ -241,7 +701,118 @@ static void smc_lgr_terminate_work(struct work_struct *work) struct smc_link_group *lgr = container_of(work, struct smc_link_group, terminate_work); - smc_lgr_terminate(lgr, true); + __smc_lgr_terminate(lgr, true); +} + +/* return next unique link id for the lgr */ +static u8 smcr_next_link_id(struct smc_link_group *lgr) +{ + u8 link_id; + int i; + + while (1) { +again: + link_id = ++lgr->next_link_id; + if (!link_id) /* skip zero as link_id */ + link_id = ++lgr->next_link_id; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (smc_link_usable(&lgr->lnk[i]) && + lgr->lnk[i].link_id == link_id) + goto again; + } + break; + } + return link_id; +} + +static void smcr_copy_dev_info_to_link(struct smc_link *link) +{ + struct smc_ib_device *smcibdev = link->smcibdev; + + snprintf(link->ibname, sizeof(link->ibname), "%s", + smcibdev->ibdev->name); + link->ndev_ifidx = smcibdev->ndev_ifidx[link->ibport - 1]; +} + +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) +{ + struct smc_ib_device *smcibdev; + u8 rndvec[3]; + int rc; + + if (lgr->smc_version == SMC_V2) { + lnk->smcibdev = ini->smcrv2.ib_dev_v2; + lnk->ibport = ini->smcrv2.ib_port_v2; + } else { + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + } + get_device(&lnk->smcibdev->ibdev->dev); + atomic_inc(&lnk->smcibdev->lnk_cnt); + refcount_set(&lnk->refcnt, 1); /* link refcnt is set to 1 */ + lnk->clearing = 0; + lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; + lnk->link_id = smcr_next_link_id(lgr); + lnk->lgr = lgr; + smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ + lnk->link_idx = link_idx; + lnk->wr_rx_id_compl = 0; + smc_ibdev_cnt_inc(lnk); + smcr_copy_dev_info_to_link(lnk); + atomic_set(&lnk->conn_cnt, 0); + smc_llc_link_set_uid(lnk); + INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); + if (!lnk->smcibdev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev); + if (rc) + goto out; + } + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + ini->vlan_id, lnk->gid, &lnk->sgid_index, + lgr->smc_version == SMC_V2 ? + &ini->smcrv2 : NULL); + if (rc) + goto out; + rc = smc_llc_link_init(lnk); + if (rc) + goto out; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + lnk->state = SMC_LNK_ACTIVATING; + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk, false); +out: + smc_ibdev_cnt_dec(lnk); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold above */ + return rc; } /* create a new SMC link group */ @@ -251,12 +822,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; - u8 rndvec[3]; + u8 link_idx; int rc = 0; int i; if (ini->is_smcd && ini->vlan_id) { - if (smc_ism_get_vlan(ini->ism_dev, ini->vlan_id)) { + if (smc_ism_get_vlan(ini->ism_dev[ini->ism_selected], + ini->vlan_id)) { rc = SMC_CLC_DECL_ISMVLANERR; goto out; } @@ -267,19 +839,26 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } + lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, + SMC_LGR_ID_SIZE, &lgr->id); + if (!lgr->tx_wq) { + rc = -ENOMEM; + goto free_lgr; + } lgr->is_smcd = ini->is_smcd; lgr->sync_err = 0; lgr->terminating = 0; - lgr->freefast = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - rwlock_init(&lgr->sndbufs_lock); - rwlock_init(&lgr->rmbs_lock); + refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ + mutex_init(&lgr->sndbufs_lock); + mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } + lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); @@ -287,77 +866,67 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ - get_device(&ini->ism_dev->dev); - lgr->peer_gid = ini->ism_gid; - lgr->smcd = ini->ism_dev; - lgr_list = &ini->ism_dev->lgr_list; + get_device(&ini->ism_dev[ini->ism_selected]->dev); + lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected]; + lgr->smcd = ini->ism_dev[ini->ism_selected]; + lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; + lgr->smc_version = ini->smcd_version; lgr->peer_shutdown = 0; - atomic_inc(&ini->ism_dev->lgr_cnt); + atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ - get_device(&ini->ib_dev->ibdev->dev); + struct smc_ib_device *ibdev; + int ibport; + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, + lgr->smc_version = ini->smcr_version; + memcpy(lgr->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN); + if (lgr->smc_version == SMC_V2) { + ibdev = ini->smcrv2.ib_dev_v2; + ibport = ini->smcrv2.ib_port_v2; + lgr->saddr = ini->smcrv2.saddr; + lgr->uses_gateway = ini->smcrv2.uses_gateway; + memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, + ETH_ALEN); + } else { + ibdev = ini->ib_dev; + ibport = ini->ib_port; + } + memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], + SMC_MAX_PNETID_LEN); + rc = smc_wr_alloc_lgr_mem(lgr); + if (rc) + goto free_wq; + smc_llc_lgr_init(lgr, smc); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; + link_idx = SMC_SINGLE_LINK; + lnk = &lgr->lnk[link_idx]; + rc = smcr_link_init(lgr, lnk, link_idx, ini); + if (rc) { + smc_wr_free_lgr_mem(lgr); + goto free_wq; + } + lgr->net = smc_ib_net(lnk->smcibdev); lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; - lnk->path_mtu = - ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; - if (!ini->ib_dev->initialized) - smc_ib_setup_per_ibdev(ini->ib_dev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + - (rndvec[2] << 16); - rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, - &lnk->sgid_index); - if (rc) - goto free_lgr; - rc = smc_llc_link_init(lnk); - if (rc) - goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; + lgr->buf_type = lgr->net->smc.sysctl_smcr_buf_type; atomic_inc(&lgr_cnt); - atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); - list_add(&lgr->list, lgr_list); + list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); return 0; -destroy_qp: - smc_ib_destroy_queue_pair(lnk); -dealloc_pd: - smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); -clear_llc_lnk: - smc_llc_link_clear(lnk); +free_wq: + destroy_workqueue(lgr->tx_wq); free_lgr: kfree(lgr); ism_put_vlan: if (ini->is_smcd && ini->vlan_id) - smc_ism_put_vlan(ini->ism_dev, ini->vlan_id); + smc_ism_put_vlan(ini->ism_dev[ini->ism_selected], ini->vlan_id); out: if (rc < 0) { if (rc == -ENOMEM) @@ -368,27 +937,214 @@ out: return rc; } +static int smc_write_space(struct smc_connection *conn) +{ + int buffer_len = conn->peer_rmbe_size; + union smc_host_cursor prod; + union smc_host_cursor cons; + int space; + + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + /* determine rx_buf space */ + space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); + return space; +} + +static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons, fin; + int rc = 0; + int diff; + + smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); + smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); + /* set prod cursor to old state, enforce tx_rdma_writes() */ + smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + + if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { + /* cons cursor advanced more than fin, and prod was set + * fin above, so now prod is smaller than cons. Fix that. + */ + diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_sent, diff); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_fin, diff); + + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + smp_mb__after_atomic(); + + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl.prod, diff); + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl_fin, diff); + } + /* recalculate, value is used by tx_rdma_writes() */ + atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); + + if (smc->sk.sk_state != SMC_INIT && + smc->sk.sk_state != SMC_CLOSED) { + rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); + if (!rc) { + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, 0); + smc->sk.sk_data_ready(&smc->sk); + } + } else { + smc_wr_tx_put_slot(conn->lnk, + (struct smc_wr_tx_pend_priv *)pend); + } + return rc; +} + +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk) +{ + atomic_dec(&conn->lnk->conn_cnt); + /* link_hold in smc_conn_create() */ + smcr_link_put(conn->lnk); + conn->lnk = to_lnk; + atomic_inc(&conn->lnk->conn_cnt); + /* link_put in smc_conn_free() */ + smcr_link_hold(conn->lnk); +} + +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err) +{ + struct smc_link *to_lnk = NULL; + struct smc_cdc_tx_pend *pend; + struct smc_connection *conn; + struct smc_wr_buf *wr_buf; + struct smc_sock *smc; + struct rb_node *node; + int i, rc = 0; + + /* link is inactive, wake up tx waiters */ + smc_wr_wakeup_tx_wait(from_lnk); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i]) || i == from_lnk->link_idx) + continue; + if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && + from_lnk->ibport == lgr->lnk[i].ibport) { + continue; + } + to_lnk = &lgr->lnk[i]; + break; + } + if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { + smc_lgr_terminate_sched(lgr); + return NULL; + } +again: + read_lock_bh(&lgr->conns_lock); + for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { + conn = rb_entry(node, struct smc_connection, alert_node); + if (conn->lnk != from_lnk) + continue; + smc = container_of(conn, struct smc_sock, conn); + /* conn->lnk not yet set in SMC_INIT state */ + if (smc->sk.sk_state == SMC_INIT) + continue; + if (smc->sk.sk_state == SMC_CLOSED || + smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || + smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || + smc->sk.sk_state == SMC_APPFINCLOSEWAIT || + smc->sk.sk_state == SMC_APPCLOSEWAIT1 || + smc->sk.sk_state == SMC_APPCLOSEWAIT2 || + smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || + smc->sk.sk_state == SMC_PEERABORTWAIT || + smc->sk.sk_state == SMC_PROCESSABORT) { + spin_lock_bh(&conn->send_lock); + smc_switch_link_and_count(conn, to_lnk); + spin_unlock_bh(&conn->send_lock); + continue; + } + sock_hold(&smc->sk); + read_unlock_bh(&lgr->conns_lock); + /* pre-fetch buffer outside of send_lock, might sleep */ + rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); + if (rc) + goto err_out; + /* avoid race with smcr_tx_sndbuf_nonempty() */ + spin_lock_bh(&conn->send_lock); + smc_switch_link_and_count(conn, to_lnk); + rc = smc_switch_cursor(smc, pend, wr_buf); + spin_unlock_bh(&conn->send_lock); + sock_put(&smc->sk); + if (rc) + goto err_out; + goto again; + } + read_unlock_bh(&lgr->conns_lock); + smc_wr_tx_link_put(to_lnk); + return to_lnk; + +err_out: + smcr_link_down_cond_sched(to_lnk); + smc_wr_tx_link_put(to_lnk); + return NULL; +} + +static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link_group *lgr) +{ + struct mutex *lock; /* lock buffer list */ + int rc; + + if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { + /* unregister rmb with peer */ + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (!rc) { + /* protect against smc_llc_cli_rkey_exchange() */ + mutex_lock(&lgr->llc_conf_mutex); + smc_llc_do_delete_rkey(lgr, buf_desc); + buf_desc->is_conf_rkey = false; + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + } + } + + if (buf_desc->is_reg_err) { + /* buf registration failed, reuse not possible */ + lock = is_rmb ? &lgr->rmbs_lock : + &lgr->sndbufs_lock; + mutex_lock(lock); + list_del(&buf_desc->list); + mutex_unlock(lock); + + smc_buf_free(lgr, is_rmb, buf_desc); + } else { + buf_desc->used = 0; + memset(buf_desc->cpu_addr, 0, buf_desc->len); + } +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { - if (conn->sndbuf_desc) - conn->sndbuf_desc->used = 0; + if (conn->sndbuf_desc) { + if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + smcr_buf_unuse(conn->sndbuf_desc, false, lgr); + } else { + conn->sndbuf_desc->used = 0; + memset(conn->sndbuf_desc->cpu_addr, 0, + conn->sndbuf_desc->len); + } + } if (conn->rmb_desc) { - if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd && !list_empty(&lgr->list)) { - /* unregister rmb with peer */ - smc_llc_do_delete_rkey( - &lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc); - } - conn->rmb_desc->used = 0; + if (!lgr->is_smcd) { + smcr_buf_unuse(conn->rmb_desc, true, lgr); } else { - /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); - list_del(&conn->rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); - - smc_buf_free(lgr, true, conn->rmb_desc); + conn->rmb_desc->used = 0; + memset(conn->rmb_desc->cpu_addr, 0, + conn->rmb_desc->len + + sizeof(struct smcd_cdc_msg)); } } } @@ -398,55 +1154,152 @@ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!lgr || conn->freed) + /* Connection has never been registered in a + * link group, or has already been freed. + */ return; + + conn->freed = 1; + if (!smc_conn_lgr_valid(conn)) + /* Connection has already unregistered from + * link group. + */ + goto lgr_put; + if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { - smc_cdc_tx_dismiss_slots(conn); + smc_cdc_wait_pend_tx_wr(conn); + if (current_work() != &conn->abort_work) + cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { - smc_lgr_unregister_conn(conn); smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + smc_lgr_unregister_conn(conn); } if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); +lgr_put: + if (!lgr->is_smcd) + smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ + smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } -static void smc_link_clear(struct smc_link *lnk) +/* unregister a link from a buf_desc */ +static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) { + if (is_rmb || buf_desc->is_vm) + buf_desc->is_reg_mr[lnk->link_idx] = false; + if (!buf_desc->is_map_ib[lnk->link_idx]) + return; + + if ((is_rmb || buf_desc->is_vm) && + buf_desc->mr[lnk->link_idx]) { + smc_ib_put_memory_region(buf_desc->mr[lnk->link_idx]); + buf_desc->mr[lnk->link_idx] = NULL; + } + if (is_rmb) + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + else + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + buf_desc->is_map_ib[lnk->link_idx] = false; +} + +/* unmap all buffers of lgr for a deleted link */ +static void smcr_buf_unmap_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + mutex_lock(&lgr->rmbs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) + smcr_buf_unmap_link(buf_desc, true, lnk); + mutex_unlock(&lgr->rmbs_lock); + mutex_lock(&lgr->sndbufs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], + list) + smcr_buf_unmap_link(buf_desc, false, lnk); + mutex_unlock(&lgr->sndbufs_lock); + } +} + +static void smcr_rtoken_clear_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + lgr->rtokens[i][lnk->link_idx].rkey = 0; + lgr->rtokens[i][lnk->link_idx].dma_addr = 0; + } +} + +static void __smcr_link_clear(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_ib_device *smcibdev; + + smc_wr_free_link_mem(lnk); + smc_ibdev_cnt_dec(lnk); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_clear(struct smc_link *lnk, bool log) +{ + if (!lnk->lgr || lnk->clearing || + lnk->state == SMC_LNK_UNUSED) + return; + lnk->clearing = 1; lnk->peer_qpn = 0; - smc_llc_link_clear(lnk); - smc_ib_modify_qp_reset(lnk); + smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); + smcr_rtoken_clear_link(lnk); + smc_ib_modify_qp_error(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); - smc_wr_free_link_mem(lnk); - if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) - wake_up(&lnk->smcibdev->lnks_deleted); + smcr_link_put(lnk); /* theoretically last link_put */ +} + +void smcr_link_hold(struct smc_link *lnk) +{ + refcount_inc(&lnk->refcnt); +} + +void smcr_link_put(struct smc_link *lnk) +{ + if (refcount_dec_and_test(&lnk->refcnt)) + __smcr_link_clear(lnk); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + int i; - if (is_rmb) { - if (buf_desc->mr_rx[SMC_SINGLE_LINK]) - smc_ib_put_memory_region( - buf_desc->mr_rx[SMC_SINGLE_LINK]); - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_FROM_DEVICE); - } else { - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_TO_DEVICE); - } - sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); - if (buf_desc->pages) + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); + + if (!buf_desc->is_vm && buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); + else if (buf_desc->is_vm && buf_desc->cpu_addr) + vfree(buf_desc->cpu_addr); kfree(buf_desc); } @@ -499,51 +1352,53 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) __smc_lgr_free_bufs(lgr, true); } -/* remove a link group */ -static void smc_lgr_free(struct smc_link_group *lgr) +/* won't be freed until no one accesses to lgr anymore */ +static void __smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { - if (!lgr->terminating) { - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); - } if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); - put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + smc_wr_free_lgr_mem(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } kfree(lgr); } -void smc_lgr_forget(struct smc_link_group *lgr) +/* remove a link group */ +static void smc_lgr_free(struct smc_link_group *lgr) { - struct list_head *lgr_list; - spinlock_t *lgr_lock; + int i; - lgr_list = smc_lgr_list_head(lgr, &lgr_lock); - spin_lock_bh(lgr_lock); - /* do not use this link group for new connections */ - if (!list_empty(lgr_list)) - list_del_init(lgr_list); - spin_unlock_bh(lgr_lock); + if (!lgr->is_smcd) { + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i], false); + } + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_lgr_clear(lgr); + } + + destroy_workqueue(lgr->tx_wq); + if (lgr->is_smcd) { + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + put_device(&lgr->smcd->dev); + } + smc_lgr_put(lgr); /* theoretically last lgr_put */ } -static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) +void smc_lgr_hold(struct smc_link_group *lgr) { - int i; - - for (i = 0; i < SMC_RMBE_SIZES; i++) { - struct smc_buf_desc *buf_desc; + refcount_inc(&lgr->refcnt); +} - list_for_each_entry(buf_desc, &lgr->rmbs[i], list) { - buf_desc->len += sizeof(struct smcd_cdc_msg); - smc_ism_unregister_dmb(lgr->smcd, buf_desc); - } - } +void smc_lgr_put(struct smc_link_group *lgr) +{ + if (refcount_dec_and_test(&lgr->refcnt)) + __smc_lgr_free(lgr); } static void smc_sk_wake_ups(struct smc_sock *smc) @@ -572,7 +1427,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) else tasklet_unlock_wait(&conn->rx_tsklet); } else { - smc_cdc_tx_dismiss_slots(conn); + smc_cdc_wait_pend_tx_wr(conn); } smc_lgr_unregister_conn(conn); smc_close_active_abort(smc); @@ -582,21 +1437,20 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) { if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); - smcd_unregister_all_dmbs(lgr); - smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); } else { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + u32 rsn = lgr->llc_termination_rsn; - wake_up(&lnk->wr_reg_wait); - if (lnk->state != SMC_LNK_INACTIVE) { - smc_link_send_delete(lnk, false); - smc_llc_link_inactive(lnk); - } + if (!rsn) + rsn = SMC_LLC_DEL_PROG_INIT_TERM; + smc_llc_send_link_delete_all(lgr, false, rsn); + smcr_lgr_link_deactivate_all(lgr); } } -/* terminate link group */ +/* terminate link group + * @soft: true if link group shutdown can take its time + * false if immediate link group shutdown is required + */ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) { struct smc_connection *conn; @@ -605,11 +1459,9 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ - if (!soft) - cancel_delayed_work_sync(&lgr->free_work); + /* cancel free_work sync, will terminate when lgr->freeing is set */ + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!lgr->is_smcd) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -628,54 +1480,24 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) } read_unlock_bh(&lgr->conns_lock); smc_lgr_cleanup(lgr); - if (soft) - smc_lgr_schedule_free_work_fast(lgr); - else - smc_lgr_free(lgr); + smc_lgr_free(lgr); } -/* unlink and terminate link group - * @soft: true if link group shutdown can take its time - * false if immediate link group shutdown is required - */ -void smc_lgr_terminate(struct smc_link_group *lgr, bool soft) +/* unlink link group and schedule termination */ +void smc_lgr_terminate_sched(struct smc_link_group *lgr) { spinlock_t *lgr_lock; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); - if (lgr->terminating) { + if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) { spin_unlock_bh(lgr_lock); return; /* lgr already terminating */ } - if (!soft) - lgr->freeing = 1; list_del_init(&lgr->list); + lgr->freeing = 1; spin_unlock_bh(lgr_lock); - __smc_lgr_terminate(lgr, soft); -} - -/* Called when IB port is terminated */ -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link_group *lgr, *l; - LIST_HEAD(lgr_free_list); - - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (!lgr->is_smcd && - lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; - } - } - spin_unlock_bh(&smc_lgr_list.lock); - - list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { - list_del_init(&lgr->list); - __smc_lgr_terminate(lgr, false); - } + schedule_work(&lgr->terminate_work); } /* Called when peer lgr shutdown (regularly or abnormally) is received */ @@ -692,6 +1514,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) if (peer_gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; } } spin_unlock_bh(&dev->lgr_lock); @@ -732,6 +1555,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { @@ -740,9 +1564,9 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } @@ -750,6 +1574,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); __smc_lgr_terminate(lgr, false); } @@ -763,14 +1588,200 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) } } -/* Determine vlan of internal TCP socket. - * @vlan_id: address to store the determined vlan id into +/* set new lgr type and clear all asymmetric link tagging */ +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) +{ + char *lgr_type = ""; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + lgr->lnk[i].link_is_asym = false; + if (lgr->type == new_type) + return; + lgr->type = new_type; + + switch (lgr->type) { + case SMC_LGR_NONE: + lgr_type = "NONE"; + break; + case SMC_LGR_SINGLE: + lgr_type = "SINGLE"; + break; + case SMC_LGR_SYMMETRIC: + lgr_type = "SYMMETRIC"; + break; + case SMC_LGR_ASYMMETRIC_PEER: + lgr_type = "ASYMMETRIC_PEER"; + break; + case SMC_LGR_ASYMMETRIC_LOCAL: + lgr_type = "ASYMMETRIC_LOCAL"; + break; + } + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu state changed: " + "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, lgr_type, lgr->pnet_id); +} + +/* set new lgr type and tag a link as asymmetric */ +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx) +{ + smcr_lgr_set_type(lgr, new_type); + lgr->lnk[asym_lnk_idx].link_is_asym = true; +} + +/* abort connection, abort_work scheduled from tasklet context */ +static void smc_conn_abort_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + abort_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_conn_kill(conn, true); + release_sock(&smc->sk); + sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ +} + +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *n; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + struct smc_link *link; + + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER || + !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) + continue; + + /* trigger local add link processing */ + link = smc_llc_usable_link(lgr); + if (link) + smc_llc_add_link_local(link); + } +} + +/* link is down - switch connections to alternate link, + * must be called under lgr->llc_conf_mutex lock */ +static void smcr_link_down(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_link *to_lnk; + int del_link_id; + + if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) + return; + + to_lnk = smc_switch_conns(lgr, lnk, true); + if (!to_lnk) { /* no backup link available */ + smcr_link_clear(lnk, true); + return; + } + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + del_link_id = lnk->link_id; + + if (lgr->role == SMC_SERV) { + /* trigger local delete link processing */ + smc_llc_srv_delete_link_local(to_lnk, del_link_id); + } else { + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* another llc task is ongoing */ + mutex_unlock(&lgr->llc_conf_mutex); + wait_event_timeout(lgr->llc_flow_waiter, + (list_empty(&lgr->list) || + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + mutex_lock(&lgr->llc_conf_mutex); + } + if (!list_empty(&lgr->list)) { + smc_llc_send_delete_link(to_lnk, del_link_id, + SMC_LLC_REQ, true, + SMC_LLC_DEL_LOST_PATH); + smcr_link_clear(lnk, true); + } + wake_up(&lgr->llc_flow_waiter); /* wake up next waiter */ + } +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_down_cond(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) { + trace_smcr_link_down(lnk, __builtin_return_address(0)); + smcr_link_down(lnk); + } +} + +/* will get the lgr->llc_conf_mutex lock */ +void smcr_link_down_cond_sched(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) { + trace_smcr_link_down(lnk, __builtin_return_address(0)); + schedule_work(&lnk->link_down_wrk); + } +} + +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *n; + int i; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk) && + lnk->smcibdev == smcibdev && lnk->ibport == ibport) + smcr_link_down_cond_sched(lnk); + } + } +} + +static void smc_link_down_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, struct smc_link, + link_down_wrk); + struct smc_link_group *lgr = link->lgr; + + if (list_empty(&lgr->list)) + return; + wake_up_all(&lgr->llc_msg_waiter); + mutex_lock(&lgr->llc_conf_mutex); + smcr_link_down(link); + mutex_unlock(&lgr->llc_conf_mutex); +} + +static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + unsigned short *vlan_id = (unsigned short *)priv->data; + + if (is_vlan_dev(lower_dev)) { + *vlan_id = vlan_dev_vlan_id(lower_dev); + return 1; + } + + return 0; +} + +/* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct netdev_nested_priv priv; struct net_device *ndev; - int i, nest_lvl, rc = 0; + int rc = 0; ini->vlan_id = 0; if (!dst) { @@ -788,20 +1799,9 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) goto out_rel; } + priv.data = (void *)&ini->vlan_id; rtnl_lock(); - nest_lvl = ndev->lower_level; - for (i = 0; i < nest_lvl; i++) { - struct list_head *lower = &ndev->adj_list.lower; - - if (list_empty(lower)) - break; - lower = lower->next; - ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); - if (is_vlan_dev(ndev)) { - ini->vlan_id = vlan_dev_vlan_id(ndev); - break; - } - } + netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); rtnl_unlock(); out_rel: @@ -810,19 +1810,35 @@ out: return rc; } -static bool smcr_lgr_match(struct smc_link_group *lgr, - struct smc_clc_msg_local *lcl, - enum smc_lgr_role role, u32 clcqpn) +static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, + u8 peer_systemid[], + u8 peer_gid[], + u8 peer_mac_v1[], + enum smc_lgr_role role, u32 clcqpn, + struct net *net) { - return !memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && - lgr->role == role && - (lgr->role == SMC_SERV || - lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); + struct smc_link *lnk; + int i; + + if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || + lgr->role != role) + return false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + + if (!smc_link_active(lnk)) + continue; + /* use verbs API to check netns, instead of lgr->net */ + if (!rdma_dev_access_netns(lnk->smcibdev->ibdev, net)) + return false; + if ((lgr->role == SMC_SERV || lnk->peer_qpn == clcqpn) && + !memcmp(lnk->peer_gid, peer_gid, SMC_GID_SIZE) && + (smcr_version == SMC_V2 || + !memcmp(lnk->peer_mac, peer_mac_v1, ETH_ALEN))) + return true; + } + return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -835,17 +1851,20 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; + struct net *net = sock_net(&smc->sk); struct list_head *lgr_list; struct smc_link_group *lgr; enum smc_lgr_role role; spinlock_t *lgr_lock; int rc = 0; - lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list; - lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock; - ini->cln_first_contact = SMC_FIRST_CONTACT; + lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list : + &smc_lgr_list.list; + lgr_lock = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_lock : + &smc_lgr_list.lock; + ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - if (role == SMC_CLNT && ini->srv_first_contact) + if (role == SMC_CLNT && ini->first_contact_peer) /* create new link group as well */ goto create; @@ -854,27 +1873,35 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? - smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) : - smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && + smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], + ini->ism_peer_gid[ini->ism_selected]) : + smcr_lgr_match(lgr, ini->smcr_version, + ini->peer_systemid, + ini->peer_gid, ini->peer_mac, role, + ini->ib_clcqpn, net)) && !lgr->sync_err && - lgr->vlan_id == ini->vlan_id && - (role == SMC_CLNT || - lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { + (ini->smcd_version == SMC_V2 || + lgr->vlan_id == ini->vlan_id) && + (role == SMC_CLNT || ini->is_smcd || + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ - ini->cln_first_contact = SMC_REUSE_CONTACT; + ini->first_contact_local = 0; conn->lgr = lgr; - smc_lgr_register_conn(conn); /* add smc conn to lgr */ - if (delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); + if (!rc && delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); + if (rc) + return rc; - if (role == SMC_CLNT && !ini->srv_first_contact && - ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (role == SMC_CLNT && !ini->first_contact_peer && + ini->first_contact_local) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error @@ -883,21 +1910,33 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) } create: - if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + if (ini->first_contact_local) { rc = smc_lgr_create(smc, ini); if (rc) goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); - smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); + if (rc) { + smc_lgr_cleanup_early(lgr); + goto out; + } } + smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ + if (!conn->lgr->is_smcd) + smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ + conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + init_waitqueue_head(&conn->cdc_pend_tx_wq); + INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ + } else { + conn->rx_off = 0; } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); @@ -907,21 +1946,30 @@ out: return rc; } -/* convert the RMB size into the compressed notation - minimum 16K. +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ + +/* convert the RMB size into the compressed notation (minimum 16K, see + * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size) +static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { + const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; + size = (size - 1) >> 14; /* convert to 16K multiple */ + compressed = min_t(u8, ilog2(size) + 1, + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb) + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + return compressed; } @@ -938,19 +1986,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - rwlock_t *lock, + struct mutex *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - read_lock_bh(lock); + mutex_lock(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - read_unlock_bh(lock); + mutex_unlock(lock); return buf_slot; } } - read_unlock_bh(lock); + mutex_unlock(lock); return NULL; } @@ -960,69 +2008,259 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { - return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); + return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) +/* map an buf to a link */ +static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) { - struct smc_buf_desc *buf_desc; - struct smc_link *lnk; - int rc; + int rc, i, nents, offset, buf_size, size, access_flags; + struct scatterlist *sg; + void *buf; - /* try to alloc a new buffer */ - buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); - if (!buf_desc) - return ERR_PTR(-ENOMEM); + if (buf_desc->is_map_ib[lnk->link_idx]) + return 0; - buf_desc->order = get_order(bufsize); - buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_COMP | - __GFP_NORETRY | __GFP_ZERO, - buf_desc->order); - if (!buf_desc->pages) { - kfree(buf_desc); - return ERR_PTR(-EAGAIN); - } - buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); - - /* build the sg table from the pages */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, - GFP_KERNEL); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); + if (buf_desc->is_vm) { + buf = buf_desc->cpu_addr; + buf_size = buf_desc->len; + offset = offset_in_page(buf_desc->cpu_addr); + nents = PAGE_ALIGN(buf_size + offset) / PAGE_SIZE; + } else { + nents = 1; + } + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], nents, GFP_KERNEL); + if (rc) + return rc; + + if (buf_desc->is_vm) { + /* virtually contiguous buffer */ + for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { + size = min_t(int, PAGE_SIZE - offset, buf_size); + sg_set_page(sg, vmalloc_to_page(buf), size, offset); + buf += size / sizeof(*buf); + buf_size -= size; + offset = 0; + } + } else { + /* physically contiguous buffer */ + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); } - sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, - buf_desc->cpu_addr, bufsize); /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, + rc = smc_ib_buf_map_sg(lnk, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(-EAGAIN); + if (rc != nents) { + rc = -EAGAIN; + goto free_table; } - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - buf_desc); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); + buf_desc->is_dma_need_sync |= + smc_ib_is_sg_need_sync(lnk, buf_desc) << lnk->link_idx; + + if (is_rmb || buf_desc->is_vm) { + /* create a new memory region for the RMB or vzalloced sndbuf */ + access_flags = is_rmb ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_LOCAL_WRITE; + + rc = smc_ib_get_memory_region(lnk->roce_pd, access_flags, + buf_desc, lnk->link_idx); + if (rc) + goto buf_unmap; + smc_ib_sync_sg_for_device(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + buf_desc->is_map_ib[lnk->link_idx] = true; + return 0; + +buf_unmap: + smc_ib_buf_unmap_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +free_table: + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + return rc; +} + +/* register a new buf on IB device, rmb or vzalloced sndbuf + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) +{ + if (list_empty(&link->lgr->list)) + return -ENOLINK; + if (!buf_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new buf */ + if (buf_desc->is_vm) + buf_desc->mr[link->link_idx]->iova = + (uintptr_t)buf_desc->cpu_addr; + if (smc_wr_reg_send(link, buf_desc->mr[link->link_idx])) { + buf_desc->is_reg_err = true; + return -EFAULT; } + buf_desc->is_reg_mr[link->link_idx] = true; } + return 0; +} - buf_desc->len = bufsize; +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, + struct list_head *lst, bool is_rmb) +{ + struct smc_buf_desc *buf_desc, *bf; + int rc = 0; + + mutex_lock(lock); + list_for_each_entry_safe(buf_desc, bf, lst, list) { + if (!buf_desc->used) + continue; + rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); + if (rc) + goto out; + } +out: + mutex_unlock(lock); + return rc; +} + +/* map all used buffers of lgr for a new link */ +int smcr_buf_map_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i, rc = 0; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, + &lgr->rmbs[i], true); + if (rc) + return rc; + rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, + &lgr->sndbufs[i], false); + if (rc) + return rc; + } + return 0; +} + +/* register all used buffers of lgr for a new link, + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_buf_reg_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i, rc = 0; + + /* reg all RMBs for a new link */ + mutex_lock(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { + if (!buf_desc->used) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->rmbs_lock); + return rc; + } + } + } + mutex_unlock(&lgr->rmbs_lock); + + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + return rc; + + /* reg all vzalloced sndbufs for a new link */ + mutex_lock(&lgr->sndbufs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { + if (!buf_desc->used || !buf_desc->is_vm) + continue; + rc = smcr_link_reg_buf(lnk, buf_desc); + if (rc) { + mutex_unlock(&lgr->sndbufs_lock); + return rc; + } + } + } + mutex_unlock(&lgr->sndbufs_lock); + return rc; +} + +static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, + bool is_rmb, int bufsize) +{ + struct smc_buf_desc *buf_desc; + + /* try to alloc a new buffer */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + + switch (lgr->buf_type) { + case SMCR_PHYS_CONT_BUFS: + case SMCR_MIXED_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (buf_desc->pages) { + buf_desc->cpu_addr = + (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + buf_desc->is_vm = false; + break; + } + if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) + goto out; + fallthrough; // try virtually continguous buf + case SMCR_VIRT_CONT_BUFS: + buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); + if (!buf_desc->cpu_addr) + goto out; + buf_desc->pages = NULL; + buf_desc->len = bufsize; + buf_desc->is_vm = true; + break; + } return buf_desc; + +out: + kfree(buf_desc); + return ERR_PTR(-EAGAIN); } -#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +/* map buf_desc on all usable links, + * unused buffers stay mapped as long as the link is up + */ +static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) +{ + int i, rc = 0, cnt = 0; + + /* protect against parallel link reconfiguration */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (!smc_link_usable(lnk)) + continue; + if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { + rc = -ENOMEM; + goto out; + } + cnt++; + } +out: + mutex_unlock(&lgr->llc_conf_mutex); + if (!rc && !cnt) + rc = -EINVAL; + return rc; +} static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) @@ -1030,9 +2268,6 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc; int rc; - if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) - return ERR_PTR(-EAGAIN); - /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) @@ -1041,7 +2276,11 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return ERR_PTR(-EAGAIN); + if (rc == -ENOMEM) + return ERR_PTR(-EAGAIN); + if (rc == -ENOSPC) + return ERR_PTR(-ENOSPC); + return ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ @@ -1066,19 +2305,19 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + bool is_dgraded = false; + struct mutex *lock; /* lock buffer list */ int sk_buf_size; - rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf / 2; + sk_buf_size = smc->sk.sk_rcvbuf; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf / 2; + sk_buf_size = smc->sk.sk_sndbuf; - for (bufsize_short = smc_compress_bufsize(sk_buf_size); + for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { - if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_short]; @@ -1087,13 +2326,13 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) buf_list = &lgr->sndbufs[bufsize_short]; } bufsize = smc_uncompress_bufsize(bufsize_short); - if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) - continue; /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { - memset(buf_desc->cpu_addr, 0, bufsize); + buf_desc->is_dma_need_sync = 0; + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ } @@ -1104,23 +2343,37 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (PTR_ERR(buf_desc) == -ENOMEM) break; - if (IS_ERR(buf_desc)) + if (IS_ERR(buf_desc)) { + if (!is_dgraded) { + is_dgraded = true; + SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); + } continue; + } + SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); buf_desc->used = 1; - write_lock_bh(lock); + mutex_lock(lock); list_add(&buf_desc->list, buf_list); - write_unlock_bh(lock); + mutex_unlock(lock); break; /* found */ } if (IS_ERR(buf_desc)) - return -ENOMEM; + return PTR_ERR(buf_desc); + + if (!is_smcd) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + smcr_buf_unuse(buf_desc, is_rmb, lgr); + return -ENOMEM; + } + } if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize * 2; + smc->sk.sk_rcvbuf = bufsize; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -1128,50 +2381,36 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize * 2; + smc->sk.sk_sndbuf = bufsize; atomic_set(&conn->sndbuf_space, bufsize); } return 0; } -void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) -{ - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) - return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); -} - void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->sndbuf_desc->is_dma_need_sync) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) + return; + smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->rmb_desc->is_dma_need_sync) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); -} - -void smc_rmb_sync_sg_for_device(struct smc_connection *conn) -{ - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } /* create the send and receive buffer for an SMC socket; @@ -1190,8 +2429,13 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) return rc; /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); - if (rc) + if (rc) { + mutex_lock(&smc->conn.lgr->sndbufs_lock); + list_del(&smc->conn.sndbuf_desc->list); + mutex_unlock(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); + smc->conn.sndbuf_desc = NULL; + } return rc; } @@ -1206,16 +2450,64 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) return -ENOSPC; } +static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, + u32 rkey) +{ + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (test_bit(i, lgr->rtokens_used_mask) && + lgr->rtokens[i][lnk_idx].rkey == rkey) + return i; + } + return -ENOENT; +} + +/* set rtoken for a new link to an existing rmb */ +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) +{ + int rtok_idx; + + rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); + if (rtok_idx == -ENOENT) + return; + lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); + lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); +} + +/* set rtoken for a new link whose link_id is given */ +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey) +{ + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); + bool found = false; + int link_idx; + + for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { + if (lgr->lnk[link_idx].link_id == link_id) { + found = true; + break; + } + } + if (!found) + return; + lgr->rtokens[rtok_idx][link_idx].rkey = rkey; + lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; +} + /* add a new rtoken from peer */ -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && - (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; @@ -1224,23 +2516,25 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + lgr->rtokens[i][lnk->link_idx].rkey = rkey; + lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } -/* delete an rtoken */ -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +/* delete an rtoken from all links */ +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); - int i; + int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; - + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { + lgr->rtokens[i][j].rkey = 0; + lgr->rtokens[i][j].dma_addr = 0; + } clear_bit(i, lgr->rtokens_used_mask); return 0; } @@ -1250,10 +2544,11 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, - clc->rmb_rkey); + conn->rtoken_idx = smc_rtoken_add(lnk, clc->r0.rmb_dma_addr, + clc->r0.rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; return 0; @@ -1264,20 +2559,20 @@ static void smc_core_going_away(void) struct smc_ib_device *smcibdev; struct smcd_dev *smcd; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { int i; for (i = 0; i < SMC_MAX_PORTS; i++) set_bit(i, smcibdev->ports_going_away); } - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { smcd->going_away = 1; } - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); } /* Clean up all SMC link groups */ @@ -1289,10 +2584,10 @@ static void smc_lgrs_shutdown(void) smc_smcr_terminate_all(NULL); - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) smc_smcd_terminate_all(smcd); - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); } static int smc_core_reboot_event(struct notifier_block *this, diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 234ae25f0025..285f9bd8e232 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -13,7 +13,10 @@ #define _SMC_CORE_H #include <linux/atomic.h> +#include <linux/smc.h> +#include <linux/pci.h> #include <rdma/ib_verbs.h> +#include <net/genetlink.h> #include "smc.h" #include "smc_ib.h" @@ -32,18 +35,23 @@ enum smc_lgr_role { /* possible roles of a link group */ }; enum smc_link_state { /* possible states of a link */ + SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ - SMC_LNK_DELETING, /* link is being deleted */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ +#define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; +struct smc_wr_v2_buf { + u8 raw[SMC_WR_BUF_V2_SIZE]; +}; + #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ enum smc_wr_reg_state { @@ -70,6 +78,8 @@ struct smc_rdma_wr { /* work requests per message struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; +#define SMC_LGR_ID_SIZE 4 + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -85,24 +95,34 @@ struct smc_link { struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ + struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ + struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ + struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ + struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ + dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ atomic_long_t wr_tx_id; /* seq # of last sent WR */ unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ + atomic_t wr_tx_refcnt; /* tx refs to link */ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ + u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ + wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ + atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ @@ -115,34 +135,30 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ + u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ + u8 link_idx; /* index in lgr link array */ + u8 link_is_asym; /* is link asymmetric? */ + u8 clearing : 1; /* link is being cleared */ + refcount_t refcnt; /* link reference count */ + struct smc_link_group *lgr; /* parent link group */ + struct work_struct link_down_wrk; /* wrk to bring link down */ + char ibname[IB_DEVICE_NAME_MAX]; /* ib device name */ + int ndev_ifidx; /* network device ifindex */ enum smc_link_state state; /* state of link */ - struct workqueue_struct *llc_wq; /* single thread work queue */ - struct completion llc_confirm; /* wait for rx of conf link */ - struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ - int llc_confirm_rc; /* rc from confirm link msg */ - int llc_confirm_resp_rc; /* rc from conf_resp msg */ - struct completion llc_add; /* wait for rx of add link */ - struct completion llc_add_resp; /* wait for rx of add link rsp*/ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ - int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ - struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ - int llc_delete_rkey_rc; /* rc from del rkey msg */ - struct mutex llc_delete_rkey_mutex; /* serialize usage */ + atomic_t conn_cnt; /* connections on this link */ }; /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ -#define SMC_LINKS_PER_LGR_MAX 1 +#define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 -#define SMC_FIRST_CONTACT 1 /* first contact to a peer */ -#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/ - /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { struct list_head list; @@ -150,25 +166,37 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 wr_reg : 1; /* mem region registered */ - u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; - /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr[SMC_LINKS_PER_LGR_MAX]; + /* memory region: for rmb and + * vzalloced sndbuf + * incl. rkey provided to peer + * and lkey provided to local + */ + u32 order; /* allocation order */ + + u8 is_conf_rkey; + /* confirm_rkey done */ + u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; + /* mem region registered */ + u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; + /* mem region mapped to lnk */ + u8 is_dma_need_sync; + u8 is_reg_err; + /* buffer registration err */ + u8 is_vm; + /* virtually contiguous */ }; struct { /* SMC-D */ - unsigned short sba_idx; - /* SBA index number */ - u64 token; - /* DMB token number */ - dma_addr_t dma_addr; - /* DMA address */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ }; }; }; @@ -178,7 +206,6 @@ struct smc_rtoken { /* address/key of remote RMB */ u32 rkey; }; -#define SMC_LGR_ID_SIZE 4 #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, @@ -188,6 +215,35 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smcd_dev; +enum smc_lgr_type { /* redundancy state of lgr */ + SMC_LGR_NONE, /* no active links, lgr to be deleted */ + SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ + SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ + SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ + SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ +}; + +enum smcr_buf_type { /* types of SMC-R sndbufs and RMBs */ + SMCR_PHYS_CONT_BUFS = 0, + SMCR_VIRT_CONT_BUFS = 1, + SMCR_MIXED_BUFS = 2, +}; + +enum smc_llc_flowtype { + SMC_LLC_FLOW_NONE = 0, + SMC_LLC_FLOW_ADD_LINK = 2, + SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_REQ_ADD_LINK = 5, + SMC_LLC_FLOW_RKEY = 6, +}; + +struct smc_llc_qentry; + +struct smc_llc_flow { + enum smc_llc_flowtype type; + struct smc_llc_qentry *qentry; +}; + struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ @@ -196,25 +252,35 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - rwlock_t sndbufs_lock; /* protects tx buffers */ + struct mutex sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - rwlock_t rmbs_lock; /* protects rx buffers */ + struct mutex rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ struct work_struct terminate_work; /* abnormal lgr termination */ + struct workqueue_struct *tx_wq; /* wq for conn. tx workers */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ - u8 freefast : 1; /* free worker scheduled fast */ u8 freeing : 1; /* lgr is being freed */ + refcount_t refcnt; /* lgr reference count */ bool is_smcd; /* SMC-R or SMC-D */ + u8 smc_version; + u8 negotiated_eid[SMC_MAX_EID_LEN]; + u8 peer_os; /* peer operating system */ + u8 peer_smc_release; + u8 peer_hostname[SMC_MAX_HOSTNAME_LEN]; union { struct { /* SMC-R */ enum smc_lgr_role role; /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ + struct smc_wr_v2_buf *wr_rx_buf_v2; + /* WR v2 recv payload buffer */ + struct smc_wr_v2_buf *wr_tx_buf_v2; + /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] @@ -222,6 +288,43 @@ struct smc_link_group { /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ + u8 next_link_id; + enum smc_lgr_type type; + enum smcr_buf_type buf_type; + /* redundancy state */ + u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; + /* pnet id of this lgr */ + struct list_head llc_event_q; + /* queue for llc events */ + spinlock_t llc_event_q_lock; + /* protects llc_event_q */ + struct mutex llc_conf_mutex; + /* protects lgr reconfig. */ + struct work_struct llc_add_link_work; + struct work_struct llc_del_link_work; + struct work_struct llc_event_work; + /* llc event worker */ + wait_queue_head_t llc_flow_waiter; + /* w4 next llc event */ + wait_queue_head_t llc_msg_waiter; + /* w4 next llc msg */ + struct smc_llc_flow llc_flow_lcl; + /* llc local control field */ + struct smc_llc_flow llc_flow_rmt; + /* llc remote control field */ + struct smc_llc_qentry *delayed_event; + /* arrived when flow active */ + spinlock_t llc_flow_lock; + /* protects llc flow */ + int llc_testlink_time; + /* link keep alive time */ + u32 llc_termination_rsn; + /* rsn code for termination */ + u8 nexthop_mac[ETH_ALEN]; + u8 uses_gateway; + __be32 saddr; + /* net namespace */ + struct net *net; }; struct { /* SMC-D */ u64 peer_gid; @@ -236,20 +339,58 @@ struct smc_link_group { struct smc_clc_msg_local; +#define GID_LIST_SIZE 2 + +struct smc_gidlist { + u8 len; + u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; +}; + +struct smc_init_info_smcrv2 { + /* Input fields */ + __be32 saddr; + struct sock *clc_sk; + __be32 daddr; + + /* Output fields when saddr is set */ + struct smc_ib_device *ib_dev_v2; + u8 ib_port_v2; + u8 ib_gid_v2[SMC_GID_SIZE]; + + /* Additional output fields when clc_sk and daddr is set as well */ + u8 uses_gateway; + u8 nexthop_mac[ETH_ALEN]; + + struct smc_gidlist gidlist; +}; + struct smc_init_info { u8 is_smcd; + u8 smc_type_v1; + u8 smc_type_v2; + u8 first_contact_peer; + u8 first_contact_local; unsigned short vlan_id; - int srv_first_contact; - int cln_first_contact; + u32 rc; + u8 negotiated_eid[SMC_MAX_EID_LEN]; /* SMC-R */ - struct smc_clc_msg_local *ib_lcl; + u8 smcr_version; + u8 check_smcrv2; + u8 peer_gid[SMC_GID_SIZE]; + u8 peer_mac[ETH_ALEN]; + u8 peer_systemid[SMC_SYSTEMID_LEN]; struct smc_ib_device *ib_dev; u8 ib_gid[SMC_GID_SIZE]; u8 ib_port; u32 ib_clcqpn; + struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ - u64 ism_gid; - struct smcd_dev *ism_dev; + u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1]; + struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1]; + u16 ism_chid[SMC_MAX_ISM_DEVS + 1]; + u8 ism_offered_cnt; /* # of ISM devices offered */ + u8 ism_selected; /* index of selected ISM dev*/ + u8 smcd_version; }; /* Find the connection associated with the given alert token in the link group. @@ -285,34 +426,109 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } -static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr) +static inline bool smc_conn_lgr_valid(struct smc_connection *conn) { - if (!lgr->terminating && !lgr->freeing) - schedule_work(&lgr->terminate_work); + return conn->lgr && conn->alert_token_local; +} + +/* + * Returns true if the specified link is usable. + * + * usable means the link is ready to receive RDMA messages, map memory + * on the link, etc. This doesn't ensure we are able to send RDMA messages + * on this link, if sending RDMA messages is needed, use smc_link_sendable() + */ +static inline bool smc_link_usable(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) + return false; + return true; +} + +/* + * Returns true if the specified link is ready to receive AND send RDMA + * messages. + * + * For the client side in first contact, the underlying QP may still in + * RESET or RTR when the link state is ACTIVATING, checks in smc_link_usable() + * is not strong enough. For those places that need to send any CDC or LLC + * messages, use smc_link_sendable(), otherwise, use smc_link_usable() instead + */ +static inline bool smc_link_sendable(struct smc_link *lnk) +{ + return smc_link_usable(lnk) && + lnk->qp_attr.cur_qp_state == IB_QPS_RTS; +} + +static inline bool smc_link_active(struct smc_link *lnk) +{ + return lnk->state == SMC_LNK_ACTIVE; +} + +static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +struct smc_pci_dev { + __u32 pci_fid; + __u16 pci_pchid; + __u16 pci_vendor; + __u16 pci_device; + __u8 pci_id[SMC_PCI_ID_STR_LEN]; +}; + +static inline void smc_set_pci_values(struct pci_dev *pci_dev, + struct smc_pci_dev *smc_dev) +{ + smc_dev->pci_vendor = pci_dev->vendor; + smc_dev->pci_device = pci_dev->device; + snprintf(smc_dev->pci_id, sizeof(smc_dev->pci_id), "%s", + pci_name(pci_dev)); +#if IS_ENABLED(CONFIG_S390) + { /* Set s390 specific PCI information */ + struct zpci_dev *zdev; + + zdev = to_zpci(pci_dev); + smc_dev->pci_fid = zdev->fid; + smc_dev->pci_pchid = zdev->pchid; + } +#endif } struct smc_sock; struct smc_clc_msg_accept_confirm; -struct smc_clc_msg_local; -void smc_lgr_forget(struct smc_link_group *lgr); -void smc_lgr_cleanup_early(struct smc_connection *conn); -void smc_lgr_terminate(struct smc_link_group *lgr, bool soft); -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); +void smc_lgr_cleanup_early(struct smc_link_group *lgr); +void smc_lgr_terminate_sched(struct smc_link_group *lgr); +void smc_lgr_hold(struct smc_link_group *lgr); +void smc_lgr_put(struct smc_link_group *lgr); +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); -int smc_rmb_rtoken_handling(struct smc_connection *conn, +int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); -void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); -void smc_rmb_sync_sg_for_device(struct smc_connection *conn); int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); @@ -321,8 +537,30 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini); +void smcr_link_clear(struct smc_link *lnk, bool log); +void smcr_link_hold(struct smc_link *lnk); +void smcr_link_put(struct smc_link *lnk); +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk); +int smcr_buf_map_lgr(struct smc_link *lnk); +int smcr_buf_reg_lgr(struct smc_link *lnk); +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx); +int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *rmb_desc); +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err); +void smcr_link_down_cond(struct smc_link *lnk); +void smcr_link_down_cond_sched(struct smc_link *lnk); +int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); +int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); + static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { - return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + return link->lgr; } #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index e1f64f4ba236..80ea7d954ece 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -22,17 +22,13 @@ #include "smc.h" #include "smc_core.h" -static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +struct smc_diag_dump_ctx { + int pos[2]; +}; + +static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) { - sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", - be16_to_cpu(((__be16 *)gid_raw)[0]), - be16_to_cpu(((__be16 *)gid_raw)[1]), - be16_to_cpu(((__be16 *)gid_raw)[2]), - be16_to_cpu(((__be16 *)gid_raw)[3]), - be16_to_cpu(((__be16 *)gid_raw)[4]), - be16_to_cpu(((__be16 *)gid_raw)[5]), - be16_to_cpu(((__be16 *)gid_raw)[6]), - be16_to_cpu(((__be16 *)gid_raw)[7])); + return (struct smc_diag_dump_ctx *)cb->ctx; } static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) @@ -93,7 +89,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; - else if (smc->conn.lgr && smc->conn.lgr->is_smcd) + else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; @@ -146,37 +142,39 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { + struct smc_link *link = smc->conn.lnk; + struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, - .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, - .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id, + .lnk[0].ibport = link->ibport, + .lnk[0].link_id = link->link_id, }; memcpy(linfo.lnk[0].ibname, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, - sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); - smc_gid_be16_convert(linfo.lnk[0].gid, - smc->conn.lgr->lnk[0].gid); - smc_gid_be16_convert(linfo.lnk[0].peer_gid, - smc->conn.lgr->lnk[0].peer_gid); + sizeof(link->smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } - if (smc->conn.lgr && smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; - struct smcd_diag_dmbinfo dinfo = { - .linkid = *((u32 *)conn->lgr->id), - .peer_gid = conn->lgr->peer_gid, - .my_gid = conn->lgr->smcd->local_gid, - .token = conn->rmb_desc->token, - .peer_token = conn->peer_token - }; + struct smcd_diag_dmbinfo dinfo; + + memset(&dinfo, 0, sizeof(dinfo)); + + dinfo.linkid = *((u32 *)conn->lgr->id); + dinfo.peer_gid = conn->lgr->peer_gid; + dinfo.my_gid = conn->lgr->smcd->local_gid; + dinfo.token = conn->rmb_desc->token; + dinfo.peer_token = conn->peer_token; if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) goto errout; @@ -191,13 +189,15 @@ errout: } static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, int p_type) { + struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); struct net *net = sock_net(skb->sk); + int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; + int rc = 0, num = 0; struct sock *sk; - int rc = 0; read_lock(&prot->h.smc_hash->lock); head = &prot->h.smc_hash->ht; @@ -207,13 +207,18 @@ static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, sk_for_each(sk, head) { if (!net_eq(sock_net(sk), net)) continue; + if (num < snum) + goto next; rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); - if (rc) - break; + if (rc < 0) + goto out; +next: + num++; } out: read_unlock(&prot->h.smc_hash->lock); + cb_ctx->pos[p_type] = num; return rc; } @@ -221,10 +226,10 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int rc = 0; - rc = smc_diag_dump_proto(&smc_proto, skb, cb); + rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); if (!rc) - rc = smc_diag_dump_proto(&smc_proto6, skb, cb); - return rc; + smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); + return skb->len; } static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) @@ -263,3 +268,4 @@ module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); +MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 05b825b3cfa4..854772dd52fd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -12,10 +12,14 @@ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ +#include <linux/etherdevice.h> +#include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> #include <linux/scatterlist.h> #include <linux/wait.h> +#include <linux/mutex.h> +#include <linux/inetdevice.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -24,6 +28,7 @@ #include "smc_core.h" #include "smc_wr.h" #include "smc.h" +#include "smc_netlink.h" #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ @@ -33,15 +38,11 @@ #define SMC_QP_RNR_RETRY 7 /* 7: infinite */ struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ - .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock), + .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex), .list = LIST_HEAD_INIT(smc_ib_devices.list), }; -#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%" - -u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system - * identifier - */ +u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ static int smc_ib_modify_qp_init(struct smc_link *lnk) { @@ -64,16 +65,23 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk) IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; struct ib_qp_attr qp_attr; + u8 hop_lim = 1; memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = IB_QPS_RTR; qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); - rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + hop_lim = IPV6_DEFAULT_HOPLIMIT; + rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0); rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); - memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, - sizeof(lnk->peer_mac)); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, + sizeof(lnk->lgr->nexthop_mac)); + else + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, + sizeof(lnk->peer_mac)); qp_attr.dest_qp_num = lnk->peer_qpn; qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming @@ -103,12 +111,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk) IB_QP_MAX_QP_RD_ATOMIC); } -int smc_ib_modify_qp_reset(struct smc_link *lnk) +int smc_ib_modify_qp_error(struct smc_link *lnk) { struct ib_qp_attr qp_attr; memset(&qp_attr, 0, sizeof(qp_attr)); - qp_attr.qp_state = IB_QPS_RESET; + qp_attr.qp_state = IB_QPS_ERR; return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); } @@ -168,6 +176,15 @@ static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, { memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], sizeof(smcibdev->mac[ibport - 1])); +} + +bool smc_ib_is_valid_local_systemid(void) +{ + return !is_zero_ether_addr(&local_systemid[2]); +} + +static void smc_ib_init_local_systemid(void) +{ get_random_bytes(&local_systemid[0], 2); } @@ -176,9 +193,81 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; } +int smc_ib_find_route(__be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway) +{ + struct neighbour *neigh = NULL; + struct rtable *rt = NULL; + struct flowi4 fl4 = { + .saddr = saddr, + .daddr = daddr + }; + + if (daddr == cpu_to_be32(INADDR_NONE)) + goto out; + rt = ip_route_output_flow(&init_net, &fl4, NULL); + if (IS_ERR(rt)) + goto out; + if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) + goto out; + neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); + if (neigh) { + memcpy(nexthop_mac, neigh->ha, ETH_ALEN); + *uses_gateway = rt->rt_uses_gateway; + return 0; + } +out: + return -ENOENT; +} + +static int smc_ib_determine_gid_rcu(const struct net_device *ndev, + const struct ib_gid_attr *attr, + u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) +{ + if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } + if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { + struct in_device *in_dev = __in_dev_get_rcu(ndev); + const struct in_ifaddr *ifa; + bool subnet_match = false; + + if (!in_dev) + goto out; + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!inet_ifa_match(smcrv2->saddr, ifa)) + continue; + subnet_match = true; + break; + } + if (!subnet_match) + goto out; + if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr, + smcrv2->daddr, + smcrv2->nexthop_mac, + &smcrv2->uses_gateway)) + goto out; + + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } +out: + return -ENODEV; +} + /* determine the gid for an ib-device port and vlan id */ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, - unsigned short vlan_id, u8 gid[], u8 *sgid_index) + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) { const struct ib_gid_attr *attr; const struct net_device *ndev; @@ -192,17 +281,15 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); if (!IS_ERR(ndev) && - ((!vlan_id && !is_vlan_dev(attr->ndev)) || - (vlan_id && is_vlan_dev(attr->ndev) && - vlan_dev_vlan_id(attr->ndev) == vlan_id)) && - attr->gid_type == IB_GID_TYPE_ROCE) { - rcu_read_unlock(); - if (gid) - memcpy(gid, &attr->gid, SMC_GID_SIZE); - if (sgid_index) - *sgid_index = attr->index; - rdma_put_gid_attr(attr); - return 0; + ((!vlan_id && !is_vlan_dev(ndev)) || + (vlan_id && is_vlan_dev(ndev) && + vlan_dev_vlan_id(ndev) == vlan_id))) { + if (!smc_ib_determine_gid_rcu(ndev, attr, gid, + sgid_index, smcrv2)) { + rcu_read_unlock(); + rdma_put_gid_attr(attr); + return 0; + } } rcu_read_unlock(); rdma_put_gid_attr(attr); @@ -210,6 +297,58 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, return -ENODEV; } +/* check if gid is still defined on smcibdev */ +static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, + struct smc_ib_device *smcibdev, u8 ibport) +{ + const struct ib_gid_attr *attr; + bool rc = false; + int i; + + for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { + attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); + if (IS_ERR(attr)) + continue; + + rcu_read_lock(); + if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || + (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + !(ipv6_addr_type((const struct in6_addr *)&attr->gid) + & IPV6_ADDR_LINKLOCAL))) + if (!memcmp(gid, &attr->gid, SMC_GID_SIZE)) + rc = true; + rcu_read_unlock(); + rdma_put_gid_attr(attr); + } + return rc; +} + +/* check all links if the gid is still defined on smcibdev */ +static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr; + int i; + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry(lgr, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (!smc_ib_check_link_gid(lgr->lnk[i].gid, + lgr->smc_version == SMC_V2, + smcibdev, ibport)) + smcr_port_err(smcibdev, ibport); + } + } + spin_unlock_bh(&smc_lgr_list.lock); +} + static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) { int rc; @@ -224,8 +363,7 @@ static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) rc = smc_ib_fill_mac(smcibdev, ibport); if (rc) goto out; - if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, - sizeof(local_systemid)) && + if (!smc_ib_is_valid_local_systemid() && smc_ib_port_active(smcibdev, ibport)) /* create unique system identifier */ smc_ib_define_local_systemid(smcibdev, ibport); @@ -245,9 +383,11 @@ static void smc_ib_port_event_work(struct work_struct *work) clear_bit(port_idx, &smcibdev->port_event_mask); if (!smc_ib_port_active(smcibdev, port_idx + 1)) { set_bit(port_idx, smcibdev->ports_going_away); - smc_port_terminate(smcibdev, port_idx + 1); + smcr_port_err(smcibdev, port_idx + 1); } else { clear_bit(port_idx, smcibdev->ports_going_away); + smcr_port_add(smcibdev, port_idx + 1); + smc_ib_gid_check(smcibdev, port_idx + 1); } } } @@ -257,6 +397,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, struct ib_event *ibevent) { struct smc_ib_device *smcibdev; + bool schedule = false; u8 port_idx; smcibdev = container_of(handler, struct smc_ib_device, event_handler); @@ -266,22 +407,35 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, /* terminate all ports on device */ for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) { set_bit(port_idx, &smcibdev->port_event_mask); - set_bit(port_idx, smcibdev->ports_going_away); + if (!test_and_set_bit(port_idx, + smcibdev->ports_going_away)) + schedule = true; } - schedule_work(&smcibdev->port_event_work); + if (schedule) + schedule_work(&smcibdev->port_event_work); break; - case IB_EVENT_PORT_ERR: case IB_EVENT_PORT_ACTIVE: - case IB_EVENT_GID_CHANGE: port_idx = ibevent->element.port_num - 1; - if (port_idx < SMC_MAX_PORTS) { - set_bit(port_idx, &smcibdev->port_event_mask); - if (ibevent->event == IB_EVENT_PORT_ERR) - set_bit(port_idx, smcibdev->ports_going_away); - else if (ibevent->event == IB_EVENT_PORT_ACTIVE) - clear_bit(port_idx, smcibdev->ports_going_away); + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + if (test_and_clear_bit(port_idx, smcibdev->ports_going_away)) schedule_work(&smcibdev->port_event_work); - } + break; + case IB_EVENT_PORT_ERR: + port_idx = ibevent->element.port_num - 1; + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) + schedule_work(&smcibdev->port_event_work); + break; + case IB_EVENT_GID_CHANGE: + port_idx = ibevent->element.port_num - 1; + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); break; default: break; @@ -306,6 +460,171 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) return rc; } +static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr, + struct smc_ib_device *smcibdev) +{ + struct smc_link_group *lgr; + bool rc = false; + int i; + + spin_lock_bh(&smc_lgr->lock); + list_for_each_entry(lgr, &smc_lgr->list, list) { + if (lgr->is_smcd) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (lgr->type == SMC_LGR_SINGLE || + lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) { + rc = true; + goto out; + } + } + } +out: + spin_unlock_bh(&smc_lgr->lock); + return rc; +} + +static int smc_nl_handle_dev_port(struct sk_buff *skb, + struct ib_device *ibdev, + struct smc_ib_device *smcibdev, + int port) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct nlattr *port_attrs; + unsigned char port_state; + int lnk_count = 0; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port); + if (!port_attrs) + goto errout; + + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, + smcibdev->pnetid_by_user[port])) + goto errattr; + memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errattr; + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV, + smcibdev->ndev_ifidx[port])) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1)) + goto errattr; + port_state = smc_ib_port_active(smcibdev, port + 1); + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state)) + goto errattr; + lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]); + if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count)) + goto errattr; + nla_nest_end(skb, port_attrs); + return 0; +errattr: + nla_nest_cancel(skb, port_attrs); +errout: + return -EMSGSIZE; +} + +static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor)) + return false; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device)) + return false; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id)) + return false; + return true; +} + +static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_ibname[IB_DEVICE_NAME_MAX]; + struct smc_pci_dev smc_pci_dev; + struct pci_dev *pci_dev; + unsigned char is_crit; + struct nlattr *attrs; + void *nlh; + int i; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCR); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR); + if (!attrs) + goto errout; + is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev); + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit)) + goto errattr; + if (smcibdev->ibdev->dev.parent) { + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent); + smc_set_pci_values(pci_dev, &smc_pci_dev); + if (!smc_nl_handle_pci_values(&smc_pci_dev, skb)) + goto errattr; + } + snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name); + if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname)) + goto errattr; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(smcibdev->ibdev, i)) + continue; + if (smc_nl_handle_dev_port(skb, smcibdev->ibdev, + smcibdev, i - 1)) + goto errattr; + } + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct smc_ib_device *smcibdev; + int snum = cb_ctx->pos[0]; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcibdev, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcr_dev(smcibdev, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb); + return skb->len; +} + static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -316,11 +635,11 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) case IB_EVENT_QP_FATAL: case IB_EVENT_QP_ACCESS_ERR: port_idx = ibevent->element.qp->port - 1; - if (port_idx < SMC_MAX_PORTS) { - set_bit(port_idx, &smcibdev->port_event_mask); - set_bit(port_idx, smcibdev->ports_going_away); + if (port_idx >= SMC_MAX_PORTS) + break; + set_bit(port_idx, &smcibdev->port_event_mask); + if (!test_and_set_bit(port_idx, smcibdev->ports_going_away)) schedule_work(&smcibdev->port_event_work); - } break; default: break; @@ -337,6 +656,7 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -350,7 +670,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = 1, + .max_recv_sge = sges_per_buf, + .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, @@ -371,15 +692,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr) ib_dereg_mr(mr); } -static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) { unsigned int offset = 0; int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + sg_num = ib_map_mr_sg(buf_slot->mr[link_idx], + buf_slot->sgt[link_idx].sgl, + buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); return sg_num; @@ -387,41 +708,68 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) /* Allocate a memory region and map the dma mapped SG list of buf_slot */ int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot) + struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[SMC_SINGLE_LINK]) + if (buf_slot->mr[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[SMC_SINGLE_LINK] = + buf_slot->mr[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { + if (IS_ERR(buf_slot->mr[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); - buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; + rc = PTR_ERR(buf_slot->mr[link_idx]); + buf_slot->mr[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != + buf_slot->sgt[link_idx].orig_nents) return -EINVAL; return 0; } +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot) +{ + struct scatterlist *sg; + unsigned int i; + bool ret = false; + + /* for now there is just one DMA address */ + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { + if (!sg_dma_len(sg)) + break; + if (dma_need_sync(lnk->smcibdev->ibdev->dma_device, + sg_dma_address(sg))) { + ret = true; + goto out; + } + } + +out: + return ret; +} + /* synchronize buffer usage for cpu access */ -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { struct scatterlist *sg; unsigned int i; + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_cpu(smcibdev->ibdev, + ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -429,19 +777,22 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, } /* synchronize buffer usage for device access */ -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { struct scatterlist *sg; unsigned int i; + if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx))) + return; + /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_device(smcibdev->ibdev, + ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -449,15 +800,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, } /* Map a new TX or RX buffer SG-table to DMA */ -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { int mapped_nents; - mapped_nents = ib_dma_map_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); if (!mapped_nents) return -ENOMEM; @@ -465,18 +816,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, return mapped_nents; } -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { - if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address) + if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) return; /* already unmapped */ - ib_dma_unmap_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + ib_dma_unmap_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); - buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; + buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) @@ -486,6 +837,10 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) int cqe_size_order, smc_order; long rc; + mutex_lock(&smcibdev->mutex); + rc = 0; + if (smcibdev->initialized) + goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; smc_order = MAX_ORDER - cqe_size_order - 1; @@ -497,7 +852,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); if (IS_ERR(smcibdev->roce_cq_send)) { smcibdev->roce_cq_send = NULL; - return rc; + goto out; } smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, smc_wr_rx_cq_handler, NULL, @@ -509,46 +864,94 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; - return rc; + goto out; err: ib_destroy_cq(smcibdev->roce_cq_send); +out: + mutex_unlock(&smcibdev->mutex); return rc; } static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) { + mutex_lock(&smcibdev->mutex); if (!smcibdev->initialized) - return; + goto out; smcibdev->initialized = 0; ib_destroy_cq(smcibdev->roce_cq_recv); ib_destroy_cq(smcibdev->roce_cq_send); smc_wr_remove_dev(smcibdev); +out: + mutex_unlock(&smcibdev->mutex); } static struct ib_client smc_ib_client; +static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) +{ + struct ib_device *ibdev = smcibdev->ibdev; + struct net_device *ndev; + + if (!ibdev->ops.get_netdev) + return; + ndev = ibdev->ops.get_netdev(ibdev, port + 1); + if (ndev) { + smcibdev->ndev_ifidx[port] = ndev->ifindex; + dev_put(ndev); + } +} + +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) +{ + struct smc_ib_device *smcibdev; + struct ib_device *libdev; + struct net_device *lndev; + u8 port_cnt; + int i; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(smcibdev, &smc_ib_devices.list, list) { + port_cnt = smcibdev->ibdev->phys_port_cnt; + for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { + libdev = smcibdev->ibdev; + if (!libdev->ops.get_netdev) + continue; + lndev = libdev->ops.get_netdev(libdev, i + 1); + dev_put(lndev); + if (lndev != ndev) + continue; + if (event == NETDEV_REGISTER) + smcibdev->ndev_ifidx[i] = ndev->ifindex; + if (event == NETDEV_UNREGISTER) + smcibdev->ndev_ifidx[i] = 0; + } + } + mutex_unlock(&smc_ib_devices.mutex); +} + /* callback function for ib_register_client() */ -static void smc_ib_add_dev(struct ib_device *ibdev) +static int smc_ib_add_dev(struct ib_device *ibdev) { struct smc_ib_device *smcibdev; u8 port_cnt; int i; if (ibdev->node_type != RDMA_NODE_IB_CA) - return; + return -EOPNOTSUPP; smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); if (!smcibdev) - return; + return -ENOMEM; smcibdev->ibdev = ibdev; INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); atomic_set(&smcibdev->lnk_cnt, 0); init_waitqueue_head(&smcibdev->lnks_deleted); - spin_lock(&smc_ib_devices.lock); + mutex_init(&smcibdev->mutex); + mutex_lock(&smc_ib_devices.mutex); list_add_tail(&smcibdev->list, &smc_ib_devices.list); - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); ib_set_client_data(ibdev, &smc_ib_client, smcibdev); INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, smc_ib_global_event_handler); @@ -556,29 +959,39 @@ static void smc_ib_add_dev(struct ib_device *ibdev) /* trigger reading of the port attributes */ port_cnt = smcibdev->ibdev->phys_port_cnt; + pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", + smcibdev->ibdev->name, port_cnt); for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { set_bit(i, &smcibdev->port_event_mask); /* determine pnetids of the port */ - smc_pnetid_by_dev_port(ibdev->dev.parent, i, - smcibdev->pnetid[i]); + if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i])) + smc_pnetid_by_table_ib(smcibdev, i + 1); + smc_copy_netdev_ifindex(smcibdev, i); + pr_warn_ratelimited("smc: ib device %s port %d has pnetid " + "%.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); } schedule_work(&smcibdev->port_event_work); + return 0; } /* callback function for ib_unregister_client() */ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { - struct smc_ib_device *smcibdev; + struct smc_ib_device *smcibdev = client_data; - smcibdev = ib_get_client_data(ibdev, &smc_ib_client); - if (!smcibdev || smcibdev->ibdev != ibdev) - return; - ib_set_client_data(ibdev, &smc_ib_client, NULL); - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); + pr_warn_ratelimited("smc: removing ib device %s\n", + smcibdev->ibdev->name); smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); @@ -594,6 +1007,7 @@ static struct ib_client smc_ib_client = { int __init smc_ib_register_client(void) { + smc_ib_init_local_systemid(); return ib_register_client(&smc_ib_client); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 255db87547d3..034295676e88 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/if_ether.h> +#include <linux/mutex.h> #include <linux/wait.h> #include <rdma/ib_verbs.h> #include <net/smc.h> @@ -25,10 +26,11 @@ struct smc_ib_devices { /* list of smc ib devices definition */ struct list_head list; - spinlock_t lock; /* protects list of smc ib devices */ + struct mutex mutex; /* protects list of smc ib devices */ }; extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ +extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; @@ -51,18 +53,41 @@ struct smc_ib_device { /* ib-device infos for smc */ DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS); atomic_t lnk_cnt; /* number of links on ibdev */ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/ + struct mutex mutex; /* protect dev setup+cleanup */ + atomic_t lnk_cnt_by_port[SMC_MAX_PORTS]; + /* number of links per port */ + int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */ }; +static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE]) +{ + struct in6_addr *addr6 = (struct in6_addr *)gid; + + if (ipv6_addr_v4mapped(addr6) || + !(addr6->s6_addr32[0] | addr6->s6_addr32[1] | addr6->s6_addr32[2])) + return addr6->s6_addr32[3]; + return cpu_to_be32(INADDR_NONE); +} + +static inline struct net *smc_ib_net(struct smc_ib_device *smcibdev) +{ + if (smcibdev && smcibdev->ibdev) + return read_pnet(&smcibdev->ibdev->coredev.rdma_net); + return NULL; +} + +struct smc_init_info_smcrv2; struct smc_buf_desc; struct smc_link; +void smc_ib_ndev_change(struct net_device *ndev, unsigned long event); int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); void smc_ib_dealloc_protection_domain(struct smc_link *lnk); @@ -72,16 +97,24 @@ int smc_ib_create_queue_pair(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); +int smc_ib_modify_qp_error(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot); + struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +bool smc_ib_is_sg_need_sync(struct smc_link *lnk, + struct smc_buf_desc *buf_slot); +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, - unsigned short vlan_id, u8 gid[], u8 *sgid_index); + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2); +int smc_ib_find_route(__be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway); +bool smc_ib_is_valid_local_systemid(void); +int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5c4727d5066e..911fe08bc54b 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -6,7 +6,9 @@ * Copyright IBM Corp. 2018 */ +#include <linux/if_vlan.h> #include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/slab.h> #include <asm/page.h> @@ -14,28 +16,40 @@ #include "smc_core.h" #include "smc_ism.h" #include "smc_pnet.h" +#include "smc_netlink.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), - .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock) + .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; -/* Test if an ISM communication is possible. */ +static bool smc_ism_v2_capable; +static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; + +/* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, vlan_id); } -int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos, - void *data, size_t len) +void smc_ism_get_system_eid(u8 **eid) { - int rc; + if (!smc_ism_v2_capable) + *eid = NULL; + else + *eid = smc_ism_v2_system_eid; +} - rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal, - pos->offset, data, len); +u16 smc_ism_get_chid(struct smcd_dev *smcd) +{ + return smcd->ops->get_chid(smcd); +} - return rc < 0 ? rc : 0; +/* HW supports ISM V2 and thus System EID is defined */ +bool smc_ism_is_v2_capable(void) +{ + return smc_ism_v2_capable; } /* Set a connection using this DMBE. */ @@ -188,6 +202,97 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, return rc; } +static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smc_pci_dev smc_pci_dev; + struct nlattr *port_attrs; + struct nlattr *attrs; + int use_cnt = 0; + void *nlh; + + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_DEV_SMCD); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); + if (!attrs) + goto errout; + use_cnt = atomic_read(&smcd->lgr_cnt); + if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) + goto errattr; + memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); + smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev); + if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) + goto errattr; + if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) + goto errattr; + + port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); + if (!port_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) + goto errportattr; + memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); + smc_pnet[SMC_MAX_PNETID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) + goto errportattr; + + nla_nest_end(skb, port_attrs); + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return 0; + +errportattr: + nla_nest_cancel(skb, port_attrs); +errattr: + nla_nest_cancel(skb, attrs); +errout: + nlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int snum = cb_ctx->pos[0]; + struct smcd_dev *smcd; + int num = 0; + + mutex_lock(&dev_list->mutex); + list_for_each_entry(smcd, &dev_list->list, list) { + if (num < snum) + goto next; + if (smc_nl_handle_smcd_dev(smcd, skb, cb)) + goto errout; +next: + num++; + } +errout: + mutex_unlock(&dev_list->mutex); + cb_ctx->pos[0] = num; +} + +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) +{ + smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); + return skb->len; +} + struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; @@ -291,47 +396,77 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, return NULL; } + smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", + WQ_MEM_RECLAIM, name); + if (!smcd->event_wq) { + kfree(smcd->conn); + kfree(smcd); + return NULL; + } + smcd->dev.parent = parent; smcd->dev.release = smcd_release; device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; - smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); - smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", - WQ_MEM_RECLAIM, name); - if (!smcd->event_wq) { - kfree(smcd->conn); - kfree(smcd); - return NULL; - } return smcd; } EXPORT_SYMBOL_GPL(smcd_alloc_dev); int smcd_register_dev(struct smcd_dev *smcd) { - spin_lock(&smcd_dev_list.lock); - list_add_tail(&smcd->list, &smcd_dev_list.list); - spin_unlock(&smcd_dev_list.lock); + int rc; + + mutex_lock(&smcd_dev_list.mutex); + if (list_empty(&smcd_dev_list.list)) { + u8 *system_eid = NULL; + + system_eid = smcd->ops->get_system_eid(); + if (system_eid[24] != '0' || system_eid[28] != '0') { + smc_ism_v2_capable = true; + memcpy(smc_ism_v2_system_eid, system_eid, + SMC_MAX_EID_LEN); + } + } + /* sort list: devices without pnetid before devices with pnetid */ + if (smcd->pnetid[0]) + list_add_tail(&smcd->list, &smcd_dev_list.list); + else + list_add(&smcd->list, &smcd_dev_list.list); + mutex_unlock(&smcd_dev_list.mutex); + + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&smcd->dev), smcd->pnetid, + smcd->pnetid_by_user ? " (user defined)" : ""); + + rc = device_add(&smcd->dev); + if (rc) { + mutex_lock(&smcd_dev_list.mutex); + list_del(&smcd->list); + mutex_unlock(&smcd_dev_list.mutex); + } - return device_add(&smcd->dev); + return rc; } EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { - spin_lock(&smcd_dev_list.lock); + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&smcd->dev)); + mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); smcd->going_away = 1; smc_smcd_terminate_all(smcd); - flush_workqueue(smcd->event_wq); destroy_workqueue(smcd->event_wq); device_del(&smcd->dev); @@ -373,13 +508,13 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) EXPORT_SYMBOL_GPL(smcd_handle_event); /* SMCD Device interrupt handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer and DMB number. Find the connection and - * schedule the tasklet for this connection. + * Parameters are smcd device pointer, DMB number, and the DMBE bitmask. + * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) +void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) { struct smc_connection *conn = NULL; unsigned long flags; @@ -391,3 +526,9 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) spin_unlock_irqrestore(&smcd->lock, flags); } EXPORT_SYMBOL_GPL(smcd_handle_irq); + +void __init smc_ism_init(void) +{ + smc_ism_v2_capable = false; + memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); +} diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 4da946cbfa29..d6b2db604fe8 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -10,15 +10,17 @@ #define SMCD_ISM_H #include <linux/uio.h> +#include <linux/types.h> +#include <linux/mutex.h> #include "smc.h" struct smcd_dev_list { /* List of SMCD devices */ struct list_head list; - spinlock_t lock; /* Protects list of devices */ + struct mutex mutex; /* Protects list of devices */ }; -extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ +extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ struct smc_ism_vlanid { /* VLAN id set on ISM device */ struct list_head list; @@ -26,13 +28,6 @@ struct smc_ism_vlanid { /* VLAN id set on ISM device */ refcount_t refcnt; /* Reference count */ }; -struct smc_ism_position { /* ISM device position to write to */ - u64 token; /* Token of DMB */ - u32 offset; /* Offset into DMBE */ - u8 index; /* Index of DMBE */ - u8 signal; /* Generate interrupt on owner side */ -}; - struct smcd_dev; int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); @@ -43,7 +38,21 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); -int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, - void *data, size_t len); int smc_ism_signal_shutdown(struct smc_link_group *lgr); +void smc_ism_get_system_eid(u8 **eid); +u16 smc_ism_get_chid(struct smcd_dev *dev); +bool smc_ism_is_v2_capable(void); +void smc_ism_init(void); +int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); + +static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, size_t len) +{ + int rc; + + rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + return rc < 0 ? rc : 0; +} + #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index a9f6431dd69a..524649d0ab65 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -17,21 +17,30 @@ #include "smc_core.h" #include "smc_clc.h" #include "smc_llc.h" +#include "smc_pnet.h" #define SMC_LLC_DATA_LEN 40 struct smc_llc_hdr { struct smc_wr_rx_hdr common; - u8 length; /* 44 */ -#if defined(__BIG_ENDIAN_BITFIELD) - u8 reserved:4, - add_link_rej_rsn:4; + union { + struct { + u8 length; /* 44 */ + #if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + add_link_rej_rsn:4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 add_link_rej_rsn:4, - reserved:4; + u8 add_link_rej_rsn:4, + reserved:4; #endif + }; + u16 length_v2; /* 44 - 8192*/ + }; u8 flags; -}; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ #define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03 @@ -58,11 +67,60 @@ struct smc_llc_msg_add_link { /* type 0x02 */ u8 sender_gid[SMC_GID_SIZE]; u8 sender_qp_num[3]; u8 link_num; - u8 flags2; /* QP mtu */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved3 : 4, + qp_mtu : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + reserved3 : 4; +#endif u8 initial_psn[3]; u8 reserved[8]; }; +struct smc_llc_msg_add_link_cont_rt { + __be32 rmb_key; + __be32 rmb_key_new; + __be64 rmb_vaddr_new; +}; + +struct smc_llc_msg_add_link_v2_ext { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 v2_direct : 1, + reserved : 7; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 7, + v2_direct : 1; +#endif + u8 reserved2; + u8 client_target_gid[SMC_GID_SIZE]; + u8 reserved3[8]; + u16 num_rkeys; + struct smc_llc_msg_add_link_cont_rt rt[]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ + +struct smc_llc_msg_req_add_link_v2 { + struct smc_llc_hdr hd; + u8 reserved[20]; + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + +#define SMC_LLC_RKEYS_PER_CONT_MSG 2 + +struct smc_llc_msg_add_link_cont { /* type 0x03 */ + struct smc_llc_hdr hd; + u8 link_num; + u8 num_rkeys; + u8 reserved2[2]; + struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG]; + u8 reserved[4]; +} __packed; /* format defined in RFC7609 */ + #define SMC_LLC_FLAG_DEL_LINK_ALL 0x40 #define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20 @@ -90,7 +148,8 @@ struct smc_rmb_rtoken { __be64 rmb_vaddr; } __packed; /* format defined in RFC7609 */ -#define SMC_LLC_RKEYS_PER_MSG 3 +#define SMC_LLC_RKEYS_PER_MSG 3 +#define SMC_LLC_RKEYS_PER_MSG_V2 255 struct smc_llc_msg_confirm_rkey { /* type 0x06 */ struct smc_llc_hdr hd; @@ -98,13 +157,8 @@ struct smc_llc_msg_confirm_rkey { /* type 0x06 */ u8 reserved; }; -struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ - struct smc_llc_hdr hd; - u8 num_rkeys; - struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; -}; - #define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_RETRY 0x10 #define SMC_LLC_FLAG_RKEY_NEG 0x20 struct smc_llc_msg_delete_rkey { /* type 0x09 */ @@ -116,13 +170,22 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + u8 num_inval_rkeys; + u8 reserved[2]; + __be32 rkey[]; +}; + union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_req_add_link_v2 req_add_link; + struct smc_llc_msg_add_link_cont add_link_cont; struct smc_llc_msg_del_link delete_link; struct smc_llc_msg_confirm_rkey confirm_rkey; - struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; @@ -134,6 +197,181 @@ union smc_llc_msg { #define SMC_LLC_FLAG_RESP 0x80 +struct smc_llc_qentry { + struct list_head list; + struct smc_link *link; + union smc_llc_msg msg; +}; + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc); + +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry = flow->qentry; + + flow->qentry = NULL; + return qentry; +} + +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry; + + if (flow->qentry) { + qentry = flow->qentry; + flow->qentry = NULL; + kfree(qentry); + } +} + +static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + flow->qentry = qentry; +} + +static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type, + struct smc_llc_qentry *qentry) +{ + u8 msg_type = qentry->msg.raw.hdr.common.llc_type; + + if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) && + flow_type != msg_type && !lgr->delayed_event) { + lgr->delayed_event = qentry; + return; + } + /* drop parallel or already-in-progress llc requests */ + if (flow_type != msg_type) + pr_warn_once("smc: SMC-R lg %*phN net %llu dropped parallel " + "LLC msg: msg %d flow %d role %d\n", + SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, + qentry->msg.raw.hdr.common.type, + flow_type, lgr->role); + kfree(qentry); +} + +/* try to start a new llc flow, initiated by an incoming llc msg */ +static bool smc_llc_flow_start(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = qentry->link->lgr; + + spin_lock_bh(&lgr->llc_flow_lock); + if (flow->type) { + /* a flow is already active */ + smc_llc_flow_parallel(lgr, flow->type, qentry); + spin_unlock_bh(&lgr->llc_flow_lock); + return false; + } + switch (qentry->msg.raw.hdr.common.llc_type) { + case SMC_LLC_ADD_LINK: + flow->type = SMC_LLC_FLOW_ADD_LINK; + break; + case SMC_LLC_DELETE_LINK: + flow->type = SMC_LLC_FLOW_DEL_LINK; + break; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + flow->type = SMC_LLC_FLOW_RKEY; + break; + default: + flow->type = SMC_LLC_FLOW_NONE; + } + smc_llc_flow_qentry_set(flow, qentry); + spin_unlock_bh(&lgr->llc_flow_lock); + return true; +} + +/* start a new local llc flow, wait till current flow finished */ +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type) +{ + enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE; + int rc; + + /* all flows except confirm_rkey and delete_rkey are exclusive, + * confirm/delete rkey flows can run concurrently (local and remote) + */ + if (type == SMC_LLC_FLOW_RKEY) + allowed_remote = SMC_LLC_FLOW_RKEY; +again: + if (list_empty(&lgr->list)) + return -ENODEV; + spin_lock_bh(&lgr->llc_flow_lock); + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)) { + lgr->llc_flow_lcl.type = type; + spin_unlock_bh(&lgr->llc_flow_lock); + return 0; + } + spin_unlock_bh(&lgr->llc_flow_lock); + rc = wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote))), + SMC_LLC_WAIT_TIME * 10); + if (!rc) + return -ETIMEDOUT; + goto again; +} + +/* finish the current llc flow */ +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow) +{ + spin_lock_bh(&lgr->llc_flow_lock); + memset(flow, 0, sizeof(*flow)); + flow->type = SMC_LLC_FLOW_NONE; + spin_unlock_bh(&lgr->llc_flow_lock); + if (!list_empty(&lgr->list) && lgr->delayed_event && + flow == &lgr->llc_flow_lcl) + schedule_work(&lgr->llc_event_work); + else + wake_up(&lgr->llc_flow_waiter); +} + +/* lnk is optional and used for early wakeup when link goes down, useful in + * cases where we wait for a response on the link after we sent a request + */ +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg) +{ + struct smc_llc_flow *flow = &lgr->llc_flow_lcl; + u8 rcv_msg; + + wait_event_timeout(lgr->llc_msg_waiter, + (flow->qentry || + (lnk && !smc_link_usable(lnk)) || + list_empty(&lgr->list)), + time_out); + if (!flow->qentry || + (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) { + smc_llc_flow_qentry_del(flow); + goto out; + } + rcv_msg = flow->qentry->msg.raw.hdr.common.llc_type; + if (exp_msg && rcv_msg != exp_msg) { + if (exp_msg == SMC_LLC_ADD_LINK && + rcv_msg == SMC_LLC_DELETE_LINK) { + /* flow_start will delay the unexpected msg */ + smc_llc_flow_start(&lgr->llc_flow_lcl, + smc_llc_flow_qentry_clr(flow)); + return NULL; + } + pr_warn_once("smc: SMC-R lg %*phN net %llu dropped unexpected LLC msg: " + "msg %d exp %d flow %d role %d flags %x\n", + SMC_LGR_ID_SIZE, &lgr->id, lgr->net->net_cookie, + rcv_msg, exp_msg, + flow->type, lgr->role, + flow->qentry->msg.raw.hdr.flags); + smc_llc_flow_qentry_del(flow); + } +out: + return flow->qentry; +} + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -182,23 +420,48 @@ static int smc_llc_add_pending_send(struct smc_link *link, return 0; } +static int smc_llc_add_pending_send_v2(struct smc_link *link, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **pend) +{ + int rc; + + rc = smc_wr_tx_get_v2_slot(link, smc_llc_tx_handler, wr_buf, pend); + if (rc < 0) + return rc; + return 0; +} + +static void smc_llc_init_msg_hdr(struct smc_llc_hdr *hdr, + struct smc_link_group *lgr, size_t len) +{ + if (lgr->smc_version == SMC_V2) { + hdr->common.llc_version = SMC_V2; + hdr->length_v2 = len; + } else { + hdr->common.llc_version = 0; + hdr->length = len; + } +} + /* high-level API to send LLC confirm link */ int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) { - struct smc_link_group *lgr = smc_get_lgr(link); struct smc_llc_msg_confirm_link *confllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; confllc = (struct smc_llc_msg_confirm_link *)wr_buf; memset(confllc, 0, sizeof(*confllc)); - confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; - confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link); + confllc->hd.common.llc_type = SMC_LLC_CONFIRM_LINK; + smc_llc_init_msg_hdr(&confllc->hd, link->lgr, sizeof(*confllc)); confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; if (reqresp == SMC_LLC_RESP) confllc->hd.flags |= SMC_LLC_FLAG_RESP; @@ -207,35 +470,61 @@ int smc_llc_send_confirm_link(struct smc_link *link, memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; - memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */ + memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); + confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } /* send LLC confirm rkey request */ -static int smc_llc_send_confirm_rkey(struct smc_link *link, +static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { struct smc_llc_msg_confirm_rkey *rkeyllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; - int rc; + struct smc_link *link; + int i, rc, rtok_ix; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (!smc_wr_tx_link_hold(send_link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); - rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; - rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + rkeyllc->hd.common.llc_type = SMC_LLC_CONFIRM_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, send_link->lgr, sizeof(*rkeyllc)); + + rtok_ix = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + link = &send_link->lgr->lnk[i]; + if (smc_link_active(link) && link != send_link) { + rkeyllc->rtoken[rtok_ix].link_id = link->link_id; + rkeyllc->rtoken[rtok_ix].rmb_key = + htonl(rmb_desc->mr[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[link->link_idx].sgl)); + rtok_ix++; + } + } + /* rkey of send_link is in rtoken[0] */ + rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + htonl(rmb_desc->mr[send_link->link_idx]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ - rc = smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(send_link, pend); +put_out: + smc_wr_tx_link_put(send_link); return rc; } @@ -248,90 +537,195 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); - rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; - rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); + rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc)); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } -/* prepare an add link message */ -static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, - struct smc_link *link, u8 mac[], u8 gid[], - enum smc_llc_reqresp reqresp) +/* return first buffer from any of the next buf lists */ +static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst) { - memset(addllc, 0, sizeof(*addllc)); - addllc->hd.common.type = SMC_LLC_ADD_LINK; - addllc->hd.length = sizeof(struct smc_llc_msg_add_link); - if (reqresp == SMC_LLC_RESP) { - addllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* always reject more links for now */ - addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; - addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + struct smc_buf_desc *buf_pos; + + while (*buf_lst < SMC_RMBE_SIZES) { + buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], + struct smc_buf_desc, list); + if (buf_pos) + return buf_pos; + (*buf_lst)++; } - memcpy(addllc->sender_mac, mac, ETH_ALEN); - memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + return NULL; +} + +/* return next rmb from buffer lists */ +static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst, + struct smc_buf_desc *buf_pos) +{ + struct smc_buf_desc *buf_next; + + if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + (*buf_lst)++; + return _smc_llc_get_next_rmb(lgr, buf_lst); + } + buf_next = list_next_entry(buf_pos, list); + return buf_next; +} + +static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + *buf_lst = 0; + return smc_llc_get_next_rmb(lgr, buf_lst, NULL); +} + +static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, + struct smc_link *link, struct smc_link *link_new) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_buf_desc *buf_pos; + int prim_lnk_idx, lnk_idx, i; + struct smc_buf_desc *rmb; + int len = sizeof(*ext); + int buf_lst; + + ext->v2_direct = !lgr->uses_gateway; + memcpy(ext->client_target_gid, link_new->gid, SMC_GID_SIZE); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + mutex_lock(&lgr->rmbs_lock); + ext->num_rkeys = lgr->conns_num; + if (!ext->num_rkeys) + goto out; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + for (i = 0; i < ext->num_rkeys; i++) { + if (!buf_pos) + break; + rmb = buf_pos; + ext->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + ext->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + ext->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + } + len += i * sizeof(ext->rt[0]); +out: + mutex_unlock(&lgr->rmbs_lock); + return len; } /* send ADD LINK request or response */ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp) { + struct smc_llc_msg_add_link_v2_ext *ext = NULL; struct smc_llc_msg_add_link *addllc; struct smc_wr_tx_pend_priv *pend; - struct smc_wr_buf *wr_buf; + int len = sizeof(*addllc); int rc; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); - if (rc) - return rc; - addllc = (struct smc_llc_msg_add_link *)wr_buf; - smc_llc_prep_add_link(addllc, link, mac, gid, reqresp); - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - return rc; -} + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + if (link->lgr->smc_version == SMC_V2) { + struct smc_wr_v2_buf *wr_buf; -/* prepare a delete link message */ -static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, - struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) -{ - memset(delllc, 0, sizeof(*delllc)); - delllc->hd.common.type = SMC_LLC_DELETE_LINK; - delllc->hd.length = sizeof(struct smc_llc_msg_add_link); + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + ext = (struct smc_llc_msg_add_link_v2_ext *) + &wr_buf->raw[sizeof(*addllc)]; + memset(ext, 0, SMC_WR_TX_SIZE); + } else { + struct smc_wr_buf *wr_buf; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + } + + memset(addllc, 0, sizeof(*addllc)); + addllc->hd.common.llc_type = SMC_LLC_ADD_LINK; if (reqresp == SMC_LLC_RESP) - delllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* DEL_LINK_ALL because only 1 link supported */ - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; - if (orderly) - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; - delllc->link_num = link->link_id; + addllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(addllc->sender_mac, mac, ETH_ALEN); + memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + if (link_new) { + addllc->link_num = link_new->link_id; + hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num); + hton24(addllc->initial_psn, link_new->psn_initial); + if (reqresp == SMC_LLC_REQ) + addllc->qp_mtu = link_new->path_mtu; + else + addllc->qp_mtu = min(link_new->path_mtu, + link_new->peer_mtu); + } + if (ext && link_new) + len += smc_llc_fill_ext_v2(ext, link, link_new); + smc_llc_init_msg_hdr(&addllc->hd, link->lgr, len); + /* send llc message */ + if (link->lgr->smc_version == SMC_V2) + rc = smc_wr_tx_v2_send(link, pend, len); + else + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } /* send DELETE LINK request or response */ -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason) { struct smc_llc_msg_del_link *delllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; delllc = (struct smc_llc_msg_del_link *)wr_buf; - smc_llc_prep_delete_link(delllc, link, reqresp, orderly); + + memset(delllc, 0, sizeof(*delllc)); + delllc->hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc->hd, link->lgr, sizeof(*delllc)); + if (reqresp == SMC_LLC_RESP) + delllc->hd.flags |= SMC_LLC_FLAG_RESP; + if (orderly) + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + if (link_del_id) + delllc->link_num = link_del_id; + else + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -343,251 +737,1347 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; testllc = (struct smc_llc_msg_test_link *)wr_buf; memset(testllc, 0, sizeof(*testllc)); - testllc->hd.common.type = SMC_LLC_TEST_LINK; - testllc->hd.length = sizeof(struct smc_llc_msg_test_link); + testllc->hd.common.llc_type = SMC_LLC_TEST_LINK; + smc_llc_init_msg_hdr(&testllc->hd, link->lgr, sizeof(*testllc)); memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } -struct smc_llc_send_work { - struct work_struct work; - struct smc_link *link; - int llclen; - union smc_llc_msg llcbuf; -}; - -/* worker that sends a prepared message */ -static void smc_llc_send_message_work(struct work_struct *work) +/* schedule an llc send on link, may wait for buffers */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *llcwrk = container_of(work, - struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - if (llcwrk->link->state == SMC_LNK_INACTIVE) - goto out; - rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - goto out; - memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); - smc_wr_tx_send(llcwrk->link, pend); -out: - kfree(llcwrk); + goto put_out; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } -/* copy llcbuf and schedule an llc send on link */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +/* schedule an llc send on link, may wait for buffers, + * and wait for send completion notification. + * @return 0 on success + */ +static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; - if (!wrk) - return -ENOMEM; - INIT_WORK(&wrk->work, smc_llc_send_message_work); - wrk->link = link; - wrk->llclen = llclen; - memcpy(&wrk->llcbuf, llcbuf, llclen); - queue_work(link->llc_wq, &wrk->work); - return 0; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); +put_out: + smc_wr_tx_link_put(link); + return rc; } /********************************* receive ***********************************/ -static void smc_llc_rx_confirm_link(struct smc_link *link, - struct smc_llc_msg_confirm_link *llc) +static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, + enum smc_lgr_type lgr_new_t) { - struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc; + int i; + + if (lgr->type == SMC_LGR_SYMMETRIC || + (lgr->type != SMC_LGR_SINGLE && + (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER))) + return -EMLINK; + + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) { + for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } else { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } + return -EMLINK; +} - /* RMBE eyecatchers are not supported */ - if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) - conf_rc = 0; - else - conf_rc = ENOTSUPP; +/* send one add_link_continue msg */ +static int smc_llc_add_link_cont(struct smc_link *link, + struct smc_link *link_new, u8 *num_rkeys_todo, + int *buf_lst, struct smc_buf_desc **buf_pos) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + int prim_lnk_idx, lnk_idx, i, rc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + struct smc_buf_desc *rmb; + u8 n; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = conf_rc; - complete(&link->llc_confirm_resp); + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; + memset(addc_llc, 0, sizeof(*addc_llc)); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + addc_llc->link_num = link_new->link_id; + addc_llc->num_rkeys = *num_rkeys_todo; + n = *num_rkeys_todo; + for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) { + if (!*buf_pos) { + addc_llc->num_rkeys = addc_llc->num_rkeys - + *num_rkeys_todo; + *num_rkeys_todo = 0; + break; } - } else { - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); + rmb = *buf_pos; + + addc_llc->rt[i].rmb_key = htonl(rmb->mr[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = rmb->is_vm ? + cpu_to_be64((uintptr_t)rmb->cpu_addr) : + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + + (*num_rkeys_todo)--; + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + while (*buf_pos && !(*buf_pos)->used) + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + } + addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT; + addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); + if (lgr->role == SMC_CLNT) + addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; +} + +static int smc_llc_cli_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + break; } + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + if (rc) + break; + } while (num_rkeys_send || num_rkeys_recv); + + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + +/* prepare and send an add link reject response */ +static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; + qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); + return smc_llc_send_message(qentry->link, &qentry->msg); +} + +static int smc_llc_cli_conf_link(struct smc_link *link, + struct smc_init_info *ini, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; + + /* receive CONFIRM LINK request over RoCE fabric */ + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry) { + rc = smc_llc_send_delete_link(link, link_new->link_id, + SMC_LLC_REQ, false, + SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + if (qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { + /* received DELETE_LINK instead */ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return -ENOLINK; + } + smc_llc_save_peer_uid(qentry); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_modify_qp_rts(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_wr_remember_qp_attr(link_new); + + rc = smcr_buf_reg_lgr(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + return 0; +} + +static void smc_llc_save_add_link_rkeys(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_v2_ext *ext; + struct smc_link_group *lgr = link->lgr; + int max, i; + + ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + SMC_WR_TX_SIZE); + max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); + mutex_lock(&lgr->rmbs_lock); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + ext->rt[i].rmb_key, + ext->rt[i].rmb_vaddr_new, + ext->rt[i].rmb_key_new); } + mutex_unlock(&lgr->rmbs_lock); } -static void smc_llc_rx_add_link(struct smc_link *link, - struct smc_llc_msg_add_link *llc) +static void smc_llc_save_add_link_info(struct smc_link *link, + struct smc_llc_msg_add_link *add_llc) { + link->peer_qpn = ntoh24(add_llc->sender_qp_num); + memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); + link->peer_psn = ntoh24(add_llc->initial_psn); + link->peer_mtu = add_llc->qp_mtu; +} + +/* as an SMC client, process an add link request */ +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) +{ + struct smc_llc_msg_add_link *llc = &qentry->msg.add_link; + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_init_info *ini = NULL; + struct smc_link *lnk_new = NULL; + int lnk_idx, rc = 0; + + if (!llc->qp_mtu) + goto out_reject; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out_reject; + } + + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(llc->sender_gid); + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + (lgr->smc_version == SMC_V2 || + !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN))) { + if (!ini->ib_dev && !ini->smcrv2.ib_dev_v2) + goto out_reject; + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + goto out_reject; + lnk_new = &lgr->lnk[lnk_idx]; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); + if (rc) + goto out_reject; + smc_llc_save_add_link_info(lnk_new, llc); + lnk_new->link_id = llc->link_num; /* SMC server assigns link id */ + smc_llc_link_set_uid(lnk_new); + + rc = smc_ib_ready_link(lnk_new); + if (rc) + goto out_clear_lnk; + + rc = smcr_buf_map_lgr(lnk_new); + if (rc) + goto out_clear_lnk; + + rc = smc_llc_send_add_link(link, + lnk_new->smcibdev->mac[lnk_new->ibport - 1], + lnk_new->gid, lnk_new, SMC_LLC_RESP); + if (rc) + goto out_clear_lnk; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, lnk_new); } else { - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; + rc = smc_llc_cli_rkey_exchange(link, lnk_new); + if (rc) { + rc = 0; + goto out_clear_lnk; } + } + rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t); + if (!rc) + goto out; +out_clear_lnk: + lnk_new->state = SMC_LNK_INACTIVE; + smcr_link_clear(lnk_new, false); +out_reject: + smc_llc_cli_add_link_reject(qentry); +out: + kfree(ini); + kfree(qentry); + return rc; +} - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); +static void smc_llc_send_request_add_link(struct smc_link *link) +{ + struct smc_llc_msg_req_add_link_v2 *llc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_v2_buf *wr_buf; + struct smc_gidlist gidlist; + int rc, len, i; + + if (!smc_wr_tx_link_hold(link)) + return; + if (link->lgr->type == SMC_LGR_SYMMETRIC || + link->lgr->type == SMC_LGR_ASYMMETRIC_PEER) + goto put_out; + + smc_fill_gid_list(link->lgr, &gidlist, link->smcibdev, link->gid); + if (gidlist.len <= 1) + goto put_out; + + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + llc = (struct smc_llc_msg_req_add_link_v2 *)wr_buf; + memset(llc, 0, SMC_WR_TX_SIZE); + + llc->hd.common.llc_type = SMC_LLC_REQ_ADD_LINK; + for (i = 0; i < gidlist.len; i++) + memcpy(llc->gid[i], gidlist.list[i], sizeof(gidlist.list[0])); + llc->gid_cnt = gidlist.len; + len = sizeof(*llc) + (gidlist.len * sizeof(gidlist.list[0])); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, len); + rc = smc_wr_tx_v2_send(link, pend, len); + if (!rc) + /* set REQ_ADD_LINK flow and wait for response from peer */ + link->lgr->llc_flow_lcl.type = SMC_LLC_FLOW_REQ_ADD_LINK; +put_out: + smc_wr_tx_link_put(link); +} + +/* as an SMC client, invite server to start the add_link processing */ +static void smc_llc_cli_add_link_invite(struct smc_link *link, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_init_info *ini = NULL; + + if (lgr->smc_version == SMC_V2) { + smc_llc_send_request_add_link(link); + goto out; + } + + if (lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + goto out; + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) + goto out; - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); + ini->vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (!ini->ib_dev) + goto out; + + smc_llc_send_add_link(link, ini->ib_dev->mac[ini->ib_port - 1], + ini->ib_gid, NULL, SMC_LLC_REQ); +out: + kfree(ini); + kfree(qentry); +} + +static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(llc->raw.data); i++) + if (llc->raw.data[i]) + return false; + return true; +} + +static bool smc_llc_is_local_add_link(union smc_llc_msg *llc) +{ + if (llc->raw.hdr.common.llc_type == SMC_LLC_ADD_LINK && + smc_llc_is_empty_llc_message(llc)) + return true; + return false; +} + +static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + + mutex_lock(&lgr->llc_conf_mutex); + if (smc_llc_is_local_add_link(&qentry->msg)) + smc_llc_cli_add_link_invite(qentry->link, qentry); + else + smc_llc_cli_add_link(qentry->link, qentry); + mutex_unlock(&lgr->llc_conf_mutex); +} + +static int smc_llc_active_link_count(struct smc_link_group *lgr) +{ + int i, link_count = 0; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + link_count++; + } + return link_count; +} + +/* find the asymmetric link when 3 links are established */ +static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr) +{ + int asym_idx = -ENOENT; + int i, j, k; + bool found; + + /* determine asymmetric link */ + found = false; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + if (!smc_link_usable(&lgr->lnk[i]) || + !smc_link_usable(&lgr->lnk[j])) + continue; + if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid, + SMC_GID_SIZE)) { + found = true; /* asym_lnk is i or j */ + break; + } + } + if (found) + break; + } + if (!found) + goto out; /* no asymmetric link */ + for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) { + if (!smc_link_usable(&lgr->lnk[k])) + continue; + if (k != i && + !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = i; + break; + } + if (k != j && + !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = j; + break; } - smc_llc_send_message(link, llc, sizeof(*llc)); } +out: + return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx]; } -static void smc_llc_rx_delete_link(struct smc_link *link, - struct smc_llc_msg_del_link *llc) +static void smc_llc_delete_asym_link(struct smc_link_group *lgr) { - struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_link *lnk_new = NULL, *lnk_asym; + struct smc_llc_qentry *qentry; + int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) - smc_lgr_schedule_free_work_fast(lgr); - } else { - smc_lgr_forget(lgr); - smc_llc_link_deleting(link); - if (lgr->role == SMC_SERV) { - /* client asks to delete this link, send request */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); - } else { - /* server requests to delete this link, send response */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); + lnk_asym = smc_llc_find_asym_link(lgr); + if (!lnk_asym) + return; /* no asymmetric link */ + if (!smc_link_downing(&lnk_asym->state)) + return; + lnk_new = smc_switch_conns(lgr, lnk_asym, false); + smc_wr_tx_wait_no_pending_sends(lnk_asym); + if (!lnk_new) + goto out_free; + /* change flow type from ADD_LINK into DEL_LINK */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK; + rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, + true, SMC_LLC_DEL_NO_ASYM_NEEDED); + if (rc) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (!qentry) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); +out_free: + smcr_link_clear(lnk_asym, true); +} + +static int smc_llc_srv_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry = NULL; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + goto out; } - smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_terminate_sched(lgr); + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } while (num_rkeys_send || num_rkeys_recv); +out: + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + +static int smc_llc_srv_conf_link(struct smc_link *link, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc; + + /* send CONFIRM LINK request over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ); + if (rc) + return -ENOLINK; + /* receive CONFIRM LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry || + qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { + /* send DELETE LINK */ + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return -ENOLINK; } + smc_llc_save_peer_uid(qentry); + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return 0; } -static void smc_llc_rx_test_link(struct smc_link *link, - struct smc_llc_msg_test_link *llc) +static void smc_llc_send_req_add_link_response(struct smc_llc_qentry *qentry) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVE) - complete(&link->llc_testlink_resp); + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); + memset(&qentry->msg.raw.data, 0, sizeof(qentry->msg.raw.data)); + smc_llc_send_message(qentry->link, &qentry->msg); +} + +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry) +{ + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_msg_add_link *add_llc; + struct smc_llc_qentry *qentry = NULL; + bool send_req_add_link_resp = false; + struct smc_link *link_new = NULL; + struct smc_init_info *ini = NULL; + int lnk_idx, rc = 0; + + if (req_qentry && + req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK) + send_req_add_link_resp = true; + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out; + } + + /* ignore client add link recommendation, start new flow */ + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + if (send_req_add_link_resp) { + struct smc_llc_msg_req_add_link_v2 *req_add = + &req_qentry->msg.req_add_link; + + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(req_add->gid[0]); + } + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) { + rc = 0; + goto out; + } + + rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); + if (rc) + goto out; + link_new = &lgr->lnk[lnk_idx]; + + rc = smcr_buf_map_lgr(link_new); + if (rc) + goto out_err; + + rc = smc_llc_send_add_link(link, + link_new->smcibdev->mac[link_new->ibport-1], + link_new->gid, link_new, SMC_LLC_REQ); + if (rc) + goto out_err; + send_req_add_link_resp = false; + /* receive ADD LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); + if (!qentry) { + rc = -ETIMEDOUT; + goto out_err; + } + add_llc = &qentry->msg.add_link; + if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) { + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = -ENOLINK; + goto out_err; + } + if (lgr->type == SMC_LGR_SINGLE && + (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + (lgr->smc_version == SMC_V2 || + !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN)))) { + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + smc_llc_save_add_link_info(link_new, add_llc); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_ready_link(link_new); + if (rc) + goto out_err; + rc = smcr_buf_reg_lgr(link_new); + if (rc) + goto out_err; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, link_new); } else { - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + rc = smc_llc_srv_rkey_exchange(link, link_new); + if (rc) + goto out_err; + } + rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); + if (rc) + goto out_err; + kfree(ini); + return 0; +out_err: + if (link_new) { + link_new->state = SMC_LNK_INACTIVE; + smcr_link_clear(link_new, false); } +out: + kfree(ini); + if (send_req_add_link_resp) + smc_llc_send_req_add_link_response(req_qentry); + return rc; } -static void smc_llc_rx_confirm_rkey(struct smc_link *link, - struct smc_llc_msg_confirm_rkey *llc) +static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) { + struct smc_link *link = lgr->llc_flow_lcl.qentry->link; + struct smc_llc_qentry *qentry; int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_confirm_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey); - } else { - rc = smc_rtoken_add(smc_get_lgr(link), - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); - /* ignore rtokens for other links, we have only one link */ + mutex_lock(&lgr->llc_conf_mutex); + rc = smc_llc_srv_add_link(link, qentry); + if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { + /* delete any asymmetric link */ + smc_llc_delete_asym_link(lgr); + } + mutex_unlock(&lgr->llc_conf_mutex); + kfree(qentry); +} - llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc, sizeof(*llc)); +/* enqueue a local add_link req to trigger a new add_link flow */ +void smc_llc_add_link_local(struct smc_link *link) +{ + struct smc_llc_msg_add_link add_llc = {}; + + add_llc.hd.common.llc_type = SMC_LLC_ADD_LINK; + smc_llc_init_msg_hdr(&add_llc.hd, link->lgr, sizeof(add_llc)); + /* no dev and port needed */ + smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); +} + +/* worker to process an add link message */ +static void smc_llc_add_link_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_add_link_work); + + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_add_link(lgr); + else + smc_llc_process_srv_add_link(lgr); +out: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_REQ_ADD_LINK) + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } -static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, - struct smc_llc_msg_confirm_rkey_cont *llc) +/* enqueue a local del_link msg to trigger a new del_link flow, + * called only for role SMC_SERV + */ +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ - } else { - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + struct smc_llc_msg_del_link del_llc = {}; + + del_llc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&del_llc.hd, link->lgr, sizeof(del_llc)); + del_llc.link_num = del_link_id; + del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH); + del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc); +} + +static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) +{ + struct smc_link *lnk_del = NULL, *lnk_asym, *lnk; + struct smc_llc_msg_del_link *del_llc; + struct smc_llc_qentry *qentry; + int active_links; + int lnk_idx; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + smc_lgr_terminate_sched(lgr); + goto out; + } + mutex_lock(&lgr->llc_conf_mutex); + /* delete single link */ + for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { + if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) + continue; + lnk_del = &lgr->lnk[lnk_idx]; + break; + } + del_llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (!lnk_del) { + /* link was not found */ + del_llc->reason = htonl(SMC_LLC_DEL_NOLNK); + smc_llc_send_message(lnk, &qentry->msg); + goto out_unlock; + } + lnk_asym = smc_llc_find_asym_link(lgr); + + del_llc->reason = 0; + smc_llc_send_message(lnk, &qentry->msg); /* response */ + + if (smc_link_downing(&lnk_del->state)) + smc_switch_conns(lgr, lnk_del, false); + smcr_link_clear(lnk_del, true); + + active_links = smc_llc_active_link_count(lgr); + if (lnk_del == lnk_asym) { + /* expected deletion of asym link, don't change lgr state */ + } else if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); } +out_unlock: + mutex_unlock(&lgr->llc_conf_mutex); +out: + kfree(qentry); } -static void smc_llc_rx_delete_rkey(struct smc_link *link, - struct smc_llc_msg_delete_rkey *llc) +/* try to send a DELETE LINK ALL request on any active link, + * waiting for send completion + */ +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) { + struct smc_llc_msg_del_link delllc = {}; + int i; + + delllc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc.hd, lgr, sizeof(delllc)); + if (ord) + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc.reason = htonl(rsn); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_sendable(&lgr->lnk[i])) + continue; + if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc)) + break; + } +} + +static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) +{ + struct smc_llc_msg_del_link *del_llc; + struct smc_link *lnk, *lnk_del; + struct smc_llc_qentry *qentry; + int active_links; + int i; + + mutex_lock(&lgr->llc_conf_mutex); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + /* delete entire lgr */ + smc_llc_send_link_delete_all(lgr, true, ntohl( + qentry->msg.delete_link.reason)); + smc_lgr_terminate_sched(lgr); + goto out; + } + /* delete single link */ + lnk_del = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].link_id == del_llc->link_num) { + lnk_del = &lgr->lnk[i]; + break; + } + } + if (!lnk_del) + goto out; /* asymmetric link already deleted */ + + if (smc_link_downing(&lnk_del->state)) { + if (smc_switch_conns(lgr, lnk_del, false)) + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + if (!list_empty(&lgr->list)) { + /* qentry is either a request from peer (send it back to + * initiate the DELETE_LINK processing), or a locally + * enqueued DELETE_LINK request (forward it) + */ + if (!smc_llc_send_message(lnk, &qentry->msg)) { + struct smc_llc_qentry *qentry2; + + qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (qentry2) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } + } + smcr_link_clear(lnk_del, true); + + active_links = smc_llc_active_link_count(lgr); + if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); + } + + if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) { + /* trigger setup of asymm alt link */ + smc_llc_add_link_local(lnk); + } +out: + mutex_unlock(&lgr->llc_conf_mutex); + kfree(qentry); +} + +static void smc_llc_delete_link_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_del_link_work); + + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; + } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_delete_link(lgr); + else + smc_llc_process_srv_delete_link(lgr); +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); +} + +/* process a confirm_rkey request from peer, remote flow */ +static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_confirm_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + int num_entries; + int rk_idx; + int i; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.confirm_rkey; + link = qentry->link; + + num_entries = llc->rtoken[0].num_rkeys; + if (num_entries > SMC_LLC_RKEYS_PER_MSG) + goto out_err; + /* first rkey entry is for receiving link */ + rk_idx = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + if (rk_idx < 0) + goto out_err; + + for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++) + smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id, + llc->rtoken[i].rmb_vaddr, + llc->rtoken[i].rmb_key); + /* max links is 3 so there is no need to support conf_rkey_cont msgs */ + goto out; +out_err: + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; +out: + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} + +/* process a delete_rkey request from peer, remote flow */ +static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_delete_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; u8 err_mask = 0; int i, max; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_delete_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey); - } else { - max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.delete_rkey; + link = qentry->link; + + if (lgr->smc_version == SMC_V2) { + struct smc_llc_msg_delete_rkey_v2 *llcv2; + + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + llcv2->num_inval_rkeys = 0; + + max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); for (i = 0; i < max; i++) { - if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) - err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + if (smc_rtoken_delete(link, llcv2->rkey[i])) + llcv2->num_inval_rkeys++; } - - if (err_mask) { + memset(&llc->rkey[0], 0, sizeof(llc->rkey)); + memset(&llc->reserved2, 0, sizeof(llc->reserved2)); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); + if (llcv2->num_inval_rkeys) { llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - llc->err_mask = err_mask; + llc->err_mask = llcv2->num_inval_rkeys; } + goto finish; + } - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); } + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; + } +finish: + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - union smc_llc_msg *llc = buf; + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu LLC protocol violation: " + "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr->net->net_cookie, type); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL); + smc_lgr_terminate_sched(lgr); +} - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ - if (link->state == SMC_LNK_INACTIVE) - return; /* link not active, drop msg */ +/* flush the llc event queue */ +static void smc_llc_event_flush(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry, *q; + + spin_lock_bh(&lgr->llc_event_q_lock); + list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) { + list_del_init(&qentry->list); + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +static void smc_llc_event_handler(struct smc_llc_qentry *qentry) +{ + union smc_llc_msg *llc = &qentry->msg; + struct smc_link *link = qentry->link; + struct smc_link_group *lgr = link->lgr; + + if (!smc_link_usable(link)) + goto out; - switch (llc->raw.hdr.common.type) { + switch (llc->raw.hdr.common.llc_type) { case SMC_LLC_TEST_LINK: - smc_llc_rx_test_link(link, &llc->test_link); - break; - case SMC_LLC_CONFIRM_LINK: - smc_llc_rx_confirm_link(link, &llc->confirm_link); + llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); break; case SMC_LLC_ADD_LINK: - smc_llc_rx_add_link(link, &llc->add_link); + if (list_empty(&lgr->list)) + goto out; /* lgr is terminating */ + if (lgr->role == SMC_CLNT) { + if (smc_llc_is_local_add_link(llc)) { + if (lgr->llc_flow_lcl.type == + SMC_LLC_FLOW_ADD_LINK) + break; /* add_link in progress */ + if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_add_link_work); + } + return; + } + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up(&lgr->llc_msg_waiter); + return; + } + if (lgr->llc_flow_lcl.type == + SMC_LLC_FLOW_REQ_ADD_LINK) { + /* server started add_link processing */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + schedule_work(&lgr->llc_add_link_work); + return; + } + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + schedule_work(&lgr->llc_add_link_work); + } + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + schedule_work(&lgr->llc_add_link_work); + } + return; + case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up(&lgr->llc_msg_waiter); + return; + } break; case SMC_LLC_DELETE_LINK: - smc_llc_rx_delete_link(link, &llc->delete_link); - break; + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up(&lgr->llc_msg_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + return; case SMC_LLC_CONFIRM_RKEY: - smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); - break; + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_conf_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; case SMC_LLC_CONFIRM_RKEY_CONT: - smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); + /* not used because max links is 3, and 3 rkeys fit into + * one CONFIRM_RKEY message + */ break; case SMC_LLC_DELETE_RKEY: - smc_llc_rx_delete_rkey(link, &llc->delete_rkey); + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_delete_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; + case SMC_LLC_REQ_ADD_LINK: + /* handle response here, smc_llc_flow_stop() cannot be called + * in tasklet context + */ + if (lgr->role == SMC_CLNT && + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_REQ_ADD_LINK && + (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP)) { + smc_llc_flow_stop(link->lgr, &lgr->llc_flow_lcl); + } else if (lgr->role == SMC_SERV) { + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + schedule_work(&lgr->llc_add_link_work); + } + return; + } break; + default: + smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); + break; + } +out: + kfree(qentry); +} + +/* worker to process llc messages on the event queue */ +static void smc_llc_event_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_event_work); + struct smc_llc_qentry *qentry; + + if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + if (smc_link_usable(qentry->link)) + smc_llc_event_handler(qentry); + else + kfree(qentry); + } + +again: + spin_lock_bh(&lgr->llc_event_q_lock); + if (!list_empty(&lgr->llc_event_q)) { + qentry = list_first_entry(&lgr->llc_event_q, + struct smc_llc_qentry, list); + list_del_init(&qentry->list); + spin_unlock_bh(&lgr->llc_event_q_lock); + smc_llc_event_handler(qentry); + goto again; + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +/* process llc responses in tasklet context */ +static void smc_llc_rx_response(struct smc_link *link, + struct smc_llc_qentry *qentry) +{ + enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type; + struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl; + u8 llc_type = qentry->msg.raw.hdr.common.llc_type; + + switch (llc_type) { + case SMC_LLC_TEST_LINK: + if (smc_link_active(link)) + complete(&link->llc_testlink_resp); + break; + case SMC_LLC_ADD_LINK: + case SMC_LLC_ADD_LINK_CONT: + case SMC_LLC_CONFIRM_LINK: + if (flowtype != SMC_LLC_FLOW_ADD_LINK || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; + case SMC_LLC_DELETE_LINK: + if (flowtype != SMC_LLC_FLOW_DEL_LINK || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + if (flowtype != SMC_LLC_FLOW_RKEY || flow->qentry) + break; /* drop out-of-flow response */ + goto assign; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* not used because max links is 3 */ + break; + default: + smc_llc_protocol_violation(link->lgr, + qentry->msg.raw.hdr.common.type); + break; + } + kfree(qentry); + return; +assign: + /* assign responses to the local flow, we requested them */ + smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); + wake_up(&link->lgr->llc_msg_waiter); +} + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry; + unsigned long flags; + + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return; + qentry->link = link; + INIT_LIST_HEAD(&qentry->list); + memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + + /* process responses immediately */ + if ((llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) && + llc->raw.hdr.common.llc_type != SMC_LLC_REQ_ADD_LINK) { + smc_llc_rx_response(link, qentry); + return; + } + + /* add requests to event queue */ + spin_lock_irqsave(&lgr->llc_event_q_lock, flags); + list_add_tail(&qentry->list, &lgr->llc_event_q); + spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); + queue_work(system_highpri_wq, &lgr->llc_event_work); +} + +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (!llc->raw.hdr.common.llc_version) { + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + } else { + if (llc->raw.hdr.length_v2 < sizeof(*llc)) + return; /* invalid message */ } + + smc_llc_enqueue(link, llc); } /***************************** worker, utils *********************************/ @@ -601,7 +2091,7 @@ static void smc_llc_testlink_work(struct work_struct *work) u8 user_data[16] = { 0 }; int rc; - if (link->state != SMC_LNK_ACTIVE) + if (!smc_link_active(link)) return; /* don't reschedule worker */ expire_time = link->wr_rx_tstamp + link->llc_testlink_time; if (time_is_after_jiffies(expire_time)) { @@ -613,112 +2103,164 @@ static void smc_llc_testlink_work(struct work_struct *work) /* receive TEST LINK response over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); + if (!smc_link_active(link)) + return; /* link state changed */ if (rc <= 0) { - smc_lgr_terminate(smc_get_lgr(link), true); + smcr_link_down_cond_sched(link); return; } next_interval = link->llc_testlink_time; out: - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - next_interval); + schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } -int smc_llc_link_init(struct smc_link *link) +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { - struct smc_link_group *lgr = smc_get_lgr(link); - link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, - *((u32 *)lgr->id), - link->link_id); - if (!link->llc_wq) - return -ENOMEM; - init_completion(&link->llc_confirm); - init_completion(&link->llc_confirm_resp); - init_completion(&link->llc_add); - init_completion(&link->llc_add_resp); - init_completion(&link->llc_confirm_rkey); - init_completion(&link->llc_delete_rkey); - mutex_init(&link->llc_delete_rkey_mutex); - init_completion(&link->llc_testlink_resp); - INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); - return 0; + struct net *net = sock_net(smc->clcsock->sk); + + INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work); + INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work); + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); + spin_lock_init(&lgr->llc_flow_lock); + init_waitqueue_head(&lgr->llc_flow_waiter); + init_waitqueue_head(&lgr->llc_msg_waiter); + mutex_init(&lgr->llc_conf_mutex); + lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); } -void smc_llc_link_active(struct smc_link *link, int testlink_time) +/* called after lgr was removed from lgr_list */ +void smc_llc_lgr_clear(struct smc_link_group *lgr) { - link->state = SMC_LNK_ACTIVE; - if (testlink_time) { - link->llc_testlink_time = testlink_time * HZ; - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - link->llc_testlink_time); + smc_llc_event_flush(lgr); + wake_up_all(&lgr->llc_flow_waiter); + wake_up_all(&lgr->llc_msg_waiter); + cancel_work_sync(&lgr->llc_event_work); + cancel_work_sync(&lgr->llc_add_link_work); + cancel_work_sync(&lgr->llc_del_link_work); + if (lgr->delayed_event) { + kfree(lgr->delayed_event); + lgr->delayed_event = NULL; } } -void smc_llc_link_deleting(struct smc_link *link) +int smc_llc_link_init(struct smc_link *link) { - link->state = SMC_LNK_DELETING; - smc_wr_wakeup_tx_wait(link); + init_completion(&link->llc_testlink_resp); + INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + return 0; } -/* called in tasklet context */ -void smc_llc_link_inactive(struct smc_link *link) +void smc_llc_link_active(struct smc_link *link) { - link->state = SMC_LNK_INACTIVE; - cancel_delayed_work(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link added: id %*phN, " + "peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + link->lgr->net->net_cookie, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + link->state = SMC_LNK_ACTIVE; + if (link->lgr->llc_testlink_time) { + link->llc_testlink_time = link->lgr->llc_testlink_time; + schedule_delayed_work(&link->llc_testlink_wrk, + link->llc_testlink_time); + } } /* called in worker context */ -void smc_llc_link_clear(struct smc_link *link) +void smc_llc_link_clear(struct smc_link *link, bool log) { - flush_workqueue(link->llc_wq); - destroy_workqueue(link->llc_wq); + if (log) + pr_warn_ratelimited("smc: SMC-R lg %*phN net %llu link removed: id %*phN" + ", peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + link->lgr->net->net_cookie, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + complete(&link->llc_testlink_resp); + cancel_delayed_work_sync(&link->llc_testlink_wrk); } -/* register a new rtoken at the remote peer */ -int smc_llc_do_confirm_rkey(struct smc_link *link, +/* register a new rtoken at the remote peer (for all links) */ +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { - int rc; + struct smc_link_group *lgr = send_link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; - /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey); - rc = smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_send_confirm_rkey(send_link, rmb_desc); if (rc) - return rc; + goto out; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_rc) - return -EFAULT; - return 0; + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) + rc = -EFAULT; +out: + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return rc; } /* unregister an rtoken at the remote peer */ -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc) { + struct smc_llc_qentry *qentry = NULL; + struct smc_link *send_link; int rc = 0; - mutex_lock(&link->llc_delete_rkey_mutex); - if (link->state != SMC_LNK_ACTIVE) - goto out; - reinit_completion(&link->llc_delete_rkey); - rc = smc_llc_send_delete_rkey(link, rmb_desc); + send_link = smc_llc_usable_link(lgr); + if (!send_link) + return -ENOLINK; + + /* protected by llc_flow control */ + rc = smc_llc_send_delete_rkey(send_link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_rc) + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) rc = -EFAULT; - else - rc = 0; out: - mutex_unlock(&link->llc_delete_rkey_mutex); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); return rc; } +void smc_llc_link_set_uid(struct smc_link *link) +{ + __be32 link_uid; + + link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id); + memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE); +} + +/* save peers link user id, used for debug purposes */ +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry) +{ + memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid, + SMC_LGR_ID_SIZE); +} + +/* evaluate confirm link request or response */ +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type) +{ + if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */ + qentry->link->link_id = qentry->msg.confirm_link.link_num; + smc_llc_link_set_uid(qentry->link); + } + if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + return -ENOTSUPP; + return 0; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { @@ -736,6 +2278,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { }, { .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_CONT + }, + { + .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_LINK }, { @@ -750,6 +2296,35 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + /* V2 types */ + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_TEST_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_REQ_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_RKEY_V2 + }, { .handler = NULL, } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 461c0c3ef76e..7e7a3162c68b 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -19,6 +19,7 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_TESTLINK_DEFAULT_TIME (30 * HZ) enum smc_llc_reqresp { SMC_LLC_REQ, @@ -28,29 +29,92 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, SMC_LLC_ADD_LINK = 0x02, + SMC_LLC_ADD_LINK_CONT = 0x03, SMC_LLC_DELETE_LINK = 0x04, + SMC_LLC_REQ_ADD_LINK = 0x05, SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + /* V2 types */ + SMC_LLC_CONFIRM_LINK_V2 = 0x21, + SMC_LLC_ADD_LINK_V2 = 0x22, + SMC_LLC_DELETE_LINK_V2 = 0x24, + SMC_LLC_REQ_ADD_LINK_V2 = 0x25, + SMC_LLC_CONFIRM_RKEY_V2 = 0x26, + SMC_LLC_TEST_LINK_V2 = 0x27, + SMC_LLC_DELETE_RKEY_V2 = 0x29, }; +#define smc_link_downing(state) \ + (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE) + +/* LLC DELETE LINK Request Reason Codes */ +#define SMC_LLC_DEL_LOST_PATH 0x00010000 +#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000 +#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000 +#define SMC_LLC_DEL_PROT_VIOL 0x00040000 +#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000 +/* LLC DELETE LINK Response Reason Codes */ +#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */ +#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */ + +/* returns a usable link of the link group, or NULL */ +static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + return &lgr->lnk[i]; + return NULL; +} + +/* set the termination reason code for the link group */ +static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr, + u32 rsn) +{ + if (!lgr->llc_termination_rsn) + lgr->llc_termination_rsn = rsn; +} + /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp); -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly); +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason); +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); +void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); -void smc_llc_link_active(struct smc_link *link, int testlink_time); -void smc_llc_link_deleting(struct smc_link *link); -void smc_llc_link_inactive(struct smc_link *link); -void smc_llc_link_clear(struct smc_link *link); -int smc_llc_do_confirm_rkey(struct smc_link *link, +void smc_llc_link_active(struct smc_link *link); +void smc_llc_link_clear(struct smc_link *link, bool log); +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc); +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type); +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type); +void smc_llc_link_set_uid(struct smc_link *link); +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry); +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg); +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, + u32 rsn); +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry); +void smc_llc_add_link_local(struct smc_link *link); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c new file mode 100644 index 000000000000..621c46c70073 --- /dev/null +++ b/net/smc/smc_netlink.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to interact with SMC module + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/if.h> +#include <linux/smc.h> + +#include "smc_core.h" +#include "smc_ism.h" +#include "smc_ib.h" +#include "smc_clc.h" +#include "smc_stats.h" +#include "smc_netlink.h" + +const struct nla_policy +smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { + [SMC_NLA_EID_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, + [SMC_NLA_EID_TABLE_ENTRY] = { .type = NLA_STRING, + .len = SMC_MAX_EID_LEN, + }, +}; + +#define SMC_CMD_MAX_ATTR 1 +/* SMC_GENL generic netlink operation definition */ +static const struct genl_ops smc_gen_nl_ops[] = { + { + .cmd = SMC_NETLINK_GET_SYS_INFO, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_sys_info, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_LINK_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_link, + }, + { + .cmd = SMC_NETLINK_GET_LGR_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_lgr, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCD, + /* can be retrieved by unprivileged users */ + .dumpit = smcd_nl_get_device, + }, + { + .cmd = SMC_NETLINK_GET_DEV_SMCR, + /* can be retrieved by unprivileged users */ + .dumpit = smcr_nl_get_device, + }, + { + .cmd = SMC_NETLINK_GET_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_stats, + }, + { + .cmd = SMC_NETLINK_GET_FBACK_STATS, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_get_fback_stats, + }, + { + .cmd = SMC_NETLINK_DUMP_UEID, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_ueid, + }, + { + .cmd = SMC_NETLINK_ADD_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_add_ueid, + .policy = smc_gen_ueid_policy, + }, + { + .cmd = SMC_NETLINK_REMOVE_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_remove_ueid, + .policy = smc_gen_ueid_policy, + }, + { + .cmd = SMC_NETLINK_FLUSH_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_flush_ueid, + }, + { + .cmd = SMC_NETLINK_DUMP_SEID, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_seid, + }, + { + .cmd = SMC_NETLINK_ENABLE_SEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_seid, + }, + { + .cmd = SMC_NETLINK_DISABLE_SEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_seid, + }, + { + .cmd = SMC_NETLINK_DUMP_HS_LIMITATION, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_hs_limitation, + }, + { + .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_hs_limitation, + }, + { + .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_hs_limitation, + }, +}; + +static const struct nla_policy smc_gen_nl_policy[2] = { + [SMC_CMD_MAX_ATTR] = { .type = NLA_REJECT, }, +}; + +/* SMC_GENL family definition */ +struct genl_family smc_gen_nl_family __ro_after_init = { + .hdrsize = 0, + .name = SMC_GENL_FAMILY_NAME, + .version = SMC_GENL_FAMILY_VERSION, + .maxattr = SMC_CMD_MAX_ATTR, + .policy = smc_gen_nl_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_gen_nl_ops, + .n_ops = ARRAY_SIZE(smc_gen_nl_ops), + .resv_start_op = SMC_NETLINK_DISABLE_HS_LIMITATION + 1, +}; + +int __init smc_nl_init(void) +{ + return genl_register_family(&smc_gen_nl_family); +} + +void smc_nl_exit(void) +{ + genl_unregister_family(&smc_gen_nl_family); +} diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h new file mode 100644 index 000000000000..e8c6c3f0e98c --- /dev/null +++ b/net/smc/smc_netlink.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC Generic netlink operations + * + * Copyright IBM Corp. 2020 + * + * Author(s): Guvenc Gulce <guvenc@linux.ibm.com> + */ + +#ifndef _SMC_NETLINK_H +#define _SMC_NETLINK_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +extern struct genl_family smc_gen_nl_family; + +extern const struct nla_policy smc_gen_ueid_policy[]; + +struct smc_nl_dmp_ctx { + int pos[3]; +}; + +static inline struct smc_nl_dmp_ctx *smc_nl_dmp_ctx(struct netlink_callback *c) +{ + return (struct smc_nl_dmp_ctx *)c->ctx; +} + +int smc_nl_init(void) __init; +void smc_nl_exit(void); + +#endif diff --git a/net/smc/smc_netns.h b/net/smc/smc_netns.h index e7a8fc4ae02f..0f4f35aa43ad 100644 --- a/net/smc/smc_netns.h +++ b/net/smc/smc_netns.h @@ -16,5 +16,6 @@ extern unsigned int smc_net_id; /* per-network namespace private data */ struct smc_net { struct smc_pnettable pnettable; + struct smc_pnetids_ndev pnetids_ndev; }; #endif diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2a5ed47c3e08..25fb2fd186e2 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/list.h> #include <linux/ctype.h> +#include <linux/mutex.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -28,11 +29,10 @@ #include "smc_ism.h" #include "smc_core.h" -#define SMC_ASCII_BLANK 32 - +static struct net_device *__pnet_find_base_ndev(struct net_device *ndev); static struct net_device *pnet_find_base_ndev(struct net_device *ndev); -static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { +static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN @@ -50,39 +50,45 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { static struct genl_family smc_pnet_nl_family; -/** - * struct smc_user_pnetentry - pnet identifier name entry for/from user - * @list: List node. - * @pnet_name: Pnet identifier name - * @ndev: pointer to network device. - * @smcibdev: Pointer to IB device. - * @ib_port: Port of IB device. - * @smcd_dev: Pointer to smcd device. - */ -struct smc_user_pnetentry { - struct list_head list; - char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; - struct smc_ib_device *smcibdev; - u8 ib_port; - struct smcd_dev *smcd_dev; +enum smc_pnet_nametype { + SMC_PNET_ETH = 1, + SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; + enum smc_pnet_nametype type; + union { + struct { + char eth_name[IFNAMSIZ + 1]; + struct net_device *ndev; + netdevice_tracker dev_tracker; + }; + struct { + char ib_name[IB_DEVICE_NAME_MAX + 1]; + u8 ib_port; + }; + }; }; +/* Check if the pnetid is set */ +bool smc_pnet_is_pnetid_set(u8 *pnetid) +{ + if (pnetid[0] == 0 || pnetid[0] == _S) + return false; + return true; +} + /* Check if two given pnetids match */ static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2) { int i; for (i = 0; i < SMC_MAX_PNETID_LEN; i++) { - if ((pnetid1[i] == 0 || pnetid1[i] == SMC_ASCII_BLANK) && - (pnetid2[i] == 0 || pnetid2[i] == SMC_ASCII_BLANK)) + if ((pnetid1[i] == 0 || pnetid1[i] == _S) && + (pnetid2[i] == 0 || pnetid2[i] == _S)) break; if (pnetid1[i] != pnetid2[i]) return false; @@ -106,32 +112,46 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* remove netdevices */ - write_lock(&pnettable->lock); + /* remove table entry */ + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); - dev_put(pnetelem->ndev); + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { + netdev_put(pnetelem->ndev, + &pnetelem->dev_tracker); + pr_warn_ratelimited("smc: net device %s " + "erased user defined " + "pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + } kfree(pnetelem); rc = 0; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); /* if this is not the initial namespace, stop here */ if (net != &init_net) return rc; /* remove ib devices */ - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { if (ibdev->pnetid_by_user[ibport] && (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { + pr_warn_ratelimited("smc: ib device %s ibport " + "%d erased user defined " + "pnetid %.16s\n", + ibdev->ibdev->name, + ibport + 1, + ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; @@ -139,25 +159,29 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) } } } - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (smcd_dev->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + pr_warn_ratelimited("smc: smcd device %s " + "erased user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = false; rc = 0; } } - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); return rc; } -/* Remove a pnet entry mentioning a given network device from the pnet table. +/* Add the reference to a given network device to the pnet table. */ -static int smc_pnet_remove_by_ndev(struct net_device *ndev) +static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; @@ -169,94 +193,87 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - write_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { - if (pnetelem->ndev == ndev) { - list_del(&pnetelem->list); - dev_put(pnetelem->ndev); - kfree(pnetelem); + if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && + !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { + netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC); + pnetelem->ndev = ndev; rc = 0; + pr_warn_ratelimited("smc: adding net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); break; } } - write_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } -/* Append a pnetid to the end of the pnet table if not already on this list. +/* Remove the reference to a given network device from the pnet table. */ -static int smc_pnet_enter(struct smc_pnettable *pnettable, - struct smc_user_pnetentry *new_pnetelem) +static int smc_pnet_remove_by_ndev(struct net_device *ndev) { - u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; - u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_pnetentry *tmp_pnetelem; - struct smc_pnetentry *pnetelem; - bool new_smcddev = false; - struct net_device *ndev; - bool new_netdev = true; - bool new_ibdev = false; - - if (new_pnetelem->smcibdev) { - struct smc_ib_device *ib_dev = new_pnetelem->smcibdev; - int ib_port = new_pnetelem->ib_port; + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; - spin_lock(&smc_ib_devices.lock); - if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { - memcpy(ib_dev->pnetid[ib_port - 1], - new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN); - ib_dev->pnetid_by_user[ib_port - 1] = true; - new_ibdev = true; - } - spin_unlock(&smc_ib_devices.lock); - } - if (new_pnetelem->smcd_dev) { - struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev; + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; - spin_lock(&smcd_dev_list.lock); - if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { - memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = true; - new_smcddev = true; + mutex_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { + netdev_put(pnetelem->ndev, &pnetelem->dev_tracker); + pnetelem->ndev = NULL; + rc = 0; + pr_warn_ratelimited("smc: removing net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + break; } - spin_unlock(&smcd_dev_list.lock); } + mutex_unlock(&pnettable->lock); + return rc; +} - if (!new_pnetelem->ndev) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; +/* Apply pnetid to ib device when no pnetid is set. + */ +static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, + char *pnet_name) +{ + bool applied = false; - /* check if (base) netdev already has a pnetid. If there is one, we do - * not want to add a pnet table entry - */ - ndev = pnet_find_base_ndev(new_pnetelem->ndev); - if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, - ndev_pnetid)) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; + mutex_lock(&smc_ib_devices.mutex); + if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) { + memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, + SMC_MAX_PNETID_LEN); + ib_dev->pnetid_by_user[ib_port - 1] = true; + applied = true; + } + mutex_unlock(&smc_ib_devices.mutex); + return applied; +} - /* add a new netdev entry to the pnet table if there isn't one */ - tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); - if (!tmp_pnetelem) - return -ENOMEM; - memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_pnetelem->ndev = new_pnetelem->ndev; +/* Apply pnetid to smcd device when no pnetid is set. + */ +static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) +{ + bool applied = false; - write_lock(&pnettable->lock); - list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (pnetelem->ndev == new_pnetelem->ndev) - new_netdev = false; + mutex_lock(&smcd_dev_list.mutex); + if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) { + memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); + smcd_dev->pnetid_by_user = true; + applied = true; } - if (new_netdev) { - dev_hold(tmp_pnetelem->ndev); - list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); - } else { - write_unlock(&pnettable->lock); - kfree(tmp_pnetelem); - } - - return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST; + mutex_unlock(&smcd_dev_list.mutex); + return applied; } /* The limit for pnetid is 16 characters. @@ -291,18 +308,19 @@ static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (!strncmp(ibdev->ibdev->name, ib_name, sizeof(ibdev->ibdev->name)) || - !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, - IB_DEVICE_NAME_MAX - 1)) { + (ibdev->ibdev->dev.parent && + !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name, + IB_DEVICE_NAME_MAX - 1))) { goto out; } } ibdev = NULL; out: - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); return ibdev; } @@ -311,7 +329,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) { struct smcd_dev *smcd_dev; - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { if (!strncmp(dev_name(&smcd_dev->dev), smcd_name, IB_DEVICE_NAME_MAX - 1)) @@ -319,61 +337,190 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) } smcd_dev = NULL; out: - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); return smcd_dev; } -/* Parse the supplied netlink attributes and fill a pnetentry structure. - * For ethernet and infiniband device names verify that the devices exist. +static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, + char *eth_name, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct net_device *ndev, *base_ndev; + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + bool new_netdev; + int rc; + + /* check if (base) netdev already has a pnetid. If there is one, we do + * not want to add a pnet table entry + */ + rc = -EEXIST; + ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ + if (ndev) { + base_ndev = pnet_find_base_ndev(ndev); + if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, + base_ndev->dev_port, ndev_pnetid)) + goto out_put; + } + + /* add a new netdev entry to the pnet table if there isn't one */ + rc = -ENOMEM; + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + goto out_put; + new_pe->type = SMC_PNET_ETH; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + rc = -EEXIST; + new_netdev = true; + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_ETH && + !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { + new_netdev = false; + break; + } + } + if (new_netdev) { + if (ndev) { + new_pe->ndev = ndev; + netdev_tracker_alloc(ndev, &new_pe->dev_tracker, + GFP_ATOMIC); + } + list_add_tail(&new_pe->list, &pnettable->pnetlist); + mutex_unlock(&pnettable->lock); + } else { + mutex_unlock(&pnettable->lock); + kfree(new_pe); + goto out_put; + } + if (ndev) + pr_warn_ratelimited("smc: net device %s " + "applied user defined pnetid %.16s\n", + new_pe->eth_name, new_pe->pnet_name); + return 0; + +out_put: + dev_put(ndev); + return rc; +} + +static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, + u8 ib_port, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct smc_ib_device *ib_dev; + bool smcddev_applied = true; + bool ibdev_applied = true; + struct smcd_dev *smcd_dev; + bool new_ibdev; + + /* try to apply the pnetid to active devices */ + ib_dev = smc_pnet_find_ib(ib_name); + if (ib_dev) { + ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + if (ibdev_applied) + pr_warn_ratelimited("smc: ib device %s ibport %d " + "applied user defined pnetid " + "%.16s\n", ib_dev->ibdev->name, + ib_port, + ib_dev->pnetid[ib_port - 1]); + } + smcd_dev = smc_pnet_find_smcd(ib_name); + if (smcd_dev) { + smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); + if (smcddev_applied) + pr_warn_ratelimited("smc: smcd device %s " + "applied user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); + } + /* Apply fails when a device has a hardware-defined pnetid set, do not + * add a pnet table entry in that case. + */ + if (!ibdev_applied || !smcddev_applied) + return -EEXIST; + + /* add a new ib entry to the pnet table if there isn't one */ + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + return -ENOMEM; + new_pe->type = SMC_PNET_IB; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + new_pe->ib_port = ib_port; + + new_ibdev = true; + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + new_ibdev = false; + break; + } + } + if (new_ibdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + mutex_unlock(&pnettable->lock); + } else { + mutex_unlock(&pnettable->lock); + kfree(new_pe); + } + return (new_ibdev) ? 0 : -EEXIST; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. */ -static int smc_pnet_fill_entry(struct net *net, - struct smc_user_pnetentry *pnetelem, - struct nlattr *tb[]) +static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { - char *string, *ibname; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + struct smc_pnettable *pnettable; + bool new_netdev = false; + bool new_ibdev = false; + struct smc_net *sn; + u8 ibport = 1; + char *string; int rc; - memset(pnetelem, 0, sizeof(*pnetelem)); - INIT_LIST_HEAD(&pnetelem->list); + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); - if (!smc_pnetid_valid(string, pnetelem->pnet_name)) + if (!smc_pnetid_valid(string, pnet_name)) goto error; - rc = -EINVAL; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); - pnetelem->ndev = dev_get_by_name(net, string); - if (!pnetelem->ndev) + rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); + if (!rc) + new_netdev = true; + else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) - return 0; + return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { - ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); - ibname = strim(ibname); - pnetelem->smcibdev = smc_pnet_find_ib(ibname); - pnetelem->smcd_dev = smc_pnet_find_smcd(ibname); - if (!pnetelem->smcibdev && !pnetelem->smcd_dev) - goto error; - if (pnetelem->smcibdev) { - if (!tb[SMC_PNETID_IBPORT]) - goto error; - pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); - if (pnetelem->ib_port < 1 || - pnetelem->ib_port > SMC_MAX_PORTS) + string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + string = strim(string); + if (tb[SMC_PNETID_IBPORT]) { + ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } + rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); + if (!rc) + new_ibdev = true; + else if (rc != -EEXIST) + goto error; } - - return 0; + return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; @@ -381,28 +528,22 @@ error: /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; - if (pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, - pnetelem->ndev->name)) + pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } - if (pnetelem->smcibdev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(pnetelem->smcibdev->ibdev->dev.parent)) || + if (pnetelem->type == SMC_PNET_IB) { + if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; - } else if (pnetelem->smcd_dev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(&pnetelem->smcd_dev->dev)) || - nla_put_u8(msg, SMC_PNETID_IBPORT, 1)) - return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) @@ -415,21 +556,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg, static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct smc_user_pnetentry pnetelem; - struct smc_pnettable *pnettable; - struct smc_net *sn; - int rc; - /* get pnettable for namespace */ - sn = net_generic(net, smc_net_id); - pnettable = &sn->pnettable; - - rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs); - if (!rc) - rc = smc_pnet_enter(pnettable, &pnetelem); - if (pnetelem.ndev) - dev_put(pnetelem.ndev); - return rc; + return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) @@ -450,7 +578,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb) static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { void *hdr; @@ -469,91 +597,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb, static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { - struct smc_user_pnetentry tmp_entry; struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; - struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; struct smc_net *sn; int idx = 0; - int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* dump netdevices */ - read_lock(&pnettable->lock); + /* dump pnettable entries */ + mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_entry.ndev = pnetelem->ndev; + /* if this is not the initial namespace, dump only netdev */ + if (net != &init_net && pnetelem->type != SMC_PNET_ETH) + continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { + pnetelem)) { --idx; break; } } - read_unlock(&pnettable->lock); - - /* if this is not the initial namespace, stop here */ - if (net != &init_net) - return idx; - - /* dump ib devices */ - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { - if (ibdev->pnetid_by_user[ibport]) { - if (pnetid && - !smc_pnet_match(ibdev->pnetid[ibport], - pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, - ibdev->pnetid[ibport], - SMC_MAX_PNETID_LEN); - tmp_entry.smcibdev = ibdev; - tmp_entry.ib_port = ibport + 1; - if (smc_pnet_dumpinfo(skb, portid, seq, - NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - } - spin_unlock(&smc_ib_devices.lock); - - /* dump smcd devices */ - spin_lock(&smcd_dev_list.lock); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user) { - if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid, - SMC_MAX_PNETID_LEN); - tmp_entry.smcd_dev = smcd_dev; - if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - spin_unlock(&smcd_dev_list.lock); - + mutex_unlock(&pnettable->lock); return idx; } @@ -646,18 +715,140 @@ static struct genl_family smc_pnet_nl_family __ro_after_init = { .netnsok = true, .module = THIS_MODULE, .ops = smc_pnet_ops, - .n_ops = ARRAY_SIZE(smc_pnet_ops) + .n_ops = ARRAY_SIZE(smc_pnet_ops), + .resv_start_op = SMC_PNETID_FLUSH + 1, }; +bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe; + bool rc = false; + + read_lock(&sn->pnetids_ndev.lock); + list_for_each_entry(pe, &sn->pnetids_ndev.list, list) { + if (smc_pnet_match(pnetid, pe->pnetid)) { + rc = true; + goto unlock; + } + } + +unlock: + read_unlock(&sn->pnetids_ndev.lock); + return rc; +} + +static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe, *pi; + + pe = kzalloc(sizeof(*pe), GFP_KERNEL); + if (!pe) + return -ENOMEM; + + write_lock(&sn->pnetids_ndev.lock); + list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { + if (smc_pnet_match(pnetid, pe->pnetid)) { + refcount_inc(&pi->refcnt); + kfree(pe); + goto unlock; + } + } + refcount_set(&pe->refcnt, 1); + memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN); + list_add_tail(&pe->list, &sn->pnetids_ndev.list); + +unlock: + write_unlock(&sn->pnetids_ndev.lock); + return 0; +} + +static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe, *pe2; + + write_lock(&sn->pnetids_ndev.lock); + list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) { + if (smc_pnet_match(pnetid, pe->pnetid)) { + if (refcount_dec_and_test(&pe->refcnt)) { + list_del(&pe->list); + kfree(pe); + } + break; + } + } + write_unlock(&sn->pnetids_ndev.lock); +} + +static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev, + u8 *ndev_pnetid) +{ + struct net_device *base_dev; + + base_dev = __pnet_find_base_ndev(dev); + if (base_dev->flags & IFF_UP && + !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port, + ndev_pnetid)) { + /* add to PNETIDs list */ + smc_pnet_add_pnetid(net, ndev_pnetid); + } +} + +/* create initial list of netdevice pnetids */ +static void smc_pnet_create_pnetids_list(struct net *net) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net_device *dev; + + rtnl_lock(); + for_each_netdev(net, dev) + smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); + rtnl_unlock(); +} + +/* clean up list of netdevice pnetids */ +static void smc_pnet_destroy_pnetids_list(struct net *net) +{ + struct smc_net *sn = net_generic(net, smc_net_id); + struct smc_pnetids_ndev_entry *pe, *temp_pe; + + write_lock(&sn->pnetids_ndev.lock); + list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) { + list_del(&pe->list); + kfree(pe); + } + write_unlock(&sn->pnetids_ndev.lock); +} + static int smc_pnet_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(event_dev); + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; switch (event) { case NETDEV_REBOOT: case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); + return NOTIFY_OK; + case NETDEV_REGISTER: + smc_pnet_add_by_ndev(event_dev); + smc_ib_ndev_change(event_dev, event); + return NOTIFY_OK; + case NETDEV_UP: + smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid); + return NOTIFY_OK; + case NETDEV_DOWN: + event_dev = __pnet_find_base_ndev(event_dev); + if (!smc_pnetid_by_dev_port(event_dev->dev.parent, + event_dev->dev_port, ndev_pnetid)) { + /* remove from PNETIDs list */ + smc_pnet_remove_pnetid(net, ndev_pnetid); + } return NOTIFY_OK; default: return NOTIFY_DONE; @@ -673,9 +864,17 @@ int smc_pnet_net_init(struct net *net) { struct smc_net *sn = net_generic(net, smc_net_id); struct smc_pnettable *pnettable = &sn->pnettable; + struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev; INIT_LIST_HEAD(&pnettable->pnetlist); - rwlock_init(&pnettable->lock); + mutex_init(&pnettable->lock); + INIT_LIST_HEAD(&pnetids_ndev->list); + rwlock_init(&pnetids_ndev->lock); + + smc_pnet_create_pnetids_list(net); + + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; return 0; } @@ -690,6 +889,7 @@ int __init smc_pnet_init(void) rc = register_netdevice_notifier(&smc_netdev_notifier); if (rc) genl_unregister_family(&smc_pnet_nl_family); + return rc; } @@ -698,6 +898,7 @@ void smc_pnet_net_exit(struct net *net) { /* flush pnet table */ smc_pnet_remove_by_pnetid(net, NULL); + smc_pnet_destroy_pnetids_list(net); } void smc_pnet_exit(void) @@ -706,16 +907,11 @@ void smc_pnet_exit(void) genl_unregister_family(&smc_pnet_nl_family); } -/* Determine one base device for stacked net devices. - * If the lower device level contains more than one devices - * (for instance with bonding slaves), just the first device - * is used to reach a base device. - */ -static struct net_device *pnet_find_base_ndev(struct net_device *ndev) +static struct net_device *__pnet_find_base_ndev(struct net_device *ndev) { int i, nest_lvl; - rtnl_lock(); + ASSERT_RTNL(); nest_lvl = ndev->lower_level; for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; @@ -725,6 +921,18 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev) lower = lower->next; ndev = netdev_lower_get_next(ndev, &lower); } + return ndev; +} + +/* Determine one base device for stacked net devices. + * If the lower device level contains more than one devices + * (for instance with bonding slaves), just the first device + * is used to reach a base device. + */ +static struct net_device *pnet_find_base_ndev(struct net_device *ndev) +{ + rtnl_lock(); + ndev = __pnet_find_base_ndev(ndev); rtnl_unlock(); return ndev; } @@ -742,32 +950,96 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - read_lock(&pnettable->lock); + mutex_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (ndev == pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; break; } } - read_unlock(&pnettable->lock); + mutex_unlock(&pnettable->lock); return rc; } +static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, + struct smc_init_info *ini) +{ + if (!ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, + NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + return 0; + } + if (ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, + NULL, &ini->smcrv2)) { + ini->smcrv2.ib_dev_v2 = ibdev; + ini->smcrv2.ib_port_v2 = i; + return 0; + } + return -ENODEV; +} + +/* find a roce device for the given pnetid */ +static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, + struct smc_init_info *ini, + struct smc_ib_device *known_dev, + struct net *net) +{ + struct smc_ib_device *ibdev; + int i; + + mutex_lock(&smc_ib_devices.mutex); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (ibdev == known_dev || + !rdma_dev_access_netns(ibdev->ibdev, net)) + continue; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + goto out; + } + } + } +out: + mutex_unlock(&smc_ib_devices.mutex); +} + +/* find alternate roce device with same pnet_id, vlan_id and net namespace */ +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev) +{ + struct net *net = lgr->net; + + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net); +} + /* if handshake network device belongs to a roce device, return its * IB device and port */ static void smc_pnet_find_rdma_dev(struct net_device *netdev, struct smc_init_info *ini) { + struct net *net = dev_net(netdev); struct smc_ib_device *ibdev; - spin_lock(&smc_ib_devices.lock); + mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { struct net_device *ndev; int i; + /* check rdma net namespace */ + if (!rdma_dev_access_netns(ibdev->ibdev, net)) + continue; + for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; @@ -779,16 +1051,13 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - break; + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + break; } } } - spin_unlock(&smc_ib_devices.lock); + mutex_unlock(&smc_ib_devices.mutex); } /* Determine the corresponding IB device port based on the hardware PNETID. @@ -801,35 +1070,17 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_ib_device *ibdev; - int i; + struct net *net; ndev = pnet_find_base_ndev(ndev); + net = dev_net(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (i = 1; i <= SMC_MAX_PORTS; i++) { - if (!rdma_is_port_valid(ibdev->ibdev, i)) - continue; - if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && - smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; - } - } - } -out: - spin_unlock(&smc_ib_devices.lock); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, @@ -844,15 +1095,18 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) return; /* pnetid could not be determined */ - spin_lock(&smcd_dev_list.lock); + mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && - !ismdev->going_away) { - ini->ism_dev = ismdev; + !ismdev->going_away && + (!ini->ism_peer_gid[0] || + !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id, + ismdev))) { + ini->ism_dev[0] = ismdev; break; } } - spin_unlock(&smcd_dev_list.lock); + mutex_unlock(&smcd_dev_list.mutex); } /* PNET table analysis for a given sock: @@ -863,8 +1117,6 @@ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - ini->ib_dev = NULL; - ini->ib_port = 0; if (!dst) goto out; if (!dst->dev) @@ -882,7 +1134,7 @@ void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - ini->ism_dev = NULL; + ini->ism_dev[0] = NULL; if (!dst) goto out; if (!dst->dev) @@ -895,3 +1147,60 @@ out_rel: out: return; } + +/* Lookup and apply a pnet table entry to the given ib device. + */ +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) +{ + char *ib_name = smcibdev->ibdev->name; + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && + tmp_pe->ib_port == ib_port) { + smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); + rc = 0; + break; + } + } + mutex_unlock(&pnettable->lock); + + return rc; +} + +/* Lookup and apply a pnet table entry to the given smcd device. + */ +int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) +{ + const char *ib_name = dev_name(&smcddev->dev); + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + mutex_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); + rc = 0; + break; + } + } + mutex_unlock(&pnettable->lock); + + return rc; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 4564e4d69c2e..80a88eea4949 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -12,6 +12,8 @@ #ifndef _SMC_PNET_H #define _SMC_PNET_H +#include <net/smc.h> + #if IS_ENABLED(CONFIG_HAVE_PNETID) #include <asm/pnet.h> #endif @@ -19,6 +21,7 @@ struct smc_ib_device; struct smcd_dev; struct smc_init_info; +struct smc_link_group; /** * struct smc_pnettable - SMC PNET table anchor @@ -26,10 +29,21 @@ struct smc_init_info; * @pnetlist: List of PNETIDs */ struct smc_pnettable { - rwlock_t lock; + struct mutex lock; struct list_head pnetlist; }; +struct smc_pnetids_ndev { /* list of pnetids for net devices in UP state*/ + struct list_head list; + rwlock_t lock; +}; + +struct smc_pnetids_ndev_entry { + struct list_head list; + u8 pnetid[SMC_MAX_PNETID_LEN]; + refcount_t refcnt; +}; + static inline int smc_pnetid_by_dev_port(struct device *dev, unsigned short port, u8 *pnetid) { @@ -46,5 +60,11 @@ void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); - +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); +int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev); +bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid); +bool smc_pnet_is_pnetid_set(u8 *pnetid); #endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 39d7b34d06d2..17c5aee7ee4f 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -21,6 +21,8 @@ #include "smc_cdc.h" #include "smc_tx.h" /* smc_tx_consumer_update() */ #include "smc_rx.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" /* callback implementation to wakeup consumers blocked with smc_rx_wait(). * indirectly called by smc_cdc_msg_recv_action(). @@ -129,16 +131,8 @@ out: sock_put(sk); } -static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - static const struct pipe_buf_operations smc_pipe_ops = { - .confirm = generic_pipe_buf_confirm, .release = smc_rx_pipe_buf_release, - .steal = smc_rx_pipe_buf_nosteal, .get = generic_pipe_buf_get }; @@ -151,35 +145,93 @@ static void smc_rx_spd_release(struct splice_pipe_desc *spd, static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, struct smc_sock *smc) { + struct smc_link_group *lgr = smc->conn.lgr; + int offset = offset_in_page(src); + struct partial_page *partial; struct splice_pipe_desc spd; - struct partial_page partial; - struct smc_spd_priv *priv; - int bytes; + struct smc_spd_priv **priv; + struct page **pages; + int bytes, nr_pages; + int i; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? + PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out; + partial = kcalloc(nr_pages, sizeof(*partial), GFP_KERNEL); + if (!partial) + goto out_page; + priv = kcalloc(nr_pages, sizeof(*priv), GFP_KERNEL); if (!priv) - return -ENOMEM; - priv->len = len; - priv->smc = smc; - partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; - partial.len = len; - partial.private = (unsigned long)priv; - - spd.nr_pages_max = 1; - spd.nr_pages = 1; - spd.pages = &smc->conn.rmb_desc->pages; - spd.partial = &partial; + goto out_part; + for (i = 0; i < nr_pages; i++) { + priv[i] = kzalloc(sizeof(**priv), GFP_KERNEL); + if (!priv[i]) + goto out_priv; + } + + if (lgr->is_smcd || + (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { + /* smcd or smcr that uses physically contiguous RMBs */ + priv[0]->len = len; + priv[0]->smc = smc; + partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial[0].len = len; + partial[0].private = (unsigned long)priv[0]; + pages[0] = smc->conn.rmb_desc->pages; + } else { + int size, left = len; + void *buf = src; + /* smcr that uses virtually contiguous RMBs*/ + for (i = 0; i < nr_pages; i++) { + size = min_t(int, PAGE_SIZE - offset, left); + priv[i]->len = size; + priv[i]->smc = smc; + pages[i] = vmalloc_to_page(buf); + partial[i].offset = offset; + partial[i].len = size; + partial[i].private = (unsigned long)priv[i]; + buf += size / sizeof(*buf); + left -= size; + offset = 0; + } + } + spd.nr_pages_max = nr_pages; + spd.nr_pages = nr_pages; + spd.pages = pages; + spd.partial = partial; spd.ops = &smc_pipe_ops; spd.spd_release = smc_rx_spd_release; bytes = splice_to_pipe(pipe, &spd); if (bytes > 0) { sock_hold(&smc->sk); - get_page(smc->conn.rmb_desc->pages); + if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { + for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) + get_page(pages[i]); + } else { + get_page(smc->conn.rmb_desc->pages); + } atomic_add(bytes, &smc->conn.splice_pending); } + kfree(priv); + kfree(partial); + kfree(pages); return bytes; + +out_priv: + for (i = (i - 1); i >= 0; i--) + kfree(priv[i]); + kfree(priv); +out_part: + kfree(partial); +out_page: + kfree(pages); +out: + return -ENOMEM; } static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) @@ -235,6 +287,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, conn->urg_state == SMC_URG_READ) return -EINVAL; + SMC_STAT_INC(smc, urg_data_cnt); if (conn->urg_state == SMC_URG_VALID) { if (!(flags & MSG_PEEK)) smc->conn.urg_state = SMC_URG_READ; @@ -311,6 +364,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + readable = atomic_read(&conn->bytes_to_rcv); + if (readable >= conn->rmb_desc->len) + SMC_STAT_RMB_RX_FULL(smc, !conn->lnk); + + if (len < readable) + SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk); /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; @@ -354,12 +413,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } break; } + if (!timeo) + return -EAGAIN; if (signal_pending(current)) { read_done = sock_intr_errno(timeo); break; } - if (!timeo) - return -EAGAIN; } if (!smc_rx_data_available(conn)) { @@ -412,7 +471,6 @@ copy: if (rc < 0) { if (!read_done) read_done = -EFAULT; - smc_rmb_sync_sg_for_device(conn); goto out; } } @@ -426,7 +484,6 @@ copy: chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in recv ring buffer */ } - smc_rmb_sync_sg_for_device(conn); /* update cursors */ if (!(flags & MSG_PEEK)) { @@ -438,6 +495,8 @@ copy: if (msg && smc_rx_update_consumer(smc, cons, copylen)) goto out; } + + trace_smc_rx_recvmsg(smc, copylen); } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c new file mode 100644 index 000000000000..e80e34f7ac15 --- /dev/null +++ b/net/smc/smc_stats.c @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * SMC statistics netlink routines + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> +#include <net/genetlink.h> +#include <net/sock.h> +#include "smc_netlink.h" +#include "smc_stats.h" + +int smc_stats_init(struct net *net) +{ + net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); + if (!net->smc.fback_rsn) + goto err_fback; + net->smc.smc_stats = alloc_percpu(struct smc_stats); + if (!net->smc.smc_stats) + goto err_stats; + mutex_init(&net->smc.mutex_fback_rsn); + return 0; + +err_stats: + kfree(net->smc.fback_rsn); +err_fback: + return -ENOMEM; +} + +void smc_stats_exit(struct net *net) +{ + kfree(net->smc.fback_rsn); + if (net->smc.smc_stats) + free_percpu(net->smc.smc_stats); +} + +static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_rmbcnt *stats_rmb_cnt; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TX_RMB_STATS) + stats_rmb_cnt = &stats->smc[tech].rmb_tx; + else + stats_rmb_cnt = &stats->smc[tech].rmb_rx; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, + stats_rmb_cnt->reuse_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, + stats_rmb_cnt->buf_size_small_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, + stats_rmb_cnt->buf_size_small_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, + stats_rmb_cnt->buf_full_peer_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, + stats_rmb_cnt->buf_full_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, + stats_rmb_cnt->alloc_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, + stats_rmb_cnt->dgrade_cnt, + SMC_NLA_STATS_RMB_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, + struct smc_stats *stats, int tech, + int type) +{ + struct smc_stats_memsize *stats_pload; + struct nlattr *attrs; + + if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) + stats_pload = &stats->smc[tech].tx_pd; + else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) + stats_pload = &stats->smc[tech].rx_pd; + else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) + stats_pload = &stats->smc[tech].tx_rmbsize; + else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) + stats_pload = &stats->smc[tech].rx_rmbsize; + else + goto errout; + + attrs = nla_nest_start(skb, type); + if (!attrs) + goto errout; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, + stats_pload->buf[SMC_BUF_8K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, + stats_pload->buf[SMC_BUF_16K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, + stats_pload->buf[SMC_BUF_32K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, + stats_pload->buf[SMC_BUF_64K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, + stats_pload->buf[SMC_BUF_128K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, + stats_pload->buf[SMC_BUF_256K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, + stats_pload->buf[SMC_BUF_512K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, + stats_pload->buf[SMC_BUF_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, + stats_pload->buf[SMC_BUF_G_1024K], + SMC_NLA_STATS_PLOAD_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, + struct smc_stats *stats, int tech) +{ + struct smc_stats_tech *smc_tech; + struct nlattr *attrs; + + smc_tech = &stats->smc[tech]; + if (tech == SMC_TYPE_D) + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); + else + attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); + + if (!attrs) + goto errout; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_rmb_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_STATS)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RXPLOAD_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_TX_RMB_SIZE)) + goto errattr; + if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, + SMC_NLA_STATS_T_RX_RMB_SIZE)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, + smc_tech->clnt_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, + smc_tech->clnt_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, + smc_tech->srv_v1_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, + smc_tech->srv_v2_succ_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, + smc_tech->rx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, + smc_tech->tx_bytes, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, + smc_tech->rx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, + smc_tech->tx_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, + smc_tech->sendpage_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, + smc_tech->cork_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, + smc_tech->ndly_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, + smc_tech->splice_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, + smc_tech->urg_data_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + return 0; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + return -EMSGSIZE; +} + +int smc_nl_get_stats(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + struct smc_stats *stats; + struct nlattr *attrs; + int cpu, i, size; + void *nlh; + u64 *src; + u64 *sum; + + if (cb_ctx->pos[0]) + goto errmsg; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_STATS); + if (!nlh) + goto errmsg; + + attrs = nla_nest_start(skb, SMC_GEN_STATS); + if (!attrs) + goto errnest; + stats = kzalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + goto erralloc; + size = sizeof(*stats) / sizeof(u64); + for_each_possible_cpu(cpu) { + src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); + sum = (u64 *)stats; + for (i = 0; i < size; i++) + *(sum++) += *(src++); + } + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) + goto errattr; + if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, + stats->clnt_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, + stats->srv_hshake_err_cnt, + SMC_NLA_STATS_PAD)) + goto errattr; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + cb_ctx->pos[0] = 1; + kfree(stats); + return skb->len; + +errattr: + kfree(stats); +erralloc: + nla_nest_cancel(skb, attrs); +errnest: + genlmsg_cancel(skb, nlh); +errmsg: + return skb->len; +} + +static int smc_nl_get_fback_details(struct sk_buff *skb, + struct netlink_callback *cb, int pos, + bool is_srv) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int cnt_reported = cb_ctx->pos[2]; + struct smc_stats_fback *trgt_arr; + struct nlattr *attrs; + int rc = 0; + void *nlh; + + if (is_srv) + trgt_arr = &net->smc.fback_rsn->srv[0]; + else + trgt_arr = &net->smc.fback_rsn->clnt[0]; + if (!trgt_arr[pos].fback_code) + return -ENODATA; + nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_GET_FBACK_STATS); + if (!nlh) + goto errmsg; + attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); + if (!attrs) + goto errout; + if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) + goto errattr; + if (!cnt_reported) { + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, + net->smc.fback_rsn->srv_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, + net->smc.fback_rsn->clnt_fback_cnt, + SMC_NLA_FBACK_STATS_PAD)) + goto errattr; + cnt_reported = 1; + } + + if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, + trgt_arr[pos].fback_code)) + goto errattr; + if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, + trgt_arr[pos].count)) + goto errattr; + + cb_ctx->pos[2] = cnt_reported; + nla_nest_end(skb, attrs); + genlmsg_end(skb, nlh); + return rc; + +errattr: + nla_nest_cancel(skb, attrs); +errout: + genlmsg_cancel(skb, nlh); +errmsg: + return -EMSGSIZE; +} + +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + struct net *net = sock_net(skb->sk); + int rc_srv = 0, rc_clnt = 0, k; + int skip_serv = cb_ctx->pos[1]; + int snum = cb_ctx->pos[0]; + bool is_srv = true; + + mutex_lock(&net->smc.mutex_fback_rsn); + for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { + if (k < snum) + continue; + if (!skip_serv) { + rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); + if (rc_srv && rc_srv != -ENODATA) + break; + } else { + skip_serv = 0; + } + rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); + if (rc_clnt && rc_clnt != -ENODATA) { + skip_serv = 1; + break; + } + if (rc_clnt == -ENODATA && rc_srv == -ENODATA) + break; + } + mutex_unlock(&net->smc.mutex_fback_rsn); + cb_ctx->pos[1] = skip_serv; + cb_ctx->pos[0] = k; + return skb->len; +} diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h new file mode 100644 index 000000000000..84b7ecd8c05c --- /dev/null +++ b/net/smc/smc_stats.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Macros for SMC statistics + * + * Copyright IBM Corp. 2021 + * + * Author(s): Guvenc Gulce + */ + +#ifndef NET_SMC_SMC_STATS_H_ +#define NET_SMC_SMC_STATS_H_ +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/ctype.h> +#include <linux/smc.h> + +#include "smc_clc.h" + +#define SMC_MAX_FBACK_RSN_CNT 30 + +enum { + SMC_BUF_8K, + SMC_BUF_16K, + SMC_BUF_32K, + SMC_BUF_64K, + SMC_BUF_128K, + SMC_BUF_256K, + SMC_BUF_512K, + SMC_BUF_1024K, + SMC_BUF_G_1024K, + SMC_BUF_MAX, +}; + +struct smc_stats_fback { + int fback_code; + u16 count; +}; + +struct smc_stats_rsn { + struct smc_stats_fback srv[SMC_MAX_FBACK_RSN_CNT]; + struct smc_stats_fback clnt[SMC_MAX_FBACK_RSN_CNT]; + u64 srv_fback_cnt; + u64 clnt_fback_cnt; +}; + +struct smc_stats_rmbcnt { + u64 buf_size_small_peer_cnt; + u64 buf_size_small_cnt; + u64 buf_full_peer_cnt; + u64 buf_full_cnt; + u64 reuse_cnt; + u64 alloc_cnt; + u64 dgrade_cnt; +}; + +struct smc_stats_memsize { + u64 buf[SMC_BUF_MAX]; +}; + +struct smc_stats_tech { + struct smc_stats_memsize tx_rmbsize; + struct smc_stats_memsize rx_rmbsize; + struct smc_stats_memsize tx_pd; + struct smc_stats_memsize rx_pd; + struct smc_stats_rmbcnt rmb_tx; + struct smc_stats_rmbcnt rmb_rx; + u64 clnt_v1_succ_cnt; + u64 clnt_v2_succ_cnt; + u64 srv_v1_succ_cnt; + u64 srv_v2_succ_cnt; + u64 sendpage_cnt; + u64 urg_data_cnt; + u64 splice_cnt; + u64 cork_cnt; + u64 ndly_cnt; + u64 rx_bytes; + u64 tx_bytes; + u64 rx_cnt; + u64 tx_cnt; +}; + +struct smc_stats { + struct smc_stats_tech smc[2]; + u64 clnt_hshake_err_cnt; + u64 srv_hshake_err_cnt; +}; + +#define SMC_STAT_PAYLOAD_SUB(_smc_stats, _tech, key, _len, _rc) \ +do { \ + typeof(_smc_stats) stats = (_smc_stats); \ + typeof(_tech) t = (_tech); \ + typeof(_len) l = (_len); \ + int _pos = fls64((l) >> 13); \ + typeof(_rc) r = (_rc); \ + int m = SMC_BUF_MAX - 1; \ + this_cpu_inc((*stats).smc[t].key ## _cnt); \ + if (r <= 0) \ + break; \ + _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \ + this_cpu_add((*stats).smc[t].key ## _bytes, r); \ +} \ +while (0) + +#define SMC_STAT_TX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, tx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, tx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RX_PAYLOAD(_smc, length, rcode) \ +do { \ + typeof(_smc) __smc = _smc; \ + struct net *_net = sock_net(&__smc->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(length) _len = (length); \ + typeof(rcode) _rc = (rcode); \ + bool is_smcd = !__smc->conn.lnk; \ + if (is_smcd) \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_D, rx, _len, _rc); \ + else \ + SMC_STAT_PAYLOAD_SUB(_smc_stats, SMC_TYPE_R, rx, _len, _rc); \ +} \ +while (0) + +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ +do { \ + typeof(_len) _l = (_len); \ + typeof(_tech) t = (_tech); \ + int _pos = fls((_l) >> 13); \ + int m = SMC_BUF_MAX - 1; \ + _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ +} \ +while (0) + +#define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ + this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) + +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ +do { \ + struct net *_net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + typeof(_len) l = (_len); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ +} \ +while (0) + +#define SMC_STAT_RMB(_smc, type, _is_smcd, _is_rx) \ +do { \ + struct net *net = sock_net(&(_smc)->sk); \ + struct smc_stats __percpu *_smc_stats = net->smc.smc_stats; \ + typeof(_is_smcd) is_d = (_is_smcd); \ + typeof(_is_rx) is_r = (_is_rx); \ + if ((is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, rx); \ + if ((is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_D, tx); \ + if (!(is_d) && (is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, rx); \ + if (!(is_d) && !(is_r)) \ + SMC_STAT_RMB_SUB(_smc_stats, type, SMC_TYPE_R, tx); \ +} \ +while (0) + +#define SMC_STAT_BUF_REUSE(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, reuse, is_smcd, is_rx) + +#define SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, alloc, is_smcd, is_rx) + +#define SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rx) \ + SMC_STAT_RMB(smc, dgrade, is_smcd, is_rx) + +#define SMC_STAT_RMB_TX_PEER_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, false) + +#define SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small_peer, is_smcd, false) + +#define SMC_STAT_RMB_TX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, false) + +#define SMC_STAT_RMB_RX_SIZE_SMALL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_size_small, is_smcd, true) + +#define SMC_STAT_RMB_RX_FULL(smc, is_smcd) \ + SMC_STAT_RMB(smc, buf_full, is_smcd, true) + +#define SMC_STAT_INC(_smc, type) \ +do { \ + typeof(_smc) __smc = _smc; \ + bool is_smcd = !(__smc)->conn.lnk; \ + struct net *net = sock_net(&(__smc)->sk); \ + struct smc_stats __percpu *smc_stats = net->smc.smc_stats; \ + if ((is_smcd)) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].type); \ + else \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].type); \ +} \ +while (0) + +#define SMC_STAT_CLNT_SUCC_INC(net, _aclc) \ +do { \ + typeof(_aclc) acl = (_aclc); \ + bool is_v2 = (acl->hdr.version == SMC_V2); \ + bool is_smcd = (acl->hdr.typev1 == SMC_TYPE_D); \ + struct smc_stats __percpu *smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].clnt_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].clnt_v1_succ_cnt); \ +} \ +while (0) + +#define SMC_STAT_SERV_SUCC_INC(net, _ini) \ +do { \ + typeof(_ini) i = (_ini); \ + bool is_v2 = (i->smcd_version & SMC_V2); \ + bool is_smcd = (i->is_smcd); \ + typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ + if (is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ + else if (is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v2_succ_cnt); \ + else if (!is_v2 && is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v1_succ_cnt); \ + else if (!is_v2 && !is_smcd) \ + this_cpu_inc(smc_stats->smc[SMC_TYPE_R].srv_v1_succ_cnt); \ +} \ +while (0) + +int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb); +int smc_stats_init(struct net *net); +void smc_stats_exit(struct net *net); + +#endif /* NET_SMC_SMC_STATS_H_ */ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 000000000000..b6f79fabb9d3 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#include <linux/init.h> +#include <linux/sysctl.h> +#include <net/net_namespace.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_llc.h" +#include "smc_sysctl.h" + +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; + +static struct ctl_table smc_table[] = { + { + .procname = "autocorking_size", + .data = &init_net.smc.sysctl_autocorking_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { + .procname = "smcr_buf_type", + .data = &init_net.smc.sysctl_smcr_buf_type, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "smcr_testlink_time", + .data = &init_net.smc.sysctl_smcr_testlink_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "wmem", + .data = &init_net.smc.sysctl_wmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem", + .data = &init_net.smc.sysctl_rmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { } +}; + +int __net_init smc_sysctl_net_init(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc.smc_hdr) + goto err_reg; + + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; + WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +void __net_exit smc_sysctl_net_exit(struct net *net) +{ + struct ctl_table *table; + + table = net->smc.smc_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->smc.smc_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 000000000000..0becc11bd2f4 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int __net_init smc_sysctl_net_init(struct net *net); +void __net_exit smc_sysctl_net_exit(struct net *net); + +#else + +static inline int smc_sysctl_net_init(struct net *net) +{ + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + return 0; +} + +static inline void smc_sysctl_net_exit(struct net *net) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ diff --git a/net/smc/smc_tracepoint.c b/net/smc/smc_tracepoint.c new file mode 100644 index 000000000000..8d47ced5a492 --- /dev/null +++ b/net/smc/smc_tracepoint.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define CREATE_TRACE_POINTS +#include "smc_tracepoint.h" + +EXPORT_TRACEPOINT_SYMBOL(smc_switch_to_fallback); +EXPORT_TRACEPOINT_SYMBOL(smc_tx_sendmsg); +EXPORT_TRACEPOINT_SYMBOL(smc_rx_recvmsg); +EXPORT_TRACEPOINT_SYMBOL(smcr_link_down); diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h new file mode 100644 index 000000000000..9fc5e586d24a --- /dev/null +++ b/net/smc/smc_tracepoint.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM smc + +#if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SMC_H + +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/tracepoint.h> +#include <net/ipv6.h> +#include "smc.h" +#include "smc_core.h" + +TRACE_EVENT(smc_switch_to_fallback, + + TP_PROTO(const struct smc_sock *smc, int fallback_rsn), + + TP_ARGS(smc, fallback_rsn), + + TP_STRUCT__entry( + __field(const void *, sk) + __field(const void *, clcsk) + __field(u64, net_cookie) + __field(int, fallback_rsn) + ), + + TP_fast_assign( + const struct sock *sk = &smc->sk; + const struct sock *clcsk = smc->clcsock->sk; + + __entry->sk = sk; + __entry->clcsk = clcsk; + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->fallback_rsn = fallback_rsn; + ), + + TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d", + __entry->sk, __entry->clcsk, + __entry->net_cookie, __entry->fallback_rsn) +); + +DECLARE_EVENT_CLASS(smc_msg_event, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len), + + TP_STRUCT__entry( + __field(const void *, smc) + __field(u64, net_cookie) + __field(size_t, len) + __string(name, smc->conn.lnk->ibname) + ), + + TP_fast_assign( + const struct sock *sk = &smc->sk; + + __entry->smc = smc; + __entry->net_cookie = sock_net(sk)->net_cookie; + __entry->len = len; + __assign_str(name, smc->conn.lnk->ibname); + ), + + TP_printk("smc=%p net=%llu len=%zu dev=%s", + __entry->smc, __entry->net_cookie, + __entry->len, __get_str(name)) +); + +DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len) +); + +DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len) +); + +TRACE_EVENT(smcr_link_down, + + TP_PROTO(const struct smc_link *lnk, void *location), + + TP_ARGS(lnk, location), + + TP_STRUCT__entry( + __field(const void *, lnk) + __field(const void *, lgr) + __field(u64, net_cookie) + __field(int, state) + __string(name, lnk->ibname) + __field(void *, location) + ), + + TP_fast_assign( + const struct smc_link_group *lgr = lnk->lgr; + + __entry->lnk = lnk; + __entry->lgr = lgr; + __entry->net_cookie = lgr->net->net_cookie; + __entry->state = lnk->state; + __assign_str(name, lnk->ibname); + __entry->location = location; + ), + + TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS", + __entry->lnk, __entry->lgr, __entry->net_cookie, + __entry->state, __get_str(name), + __entry->location) +); + +#endif /* _TRACE_SMC_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE smc_tracepoint + +#include <trace/define_trace.h> diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 0d42e7716b91..64dedffe9d26 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -27,9 +27,10 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 -#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -45,6 +46,8 @@ static void smc_tx_write_space(struct sock *sk) /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) + SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); @@ -128,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc) return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 + */ +static bool smc_should_autocork(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + int corking_size; + + corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, + sock_net(&smc->sk)->smc.sysctl_autocorking_size); + + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) + return false; + return true; +} + +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -151,9 +199,19 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) goto out_err; } + if (sk->sk_state == SMC_INIT) + return -ENOTCONN; + + if (len > conn->sndbuf_desc->len) + SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); + + if (len > conn->peer_rmbe_size) + SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); + + if (msg->msg_flags & MSG_OOB) + SMC_STAT_INC(smc, urg_data_cnt); + while (msg_data_left(msg)) { - if (sk->sk_state == SMC_INIT) - return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->killed) @@ -188,7 +246,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; - smc_sndbuf_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); @@ -222,16 +279,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && - (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) - /* for a corked socket defer the RDMA writes if there - * is still sufficient sndbuf_space available - */ - schedule_delayed_work(&conn->tx_work, - SMC_TX_CORK_DELAY); - else + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion + */ + if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); + + trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ return send_done; @@ -244,21 +298,33 @@ out_err: return rc; } +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags) +{ + struct msghdr msg = {.msg_flags = flags}; + char *kaddr = kmap(page); + struct kvec iov; + int rc; + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); + rc = smc_tx_sendmsg(smc, &msg, size); + kunmap(page); + return rc; +} + /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal) { - struct smc_ism_position pos; int rc; - memset(&pos, 0, sizeof(pos)); - pos.token = conn->peer_token; - pos.index = conn->peer_rmbe_idx; - pos.offset = conn->tx_off + offset; - pos.signal = signal; - rc = smc_ism_write(conn->lgr->smcd, &pos, data, len); + rc = smc_ism_write(conn->lgr->smcd, conn->peer_token, + conn->peer_rmbe_idx, signal, conn->tx_off + offset, + data, len); if (rc) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return rc; @@ -269,22 +335,21 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; - struct smc_link *link; + struct smc_link *link = conn->lnk; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; - rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) - smc_lgr_terminate(lgr, true); + smcr_link_down_cond_sched(link); return rc; } @@ -310,8 +375,11 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { + struct smc_link *link = conn->lnk; + dma_addr_t dma_addr = - sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -319,13 +387,25 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { - struct ib_sge *sge = - wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list; + struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; + struct ib_sge *sge = wr->wr.sg_list; + u64 base_addr = dma_addr; + + if (dst_len < link->qp_attr.cap.max_inline_data) { + base_addr = virt_addr; + wr->wr.send_flags |= IB_SEND_INLINE; + } else { + wr->wr.send_flags &= ~IB_SEND_INLINE; + } num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = dma_addr + src_off; + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; @@ -338,8 +418,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, - &wr_rdma_buf->wr_tx_rdma[dstchunk]); + rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); if (rc) return rc; if (dst_len_sum == len) @@ -418,8 +497,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); - if (rmbespace <= 0) + if (rmbespace <= 0) { + struct smc_sock *smc = container_of(conn, struct smc_sock, + conn); + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; + } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); @@ -481,13 +564,17 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + struct smc_link *link = conn->lnk; struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend); + if (!link || !smc_wr_tx_link_hold(link)) + return -ENOLINK; + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { + smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -497,17 +584,24 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (conn->killed) return -EPIPE; rc = 0; - mod_delayed_work(system_wq, &conn->tx_work, + mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); } return rc; } spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, tx_work will restart */ + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + rc = -ENOLINK; + goto out_unlock; + } if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } @@ -521,6 +615,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) out_unlock: spin_unlock_bh(&conn->send_lock); + smc_wr_tx_link_put(link); return rc; } @@ -543,13 +638,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc = 0; + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -557,34 +665,72 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) if (!rc) { /* trigger socket release if connection is closing */ - struct smc_sock *smc = container_of(conn, struct smc_sock, - conn); smc_close_wake_tx_prepared(smc); } + +out: + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + /* This make sure only one can send simultaneously to prevent wasting + * of CPU and CDC slot. + * Record whether someone has tried to push while we are pushing. + */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + smp_wmb(); /* Make sure tx_pushing is 1 before real send */ + rc = __smc_tx_sndbuf_nonempty(conn); + + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed after the atomic_set() + * when we are pushing. + * If so, we need to push again to prevent those data hang in the send + * queue. + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + return rc; } /* Wakeup sndbuf consumers from process context - * since there is more data to transmit + * since there is more data to transmit. The caller + * must hold sock lock. */ -void smc_tx_work(struct work_struct *work) +void smc_tx_pending(struct smc_connection *conn) { - struct smc_connection *conn = container_of(to_delayed_work(work), - struct smc_connection, - tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; - lock_sock(&smc->sk); if (smc->sk.sk_err) - goto out; + return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} -out: +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit in locked + * sock. + */ +void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(to_delayed_work(work), + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_tx_pending(conn); release_sock(&smc->sk); } @@ -614,8 +760,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && !conn->killed) { - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); return; } } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 07e6ad76224a..34b578498b1f 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -27,9 +27,12 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } +void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 337ee52ad3d3..b0678a417e09 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -44,6 +44,7 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ struct smc_link *link; u32 idx; struct smc_wr_tx_pend_priv priv; + u8 compl_requested; }; /******************************** send queue *********************************/ @@ -53,21 +54,13 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ /* returns true if at least one tx work request is pending on the given link */ static inline bool smc_wr_is_tx_pend(struct smc_link *link) { - if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) != - link->wr_tx_cnt) { - return true; - } - return false; + return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt); } /* wait till all pending tx work requests on the given link are completed */ -static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) { - if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), - SMC_WR_TX_WAIT_PENDING_TIME)) - return 0; - else /* timeout */ - return -EPIPE; + wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); } static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) @@ -86,7 +79,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) struct smc_wr_tx_pend pnd_snd; struct smc_link *link; u32 pnd_snd_idx; - int i; link = wc->qp->qp_context; @@ -100,37 +92,50 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); - if (pnd_snd_idx == link->wr_tx_cnt) - return; - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); - /* clear the full struct smc_wr_tx_pend including .priv */ - memset(&link->wr_tx_pends[pnd_snd_idx], 0, - sizeof(link->wr_tx_pends[pnd_snd_idx])); - memset(&link->wr_tx_bufs[pnd_snd_idx], 0, - sizeof(link->wr_tx_bufs[pnd_snd_idx])); - if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) - return; + if (pnd_snd_idx == link->wr_tx_cnt) { + if (link->lgr->smc_version != SMC_V2 || + link->wr_tx_v2_pend->wr_id != wc->wr_id) + return; + link->wr_tx_v2_pend->wc_status = wc->status; + memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); + } else { + link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + complete(&link->wr_tx_compl[pnd_snd_idx]); + memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], + sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pnd_snd_idx], 0, + sizeof(link->wr_tx_pends[pnd_snd_idx])); + memset(&link->wr_tx_bufs[pnd_snd_idx], 0, + sizeof(link->wr_tx_bufs[pnd_snd_idx])); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) + return; + } + if (wc->status) { - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { - /* clear full struct smc_wr_tx_pend including .priv */ - memset(&link->wr_tx_pends[i], 0, - sizeof(link->wr_tx_pends[i])); - memset(&link->wr_tx_bufs[i], 0, - sizeof(link->wr_tx_bufs[i])); - clear_bit(i, link->wr_tx_mask); + if (link->lgr->smc_version == SMC_V2) { + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); } - /* terminate connections of this link group abnormally */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* terminate link */ + smcr_link_down_cond_sched(link); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); wake_up(&link->wr_tx_wait); } -static void smc_wr_tx_tasklet_fn(unsigned long data) +static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int i = 0, rc; int polled = 0; @@ -166,6 +171,8 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) { *idx = link->wr_tx_cnt; + if (!smc_link_sendable(link)) + return -ENOLINK; for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; @@ -207,13 +214,13 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - link->state == SMC_LNK_INACTIVE || + !smc_link_sendable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(lgr); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -234,6 +241,33 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, return 0; } +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv) +{ + struct smc_wr_tx_pend *wr_pend; + struct ib_send_wr *wr_ib; + u64 wr_id; + + if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) + return -EBUSY; + + *wr_buf = NULL; + *wr_pend_priv = NULL; + wr_id = smc_wr_tx_get_next_wr_id(link); + wr_pend = link->wr_tx_v2_pend; + wr_pend->wr_id = wr_id; + wr_pend->handler = handler; + wr_pend->link = link; + wr_pend->idx = link->wr_tx_cnt; + wr_ib = link->wr_tx_v2_ib; + wr_ib->wr_id = wr_id; + *wr_buf = link->lgr->wr_tx_buf_v2; + *wr_pend_priv = &wr_pend->priv; + return 0; +} + int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv) { @@ -251,6 +285,14 @@ int smc_wr_tx_put_slot(struct smc_link *link, test_and_clear_bit(idx, link->wr_tx_mask); wake_up(&link->wr_tx_wait); return 1; + } else if (link->lgr->smc_version == SMC_V2 && + pend->idx == link->wr_tx_cnt) { + /* Large v2 buffer */ + memset(&link->wr_tx_v2_pend, 0, + sizeof(link->wr_tx_v2_pend)); + memset(&link->lgr->wr_tx_buf_v2, 0, + sizeof(link->lgr->wr_tx_buf_v2)); + return 1; } return 0; @@ -270,11 +312,56 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); + } + return rc; +} + +int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + int len) +{ + int rc; + + link->wr_tx_v2_ib->sg_list[0].length = len; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); + if (rc) { + smc_wr_tx_put_slot(link, priv); + smcr_link_down_cond_sched(link); } return rc; } +/* Send prepared WR slot via ib_post_send and wait for send completion + * notification. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout) +{ + struct smc_wr_tx_pend *pend; + u32 pnd_idx; + int rc; + + pend = container_of(priv, struct smc_wr_tx_pend, priv); + pend->compl_requested = 1; + pnd_idx = pend->idx; + init_completion(&link->wr_tx_compl[pnd_idx]); + + rc = smc_wr_tx_send(link, priv); + if (rc) + return rc; + /* wait for completion by smc_wr_tx_process_cqe() */ + rc = wait_for_completion_interruptible_timeout( + &link->wr_tx_compl[pnd_idx], timeout); + if (rc <= 0) + rc = -ENODATA; + if (rc > 0) + rc = 0; + return rc; +} + /* Register a memory region and wait for result. */ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { @@ -290,12 +377,15 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (rc) return rc; + atomic_inc(&link->wr_reg_refcnt); rc = wait_event_interruptible_timeout(link->wr_reg_wait, (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); + if (atomic_dec_and_test(&link->wr_reg_refcnt)) + wake_up_all(&link->wr_reg_wait); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -314,25 +404,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) return rc; } -void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type, - smc_wr_tx_filter filter, - smc_wr_tx_dismisser dismisser, - unsigned long data) -{ - struct smc_wr_tx_pend_priv *tx_pend; - struct smc_wr_rx_hdr *wr_tx; - int i; - - for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { - wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i]; - if (wr_tx->type != wr_tx_hdr_type) - continue; - tx_pend = &link->wr_tx_pends[i].priv; - if (filter(tx_pend, data)) - dismisser(tx_pend); - } -} - /****************************** receive queue ********************************/ int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) @@ -383,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) for (i = 0; i < num; i++) { link = wc[i].qp->qp_context; + link->wr_rx_id_compl = wc[i].wr_id; if (wc[i].status == IB_WC_SUCCESS) { link->wr_rx_tstamp = jiffies; smc_wr_rx_demultiplex(&wc[i]); @@ -393,10 +465,9 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: - /* terminate connections of this link group - * abnormally - */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); + if (link->wr_rx_id_compl == link->wr_rx_id) + wake_up(&link->wr_rx_empty_wait); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -406,9 +477,9 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) } } -static void smc_wr_rx_tasklet_fn(unsigned long data) +static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; int polled = 0; int rc; @@ -485,10 +556,12 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; + bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; for (i = 0; i < lnk->wr_tx_cnt; i++) { - lnk->wr_tx_sges[i].addr = + lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) : lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; @@ -506,6 +579,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send_inline) + lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = @@ -513,14 +588,44 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; } + + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; + lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; + lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; + + lnk->wr_tx_v2_ib->next = NULL; + lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; + lnk->wr_tx_v2_ib->num_sge = 1; + lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; + lnk->wr_tx_v2_ib->send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. + * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer + * and the same buffer for all sges. When a larger message arrived then + * the content of the first small sge is copied to the beginning of + * the larger spillover buffer, allowing easy data mapping. + */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - lnk->wr_rx_sges[i].addr = + int x = i * sges_per_buf; + + lnk->wr_rx_sges[x].addr = lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_sges[x + 1].addr = + lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].length = + SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].lkey = + lnk->roce_pd->local_dma_lkey; + } lnk->wr_rx_ibs[i].next = NULL; - lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; - lnk->wr_rx_ibs[i].num_sge = 1; + lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; + lnk->wr_rx_ibs[i].num_sge = sges_per_buf; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -533,34 +638,68 @@ void smc_wr_free_link(struct smc_link *lnk) { struct ib_device *ibdev; - if (smc_wr_tx_wait_no_pending_sends(lnk)) - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * - sizeof(*lnk->wr_tx_mask)); - if (!lnk->smcibdev) return; ibdev = lnk->smcibdev->ibdev; + smc_wr_drain_cq(lnk); + smc_wr_wakeup_reg_wait(lnk); + smc_wr_wakeup_tx_wait(lnk); + + smc_wr_tx_wait_no_pending_sends(lnk); + wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); + wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); + if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } if (lnk->wr_tx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, DMA_TO_DEVICE); lnk->wr_tx_dma_addr = 0; } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } +} + +void smc_wr_free_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return; + + kfree(lgr->wr_rx_buf_v2); + lgr->wr_rx_buf_v2 = NULL; + kfree(lgr->wr_tx_buf_v2); + lgr->wr_tx_buf_v2 = NULL; } void smc_wr_free_link_mem(struct smc_link *lnk) { + kfree(lnk->wr_tx_v2_ib); + lnk->wr_tx_v2_ib = NULL; + kfree(lnk->wr_tx_v2_sge); + lnk->wr_tx_v2_sge = NULL; + kfree(lnk->wr_tx_v2_pend); + lnk->wr_tx_v2_pend = NULL; + kfree(lnk->wr_tx_compl); + lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); lnk->wr_tx_pends = NULL; - kfree(lnk->wr_tx_mask); + bitmap_free(lnk->wr_tx_mask); lnk->wr_tx_mask = NULL; kfree(lnk->wr_tx_sges); lnk->wr_tx_sges = NULL; @@ -580,8 +719,26 @@ void smc_wr_free_link_mem(struct smc_link *lnk) lnk->wr_rx_bufs = NULL; } +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return 0; + + lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_rx_buf_v2) + return -ENOMEM; + lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_tx_buf_v2) { + kfree(lgr->wr_rx_buf_v2); + return -ENOMEM; + } + return 0; +} + int smc_wr_alloc_link_mem(struct smc_link *link) { + int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; + /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) @@ -614,13 +771,11 @@ int smc_wr_alloc_link_mem(struct smc_link *link) if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]), + sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; - link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT), - sizeof(*link->wr_tx_mask), - GFP_KERNEL); + link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); if (!link->wr_tx_mask) goto no_mem_wr_rx_sges; link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, @@ -628,8 +783,36 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_pends) goto no_mem_wr_tx_mask; + link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_compl[0]), + GFP_KERNEL); + if (!link->wr_tx_compl) + goto no_mem_wr_tx_pends; + + if (link->lgr->smc_version == SMC_V2) { + link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), + GFP_KERNEL); + if (!link->wr_tx_v2_ib) + goto no_mem_tx_compl; + link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), + GFP_KERNEL); + if (!link->wr_tx_v2_sge) + goto no_mem_v2_ib; + link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), + GFP_KERNEL); + if (!link->wr_tx_v2_pend) + goto no_mem_v2_sge; + } return 0; +no_mem_v2_sge: + kfree(link->wr_tx_v2_sge); +no_mem_v2_ib: + kfree(link->wr_tx_v2_ib); +no_mem_tx_compl: + kfree(link->wr_tx_compl); +no_mem_wr_tx_pends: + kfree(link->wr_tx_pends); no_mem_wr_tx_mask: kfree(link->wr_tx_mask); no_mem_wr_rx_sges: @@ -660,10 +843,8 @@ void smc_wr_remove_dev(struct smc_ib_device *smcibdev) void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn, - (unsigned long)smcibdev); - tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn, - (unsigned long)smcibdev); + tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); + tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } int smc_wr_create_link(struct smc_link *lnk) @@ -681,6 +862,24 @@ int smc_wr_create_link(struct smc_link *lnk) rc = -EIO; goto out; } + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) { + lnk->wr_tx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + } lnk->wr_tx_dma_addr = ib_dma_map_single( ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, DMA_TO_DEVICE); @@ -689,13 +888,27 @@ int smc_wr_create_link(struct smc_link *lnk) goto dma_unmap; } smc_wr_init_sge(lnk); - memset(lnk->wr_tx_mask, 0, - BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); init_waitqueue_head(&lnk->wr_tx_wait); + atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); + atomic_set(&lnk->wr_reg_refcnt, 0); + init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; dma_unmap: + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, DMA_FROM_DEVICE); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 3ac99c898418..45e9b894d3f8 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -22,7 +22,6 @@ #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) -#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ) #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ @@ -60,6 +59,25 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline bool smc_wr_tx_link_hold(struct smc_link *link) +{ + if (!smc_link_sendable(link)) + return false; + atomic_inc(&link->wr_tx_refcnt); + return true; +} + +static inline void smc_wr_tx_link_put(struct smc_link *link) +{ + if (atomic_dec_and_test(&link->wr_tx_refcnt)) + wake_up_all(&link->wr_tx_wait); +} + +static inline void smc_wr_drain_cq(struct smc_link *lnk) +{ + wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); @@ -87,8 +105,10 @@ static inline int smc_wr_rx_post(struct smc_link *link) int smc_wr_create_link(struct smc_link *lnk); int smc_wr_alloc_link_mem(struct smc_link *lnk); +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr); void smc_wr_free_link(struct smc_link *lnk); void smc_wr_free_link_mem(struct smc_link *lnk); +void smc_wr_free_lgr_mem(struct smc_link_group *lgr); void smc_wr_remember_qp_attr(struct smc_link *lnk); void smc_wr_remove_dev(struct smc_ib_device *smcibdev); void smc_wr_add_dev(struct smc_ib_device *smcibdev); @@ -97,15 +117,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wrs, struct smc_wr_tx_pend_priv **wr_pend_priv); +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv); int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_v2_send(struct smc_link *link, + struct smc_wr_tx_pend_priv *priv, int len); +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); -void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, - smc_wr_tx_filter filter, - smc_wr_tx_dismisser dismisser, - unsigned long data); +void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); |