aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig6
-rw-r--r--net/rds/Makefile2
-rw-r--r--net/rds/af_rds.c32
-rw-r--r--net/rds/bind.c2
-rw-r--r--net/rds/cong.c2
-rw-r--r--net/rds/connection.c35
-rw-r--r--net/rds/ib.c51
-rw-r--r--net/rds/ib.h9
-rw-r--r--net/rds/ib_cm.c178
-rw-r--r--net/rds/ib_fmr.c269
-rw-r--r--net/rds/ib_frmr.c8
-rw-r--r--net/rds/ib_mr.h14
-rw-r--r--net/rds/ib_rdma.c28
-rw-r--r--net/rds/ib_recv.c25
-rw-r--r--net/rds/ib_ring.c2
-rw-r--r--net/rds/ib_send.c9
-rw-r--r--net/rds/info.c7
-rw-r--r--net/rds/message.c31
-rw-r--r--net/rds/rdma.c86
-rw-r--r--net/rds/rdma_transport.c5
-rw-r--r--net/rds/rdma_transport.h2
-rw-r--r--net/rds/rds.h33
-rw-r--r--net/rds/recv.c9
-rw-r--r--net/rds/send.c15
-rw-r--r--net/rds/tcp.c45
-rw-r--r--net/rds/tcp.h9
-rw-r--r--net/rds/tcp_connect.c8
-rw-r--r--net/rds/tcp_listen.c56
-rw-r--r--net/rds/tcp_recv.c2
-rw-r--r--net/rds/tcp_send.c9
-rw-r--r--net/rds/threads.c2
-rw-r--r--net/rds/transport.c26
32 files changed, 362 insertions, 655 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index c64e154bc18f..75cd696963b2 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -3,14 +3,14 @@
config RDS
tristate "The Reliable Datagram Sockets Protocol"
depends on INET
- ---help---
+ help
The RDS (Reliable Datagram Sockets) protocol provides reliable,
sequenced delivery of datagrams over Infiniband or TCP.
config RDS_RDMA
tristate "RDS over Infiniband"
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
- ---help---
+ help
Allow RDS to use Infiniband as a transport.
This transport supports RDMA operations.
@@ -18,7 +18,7 @@ config RDS_TCP
tristate "RDS over TCP"
depends on RDS
depends on IPV6 || !IPV6
- ---help---
+ help
Allow RDS to use TCP as a transport.
This transport does not support RDMA operations.
diff --git a/net/rds/Makefile b/net/rds/Makefile
index e647f9de104a..8fdc118e2927 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -7,7 +7,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
rds_rdma-y := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
- ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
+ ib_sysctl.o ib_rdma.o ib_frmr.o
obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 1a5bf3fa4578..3ff6995244e5 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -290,8 +290,7 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return 0;
}
-static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
- int len)
+static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
{
struct sockaddr_in6 sin6;
struct sockaddr_in sin;
@@ -308,14 +307,15 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
goto out;
} else if (len < sizeof(struct sockaddr_in6)) {
/* Assume IPv4 */
- if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
+ if (copy_from_sockptr(&sin, optval,
+ sizeof(struct sockaddr_in))) {
ret = -EFAULT;
goto out;
}
ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
sin6.sin6_port = sin.sin_port;
} else {
- if (copy_from_user(&sin6, optval,
+ if (copy_from_sockptr(&sin6, optval,
sizeof(struct sockaddr_in6))) {
ret = -EFAULT;
goto out;
@@ -327,21 +327,20 @@ out:
return ret;
}
-static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
int optlen)
{
int value;
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(value, (int __user *) optval))
+ if (copy_from_sockptr(&value, optval, sizeof(int)))
return -EFAULT;
*optvar = !!value;
return 0;
}
-static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
- int optlen)
+static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
{
int ret;
@@ -358,8 +357,7 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
return ret;
}
-static int rds_set_transport(struct rds_sock *rs, char __user *optval,
- int optlen)
+static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen)
{
int t_type;
@@ -369,7 +367,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
if (optlen != sizeof(int))
return -EINVAL;
- if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type)))
+ if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
return -EFAULT;
if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
@@ -380,7 +378,7 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
return rs->rs_transport ? 0 : -ENOPROTOOPT;
}
-static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
+static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
int optlen, int optname)
{
int val, valbool;
@@ -388,7 +386,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
if (optlen != sizeof(int))
return -EFAULT;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
valbool = val ? 1 : 0;
@@ -404,7 +402,7 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
return 0;
}
-static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
+static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
int optlen)
{
struct rds_rx_trace_so trace;
@@ -413,7 +411,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
if (optlen != sizeof(struct rds_rx_trace_so))
return -EFAULT;
- if (copy_from_user(&trace, optval, sizeof(trace)))
+ if (copy_from_sockptr(&trace, optval, sizeof(trace)))
return -EFAULT;
if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
@@ -432,7 +430,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
}
static int rds_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret;
@@ -896,7 +894,7 @@ module_exit(rds_exit);
u32 rds_gen_num;
-static int rds_init(void)
+static int __init rds_init(void)
{
int ret;
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5b5fb4ca8d3e..97a29172a8ee 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -104,7 +104,7 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
return -EINVAL;
last = rover;
} else {
- rover = max_t(u16, prandom_u32(), 2);
+ rover = max_t(u16, get_random_u16(), 2);
last = rover - 1;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index ccdff09a79c8..8b689ebbd5b5 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -236,7 +236,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* tcp_setsockopt and/or tcp_sendmsg will deadlock
* when it tries to get the sock_lock())
* 2. Interrupts are masked so that we can mark the
- * the port congested from both send and recv paths.
+ * port congested from both send and recv paths.
* (See comment around declaration of rdc_cong_lock).
* An attempt to get the sock_lock() here will
* therefore trigger warnings.
diff --git a/net/rds/connection.c b/net/rds/connection.c
index ed7f2133acc2..b4cc699c5fad 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -240,12 +240,24 @@ static struct rds_connection *__rds_conn_create(struct net *net,
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
- if (is_outgoing && trans->t_prefer_loopback) {
- /* "outgoing" connection - and the transport
- * says it wants the connection handled by the
- * loopback transport. This is what TCP does.
- */
- trans = &rds_loop_transport;
+ if (trans->t_prefer_loopback) {
+ if (likely(is_outgoing)) {
+ /* "outgoing" connection to local address.
+ * Protocol says it wants the connection
+ * handled by the loopback transport.
+ * This is what TCP does.
+ */
+ trans = &rds_loop_transport;
+ } else {
+ /* No transport currently in use
+ * should end up here, but if it
+ * does, reset/destroy the connection.
+ */
+ kfree(conn->c_path);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = ERR_PTR(-EOPNOTSUPP);
+ goto out;
+ }
}
}
@@ -905,6 +917,17 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
}
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
+/* Check connectivity of all paths
+ */
+void rds_check_all_paths(struct rds_connection *conn)
+{
+ int i = 0;
+
+ do {
+ rds_conn_path_connect_if_down(&conn->c_path[i]);
+ } while (++i < conn->c_npaths);
+}
+
void rds_conn_connect_if_down(struct rds_connection *conn)
{
WARN_ON(conn->c_trans->t_mp_capable);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index a792d8a3872a..9826fe7f9d00 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*
*/
-#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/if.h>
@@ -108,7 +107,6 @@ static void rds_ib_dev_free(struct work_struct *work)
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);
- dma_pool_destroy(rds_ibdev->rid_hdrs_pool);
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
list_del(&i_ipaddr->list);
@@ -127,19 +125,23 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
queue_work(rds_wq, &rds_ibdev->free_work);
}
-static void rds_ib_add_one(struct ib_device *device)
+static int rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
- bool has_fr, has_fmr;
+ int ret;
/* Only handle IB (no iWARP) devices */
if (device->node_type != RDMA_NODE_IB_CA)
- return;
+ return -EOPNOTSUPP;
+
+ /* Device must support FRWR */
+ if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ return -EOPNOTSUPP;
rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
ibdev_to_node(device));
if (!rds_ibdev)
- return;
+ return -ENOMEM;
spin_lock_init(&rds_ibdev->spinlock);
refcount_set(&rds_ibdev->refcount, 1);
@@ -151,20 +153,14 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_wrs = device->attrs.max_qp_wr;
rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE);
- has_fr = (device->attrs.device_cap_flags &
- IB_DEVICE_MEM_MGT_EXTENSIONS);
- has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr &&
- device->ops.map_phys_fmr && device->ops.unmap_fmr);
- rds_ibdev->use_fastreg = (has_fr && !has_fmr);
rds_ibdev->odp_capable =
- !!(device->attrs.device_cap_flags &
- IB_DEVICE_ON_DEMAND_PAGING) &&
+ !!(device->attrs.kernel_cap_flags &
+ IBK_ON_DEMAND_PAGING) &&
!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_WRITE) &&
!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_READ);
- rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
min_t(unsigned int, (device->attrs.max_mr / 2),
rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
@@ -182,25 +178,22 @@ static void rds_ib_add_one(struct ib_device *device)
if (!rds_ibdev->vector_load) {
pr_err("RDS/IB: %s failed to allocate vector memory\n",
__func__);
+ ret = -ENOMEM;
goto put_dev;
}
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device, 0);
if (IS_ERR(rds_ibdev->pd)) {
+ ret = PTR_ERR(rds_ibdev->pd);
rds_ibdev->pd = NULL;
goto put_dev;
}
- rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name,
- device->dma_device,
- sizeof(struct rds_header),
- L1_CACHE_BYTES, 0);
- if (!rds_ibdev->rid_hdrs_pool)
- goto put_dev;
rds_ibdev->mr_1m_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
if (IS_ERR(rds_ibdev->mr_1m_pool)) {
+ ret = PTR_ERR(rds_ibdev->mr_1m_pool);
rds_ibdev->mr_1m_pool = NULL;
goto put_dev;
}
@@ -208,18 +201,16 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->mr_8k_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
if (IS_ERR(rds_ibdev->mr_8k_pool)) {
+ ret = PTR_ERR(rds_ibdev->mr_8k_pool);
rds_ibdev->mr_8k_pool = NULL;
goto put_dev;
}
- rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
- device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
- rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
- rds_ibdev->max_8k_mrs);
+ rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
+ device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+ rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs);
- pr_info("RDS/IB: %s: %s supported and preferred\n",
- device->name,
- rds_ibdev->use_fastreg ? "FRMR" : "FMR");
+ pr_info("RDS/IB: %s: added\n", device->name);
down_write(&rds_ib_devices_lock);
list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
@@ -227,12 +218,13 @@ static void rds_ib_add_one(struct ib_device *device)
refcount_inc(&rds_ibdev->refcount);
ib_set_client_data(device, &rds_ib_client, rds_ibdev);
- refcount_inc(&rds_ibdev->refcount);
rds_ib_nodev_connect();
+ return 0;
put_dev:
rds_ib_dev_put(rds_ibdev);
+ return ret;
}
/*
@@ -274,9 +266,6 @@ static void rds_ib_remove_one(struct ib_device *device, void *client_data)
{
struct rds_ib_device *rds_ibdev = client_data;
- if (!rds_ibdev)
- return;
-
rds_ib_dev_shutdown(rds_ibdev);
/* stop connection attempts from getting a reference to this device. */
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 0296f1f7acda..2ba71102b1f1 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -246,14 +246,11 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */
- u8 use_fastreg:1;
u8 odp_capable:1;
unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;
struct rds_ib_mr_pool *mr_8k_pool;
- unsigned int fmr_max_remaps;
unsigned int max_8k_mrs;
unsigned int max_1m_mrs;
int max_sge;
@@ -266,7 +263,6 @@ struct rds_ib_device {
int *vector_load;
};
-#define ibdev_to_node(ibdev) dev_to_node((ibdev)->dev.parent)
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */
@@ -383,11 +379,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
-struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
- struct dma_pool *pool,
- dma_addr_t **dma_addrs, u32 num_hdrs);
-void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
- dma_addr_t *dma_addrs, u32 num_hdrs);
#define rds_ib_conn_error(conn, fmt...) \
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index c71f4328d138..26b069e1999d 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -30,13 +30,13 @@
* SOFTWARE.
*
*/
-#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
#include <net/addrconf.h>
+#include <rdma/ib_cm.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -68,31 +68,6 @@ static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
}
/*
- * Tune RNR behavior. Without flow control, we use a rather
- * low timeout, but not the absolute minimum - this should
- * be tunable.
- *
- * We already set the RNR retry count to 7 (which is the
- * smallest infinite number :-) above.
- * If flow control is off, we want to change this back to 0
- * so that we learn quickly when our credit accounting is
- * buggy.
- *
- * Caller passes in a qp_attr pointer - don't waste stack spacv
- * by allocation this twice.
- */
-static void
-rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
-{
- int ret;
-
- attr->min_rnr_timer = IB_RNR_TIMER_000_32;
- ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
- if (ret)
- printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
-}
-
-/*
* Connection established.
* We get here for both outgoing and incoming connection.
*/
@@ -100,7 +75,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
{
struct rds_ib_connection *ic = conn->c_transport_data;
const union rds_ib_conn_priv *dp = NULL;
- struct ib_qp_attr qp_attr;
__be64 ack_seq = 0;
__be32 credit = 0;
u8 major = 0;
@@ -168,14 +142,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
* the posted credit count. */
rds_ib_recv_refill(conn, 1, GFP_KERNEL);
- /* Tune RNR behavior */
- rds_ib_tune_rnr(ic, &qp_attr);
-
- qp_attr.qp_state = IB_QPS_RTS;
- err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
- if (err)
- printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
-
/* update ib_device with this local ipaddr */
err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
if (err)
@@ -440,42 +406,87 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
rds_ibdev->vector_load[index]--;
}
+static void rds_dma_hdr_free(struct ib_device *dev, struct rds_header *hdr,
+ dma_addr_t dma_addr, enum dma_data_direction dir)
+{
+ ib_dma_unmap_single(dev, dma_addr, sizeof(*hdr), dir);
+ kfree(hdr);
+}
+
+static struct rds_header *rds_dma_hdr_alloc(struct ib_device *dev,
+ dma_addr_t *dma_addr, enum dma_data_direction dir)
+{
+ struct rds_header *hdr;
+
+ hdr = kzalloc_node(sizeof(*hdr), GFP_KERNEL, ibdev_to_node(dev));
+ if (!hdr)
+ return NULL;
+
+ *dma_addr = ib_dma_map_single(dev, hdr, sizeof(*hdr),
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, *dma_addr)) {
+ kfree(hdr);
+ return NULL;
+ }
+
+ return hdr;
+}
+
+/* Free the DMA memory used to store struct rds_header.
+ *
+ * @dev: the RDS IB device
+ * @hdrs: pointer to the array storing DMA memory pointers
+ * @dma_addrs: pointer to the array storing DMA addresses
+ * @num_hdars: number of headers to free.
+ */
+static void rds_dma_hdrs_free(struct rds_ib_device *dev,
+ struct rds_header **hdrs, dma_addr_t *dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
+{
+ u32 i;
+
+ for (i = 0; i < num_hdrs; i++)
+ rds_dma_hdr_free(dev->dev, hdrs[i], dma_addrs[i], dir);
+ kvfree(hdrs);
+ kvfree(dma_addrs);
+}
+
+
/* Allocate DMA coherent memory to be used to store struct rds_header for
* sending/receiving packets. The pointers to the DMA memory and the
* associated DMA addresses are stored in two arrays.
*
- * @ibdev: the IB device
- * @pool: the DMA memory pool
+ * @dev: the RDS IB device
* @dma_addrs: pointer to the array for storing DMA addresses
* @num_hdrs: number of headers to allocate
*
* It returns the pointer to the array storing the DMA memory pointers. On
* error, NULL pointer is returned.
*/
-struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
- struct dma_pool *pool,
- dma_addr_t **dma_addrs, u32 num_hdrs)
+static struct rds_header **rds_dma_hdrs_alloc(struct rds_ib_device *dev,
+ dma_addr_t **dma_addrs, u32 num_hdrs,
+ enum dma_data_direction dir)
{
struct rds_header **hdrs;
dma_addr_t *hdr_daddrs;
u32 i;
hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
- ibdev_to_node(ibdev));
+ ibdev_to_node(dev->dev));
if (!hdrs)
return NULL;
hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
- ibdev_to_node(ibdev));
+ ibdev_to_node(dev->dev));
if (!hdr_daddrs) {
kvfree(hdrs);
return NULL;
}
for (i = 0; i < num_hdrs; i++) {
- hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]);
+ hdrs[i] = rds_dma_hdr_alloc(dev->dev, &hdr_daddrs[i], dir);
if (!hdrs[i]) {
- rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i);
+ rds_dma_hdrs_free(dev, hdrs, hdr_daddrs, i, dir);
return NULL;
}
}
@@ -484,24 +495,6 @@ struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
return hdrs;
}
-/* Free the DMA memory used to store struct rds_header.
- *
- * @pool: the DMA memory pool
- * @hdrs: pointer to the array storing DMA memory pointers
- * @dma_addrs: pointer to the array storing DMA addresses
- * @num_hdars: number of headers to free.
- */
-void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
- dma_addr_t *dma_addrs, u32 num_hdrs)
-{
- u32 i;
-
- for (i = 0; i < num_hdrs; i++)
- dma_pool_free(pool, hdrs[i], dma_addrs[i]);
- kvfree(hdrs);
- kvfree(dma_addrs);
-}
-
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
@@ -515,7 +508,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct rds_ib_device *rds_ibdev;
unsigned long max_wrs;
int ret, fr_queue_space;
- struct dma_pool *pool;
/*
* It's normal to see a null device if an incoming connection races
@@ -526,10 +518,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
return -EOPNOTSUPP;
/* The fr_queue_space is currently set to 512, to add extra space on
- * completion queue and send queue. This extra space is used for FRMR
+ * completion queue and send queue. This extra space is used for FRWR
* registration and invalidation work requests
*/
- fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+ fr_queue_space = RDS_IB_DEFAULT_FR_WR;
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
@@ -611,25 +603,26 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
goto recv_cq_out;
}
- pool = rds_ibdev->rid_hdrs_pool;
- ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ ic->i_send_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA send hdrs alloc failed\n");
goto qp_out;
}
- ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ ic->i_recv_hdrs = rds_dma_hdrs_alloc(rds_ibdev, &ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("DMA recv hdrs alloc failed\n");
goto send_hdrs_dma_out;
}
- ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL,
- &ic->i_ack_dma);
+ ic->i_ack = rds_dma_hdr_alloc(rds_ibdev->dev, &ic->i_ack_dma,
+ DMA_TO_DEVICE);
if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("DMA ack header alloc failed\n");
@@ -665,18 +658,19 @@ sends_out:
vfree(ic->i_sends);
ack_dma_out:
- dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ rds_dma_hdr_free(rds_ibdev->dev, ic->i_ack, ic->i_ack_dma,
+ DMA_TO_DEVICE);
ic->i_ack = NULL;
recv_hdrs_dma_out:
- rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr, DMA_FROM_DEVICE);
ic->i_recv_hdrs = NULL;
ic->i_recv_hdrs_dma = NULL;
send_hdrs_dma_out:
- rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ rds_dma_hdrs_free(rds_ibdev, ic->i_send_hdrs, ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr, DMA_TO_DEVICE);
ic->i_send_hdrs = NULL;
ic->i_send_hdrs_dma = NULL;
@@ -710,7 +704,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
- * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+ * from an older version. This could be 3.0 or 2.0 - we can't tell.
* We really should have changed this for OFED 1.3 :-(
*/
@@ -919,6 +913,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
event->param.conn.responder_resources,
event->param.conn.initiator_depth, isv6);
+ rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
/* rdma_accept() calls rdma_reject() internally if it fails */
if (rdma_accept(cm_id, &conn_param))
rds_ib_conn_error(conn, "rdma_accept failed\n");
@@ -927,7 +922,8 @@ out:
if (conn)
mutex_unlock(&conn->c_cm_lock);
if (err)
- rdma_reject(cm_id, &err, sizeof(int));
+ rdma_reject(cm_id, &err, sizeof(int),
+ IB_CM_REJ_CONSUMER_DEFINED);
return destroy;
}
@@ -954,9 +950,10 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
conn->c_proposed_version,
UINT_MAX, UINT_MAX, isv6);
- ret = rdma_connect(cm_id, &conn_param);
+ ret = rdma_connect_locked(cm_id, &conn_param);
if (ret)
- rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+ rds_ib_conn_error(conn, "rdma_connect_locked failed (%d)\n",
+ ret);
out:
/* Beware - returning non-zero tells the rdma_cm to destroy
@@ -1107,29 +1104,30 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
}
if (ic->rds_ibdev) {
- struct dma_pool *pool;
-
- pool = ic->rds_ibdev->rid_hdrs_pool;
-
/* then free the resources that ib callbacks use */
if (ic->i_send_hdrs) {
- rds_dma_hdrs_free(pool, ic->i_send_hdrs,
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_send_hdrs,
ic->i_send_hdrs_dma,
- ic->i_send_ring.w_nr);
+ ic->i_send_ring.w_nr,
+ DMA_TO_DEVICE);
ic->i_send_hdrs = NULL;
ic->i_send_hdrs_dma = NULL;
}
if (ic->i_recv_hdrs) {
- rds_dma_hdrs_free(pool, ic->i_recv_hdrs,
+ rds_dma_hdrs_free(ic->rds_ibdev,
+ ic->i_recv_hdrs,
ic->i_recv_hdrs_dma,
- ic->i_recv_ring.w_nr);
+ ic->i_recv_ring.w_nr,
+ DMA_FROM_DEVICE);
ic->i_recv_hdrs = NULL;
ic->i_recv_hdrs_dma = NULL;
}
if (ic->i_ack) {
- dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ rds_dma_hdr_free(ic->rds_ibdev->dev, ic->i_ack,
+ ic->i_ack_dma, DMA_TO_DEVICE);
ic->i_ack = NULL;
}
} else {
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
deleted file mode 100644
index 93c0437e6a5f..000000000000
--- a/net/rds/ib_fmr.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (c) 2016 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ib_mr.h"
-
-struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
-{
- struct rds_ib_mr_pool *pool;
- struct rds_ib_mr *ibmr = NULL;
- struct rds_ib_fmr *fmr;
- int err = 0;
-
- if (npages <= RDS_MR_8K_MSG_SIZE)
- pool = rds_ibdev->mr_8k_pool;
- else
- pool = rds_ibdev->mr_1m_pool;
-
- if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
-
- /* Switch pools if one of the pool is reaching upper limit */
- if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- pool = rds_ibdev->mr_1m_pool;
- else
- pool = rds_ibdev->mr_8k_pool;
- }
-
- ibmr = rds_ib_try_reuse_ibmr(pool);
- if (ibmr)
- return ibmr;
-
- ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
- rdsibdev_to_node(rds_ibdev));
- if (!ibmr) {
- err = -ENOMEM;
- goto out_no_cigar;
- }
-
- fmr = &ibmr->u.fmr;
- fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
- (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_ATOMIC),
- &pool->fmr_attr);
- if (IS_ERR(fmr->fmr)) {
- err = PTR_ERR(fmr->fmr);
- fmr->fmr = NULL;
- pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
- goto out_no_cigar;
- }
-
- ibmr->pool = pool;
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
-
- return ibmr;
-
-out_no_cigar:
- kfree(ibmr);
- atomic_dec(&pool->item_count);
-
- return ERR_PTR(err);
-}
-
-static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev,
- struct rds_ib_mr *ibmr, struct scatterlist *sg,
- unsigned int nents)
-{
- struct ib_device *dev = rds_ibdev->dev;
- struct rds_ib_fmr *fmr = &ibmr->u.fmr;
- struct scatterlist *scat = sg;
- u64 io_addr = 0;
- u64 *dma_pages;
- u32 len;
- int page_cnt, sg_dma_len;
- int i, j;
- int ret;
-
- sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- if (unlikely(!sg_dma_len)) {
- pr_warn("RDS/IB: %s failed!\n", __func__);
- return -EBUSY;
- }
-
- len = 0;
- page_cnt = 0;
-
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = sg_dma_len(&scat[i]);
- u64 dma_addr = sg_dma_address(&scat[i]);
-
- if (dma_addr & ~PAGE_MASK) {
- if (i > 0) {
- ib_dma_unmap_sg(dev, sg, nents,
- DMA_BIDIRECTIONAL);
- return -EINVAL;
- } else {
- ++page_cnt;
- }
- }
- if ((dma_addr + dma_len) & ~PAGE_MASK) {
- if (i < sg_dma_len - 1) {
- ib_dma_unmap_sg(dev, sg, nents,
- DMA_BIDIRECTIONAL);
- return -EINVAL;
- } else {
- ++page_cnt;
- }
- }
-
- len += dma_len;
- }
-
- page_cnt += len >> PAGE_SHIFT;
- if (page_cnt > ibmr->pool->fmr_attr.max_pages) {
- ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- return -EINVAL;
- }
-
- dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC,
- rdsibdev_to_node(rds_ibdev));
- if (!dma_pages) {
- ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- return -ENOMEM;
- }
-
- page_cnt = 0;
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = sg_dma_len(&scat[i]);
- u64 dma_addr = sg_dma_address(&scat[i]);
-
- for (j = 0; j < dma_len; j += PAGE_SIZE)
- dma_pages[page_cnt++] =
- (dma_addr & PAGE_MASK) + j;
- }
-
- ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
- if (ret) {
- ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
- goto out;
- }
-
- /* Success - we successfully remapped the MR, so we can
- * safely tear down the old mapping.
- */
- rds_ib_teardown_mr(ibmr);
-
- ibmr->sg = scat;
- ibmr->sg_len = nents;
- ibmr->sg_dma_len = sg_dma_len;
- ibmr->remap_count++;
-
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
- ret = 0;
-
-out:
- kfree(dma_pages);
-
- return ret;
-}
-
-struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
- struct scatterlist *sg,
- unsigned long nents,
- u32 *key)
-{
- struct rds_ib_mr *ibmr = NULL;
- struct rds_ib_fmr *fmr;
- int ret;
-
- ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
- if (IS_ERR(ibmr))
- return ibmr;
-
- ibmr->device = rds_ibdev;
- fmr = &ibmr->u.fmr;
- ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
- if (ret == 0)
- *key = fmr->fmr->rkey;
- else
- rds_ib_free_mr(ibmr, 0);
-
- return ibmr;
-}
-
-void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
- unsigned long *unpinned, unsigned int goal)
-{
- struct rds_ib_mr *ibmr, *next;
- struct rds_ib_fmr *fmr;
- LIST_HEAD(fmr_list);
- int ret = 0;
- unsigned int freed = *nfreed;
-
- /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
- list_for_each_entry(ibmr, list, unmap_list) {
- fmr = &ibmr->u.fmr;
- list_add(&fmr->fmr->list, &fmr_list);
- }
-
- ret = ib_unmap_fmr(&fmr_list);
- if (ret)
- pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
-
- /* Now we can destroy the DMA mapping and unpin any pages */
- list_for_each_entry_safe(ibmr, next, list, unmap_list) {
- fmr = &ibmr->u.fmr;
- *unpinned += ibmr->sg_len;
- __rds_ib_teardown_mr(ibmr);
- if (freed < goal ||
- ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
- list_del(&ibmr->unmap_list);
- ib_dealloc_fmr(fmr->fmr);
- kfree(ibmr);
- freed++;
- }
- }
- *nfreed = freed;
-}
-
-void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
-{
- struct rds_ib_mr_pool *pool = ibmr->pool;
-
- if (ibmr->remap_count >= pool->fmr_attr.max_maps)
- llist_add(&ibmr->llnode, &pool->drop_list);
- else
- llist_add(&ibmr->llnode, &pool->free_list);
-}
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 06ecf9d2d4bf..28c1b0022178 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -76,7 +76,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
frmr = &ibmr->u.frmr;
frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
- pool->fmr_attr.max_pages);
+ pool->max_pages);
if (IS_ERR(frmr->mr)) {
pr_warn("RDS/IB: %s failed to allocate MR", __func__);
err = PTR_ERR(frmr->mr);
@@ -131,9 +131,9 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
cpu_relax();
}
- ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len,
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len,
&off, PAGE_SIZE);
- if (unlikely(ret != ibmr->sg_len))
+ if (unlikely(ret != ibmr->sg_dma_len))
return ret < 0 ? ret : -EINVAL;
if (cmpxchg(&frmr->fr_state,
@@ -240,7 +240,7 @@ static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
}
frmr->dma_npages += len >> PAGE_SHIFT;
- if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) {
+ if (frmr->dma_npages > ibmr->pool->max_pages) {
ret = -EMSGSIZE;
goto out_unmap;
}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 0c8252d7fe2b..ea5e9aee4959 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -43,10 +43,6 @@
#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1))
#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
-struct rds_ib_fmr {
- struct ib_fmr *fmr;
-};
-
enum rds_ib_fr_state {
FRMR_IS_FREE, /* mr invalidated & ready for use */
FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */
@@ -84,7 +80,6 @@ struct rds_ib_mr {
u8 odp:1;
union {
- struct rds_ib_fmr fmr;
struct rds_ib_frmr frmr;
struct ib_mr *mr;
} u;
@@ -109,8 +104,7 @@ struct rds_ib_mr_pool {
unsigned long max_items;
unsigned long max_items_soft;
unsigned long max_free_pinned;
- struct ib_fmr_attr fmr_attr;
- bool use_fastreg;
+ unsigned int max_pages;
};
extern struct workqueue_struct *rds_ib_mr_wq;
@@ -136,15 +130,9 @@ u32 rds_ib_get_lkey(void *trans_private);
void __rds_ib_teardown_mr(struct rds_ib_mr *);
void rds_ib_teardown_mr(struct rds_ib_mr *);
-struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int);
struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
-struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *,
- unsigned long, u32 *);
struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
-void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
- unsigned long *, unsigned int);
-void rds_ib_free_fmr_list(struct rds_ib_mr *);
struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
struct rds_ib_connection *ic,
struct scatterlist *sg,
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index b34b24e237f8..8f070ee7e742 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -181,7 +181,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
iinfo->rdma_mr_max = pool_1m->max_items;
- iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+ iinfo->rdma_mr_size = pool_1m->max_pages;
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -191,7 +191,7 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
iinfo6->rdma_mr_max = pool_1m->max_items;
- iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+ iinfo6->rdma_mr_size = pool_1m->max_pages;
}
#endif
@@ -406,10 +406,7 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
if (list_empty(&unmap_list))
goto out;
- if (pool->use_fastreg)
- rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
- else
- rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
+ rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
if (!list_empty(&unmap_list)) {
unsigned long flags;
@@ -503,10 +500,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
}
/* Return it to the pool's free list */
- if (rds_ibdev->use_fastreg)
- rds_ib_free_frmr_list(ibmr);
- else
- rds_ib_free_fmr_list(ibmr);
+ rds_ib_free_frmr_list(ibmr);
atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
@@ -622,10 +616,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out;
}
- if (rds_ibdev->use_fastreg)
- ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
- else
- ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
+ ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
if (IS_ERR(ibmr)) {
ret = PTR_ERR(ibmr);
pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
@@ -669,19 +660,16 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
if (pool_type == RDS_IB_MR_1M_POOL) {
/* +1 allows for unaligned MRs */
- pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
+ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1;
pool->max_items = rds_ibdev->max_1m_mrs;
} else {
/* pool_type == RDS_IB_MR_8K_POOL */
- pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
+ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1;
pool->max_items = rds_ibdev->max_8k_mrs;
}
- pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
- pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
- pool->fmr_attr.page_shift = PAGE_SHIFT;
+ pool->max_free_pinned = pool->max_items * pool->max_pages / 4;
pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
- pool->use_fastreg = rds_ibdev->use_fastreg;
return pool;
}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 694d411dc72f..cfbf0e129cba 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -310,8 +310,8 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_sge *sge;
int ret = -ENOMEM;
- gfp_t slab_mask = GFP_NOWAIT;
- gfp_t page_mask = GFP_NOWAIT;
+ gfp_t slab_mask = gfp;
+ gfp_t page_mask = gfp;
if (gfp & __GFP_DIRECT_RECLAIM) {
slab_mask = GFP_KERNEL;
@@ -363,6 +363,7 @@ static int acquire_refill(struct rds_connection *conn)
static void release_refill(struct rds_connection *conn)
{
clear_bit(RDS_RECV_REFILL, &conn->c_flags);
+ smp_mb__after_atomic();
/* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
* hot path and finding waiters is very rare. We don't want to walk
@@ -662,10 +663,16 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
seq = rds_ib_get_ack(ic);
rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
rds_message_populate_header(hdr, 0, 0, 0);
hdr->h_ack = cpu_to_be64(seq);
hdr->h_credit = adv_credits;
rds_message_make_checksum(hdr);
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, ic->i_ack_dma,
+ sizeof(*hdr), DMA_TO_DEVICE);
+
ic->i_ack_queued = jiffies;
ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
@@ -845,6 +852,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_incoming *ibinc = ic->i_ibinc;
struct rds_header *ihdr, *hdr;
+ dma_addr_t dma_addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
/* XXX shut down the connection if port 0,0 are seen? */
@@ -863,6 +871,8 @@ static void rds_ib_process_recv(struct rds_connection *conn,
ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
rds_ib_conn_error(conn, "incoming message "
@@ -870,7 +880,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
"forcing a reconnect\n",
&conn->c_faddr);
rds_stats_inc(s_recv_drop_bad_checksum);
- return;
+ goto done;
}
/* Process the ACK sequence which comes with every packet */
@@ -899,7 +909,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
*/
rds_ib_frag_free(ic, recv->r_frag);
recv->r_frag = NULL;
- return;
+ goto done;
}
/*
@@ -933,7 +943,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
hdr->h_dport != ihdr->h_dport) {
rds_ib_conn_error(conn,
"fragment header mismatch; forcing reconnect\n");
- return;
+ goto done;
}
}
@@ -965,6 +975,9 @@ static void rds_ib_process_recv(struct rds_connection *conn,
rds_inc_put(&ibinc->ii_inc);
}
+done:
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev, dma_addr,
+ sizeof(*ihdr), DMA_FROM_DEVICE);
}
void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
@@ -1020,7 +1033,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
rds_ib_stats_inc(s_ib_rx_ring_empty);
if (rds_ib_ring_low(&ic->i_recv_ring)) {
- rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
rds_ib_stats_inc(s_ib_rx_refill_from_cq);
}
}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
index ff97e8eda858..006b2e441418 100644
--- a/net/rds/ib_ring.c
+++ b/net/rds/ib_ring.c
@@ -141,7 +141,7 @@ int rds_ib_ring_low(struct rds_ib_work_ring *ring)
}
/*
- * returns the oldest alloced ring entry. This will be the next one
+ * returns the oldest allocated ring entry. This will be the next one
* freed. This can't be called if there are none allocated.
*/
u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index dfe778220657..4190b90ff3b1 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -294,7 +294,6 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
rds_ib_ring_free(&ic->i_send_ring, completed);
rds_ib_sub_signaled(ic, nr_sig);
- nr_sig = 0;
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued))
@@ -638,6 +637,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[0].length = sizeof(struct rds_header);
send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+ ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
sizeof(struct rds_header));
@@ -688,6 +691,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
adv_credits = 0;
rds_ib_stats_inc(s_ib_tx_credit_updates);
}
+ ib_dma_sync_single_for_device(ic->rds_ibdev->dev,
+ ic->i_send_hdrs_dma[pos],
+ sizeof(struct rds_header),
+ DMA_TO_DEVICE);
if (prev)
prev->s_wr.next = &send->s_wr;
diff --git a/net/rds/info.c b/net/rds/info.c
index 03f6fd56d237..b6b46a8214a0 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -162,7 +162,6 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
struct rds_info_lengths lens;
unsigned long nr_pages = 0;
unsigned long start;
- unsigned long i;
rds_info_func func;
struct page **pages = NULL;
int ret;
@@ -193,7 +192,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = -ENOMEM;
goto out;
}
- ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
+ ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
@@ -235,8 +234,8 @@ call_func:
ret = -EFAULT;
out:
- for (i = 0; pages && i < nr_pages; i++)
- put_page(pages[i]);
+ if (pages)
+ unpin_user_pages(pages, nr_pages);
kfree(pages);
return ret;
diff --git a/net/rds/message.c b/net/rds/message.c
index 50f13f1d4ae0..44dbc612ef54 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2020 Oracle and/or its affiliates.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -162,12 +162,12 @@ static void rds_message_purge(struct rds_message *rm)
if (rm->rdma.op_active)
rds_rdma_free_op(&rm->rdma);
if (rm->rdma.op_rdma_mr)
- rds_mr_put(rm->rdma.op_rdma_mr);
+ kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final);
if (rm->atomic.op_active)
rds_atomic_free_op(&rm->atomic);
if (rm->atomic.op_rdma_mr)
- rds_mr_put(rm->atomic.op_rdma_mr);
+ kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final);
}
void rds_message_put(struct rds_message *rm)
@@ -308,26 +308,20 @@ out:
/*
* RDS ops use this to grab SG entries from the rm's sg pool.
*/
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents,
- int *ret)
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
{
struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
struct scatterlist *sg_ret;
- if (WARN_ON(!ret))
- return NULL;
-
if (nents <= 0) {
pr_warn("rds: alloc sgs failed! nents <= 0\n");
- *ret = -EINVAL;
- return NULL;
+ return ERR_PTR(-EINVAL);
}
if (rm->m_used_sgs + nents > rm->m_total_sgs) {
pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n",
rm->m_total_sgs, rm->m_used_sgs, nents);
- *ret = -ENOMEM;
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
sg_ret = &sg_first[rm->m_used_sgs];
@@ -343,7 +337,6 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
unsigned int i;
int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE);
int extra_bytes = num_sgs * sizeof(struct scatterlist);
- int ret;
rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
if (!rm)
@@ -352,15 +345,16 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE);
- rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret);
- if (!rm->data.op_sg) {
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+ if (IS_ERR(rm->data.op_sg)) {
+ void *err = ERR_CAST(rm->data.op_sg);
rds_message_put(rm);
- return ERR_PTR(ret);
+ return err;
}
for (i = 0; i < rm->data.op_nents; ++i) {
sg_set_page(&rm->data.op_sg[i],
- virt_to_page(page_addrs[i]),
+ virt_to_page((void *)page_addrs[i]),
PAGE_SIZE, 0);
}
@@ -397,7 +391,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
size_t start;
ssize_t copied;
- copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+ copied = iov_iter_get_pages2(from, &pages, PAGE_SIZE,
1, &start);
if (copied < 0) {
struct mmpin *mmp;
@@ -411,7 +405,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
goto err;
}
total_copied += copied;
- iov_iter_advance(from, copied);
length -= copied;
sg_set_page(sg, pages, copied, start);
rm->data.op_nents++;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 585e6b3b69ce..fba82d36593a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2020 Oracle and/or its affiliates.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -84,7 +84,7 @@ static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
if (insert) {
rb_link_node(&insert->r_rb_node, parent, p);
rb_insert_color(&insert->r_rb_node, root);
- refcount_inc(&insert->r_refcount);
+ kref_get(&insert->r_kref);
}
return NULL;
}
@@ -99,10 +99,7 @@ static void rds_destroy_mr(struct rds_mr *mr)
unsigned long flags;
rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
- mr->r_key, refcount_read(&mr->r_refcount));
-
- if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
- return;
+ mr->r_key, kref_read(&mr->r_kref));
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
if (!RB_EMPTY_NODE(&mr->r_rb_node))
@@ -115,8 +112,10 @@ static void rds_destroy_mr(struct rds_mr *mr)
mr->r_trans->free_mr(trans_private, mr->r_invalidate);
}
-void __rds_put_mr_final(struct rds_mr *mr)
+void __rds_put_mr_final(struct kref *kref)
{
+ struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref);
+
rds_destroy_mr(mr);
kfree(mr);
}
@@ -140,8 +139,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
- rds_destroy_mr(mr);
- rds_mr_put(mr);
+ kref_put(&mr->r_kref, __rds_put_mr_final);
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
}
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
@@ -242,7 +240,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
}
- refcount_set(&mr->r_refcount, 1);
+ kref_init(&mr->r_kref);
RB_CLEAR_NODE(&mr->r_rb_node);
mr->r_trans = rs->rs_transport;
mr->r_sock = rs;
@@ -271,7 +269,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
} else {
nents = ret;
- sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+ sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL);
if (!sg) {
ret = -ENOMEM;
goto out;
@@ -343,7 +341,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
if (mr_ret) {
- refcount_inc(&mr->r_refcount);
+ kref_get(&mr->r_kref);
*mr_ret = mr;
}
@@ -351,25 +349,24 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
out:
kfree(pages);
if (mr)
- rds_mr_put(mr);
+ kref_put(&mr->r_kref, __rds_put_mr_final);
return ret;
}
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
{
struct rds_get_mr_args args;
if (optlen != sizeof(struct rds_get_mr_args))
return -EINVAL;
- if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
- sizeof(struct rds_get_mr_args)))
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args)))
return -EFAULT;
return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
}
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
+int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen)
{
struct rds_get_mr_for_dest_args args;
struct rds_get_mr_args new_args;
@@ -377,7 +374,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
if (optlen != sizeof(struct rds_get_mr_for_dest_args))
return -EINVAL;
- if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
+ if (copy_from_sockptr(&args, optval,
sizeof(struct rds_get_mr_for_dest_args)))
return -EFAULT;
@@ -396,7 +393,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
/*
* Free the MR indicated by the given R_Key
*/
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
{
struct rds_free_mr_args args;
struct rds_mr *mr;
@@ -405,8 +402,7 @@ int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
if (optlen != sizeof(struct rds_free_mr_args))
return -EINVAL;
- if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
- sizeof(struct rds_free_mr_args)))
+ if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args)))
return -EFAULT;
/* Special case - a null cookie means flush all unused MRs */
@@ -434,13 +430,7 @@ int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
if (!mr)
return -EINVAL;
- /*
- * call rds_destroy_mr() ourselves so that we're sure it's done by the time
- * we return. If we let rds_mr_put() do it it might not happen until
- * someone else drops their ref.
- */
- rds_destroy_mr(mr);
- rds_mr_put(mr);
+ kref_put(&mr->r_kref, __rds_put_mr_final);
return 0;
}
@@ -464,6 +454,14 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
return;
}
+ /* Get a reference so that the MR won't go away before calling
+ * sync_mr() below.
+ */
+ kref_get(&mr->r_kref);
+
+ /* If it is going to be freed, remove it from the tree now so
+ * that no other thread can find it and free it.
+ */
if (mr->r_use_once || force) {
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
@@ -477,12 +475,13 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
if (mr->r_trans->sync_mr)
mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+ /* Release the reference held above. */
+ kref_put(&mr->r_kref, __rds_put_mr_final);
+
/* If the MR was marked as invalidate, this will
* trigger an async flush. */
- if (zot_me) {
- rds_destroy_mr(mr);
- rds_mr_put(mr);
- }
+ if (zot_me)
+ kref_put(&mr->r_kref, __rds_put_mr_final);
}
void rds_rdma_free_op(struct rm_rdma_op *ro)
@@ -490,7 +489,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
unsigned int i;
if (ro->op_odp_mr) {
- rds_mr_put(ro->op_odp_mr);
+ kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final);
} else {
for (i = 0; i < ro->op_nents; i++) {
struct page *page = sg_page(&ro->op_sg[i]);
@@ -566,6 +565,9 @@ int rds_rdma_extra_size(struct rds_rdma_args *args,
if (args->nr_local == 0)
return -EINVAL;
+ if (args->nr_local > UIO_MAXIOV)
+ return -EMSGSIZE;
+
iov->iov = kcalloc(args->nr_local,
sizeof(struct rds_iovec),
GFP_KERNEL);
@@ -664,9 +666,11 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
op->op_odp_mr = NULL;
WARN_ON(!nr_pages);
- op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret);
- if (!op->op_sg)
+ op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
+ if (IS_ERR(op->op_sg)) {
+ ret = PTR_ERR(op->op_sg);
goto out_pages;
+ }
if (op->op_notify || op->op_recverr) {
/* We allocate an uninitialized notifier here, because
@@ -730,7 +734,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
goto out_pages;
}
RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
- refcount_set(&local_odp_mr->r_refcount, 1);
+ kref_init(&local_odp_mr->r_kref);
local_odp_mr->r_trans = rs->rs_transport;
local_odp_mr->r_sock = rs;
local_odp_mr->r_trans_private =
@@ -738,7 +742,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
NULL, 0, rs, &local_odp_mr->r_key, NULL,
iov->addr, iov->bytes, ODP_VIRTUAL);
if (IS_ERR(local_odp_mr->r_trans_private)) {
- ret = IS_ERR(local_odp_mr->r_trans_private);
+ ret = PTR_ERR(local_odp_mr->r_trans_private);
rdsdebug("get_mr ret %d %p\"", ret,
local_odp_mr->r_trans_private);
kfree(local_odp_mr);
@@ -827,7 +831,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
if (!mr)
err = -EINVAL; /* invalid r_key */
else
- refcount_inc(&mr->r_refcount);
+ kref_get(&mr->r_kref);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (mr) {
@@ -905,9 +909,11 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
rm->atomic.op_active = 1;
rm->atomic.op_recverr = rs->rs_recverr;
- rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1, &ret);
- if (!rm->atomic.op_sg)
+ rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+ if (IS_ERR(rm->atomic.op_sg)) {
+ ret = PTR_ERR(rm->atomic.op_sg);
goto err;
+ }
/* verify 8 byte-aligned */
if (args->local_addr & 0x7) {
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 5f741e51b4ba..d36f3f6b4351 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -87,6 +87,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_ADDR_RESOLVED:
rdma_set_service_type(cm_id, conn->c_tos);
+ rdma_set_min_rnr_timer(cm_id, IB_RNR_TIMER_000_32);
/* XXX do we need to clean up if this fails? */
ret = rdma_resolve_route(cm_id,
RDS_RDMA_RESOLVE_TIMEOUT_MS);
@@ -290,7 +291,7 @@ static void rds_rdma_listen_stop(void)
#endif
}
-static int rds_rdma_init(void)
+static int __init rds_rdma_init(void)
{
int ret;
@@ -306,7 +307,7 @@ out:
}
module_init(rds_rdma_init);
-static void rds_rdma_exit(void)
+static void __exit rds_rdma_exit(void)
{
/* stop listening first to ensure no new connections are attempted */
rds_rdma_listen_stop();
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index bfafd4a6d827..ca4c3a667091 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -13,7 +13,7 @@
/* Below reject reason is for legacy interoperability issue with non-linux
* RDS endpoints where older version incompatibility is conveyed via value 1.
- * For future version(s), proper encoded reject reason should be be used.
+ * For future version(s), proper encoded reject reason should be used.
*/
#define RDS_RDMA_REJ_INCOMPAT 1
diff --git a/net/rds/rds.h b/net/rds/rds.h
index e4a603523083..d35d1fc39807 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -291,7 +291,7 @@ struct rds_incoming {
struct rds_mr {
struct rb_node r_rb_node;
- refcount_t r_refcount;
+ struct kref r_kref;
u32 r_key;
/* A copy of the creation flags */
@@ -299,19 +299,11 @@ struct rds_mr {
unsigned int r_invalidate:1;
unsigned int r_write:1;
- /* This is for RDS_MR_DEAD.
- * It would be nice & consistent to make this part of the above
- * bit field here, but we need to use test_and_set_bit.
- */
- unsigned long r_state;
struct rds_sock *r_sock; /* back pointer to the socket that owns us */
struct rds_transport *r_trans;
void *r_trans_private;
};
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD 0
-
static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
{
return r_key | (((u64) offset) << 32);
@@ -786,6 +778,7 @@ void rds_conn_drop(struct rds_connection *conn);
void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy);
void rds_conn_connect_if_down(struct rds_connection *conn);
void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
+void rds_check_all_paths(struct rds_connection *conn);
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
@@ -831,6 +824,12 @@ rds_conn_path_up(struct rds_conn_path *cp)
}
static inline int
+rds_conn_path_down(struct rds_conn_path *cp)
+{
+ return atomic_read(&cp->cp_state) == RDS_CONN_DOWN;
+}
+
+static inline int
rds_conn_up(struct rds_connection *conn)
{
WARN_ON(conn->c_trans->t_mp_capable);
@@ -852,8 +851,7 @@ rds_conn_connecting(struct rds_connection *conn)
/* message.c */
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents,
- int *ret);
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
bool zcopy);
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
@@ -926,9 +924,9 @@ int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
void rds_rdma_drop_keys(struct rds_sock *rs);
int rds_rdma_extra_size(struct rds_rdma_args *args,
struct rds_iov_vector *iov);
@@ -946,12 +944,7 @@ void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
-void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
- if (refcount_dec_and_test(&mr->r_refcount))
- __rds_put_mr_final(mr);
-}
+void __rds_put_mr_final(struct kref *kref);
static inline bool rds_destroy_pending(struct rds_connection *conn)
{
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c8404971d5ab..5b426dc3634d 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -450,12 +450,13 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
{
struct rds_notifier *notifier;
- struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+ struct rds_rdma_notify cmsg;
unsigned int count = 0, max_messages = ~0U;
unsigned long flags;
LIST_HEAD(copy);
int err = 0;
+ memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */
/* put_cmsg copies to user space and thus may sleep. We can't do this
* with rs_lock held, so first grab as many notifications as we can stuff
@@ -713,7 +714,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (rds_cmsg_recv(inc, msg, rs)) {
ret = -EFAULT;
- goto out;
+ break;
}
rds_recvmsg_zcookie(rs, msg);
@@ -721,8 +722,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (msg->msg_name) {
if (ipv6_addr_v4mapped(&inc->i_saddr)) {
- sin = (struct sockaddr_in *)msg->msg_name;
-
sin->sin_family = AF_INET;
sin->sin_port = inc->i_hdr.h_sport;
sin->sin_addr.s_addr =
@@ -730,8 +729,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
msg->msg_namelen = sizeof(*sin);
} else {
- sin6 = (struct sockaddr_in6 *)msg->msg_name;
-
sin6->sin6_family = AF_INET6;
sin6->sin6_port = inc->i_hdr.h_sport;
sin6->sin6_addr = inc->i_saddr;
diff --git a/net/rds/send.c b/net/rds/send.c
index 82dcd8b84fe7..0c5504068e3c 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -272,7 +272,7 @@ restart:
/* Unfortunately, the way Infiniband deals with
* RDMA to a bad MR key is by moving the entire
- * queue pair to error state. We cold possibly
+ * queue pair to error state. We could possibly
* recover from that, but right now we drop the
* connection.
* Therefore, we never retransmit messages with RDMA ops.
@@ -934,7 +934,7 @@ static int rds_rm_size(struct msghdr *msg, int num_sgs,
case RDS_CMSG_ZCOPY_COOKIE:
zcopy_cookie = true;
- /* fall through */
+ fallthrough;
case RDS_CMSG_RDMA_DEST:
case RDS_CMSG_RDMA_MAP:
@@ -1225,7 +1225,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
}
/* If the socket is already bound to a link local address,
* it can only send to peers on the same link. But allow
- * communicating beween link local and non-link local address.
+ * communicating between link local and non-link local address.
*/
if (scope_id != rs->rs_bound_scope_id) {
if (!scope_id) {
@@ -1274,9 +1274,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* Attach data to the rm */
if (payload_len) {
- rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs, &ret);
- if (!rm->data.op_sg)
+ rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+ if (IS_ERR(rm->data.op_sg)) {
+ ret = PTR_ERR(rm->data.op_sg);
goto out;
+ }
ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
if (ret)
goto out;
@@ -1338,7 +1340,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
- rds_conn_path_connect_if_down(cpath);
+ if (rds_conn_path_down(cpath))
+ rds_check_all_paths(conn);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
if (ret) {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 66121bc6f34e..4444fd82b66d 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
static struct kmem_cache *rds_tcp_conn_slab;
static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *fpos);
+ void *buffer, size_t *lenp, loff_t *fpos);
static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -90,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = {
{ }
};
-/* doing it this way avoids calling tcp_sk() */
-void rds_tcp_nonagle(struct socket *sock)
-{
- int val = 1;
-
- kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val,
- sizeof(val));
-}
-
u32 rds_tcp_write_seq(struct rds_tcp_connection *tc)
{
/* seq# of the last byte of data in tcp send buffer */
@@ -176,10 +166,10 @@ void rds_tcp_reset_callbacks(struct socket *sock,
*/
atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
- lock_sock(osock->sk);
/* reset receive side state for rds_tcp_data_recv() for osock */
cancel_delayed_work_sync(&cp->cp_send_w);
cancel_delayed_work_sync(&cp->cp_recv_w);
+ lock_sock(osock->sk);
if (tc->t_tinc) {
rds_inc_put(&tc->t_tinc->ti_inc);
tc->t_tinc = NULL;
@@ -323,8 +313,8 @@ out:
}
#endif
-static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
- __u32 scope_id)
+int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
{
struct net_device *dev = NULL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -497,23 +487,37 @@ struct rds_tcp_net {
/* All module specific customizations to the RDS-TCP socket should be done in
* rds_tcp_tune() and applied after socket creation.
*/
-void rds_tcp_tune(struct socket *sock)
+bool rds_tcp_tune(struct socket *sock)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
- struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct rds_tcp_net *rtn;
- rds_tcp_nonagle(sock);
+ tcp_sock_set_nodelay(sock->sk);
lock_sock(sk);
+ /* TCP timer functions might access net namespace even after
+ * a process which created this net namespace terminated.
+ */
+ if (!sk->sk_net_refcnt) {
+ if (!maybe_get_net(net)) {
+ release_sock(sk);
+ return false;
+ }
+ sk->sk_net_refcnt = 1;
+ netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+ }
+ rtn = net_generic(net, rds_tcp_netid);
if (rtn->sndbuf_size > 0) {
sk->sk_sndbuf = rtn->sndbuf_size;
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
}
if (rtn->rcvbuf_size > 0) {
- sk->sk_sndbuf = rtn->rcvbuf_size;
+ sk->sk_rcvbuf = rtn->rcvbuf_size;
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
}
release_sock(sk);
+ return true;
}
static void rds_tcp_accept_worker(struct work_struct *work)
@@ -676,8 +680,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
}
static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *fpos)
+ void *buffer, size_t *lenp, loff_t *fpos)
{
struct net *net = current->nsproxy->net_ns;
int err;
@@ -709,7 +712,7 @@ static void rds_tcp_exit(void)
}
module_exit(rds_tcp_exit);
-static int rds_tcp_init(void)
+static int __init rds_tcp_init(void)
{
int ret;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 3c69361d21c7..f8b5930d7b34 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -49,8 +49,7 @@ struct rds_tcp_statistics {
};
/* tcp.c */
-void rds_tcp_tune(struct socket *sock);
-void rds_tcp_nonagle(struct socket *sock);
+bool rds_tcp_tune(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_restore_callbacks(struct socket *sock,
@@ -60,7 +59,8 @@ u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
extern struct rds_transport rds_tcp_transport;
void rds_tcp_accept_work(struct sock *sk);
-
+int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id);
/* tcp_connect.c */
int rds_tcp_conn_path_connect(struct rds_conn_path *cp);
void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
@@ -71,9 +71,8 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
int rds_tcp_accept_one(struct socket *sock);
-int rds_tcp_keepalive(struct socket *sock);
+void rds_tcp_keepalive(struct socket *sock);
void *rds_tcp_listen_sock_def_readable(struct net *net);
-void rds_tcp_set_linger(struct socket *sock);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 008f50fb25dd..f0c477c5d1db 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -78,6 +78,7 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
rds_conn_path_drop(cp, false);
+ break;
default:
break;
}
@@ -123,7 +124,10 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
if (ret < 0)
goto out;
- rds_tcp_tune(sock);
+ if (!rds_tcp_tune(sock)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (isv6) {
sin6.sin6_family = AF_INET6;
@@ -207,7 +211,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
if (sock) {
if (rds_destroy_pending(cp->cp_conn))
- rds_tcp_set_linger(sock);
+ sock_no_linger(sock->sk);
sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
lock_sock(sock->sk);
rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 810a3a49e947..7edf2e69d3fe 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -38,36 +38,19 @@
#include "rds.h"
#include "tcp.h"
-int rds_tcp_keepalive(struct socket *sock)
+void rds_tcp_keepalive(struct socket *sock)
{
/* values below based on xs_udp_default_timeout */
int keepidle = 5; /* send a probe 'keepidle' secs after last data */
int keepcnt = 5; /* number of unack'ed probes before declaring dead */
- int keepalive = 1;
- int ret = 0;
-
- ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
- (char *)&keepalive, sizeof(keepalive));
- if (ret < 0)
- goto bail;
-
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
- (char *)&keepcnt, sizeof(keepcnt));
- if (ret < 0)
- goto bail;
-
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
- (char *)&keepidle, sizeof(keepidle));
- if (ret < 0)
- goto bail;
+ sock_set_keepalive(sock->sk);
+ tcp_sock_set_keepcnt(sock->sk, keepcnt);
+ tcp_sock_set_keepidle(sock->sk, keepidle);
/* KEEPINTVL is the interval between successive probes. We follow
* the model in xs_tcp_finish_connecting() and re-use keepidle.
*/
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
- (char *)&keepidle, sizeof(keepidle));
-bail:
- return ret;
+ tcp_sock_set_keepintvl(sock->sk, keepidle);
}
/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
@@ -111,17 +94,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
return NULL;
}
-void rds_tcp_set_linger(struct socket *sock)
-{
- struct linger no_linger = {
- .l_onoff = 1,
- .l_linger = 0,
- };
-
- kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&no_linger, sizeof(no_linger));
-}
-
int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
@@ -160,11 +132,11 @@ int rds_tcp_accept_one(struct socket *sock)
new_sock->ops = sock->ops;
__module_get(new_sock->ops->owner);
- ret = rds_tcp_keepalive(new_sock);
- if (ret < 0)
+ rds_tcp_keepalive(new_sock);
+ if (!rds_tcp_tune(new_sock)) {
+ ret = -EINVAL;
goto out;
-
- rds_tcp_tune(new_sock);
+ }
inet = inet_sk(new_sock->sk);
@@ -198,6 +170,12 @@ int rds_tcp_accept_one(struct socket *sock)
}
#endif
+ if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) {
+ /* local address connection is only allowed via loopback */
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
conn = rds_conn_create(sock_net(sock->sk),
my_addr, peer_addr,
&rds_tcp_transport, 0, GFP_KERNEL, dev_if);
@@ -241,7 +219,7 @@ rst_nsk:
* be pending on it. By setting linger, we achieve the side-effect
* of avoiding TIME_WAIT state on new_sock.
*/
- rds_tcp_set_linger(new_sock);
+ sock_no_linger(new_sock->sk);
kernel_sock_shutdown(new_sock, SHUT_RDWR);
ret = 0;
out:
@@ -303,7 +281,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
}
sock->sk->sk_reuse = SK_CAN_REUSE;
- rds_tcp_nonagle(sock);
+ tcp_sock_set_nodelay(sock->sk);
write_lock_bh(&sock->sk->sk_callback_lock);
sock->sk->sk_user_data = sock->sk->sk_data_ready;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 42c5ff1eda95..f4ee13da90c7 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -177,7 +177,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
goto out;
}
tc->t_tinc = tinc;
- rdsdebug("alloced tinc %p\n", tinc);
+ rdsdebug("allocated tinc %p\n", tinc);
rds_inc_path_init(&tinc->ti_inc, cp,
&cp->cp_conn->c_faddr);
tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 78a2554a4497..8c4d1d6e9249 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -38,23 +38,18 @@
#include "rds.h"
#include "tcp.h"
-static void rds_tcp_cork(struct socket *sock, int val)
-{
- kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val));
-}
-
void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp)
{
struct rds_tcp_connection *tc = cp->cp_transport_data;
- rds_tcp_cork(tc->t_sock, 1);
+ tcp_sock_set_cork(tc->t_sock->sk, true);
}
void rds_tcp_xmit_path_complete(struct rds_conn_path *cp)
{
struct rds_tcp_connection *tc = cp->cp_transport_data;
- rds_tcp_cork(tc->t_sock, 0);
+ tcp_sock_set_cork(tc->t_sock->sk, false);
}
/* the core send_sem serializes this with other xmit and shutdown */
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 32dc50f0a303..1f424cbfcbb4 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -208,6 +208,7 @@ void rds_send_worker(struct work_struct *work)
case -ENOMEM:
rds_stats_inc(s_send_delayed_retry);
queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
+ break;
default:
break;
}
@@ -232,6 +233,7 @@ void rds_recv_worker(struct work_struct *work)
case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry);
queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
+ break;
default:
break;
}
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 46f709a4b577..f8001ec80867 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -38,6 +38,12 @@
#include "rds.h"
#include "loop.h"
+static char * const rds_trans_modules[] = {
+ [RDS_TRANS_IB] = "rds_rdma",
+ [RDS_TRANS_GAP] = NULL,
+ [RDS_TRANS_TCP] = "rds_tcp",
+};
+
static struct rds_transport *transports[RDS_TRANS_COUNT];
static DECLARE_RWSEM(rds_trans_sem);
@@ -110,18 +116,20 @@ struct rds_transport *rds_trans_get(int t_type)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
- unsigned int i;
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++) {
- trans = transports[i];
-
- if (trans && trans->t_type == t_type &&
- (!trans->t_owner || try_module_get(trans->t_owner))) {
- ret = trans;
- break;
- }
+ trans = transports[t_type];
+ if (!trans) {
+ up_read(&rds_trans_sem);
+ if (rds_trans_modules[t_type])
+ request_module(rds_trans_modules[t_type]);
+ down_read(&rds_trans_sem);
+ trans = transports[t_type];
}
+ if (trans && trans->t_type == t_type &&
+ (!trans->t_owner || try_module_get(trans->t_owner)))
+ ret = trans;
+
up_read(&rds_trans_sem);
return ret;