aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/trans_fd.c88
-rw-r--r--net/9p/trans_virtio.c2
-rw-r--r--net/ceph/auth_x.c49
-rw-r--r--net/ceph/auth_x.h2
-rw-r--r--net/ceph/crush/mapper.c33
-rw-r--r--net/ceph/messenger.c105
-rw-r--r--net/ceph/mon_client.c4
-rw-r--r--net/ceph/osd_client.c10
-rw-r--r--net/ceph/osdmap.c19
-rw-r--r--net/core/flow_dissector.c9
-rw-r--r--net/core/scm.c7
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/core/sysctl_net_core.c10
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/ip_gre.c8
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ip_tunnel.c20
-rw-r--r--net/ipv4/ping.c4
-rw-r--r--net/ipv4/raw.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c1
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_ipv4.c12
-rw-r--r--net/ipv4/tcp_memcontrol.c200
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv6/addrconf.c5
-rw-r--r--net/ipv6/ip6_flowlabel.c5
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/mac80211/debugfs.c7
-rw-r--r--net/openvswitch/vport-vxlan.c2
-rw-r--r--net/rds/ib.c34
-rw-r--r--net/rds/iw.c23
-rw-r--r--net/sctp/socket.c9
-rw-r--r--net/sunrpc/cache.c10
-rw-r--r--net/sunrpc/rpc_pipe.c60
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c41
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c371
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c56
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c33
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c360
-rw-r--r--net/sunrpc/xprtrdma/transport.c30
-rw-r--r--net/sunrpc/xprtrdma/verbs.c24
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h21
-rw-r--r--net/unix/af_unix.c27
-rw-r--r--net/unix/garbage.c8
-rw-r--r--net/vmw_vsock/af_vsock.c19
49 files changed, 1106 insertions, 659 deletions
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index bced8c074c12..7bc2208b6cc4 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -108,9 +108,7 @@ struct p9_poll_wait {
* @unsent_req_list: accounting for requests that haven't been sent
* @req: current request being processed (if any)
* @tmp_buf: temporary buffer to read in header
- * @rsize: amount to read for current frame
- * @rpos: read position in current frame
- * @rbuf: current read buffer
+ * @rc: temporary fcall for reading current frame
* @wpos: write position for current frame
* @wsize: amount of data to write for current frame
* @wbuf: current write buffer
@@ -131,9 +129,7 @@ struct p9_conn {
struct list_head unsent_req_list;
struct p9_req_t *req;
char tmp_buf[7];
- int rsize;
- int rpos;
- char *rbuf;
+ struct p9_fcall rc;
int wpos;
int wsize;
char *wbuf;
@@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work)
if (m->err < 0)
return;
- p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
+ p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
- if (!m->rbuf) {
- m->rbuf = m->tmp_buf;
- m->rpos = 0;
- m->rsize = 7; /* start by reading header */
+ if (!m->rc.sdata) {
+ m->rc.sdata = m->tmp_buf;
+ m->rc.offset = 0;
+ m->rc.capacity = 7; /* start by reading header */
}
clear_bit(Rpending, &m->wsched);
- p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n",
- m, m->rpos, m->rsize, m->rsize-m->rpos);
- err = p9_fd_read(m->client, m->rbuf + m->rpos,
- m->rsize - m->rpos);
+ p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n",
+ m, m->rc.offset, m->rc.capacity,
+ m->rc.capacity - m->rc.offset);
+ err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset,
+ m->rc.capacity - m->rc.offset);
p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
- if (err == -EAGAIN) {
+ if (err == -EAGAIN)
goto end_clear;
- }
if (err <= 0)
goto error;
- m->rpos += err;
+ m->rc.offset += err;
- if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */
- u16 tag;
+ /* header read in */
+ if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new header\n");
- n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */
- if (n >= m->client->msize) {
+ err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0);
+ if (err) {
+ p9_debug(P9_DEBUG_ERROR,
+ "error parsing header: %d\n", err);
+ goto error;
+ }
+
+ if (m->rc.size >= m->client->msize) {
p9_debug(P9_DEBUG_ERROR,
- "requested packet size too big: %d\n", n);
+ "requested packet size too big: %d\n",
+ m->rc.size);
err = -EIO;
goto error;
}
- tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
p9_debug(P9_DEBUG_TRANS,
- "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
+ "mux %p pkt: size: %d bytes tag: %d\n",
+ m, m->rc.size, m->rc.tag);
- m->req = p9_tag_lookup(m->client, tag);
+ m->req = p9_tag_lookup(m->client, m->rc.tag);
if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
- tag);
+ m->rc.tag);
err = -EIO;
goto error;
}
if (m->req->rc == NULL) {
- m->req->rc = kmalloc(sizeof(struct p9_fcall) +
- m->client->msize, GFP_NOFS);
- if (!m->req->rc) {
- m->req = NULL;
- err = -ENOMEM;
- goto error;
- }
+ p9_debug(P9_DEBUG_ERROR,
+ "No recv fcall for tag %d (req %p), disconnecting!\n",
+ m->rc.tag, m->req);
+ m->req = NULL;
+ err = -EIO;
+ goto error;
}
- m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall);
- memcpy(m->rbuf, m->tmp_buf, m->rsize);
- m->rsize = n;
+ m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall);
+ memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
+ m->rc.capacity = m->rc.size;
}
- /* not an else because some packets (like clunk) have no payload */
- if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */
+ /* packet is read in
+ * not an else because some packets (like clunk) have no payload
+ */
+ if ((m->req) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new packet\n");
spin_lock(&m->client->lock);
if (m->req->status != REQ_STATUS_ERROR)
@@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work)
list_del(&m->req->req_list);
spin_unlock(&m->client->lock);
p9_client_cb(m->client, m->req, status);
- m->rbuf = NULL;
- m->rpos = 0;
- m->rsize = 0;
+ m->rc.sdata = NULL;
+ m->rc.offset = 0;
+ m->rc.capacity = 0;
m->req = NULL;
}
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 199bc76202d2..4acb1d5417aa 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -658,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
mutex_unlock(&virtio_9p_lock);
if (!found) {
- pr_err("no channels available\n");
+ pr_err("no channels available for device %s\n", devname);
return ret;
}
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 10d87753ed87..9e43a315e662 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
void *ticket_buf = NULL;
void *tp, *tpend;
void **ptp;
- struct ceph_timespec new_validity;
struct ceph_crypto_key new_session_key;
struct ceph_buffer *new_ticket_blob;
unsigned long new_expires, new_renew_after;
@@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac,
if (ret)
goto out;
- ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
- ceph_decode_timespec(&validity, &new_validity);
+ ceph_decode_timespec(&validity, dp);
+ dp += sizeof(struct ceph_timespec);
new_expires = get_seconds() + validity.tv_sec;
new_renew_after = new_expires - (validity.tv_sec / 4);
dout(" expires=%lu renew_after=%lu\n", new_expires,
@@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
ceph_buffer_put(th->ticket_blob);
th->session_key = new_session_key;
th->ticket_blob = new_ticket_blob;
- th->validity = new_validity;
th->secret_id = new_secret_id;
th->expires = new_expires;
th->renew_after = new_renew_after;
+ th->have_key = true;
dout(" got ticket service %d (%s) secret_id %lld len %d\n",
type, ceph_entity_type_name(type), th->secret_id,
(int)th->ticket_blob->vec.iov_len);
@@ -384,6 +383,24 @@ bad:
return -ERANGE;
}
+static bool need_key(struct ceph_x_ticket_handler *th)
+{
+ if (!th->have_key)
+ return true;
+
+ return get_seconds() >= th->renew_after;
+}
+
+static bool have_key(struct ceph_x_ticket_handler *th)
+{
+ if (th->have_key) {
+ if (get_seconds() >= th->expires)
+ th->have_key = false;
+ }
+
+ return th->have_key;
+}
+
static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
{
int want = ac->want_keys;
@@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
continue;
th = get_ticket_handler(ac, service);
-
if (IS_ERR(th)) {
*pneed |= service;
continue;
}
- if (get_seconds() >= th->renew_after)
+ if (need_key(th))
*pneed |= service;
- if (get_seconds() >= th->expires)
+ if (!have_key(th))
xi->have_keys &= ~service;
}
}
-
static int ceph_x_build_request(struct ceph_auth_client *ac,
void *buf, void *end)
{
@@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
ac->private = NULL;
}
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
- int peer_type)
+static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
{
struct ceph_x_ticket_handler *th;
th = get_ticket_handler(ac, peer_type);
if (!IS_ERR(th))
- memset(&th->validity, 0, sizeof(th->validity));
+ th->have_key = false;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type)
+{
+ /*
+ * We are to invalidate a service ticket in the hopes of
+ * getting a new, hopefully more valid, one. But, we won't get
+ * it unless our AUTH ticket is good, so invalidate AUTH ticket
+ * as well, just in case.
+ */
+ invalidate_ticket(ac, peer_type);
+ invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
}
static int calcu_signature(struct ceph_x_authorizer *au,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index e8b7c6917d47..40b1a3cf7397 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -16,7 +16,7 @@ struct ceph_x_ticket_handler {
unsigned int service;
struct ceph_crypto_key session_key;
- struct ceph_timespec validity;
+ bool have_key;
u64 secret_id;
struct ceph_buffer *ticket_blob;
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 393bfb22d5bb..5fcfb98f309e 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
* @local_retries: localized retries
* @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @stable: stable mode starts rep=0 in the recursive call for all replicas
* @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf)
* @parent_r: r value passed from the parent
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int local_fallback_retries,
int recurse_to_leaf,
unsigned int vary_r,
+ unsigned int stable,
int *out2,
int parent_r)
{
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
int collide, reject;
int count = out_size;
- dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep,
tries, recurse_tries, local_retries, local_fallback_retries,
- parent_r);
+ parent_r, stable);
- for (rep = outpos; rep < numrep && count > 0 ; rep++) {
+ for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
skip_rep = 0;
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
if (crush_choose_firstn(map,
map->buckets[-1-item],
weight, weight_max,
- x, outpos+1, 0,
+ x, stable ? 1 : outpos+1, 0,
out2, outpos, count,
recurse_tries, 0,
local_retries,
local_fallback_retries,
0,
vary_r,
+ stable,
NULL,
sub_r) <= outpos)
/* didn't get leaf */
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
int choose_local_fallback_retries = map->choose_local_fallback_tries;
int vary_r = map->chooseleaf_vary_r;
+ int stable = map->chooseleaf_stable;
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map,
case CRUSH_RULE_TAKE:
if ((curstep->arg1 >= 0 &&
curstep->arg1 < map->max_devices) ||
- (-1-curstep->arg1 < map->max_buckets &&
+ (-1-curstep->arg1 >= 0 &&
+ -1-curstep->arg1 < map->max_buckets &&
map->buckets[-1-curstep->arg1])) {
w[0] = curstep->arg1;
wsize = 1;
@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
vary_r = curstep->arg1;
break;
+ case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+ if (curstep->arg1 >= 0)
+ stable = curstep->arg1;
+ break;
+
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1;
@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map,
osize = 0;
for (i = 0; i < wsize; i++) {
+ int bno;
/*
* see CRUSH_N, CRUSH_N_MINUS macros.
* basically, numrep <= 0 means relative to
@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map,
continue;
}
j = 0;
+ /* make sure bucket id is valid */
+ bno = -1 - w[i];
+ if (bno < 0 || bno >= map->max_buckets) {
+ /* w[i] is probably CRUSH_ITEM_NONE */
+ dprintk(" bad w[i] %d\n", w[i]);
+ continue;
+ }
if (firstn) {
int recurse_tries;
if (choose_leaf_tries)
@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map,
recurse_tries = choose_tries;
osize += crush_choose_firstn(
map,
- map->buckets[-1-w[i]],
+ map->buckets[bno],
weight, weight_max,
x, numrep,
curstep->arg2,
@@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
choose_local_fallback_retries,
recurse_to_leaf,
vary_r,
+ stable,
c+osize,
0);
} else {
@@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map,
numrep : (result_max-osize));
crush_choose_indep(
map,
- map->buckets[-1-w[i]],
+ map->buckets[bno],
weight, weight_max,
x, out_size, numrep,
curstep->arg2,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9981039ef4ff..9cfedf565f5b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -23,9 +23,6 @@
#include <linux/ceph/pagelist.h>
#include <linux/export.h>
-#define list_entry_next(pos, member) \
- list_entry(pos->member.next, typeof(*pos), member)
-
/*
* Ceph uses the messenger to exchange ceph_msg messages with other
* hosts in the system. The messenger provides ordered and reliable
@@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con)
}
con->in_seq = 0;
con->in_seq_acked = 0;
+
+ con->out_skip = 0;
}
/*
@@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
static void con_out_kvec_reset(struct ceph_connection *con)
{
+ BUG_ON(con->out_skip);
+
con->out_kvec_left = 0;
con->out_kvec_bytes = 0;
con->out_kvec_cur = &con->out_kvec[0];
@@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con)
static void con_out_kvec_add(struct ceph_connection *con,
size_t size, void *data)
{
- int index;
+ int index = con->out_kvec_left;
- index = con->out_kvec_left;
+ BUG_ON(con->out_skip);
BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
con->out_kvec[index].iov_len = size;
@@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con,
con->out_kvec_bytes += size;
}
+/*
+ * Chop off a kvec from the end. Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+ int off = con->out_kvec_cur - con->out_kvec;
+ int skip = 0;
+
+ if (con->out_kvec_bytes > 0) {
+ skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+ BUG_ON(con->out_kvec_bytes < skip);
+ BUG_ON(!con->out_kvec_left);
+ con->out_kvec_bytes -= skip;
+ con->out_kvec_left--;
+ }
+
+ return skip;
+}
+
#ifdef CONFIG_BLOCK
/*
@@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
/* Move on to the next page */
BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
- cursor->page = list_entry_next(cursor->page, lru);
+ cursor->page = list_next_entry(cursor->page, lru);
cursor->last_piece = cursor->resid <= PAGE_SIZE;
return true;
@@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
if (!cursor->resid && cursor->total_resid) {
WARN_ON(!cursor->last_piece);
BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
- cursor->data = list_entry_next(cursor->data, links);
+ cursor->data = list_next_entry(cursor->data, links);
__ceph_msg_data_cursor_init(cursor);
new_piece = true;
}
@@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con)
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con);
- con->out_kvec_is_msg = true;
con->out_kvec[v].iov_base = &m->footer;
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
if (con->ops->sign_message)
@@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con)
u32 crc;
con_out_kvec_reset(con);
- con->out_kvec_is_msg = true;
con->out_msg_done = false;
/* Sneak an ack in there first? If we can get it into the same
@@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con)
/* tag + hdr + front + middle */
con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
- con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+ con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
if (m->middle)
con_out_kvec_add(con, m->middle->vec.iov_len,
m->middle->vec.iov_base);
- /* fill in crc (except data pages), footer */
+ /* fill in hdr crc and finalize hdr */
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
con->out_msg->hdr.crc = cpu_to_le32(crc);
- con->out_msg->footer.flags = 0;
+ memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+ /* fill in front and middle crc, footer */
crc = crc32c(0, m->front.iov_base, m->front.iov_len);
con->out_msg->footer.front_crc = cpu_to_le32(crc);
if (m->middle) {
@@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con)
dout("%s front_crc %u middle_crc %u\n", __func__,
le32_to_cpu(con->out_msg->footer.front_crc),
le32_to_cpu(con->out_msg->footer.middle_crc));
+ con->out_msg->footer.flags = 0;
/* is there a data payload? */
con->out_msg->footer.data_crc = 0;
@@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con)
}
}
con->out_kvec_left = 0;
- con->out_kvec_is_msg = false;
ret = 1;
out:
dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
@@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con)
{
int ret;
+ dout("%s %p %d left\n", __func__, con, con->out_skip);
while (con->out_skip > 0) {
size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
@@ -2506,13 +2528,13 @@ more:
more_kvec:
/* kvec data queued? */
- if (con->out_skip) {
- ret = write_partial_skip(con);
+ if (con->out_kvec_left) {
+ ret = write_partial_kvec(con);
if (ret <= 0)
goto out;
}
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
+ if (con->out_skip) {
+ ret = write_partial_skip(con);
if (ret <= 0)
goto out;
}
@@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con)
static void con_fault_finish(struct ceph_connection *con)
{
+ dout("%s %p\n", __func__, con);
+
/*
* in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones.
*/
- if (con->auth_retry && con->ops->invalidate_authorizer) {
- dout("calling invalidate_authorizer()\n");
- con->ops->invalidate_authorizer(con);
+ if (con->auth_retry) {
+ dout("auth_retry %d, invalidating\n", con->auth_retry);
+ if (con->ops->invalidate_authorizer)
+ con->ops->invalidate_authorizer(con);
+ con->auth_retry = 0;
}
if (con->ops->fault)
@@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg)
ceph_msg_put(msg);
}
if (con->out_msg == msg) {
- dout("%s %p msg %p - was sending\n", __func__, con, msg);
- con->out_msg = NULL;
- if (con->out_kvec_is_msg) {
- con->out_skip = con->out_kvec_bytes;
- con->out_kvec_is_msg = false;
+ BUG_ON(con->out_skip);
+ /* footer */
+ if (con->out_msg_done) {
+ con->out_skip += con_out_kvec_skip(con);
+ } else {
+ BUG_ON(!msg->data_length);
+ if (con->peer_features & CEPH_FEATURE_MSG_AUTH)
+ con->out_skip += sizeof(msg->footer);
+ else
+ con->out_skip += sizeof(msg->old_footer);
}
+ /* data, middle, front */
+ if (msg->data_length)
+ con->out_skip += msg->cursor.total_resid;
+ if (msg->middle)
+ con->out_skip += con_out_kvec_skip(con);
+ con->out_skip += con_out_kvec_skip(con);
+
+ dout("%s %p msg %p - was sending, will write %d skip %d\n",
+ __func__, con, msg, con->out_kvec_bytes, con->out_skip);
msg->hdr.seq = 0;
-
+ con->out_msg = NULL;
ceph_msg_put(msg);
}
+
mutex_unlock(&con->mutex);
}
@@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m)
static void ceph_msg_release(struct kref *kref)
{
struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
- LIST_HEAD(data);
- struct list_head *links;
- struct list_head *next;
+ struct ceph_msg_data *data, *next;
dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
@@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref)
m->middle = NULL;
}
- list_splice_init(&m->data, &data);
- list_for_each_safe(links, next, &data) {
- struct ceph_msg_data *data;
-
- data = list_entry(links, struct ceph_msg_data, links);
- list_del_init(links);
+ list_for_each_entry_safe(data, next, &m->data, links) {
+ list_del_init(&data->links);
ceph_msg_data_destroy(data);
}
m->data_length = 0;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index edda01626a45..de85dddc3dc0 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc)
return monc->client->have_fsid && monc->auth->global_id > 0;
}
-/*
- * The monitor responds with mount ack indicate mount success. The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f8f235930d88..3534e12683d3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
u32 osdmap_epoch;
int already_completed;
u32 bytes;
+ u8 decode_redir;
unsigned int i;
tid = le64_to_cpu(msg->hdr.tid);
@@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
p += 8 + 4; /* skip replay_version */
p += 8; /* skip user_version */
+ if (le16_to_cpu(msg->hdr.version) >= 7)
+ ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+ else
+ decode_redir = 1;
+ } else {
+ decode_redir = 0;
+ }
+
+ if (decode_redir) {
err = ceph_redirect_decode(&p, end, &redir);
if (err)
goto bad_put;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7d8f581d9f1f..243574c8cf33 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
c->choose_local_tries = ceph_decode_32(p);
c->choose_local_fallback_tries = ceph_decode_32(p);
c->choose_total_tries = ceph_decode_32(p);
- dout("crush decode tunable choose_local_tries = %d",
+ dout("crush decode tunable choose_local_tries = %d\n",
c->choose_local_tries);
- dout("crush decode tunable choose_local_fallback_tries = %d",
+ dout("crush decode tunable choose_local_fallback_tries = %d\n",
c->choose_local_fallback_tries);
- dout("crush decode tunable choose_total_tries = %d",
+ dout("crush decode tunable choose_total_tries = %d\n",
c->choose_total_tries);
ceph_decode_need(p, end, sizeof(u32), done);
c->chooseleaf_descend_once = ceph_decode_32(p);
- dout("crush decode tunable chooseleaf_descend_once = %d",
+ dout("crush decode tunable chooseleaf_descend_once = %d\n",
c->chooseleaf_descend_once);
ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_vary_r = ceph_decode_8(p);
- dout("crush decode tunable chooseleaf_vary_r = %d",
+ dout("crush decode tunable chooseleaf_vary_r = %d\n",
c->chooseleaf_vary_r);
+ /* skip straw_calc_version, allowed_bucket_algs */
+ ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
+ *p += sizeof(u8) + sizeof(u32);
+
+ ceph_decode_need(p, end, sizeof(u8), done);
+ c->chooseleaf_stable = ceph_decode_8(p);
+ dout("crush decode tunable chooseleaf_stable = %d\n",
+ c->chooseleaf_stable);
+
done:
dout("crush_decode success\n");
return c;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d79699c9d1b9..eab81bc80e5c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -208,7 +208,6 @@ ip:
case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
- __be32 flow_label;
ipv6:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
@@ -230,8 +229,12 @@ ipv6:
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
- flow_label = ip6_flowlabel(iph);
- if (flow_label) {
+ if ((dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
+ (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
+ ip6_flowlabel(iph)) {
+ __be32 flow_label = ip6_flowlabel(iph);
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
key_tags = skb_flow_dissector_target(flow_dissector,
diff --git a/net/core/scm.c b/net/core/scm.c
index 14596fb37172..2696aefdc148 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fplp = fpl;
fpl->count = 0;
fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
}
fpp = &fpl->fp[fpl->count];
@@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fpp++ = file;
fpl->count++;
}
+
+ if (!fpl->user)
+ fpl->user = get_uid(current_user());
+
return num;
}
@@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm)
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
+ free_uid(fpl->user);
kfree(fpl);
}
}
@@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
}
return new_fpl;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2df375ec9c2..5bf88f58bee7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,6 +79,8 @@
struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
/**
* skb_panic - private function for out-of-line support
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 95b6139d710c..a6beb7b6ae55 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,7 @@ static int zero = 0;
static int one = 1;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "max_skb_frags",
+ .data = &sysctl_max_skb_frags,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &max_skb_frags,
+ },
{ }
};
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c29809f765dc..62c049b647e9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -56,7 +56,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 22e73171ea63..d07fc076bea0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
- else if (n->tn_bits <= TNODE_KMALLOC_MAX)
- kfree(n);
else
- vfree(n);
+ kvfree(n);
}
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7c51c4e1661f..56fdf4e0dce4 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1240,6 +1240,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
err = ipgre_newlink(net, dev, tb, NULL);
if (err < 0)
goto out;
+
+ /* openvswitch users expect packet sizes to be unrestricted,
+ * so set the largest MTU we can.
+ */
+ err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
+ if (err)
+ goto out;
+
return dev;
out:
free_netdev(dev);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5f73a7c03e27..a50124260f5a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+
+ /* Our caller is responsible for freeing ipc->opt */
err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
err < 40 ? err : 40);
if (err)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index c7bd72e9b544..89e8861e05fc 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -943,17 +943,31 @@ done:
}
EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
-int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+ int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
- if (new_mtu < 68 ||
- new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ if (new_mtu < 68)
return -EINVAL;
+
+ if (new_mtu > max_mtu) {
+ if (strict)
+ return -EINVAL;
+
+ new_mtu = max_mtu;
+ }
+
dev->mtu = new_mtu;
return 0;
}
+EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ return __ip_tunnel_change_mtu(dev, new_mtu, true);
+}
EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
static void ip_tunnel_dev_free(struct net_device *dev)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index c117b21b937d..d3a27165f9cc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -746,8 +746,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bc35f1842512..7113bae4e6a0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -547,8 +547,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(net, msg, &ipc, false);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
goto out;
+ }
if (ipc.opt)
free = 1;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 46ce410703b1..4d367b4139a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,7 +24,6 @@
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
#include <net/ping.h>
-#include <net/tcp_memcontrol.h>
static int zero;
static int one = 1;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 19746b3fcbbe..0c36ef4a3f86 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -940,7 +940,7 @@ new_segment:
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ if (!can_coalesce && i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1213,7 +1213,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i == MAX_SKB_FRAGS || !sg) {
+ if (i == sysctl_max_skb_frags || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2a67244f97ca..7f6ff037adaf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,7 +73,6 @@
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
#include <net/busy_poll.h>
#include <linux/inet.h>
@@ -312,7 +311,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
-void tcp_req_err(struct sock *sk, u32 seq)
+void tcp_req_err(struct sock *sk, u32 seq, bool abort)
{
struct request_sock *req = inet_reqsk(sk);
struct net *net = sock_net(sk);
@@ -324,7 +323,7 @@ void tcp_req_err(struct sock *sk, u32 seq)
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- } else {
+ } else if (abort) {
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
@@ -384,7 +383,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
seq = ntohl(th->seq);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq);
+ return tcp_req_err(sk, seq,
+ type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
deleted file mode 100644
index 18bc7f745e9c..000000000000
--- a/net/ipv4/tcp_memcontrol.c
+++ /dev/null
@@ -1,200 +0,0 @@
-#include <net/tcp.h>
-#include <net/tcp_memcontrol.h>
-#include <net/sock.h>
-#include <net/ip.h>
-#include <linux/nsproxy.h>
-#include <linux/memcontrol.h>
-#include <linux/module.h>
-
-int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct page_counter *counter_parent = NULL;
- /*
- * The root cgroup does not use page_counters, but rather,
- * rely on the data already collected by the network
- * subsystem
- */
- if (memcg == root_mem_cgroup)
- return 0;
-
- memcg->tcp_mem.memory_pressure = 0;
-
- if (parent)
- counter_parent = &parent->tcp_mem.memory_allocated;
-
- page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent);
-
- return 0;
-}
-
-void tcp_destroy_cgroup(struct mem_cgroup *memcg)
-{
- if (memcg == root_mem_cgroup)
- return;
-
- if (memcg->tcp_mem.active)
- static_branch_dec(&memcg_sockets_enabled_key);
-}
-
-static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
- int ret;
-
- if (memcg == root_mem_cgroup)
- return -EINVAL;
-
- ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages);
- if (ret)
- return ret;
-
- if (!memcg->tcp_mem.active) {
- /*
- * The active flag needs to be written after the static_key
- * update. This is what guarantees that the socket activation
- * function is the last one to run. See sock_update_memcg() for
- * details, and note that we don't mark any socket as belonging
- * to this memcg until that flag is up.
- *
- * We need to do this, because static_keys will span multiple
- * sites, but we can't control their order. If we mark a socket
- * as accounted, but the accounting functions are not patched in
- * yet, we'll lose accounting.
- *
- * We never race with the readers in sock_update_memcg(),
- * because when this value change, the code to process it is not
- * patched in yet.
- */
- static_branch_inc(&memcg_sockets_enabled_key);
- memcg->tcp_mem.active = true;
- }
-
- return 0;
-}
-
-enum {
- RES_USAGE,
- RES_LIMIT,
- RES_MAX_USAGE,
- RES_FAILCNT,
-};
-
-static DEFINE_MUTEX(tcp_limit_mutex);
-
-static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long nr_pages;
- int ret = 0;
-
- buf = strstrip(buf);
-
- switch (of_cft(of)->private) {
- case RES_LIMIT:
- /* see memcontrol.c */
- ret = page_counter_memparse(buf, "-1", &nr_pages);
- if (ret)
- break;
- mutex_lock(&tcp_limit_mutex);
- ret = tcp_update_limit(memcg, nr_pages);
- mutex_unlock(&tcp_limit_mutex);
- break;
- default:
- ret = -EINVAL;
- break;
- }
- return ret ?: nbytes;
-}
-
-static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- u64 val;
-
- switch (cft->private) {
- case RES_LIMIT:
- if (memcg == root_mem_cgroup)
- val = PAGE_COUNTER_MAX;
- else
- val = memcg->tcp_mem.memory_allocated.limit;
- val *= PAGE_SIZE;
- break;
- case RES_USAGE:
- if (memcg == root_mem_cgroup)
- val = atomic_long_read(&tcp_memory_allocated);
- else
- val = page_counter_read(&memcg->tcp_mem.memory_allocated);
- val *= PAGE_SIZE;
- break;
- case RES_FAILCNT:
- if (memcg == root_mem_cgroup)
- return 0;
- val = memcg->tcp_mem.memory_allocated.failcnt;
- break;
- case RES_MAX_USAGE:
- if (memcg == root_mem_cgroup)
- return 0;
- val = memcg->tcp_mem.memory_allocated.watermark;
- val *= PAGE_SIZE;
- break;
- default:
- BUG();
- }
- return val;
-}
-
-static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_from_css(of_css(of));
- if (memcg == root_mem_cgroup)
- return nbytes;
-
- switch (of_cft(of)->private) {
- case RES_MAX_USAGE:
- page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated);
- break;
- case RES_FAILCNT:
- memcg->tcp_mem.memory_allocated.failcnt = 0;
- break;
- }
-
- return nbytes;
-}
-
-static struct cftype tcp_files[] = {
- {
- .name = "kmem.tcp.limit_in_bytes",
- .write = tcp_cgroup_write,
- .read_u64 = tcp_cgroup_read,
- .private = RES_LIMIT,
- },
- {
- .name = "kmem.tcp.usage_in_bytes",
- .read_u64 = tcp_cgroup_read,
- .private = RES_USAGE,
- },
- {
- .name = "kmem.tcp.failcnt",
- .private = RES_FAILCNT,
- .write = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- {
- .name = "kmem.tcp.max_usage_in_bytes",
- .private = RES_MAX_USAGE,
- .write = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- { } /* terminate */
-};
-
-static int __init tcp_memcontrol_init(void)
-{
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
- return 0;
-}
-__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index be0b21852b13..95d2f198017e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1048,8 +1048,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc,
sk->sk_family == AF_INET6);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
connected = 0;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 38eeddedfc21..9efd9ffdc34c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3538,6 +3538,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
+ bool notify = false;
addrconf_join_solict(dev, &ifp->addr);
@@ -3583,7 +3584,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
/* Because optimistic nodes can use this address,
* notify listeners. If DAD fails, RTM_DELADDR is sent.
*/
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ notify = true;
}
}
@@ -3591,6 +3592,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
out:
spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
+ if (notify)
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
}
static void addrconf_dad_start(struct inet6_ifaddr *ifp)
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1f9ebe3cbb4a..dc2db4f7b182 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
}
spin_lock_bh(&ip6_sk_fl_lock);
for (sflp = &np->ipv6_fl_list;
- (sfl = rcu_dereference(*sflp)) != NULL;
+ (sfl = rcu_dereference_protected(*sflp,
+ lockdep_is_held(&ip6_sk_fl_lock))) != NULL;
sflp = &sfl->next) {
if (sfl->fl->label == freq.flr_label) {
if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
np->flow_label &= ~IPV6_FLOWLABEL_MASK;
- *sflp = rcu_dereference(sfl->next);
+ *sflp = sfl->next;
spin_unlock_bh(&ip6_sk_fl_lock);
fl_release(sfl->fl);
kfree_rcu(sfl, rcu);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4ad8edb46f7c..1a5a70fb8551 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -61,7 +61,6 @@
#include <net/timewait_sock.h>
#include <net/inet_common.h>
#include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
#include <net/busy_poll.h>
#include <linux/proc_fs.h>
@@ -328,6 +327,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct tcp_sock *tp;
__u32 seq, snd_una;
struct sock *sk;
+ bool fatal;
int err;
sk = __inet6_lookup_established(net, &tcp_hashinfo,
@@ -346,8 +346,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return;
}
seq = ntohl(th->seq);
+ fatal = icmpv6_err_convert(type, code, &err);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq);
+ return tcp_req_err(sk, seq, fatal);
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -401,7 +402,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
- icmpv6_err_convert(type, code, &err);
/* Might be for an request_sock */
switch (sk->sk_state) {
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index abbdff03ce92..3e24d0ddb51b 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -91,7 +91,7 @@ static const struct file_operations reset_ops = {
};
#endif
-static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
+static const char *hw_flag_names[] = {
#define FLAG(F) [IEEE80211_HW_##F] = #F
FLAG(HAS_RATE_CONTROL),
FLAG(RX_INCLUDES_FCS),
@@ -126,9 +126,6 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
FLAG(SUPPORTS_AMSDU_IN_AMPDU),
FLAG(BEACON_TX_STATUS),
FLAG(NEEDS_UNIQUE_STA_ADDR),
-
- /* keep last for the build bug below */
- (void *)0x1
#undef FLAG
};
@@ -148,7 +145,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
/* fail compilation if somebody adds or removes
* a flag without updating the name array above
*/
- BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1);
+ BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS);
for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) {
if (test_bit(i, local->hw.flags))
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 1605691d9414..de9cb19efb6a 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -91,6 +91,8 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
struct vxlan_config conf = {
.no_share = true,
.flags = VXLAN_F_COLLECT_METADATA,
+ /* Don't restrict the packets that can be sent by MTU */
+ .mtu = IP_MAX_MTU,
};
if (!options) {
diff --git a/net/rds/ib.c b/net/rds/ib.c
index f222885ac0c7..9481d55ff6cb 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
static void rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
- struct ib_device_attr *dev_attr;
/* Only handle IB (no iWARP) devices */
if (device->node_type != RDMA_NODE_IB_CA)
return;
- dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
- if (!dev_attr)
- return;
-
- if (ib_query_device(device, dev_attr)) {
- rdsdebug("Query device failed for %s\n", device->name);
- goto free_attr;
- }
-
rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
ibdev_to_node(device));
if (!rds_ibdev)
- goto free_attr;
+ return;
spin_lock_init(&rds_ibdev->spinlock);
atomic_set(&rds_ibdev->refcount, 1);
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
- rds_ibdev->max_wrs = dev_attr->max_qp_wr;
- rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+ rds_ibdev->max_wrs = device->attrs.max_qp_wr;
+ rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
- rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
- rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
- min_t(unsigned int, (dev_attr->max_mr / 2),
+ rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
+ rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
+ min_t(unsigned int, (device->attrs.max_mr / 2),
rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
- rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
- min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
+ rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
+ min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
- rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
- rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+ rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
+ rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device);
@@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device)
}
rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
- dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+ device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
rds_ibdev->max_8k_fmrs);
@@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device)
put_dev:
rds_ib_dev_put(rds_ibdev);
-free_attr:
- kfree(dev_attr);
}
/*
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 576f1825fc55..f4a9fff829e0 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns);
static void rds_iw_add_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
- struct ib_device_attr *dev_attr;
/* Only handle iwarp devices */
if (device->node_type != RDMA_NODE_RNIC)
return;
- dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
- if (!dev_attr)
- return;
-
- if (ib_query_device(device, dev_attr)) {
- rdsdebug("Query device failed for %s\n", device->name);
- goto free_attr;
- }
-
rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
if (!rds_iwdev)
- goto free_attr;
+ return;
spin_lock_init(&rds_iwdev->spinlock);
- rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
- rds_iwdev->max_wrs = dev_attr->max_qp_wr;
- rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+ rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+ rds_iwdev->max_wrs = device->attrs.max_qp_wr;
+ rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
rds_iwdev->dev = device;
rds_iwdev->pd = ib_alloc_pd(device);
@@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device)
list_add_tail(&rds_iwdev->list, &rds_iw_devices);
ib_set_client_data(device, &rds_iw_client, rds_iwdev);
-
- goto free_attr;
+ return;
err_mr:
if (rds_iwdev->mr)
@@ -121,8 +110,6 @@ err_pd:
ib_dealloc_pd(rds_iwdev->pd);
free_dev:
kfree(rds_iwdev);
-free_attr:
- kfree(dev_attr);
}
static void rds_iw_remove_one(struct ib_device *device, void *client_data)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5ca2ebfe0be8..e878da0949db 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5538,6 +5538,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
struct sctp_hmac_algo_param *hmacs;
__u16 data_len = 0;
u32 num_idents;
+ int i;
if (!ep->auth_enable)
return -EACCES;
@@ -5555,8 +5556,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
return -EFAULT;
if (put_user(num_idents, &p->shmac_num_idents))
return -EFAULT;
- if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len))
- return -EFAULT;
+ for (i = 0; i < num_idents; i++) {
+ __u16 hmacid = ntohs(hmacs->hmac_ids[i]);
+
+ if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16)))
+ return -EFAULT;
+ }
return 0;
}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 5e4f815c2b34..2b32fd602669 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
if (count == 0)
return 0;
- mutex_lock(&inode->i_mutex); /* protect against multiple concurrent
+ inode_lock(inode); /* protect against multiple concurrent
* readers on this file */
again:
spin_lock(&queue_lock);
@@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
}
if (rp->q.list.next == &cd->queue) {
spin_unlock(&queue_lock);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
WARN_ON_ONCE(rp->offset);
return 0;
}
@@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
}
if (err == -EAGAIN)
goto again;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err ? err : count;
}
@@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf,
if (!cd->cache_parse)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = cache_downcall(mapping, buf, count, cd);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
return ret;
}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 14f45bf0410c..31789ef3e614 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode)
int need_release;
LIST_HEAD(free_list);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
spin_lock(&pipe->lock);
need_release = pipe->nreaders != 0 || pipe->nwriters != 0;
pipe->nreaders = 0;
@@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode)
cancel_delayed_work_sync(&pipe->queue_timeout);
rpc_inode_setowner(inode, NULL);
RPC_I(inode)->pipe = NULL;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
static struct inode *
@@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
int first_open;
int res = -ENXIO;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL)
goto out;
@@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
pipe->nwriters++;
res = 0;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
struct rpc_pipe_msg *msg;
int last_close;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL)
goto out;
@@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
if (last_close && pipe->ops->release_pipe)
pipe->ops->release_pipe(inode);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
@@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
struct rpc_pipe_msg *msg;
int res = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL) {
res = -EPIPE;
@@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
pipe->ops->destroy_msg(msg);
}
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
struct inode *inode = file_inode(filp);
int res;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
res = -EPIPE;
if (RPC_I(inode)->pipe != NULL)
res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
poll_wait(filp, &rpci->waitq, wait);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (rpci->pipe == NULL)
mask |= POLLERR | POLLHUP;
else if (filp->private_data || !list_empty(&rpci->pipe->pipe))
mask |= POLLIN | POLLRDNORM;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return mask;
}
@@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case FIONREAD:
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -EPIPE;
}
spin_lock(&pipe->lock);
@@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
len += msg->len - msg->copied;
}
spin_unlock(&pipe->lock);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return put_user(len, (int __user *)arg);
default:
return -EINVAL;
@@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry)
parent = dget_parent(dentry);
dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
error = __rpc_rmdir(dir, dentry);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
dput(parent);
return error;
}
@@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent,
{
struct inode *dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(dir, I_MUTEX_CHILD);
__rpc_depopulate(parent, files, start, eof);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
}
static int rpc_populate(struct dentry *parent,
@@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent,
struct dentry *dentry;
int i, err;
- mutex_lock(&dir->i_mutex);
+ inode_lock(dir);
for (i = start; i < eof; i++) {
dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
err = PTR_ERR(dentry);
@@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent,
if (err != 0)
goto out_bad;
}
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return 0;
out_bad:
__rpc_depopulate(parent, files, start, eof);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
__FILE__, __func__, parent);
return err;
@@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
struct inode *dir = d_inode(parent);
int error;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
dentry = __rpc_lookup_create_exclusive(parent, name);
if (IS_ERR(dentry))
goto out;
@@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
goto err_rmdir;
}
out:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return dentry;
err_rmdir:
__rpc_rmdir(dir, dentry);
@@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
parent = dget_parent(dentry);
dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
if (depopulate != NULL)
depopulate(dentry);
error = __rpc_rmdir(dir, dentry);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
dput(parent);
return error;
}
@@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
if (pipe->ops->downcall == NULL)
umode &= ~S_IWUGO;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
dentry = __rpc_lookup_create_exclusive(parent, name);
if (IS_ERR(dentry))
goto out;
@@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
if (err)
goto out_err;
out:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return dentry;
out_err:
dentry = ERR_PTR(err);
@@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry)
parent = dget_parent(dentry);
dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
error = __rpc_rmpipe(dir, dentry);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
dput(parent);
return error;
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e98f4a243e5..37edea6fa92d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt)
if (atomic_dec_and_test(&xprt->count))
xprt_destroy(xprt);
}
+EXPORT_SYMBOL_GPL(xprt_put);
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 33f99d3004f2..dc9f3b513a05 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o physical_ops.o \
- svc_rdma.o svc_rdma_transport.o \
+ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c6836844bd0e..e16567389e28 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -190,12 +190,11 @@ static int
frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
- struct ib_device_attr *devattr = &ia->ri_devattr;
int depth, delta;
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- devattr->max_fast_reg_page_list_len);
+ ia->ri_device->attrs.max_fast_reg_page_list_len);
dprintk("RPC: %s: device's max FR page list len = %u\n",
__func__, ia->ri_max_frmr_depth);
@@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
}
ep->rep_attr.cap.max_send_wr *= depth;
- if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
- cdata->max_requests = devattr->max_qp_wr / depth;
+ if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
+ cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests *
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 1b7051bdbdc8..c846ca9f1eba 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD;
static unsigned int min_ord = 1;
static unsigned int max_ord = 4096;
unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
@@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod;
atomic_t rdma_stat_sq_poll;
atomic_t rdma_stat_sq_prod;
-/* Temporary NFS request map and context caches */
-struct kmem_cache *svc_rdma_map_cachep;
-struct kmem_cache *svc_rdma_ctxt_cachep;
-
struct workqueue_struct *svc_rdma_wq;
/*
@@ -243,17 +240,16 @@ void svc_rdma_cleanup(void)
svc_unreg_xprt_class(&svc_rdma_bc_class);
#endif
svc_unreg_xprt_class(&svc_rdma_class);
- kmem_cache_destroy(svc_rdma_map_cachep);
- kmem_cache_destroy(svc_rdma_ctxt_cachep);
}
int svc_rdma_init(void)
{
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
- dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
- dprintk("\tsq_depth : %d\n",
+ dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
+ dprintk("\tsq_depth : %u\n",
svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+ dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
@@ -264,39 +260,10 @@ int svc_rdma_init(void)
svcrdma_table_header =
register_sysctl_table(svcrdma_root_table);
- /* Create the temporary map cache */
- svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
- sizeof(struct svc_rdma_req_map),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!svc_rdma_map_cachep) {
- printk(KERN_INFO "Could not allocate map cache.\n");
- goto err0;
- }
-
- /* Create the temporary context cache */
- svc_rdma_ctxt_cachep =
- kmem_cache_create("svc_rdma_ctxt_cache",
- sizeof(struct svc_rdma_op_ctxt),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!svc_rdma_ctxt_cachep) {
- printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
- goto err1;
- }
-
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
svc_reg_xprt_class(&svc_rdma_bc_class);
#endif
return 0;
- err1:
- kmem_cache_destroy(svc_rdma_map_cachep);
- err0:
- unregister_sysctl_table(svcrdma_table_header);
- destroy_workqueue(svc_rdma_wq);
- return -ENOMEM;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644
index 000000000000..65a7c232a345
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA (server-side).
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+#undef SVCRDMA_BACKCHANNEL_DEBUG
+
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+ struct xdr_buf *rcvbuf)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct kvec *dst, *src = &rcvbuf->head[0];
+ struct rpc_rqst *req;
+ unsigned long cwnd;
+ u32 credits;
+ size_t len;
+ __be32 xid;
+ __be32 *p;
+ int ret;
+
+ p = (__be32 *)src->iov_base;
+ len = src->iov_len;
+ xid = rmsgp->rm_xid;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+ pr_info("%s: xid=%08x, length=%zu\n",
+ __func__, be32_to_cpu(xid), len);
+ pr_info("%s: RPC/RDMA: %*ph\n",
+ __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+ pr_info("%s: RPC: %*ph\n",
+ __func__, (int)len, p);
+#endif
+
+ ret = -EAGAIN;
+ if (src->iov_len < 24)
+ goto out_shortreply;
+
+ spin_lock_bh(&xprt->transport_lock);
+ req = xprt_lookup_rqst(xprt, xid);
+ if (!req)
+ goto out_notfound;
+
+ dst = &req->rq_private_buf.head[0];
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+ if (dst->iov_len < len)
+ goto out_unlock;
+ memcpy(dst->iov_base, p, len);
+
+ credits = be32_to_cpu(rmsgp->rm_credit);
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
+ credits = r_xprt->rx_buf.rb_bc_max_requests;
+
+ cwnd = xprt->cwnd;
+ xprt->cwnd = credits << RPC_CWNDSHIFT;
+ if (xprt->cwnd > cwnd)
+ xprt_release_rqst_cong(req->rq_task);
+
+ ret = 0;
+ xprt_complete_rqst(req->rq_task, rcvbuf->len);
+ rcvbuf->len = 0;
+
+out_unlock:
+ spin_unlock_bh(&xprt->transport_lock);
+out:
+ return ret;
+
+out_shortreply:
+ dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
+ xprt, src->iov_len);
+ goto out;
+
+out_notfound:
+ dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
+ xprt, be32_to_cpu(xid));
+
+ goto out_unlock;
+}
+
+/* Send a backwards direction RPC call.
+ *
+ * Caller holds the connection's mutex and has already marshaled
+ * the RPC/RDMA request.
+ *
+ * This is similar to svc_rdma_reply, but takes an rpc_rqst
+ * instead, does not support chunks, and avoids blocking memory
+ * allocation.
+ *
+ * XXX: There is still an opportunity to block in svc_rdma_send()
+ * if there are no SQ entries to post the Send. This may occur if
+ * the adapter has a small maximum SQ depth.
+ */
+static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
+ struct svc_rdma_op_ctxt *ctxt;
+ struct svc_rdma_req_map *vec;
+ struct ib_send_wr send_wr;
+ int ret;
+
+ vec = svc_rdma_get_req_map(rdma);
+ ret = svc_rdma_map_xdr(rdma, sndbuf, vec);
+ if (ret)
+ goto out_err;
+
+ /* Post a recv buffer to handle the reply for this request. */
+ ret = svc_rdma_post_recv(rdma, GFP_NOIO);
+ if (ret) {
+ pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n",
+ ret);
+ pr_err("svcrdma: closing transport %p.\n", rdma);
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ ret = -ENOTCONN;
+ goto out_err;
+ }
+
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
+ ctxt->count = 1;
+
+ ctxt->wr_op = IB_WR_SEND;
+ ctxt->direction = DMA_TO_DEVICE;
+ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
+ ctxt->sge[0].length = sndbuf->len;
+ ctxt->sge[0].addr =
+ ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
+ sndbuf->len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+ atomic_inc(&rdma->sc_dma_used);
+
+ memset(&send_wr, 0, sizeof(send_wr));
+ send_wr.wr_id = (unsigned long)ctxt;
+ send_wr.sg_list = ctxt->sge;
+ send_wr.num_sge = 1;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = svc_rdma_send(rdma, &send_wr);
+ if (ret) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+
+out_err:
+ svc_rdma_put_req_map(rdma, vec);
+ dprintk("svcrdma: %s returns %d\n", __func__, ret);
+ return ret;
+
+out_unmap:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ goto out_err;
+}
+
+/* Server-side transport endpoint wants a whole page for its send
+ * buffer. The client RPC code constructs the RPC header in this
+ * buffer before it invokes ->send_request.
+ *
+ * Returns NULL if there was a temporary allocation failure.
+ */
+static void *
+xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+ struct svcxprt_rdma *rdma;
+ struct page *page;
+
+ rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+
+ /* Prevent an infinite loop: try to make this case work */
+ if (size > PAGE_SIZE)
+ WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
+ size);
+
+ page = alloc_page(RPCRDMA_DEF_GFP);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+static void
+xprt_rdma_bc_free(void *buffer)
+{
+ /* No-op: ctxt and page have already been freed. */
+}
+
+static int
+rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+ int rc;
+
+ /* Space in the send buffer for an RPC/RDMA header is reserved
+ * via xprt->tsh_size.
+ */
+ headerp->rm_xid = rqst->rq_xid;
+ headerp->rm_vers = rpcrdma_version;
+ headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+ headerp->rm_type = rdma_msg;
+ headerp->rm_body.rm_chunks[0] = xdr_zero;
+ headerp->rm_body.rm_chunks[1] = xdr_zero;
+ headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+ pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
+#endif
+
+ rc = svc_rdma_bc_sendto(rdma, rqst);
+ if (rc)
+ goto drop_connection;
+ return rc;
+
+drop_connection:
+ dprintk("svcrdma: failed to send bc call\n");
+ xprt_disconnect_done(xprt);
+ return -ENOTCONN;
+}
+
+/* Send an RPC call on the passive end of a transport
+ * connection.
+ */
+static int
+xprt_rdma_bc_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+ struct svcxprt_rdma *rdma;
+ int ret;
+
+ dprintk("svcrdma: sending bc call with xid: %08x\n",
+ be32_to_cpu(rqst->rq_xid));
+
+ if (!mutex_trylock(&sxprt->xpt_mutex)) {
+ rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
+ if (!mutex_trylock(&sxprt->xpt_mutex))
+ return -EAGAIN;
+ rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
+ }
+
+ ret = -ENOTCONN;
+ rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+ if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
+ ret = rpcrdma_bc_send_request(rdma, rqst);
+
+ mutex_unlock(&sxprt->xpt_mutex);
+
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static void
+xprt_rdma_bc_close(struct rpc_xprt *xprt)
+{
+ dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+}
+
+static void
+xprt_rdma_bc_put(struct rpc_xprt *xprt)
+{
+ dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+}
+
+static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+ .reserve_xprt = xprt_reserve_xprt_cong,
+ .release_xprt = xprt_release_xprt_cong,
+ .alloc_slot = xprt_alloc_slot,
+ .release_request = xprt_release_rqst_cong,
+ .buf_alloc = xprt_rdma_bc_allocate,
+ .buf_free = xprt_rdma_bc_free,
+ .send_request = xprt_rdma_bc_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .close = xprt_rdma_bc_close,
+ .destroy = xprt_rdma_bc_put,
+ .print_stats = xprt_rdma_print_stats
+};
+
+static const struct rpc_timeout xprt_rdma_bc_timeout = {
+ .to_initval = 60 * HZ,
+ .to_maxval = 60 * HZ,
+};
+
+/* It shouldn't matter if the number of backchannel session slots
+ * doesn't match the number of RPC/RDMA credits. That just means
+ * one or the other will have extra slots that aren't used.
+ */
+static struct rpc_xprt *
+xprt_setup_rdma_bc(struct xprt_create *args)
+{
+ struct rpc_xprt *xprt;
+ struct rpcrdma_xprt *new_xprt;
+
+ if (args->addrlen > sizeof(xprt->addr)) {
+ dprintk("RPC: %s: address too large\n", __func__);
+ return ERR_PTR(-EBADF);
+ }
+
+ xprt = xprt_alloc(args->net, sizeof(*new_xprt),
+ RPCRDMA_MAX_BC_REQUESTS,
+ RPCRDMA_MAX_BC_REQUESTS);
+ if (!xprt) {
+ dprintk("RPC: %s: couldn't allocate rpc_xprt\n",
+ __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ xprt->timeout = &xprt_rdma_bc_timeout;
+ xprt_set_bound(xprt);
+ xprt_set_connected(xprt);
+ xprt->bind_timeout = RPCRDMA_BIND_TO;
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+ xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+ xprt->prot = XPRT_TRANSPORT_BC_RDMA;
+ xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
+ xprt->ops = &xprt_rdma_bc_procs;
+
+ memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+ xprt->addrlen = args->addrlen;
+ xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
+ xprt->resvport = 0;
+
+ xprt->max_payload = xprt_rdma_max_inline_read;
+
+ new_xprt = rpcx_to_rdmax(xprt);
+ new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
+
+ xprt_get(xprt);
+ args->bc_xprt->xpt_bc_xprt = xprt;
+ xprt->bc_xprt = args->bc_xprt;
+
+ if (!try_module_get(THIS_MODULE))
+ goto out_fail;
+
+ /* Final put for backchannel xprt is in __svc_rdma_free */
+ xprt_get(xprt);
+ return xprt;
+
+out_fail:
+ xprt_rdma_free_addresses(xprt);
+ args->bc_xprt->xpt_bc_xprt = NULL;
+ xprt_put(xprt);
+ xprt_free(xprt);
+ return ERR_PTR(-EINVAL);
+}
+
+struct xprt_class xprt_rdma_bc = {
+ .list = LIST_HEAD_INIT(xprt_rdma_bc.list),
+ .name = "rdma backchannel",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_BC_RDMA,
+ .setup = xprt_setup_rdma_bc,
+};
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ff4f01e527ec..c8b8a8b4181e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
head->arg.page_len += len;
+
head->arg.len += len;
if (!pg_off)
head->count++;
@@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
goto err;
atomic_inc(&xprt->sc_dma_used);
- /* The lkey here is either a local dma lkey or a dma_mr lkey */
- ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->sge[pno].length = len;
ctxt->count++;
@@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
return ret;
}
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+{
+ __be32 *p = (__be32 *)rmsgp;
+
+ if (!xprt->xpt_bc_xprt)
+ return false;
+
+ if (rmsgp->rm_type != rdma_msg)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+ return false;
+
+ /* sanity */
+ if (p[7] != rmsgp->rm_xid)
+ return false;
+ /* call direction */
+ if (p[8] == cpu_to_be32(RPC_CALL))
+ return false;
+
+ return true;
+}
+
/*
* Set up the rqstp thread context to point to the RQ buffer. If
* necessary, pull additional data from the client with an RDMA_READ
@@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto close_out;
}
+ if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
+ ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+ &rqstp->rq_arg);
+ svc_rdma_put_context(ctxt, 0);
+ if (ret)
+ goto repost;
+ return ret;
+ }
+
/* Read read-list data. */
ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
if (ret > 0) {
@@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
set_bit(XPT_CLOSE, &xprt->xpt_flags);
defer:
return 0;
+
+repost:
+ ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL);
+ if (ret) {
+ pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
+ ret);
+ pr_err("svcrdma: closing transport %p.\n", rdma_xprt);
+ set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags);
+ ret = -ENOTCONN;
+ }
+ return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 969a1ab75fc3..df57f3ce6cd2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -50,9 +50,9 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-static int map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec)
+int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
{
int sge_no;
u32 sge_bytes;
@@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
if (xdr->len !=
(xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
- pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+ pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
return -EIO;
}
@@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt,
sge_no++;
}
- dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+ dprintk("svcrdma: %s: sge_no %d page_no %d "
"page_base %u page_len %u head_len %zu tail_len %zu\n",
- sge_no, page_no, xdr->page_base, xdr->page_len,
+ __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
xdr->head[0].iov_len, xdr->tail[0].iov_len);
vec->count = sge_no;
@@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge[sge_no].addr))
goto err;
atomic_inc(&xprt->sc_dma_used);
- sge[sge_no].lkey = xprt->sc_dma_lkey;
+ sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count++;
sge_off = 0;
sge_no++;
@@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
int ret;
/* Post a recv buffer to handle another request. */
- ret = svc_rdma_post_recv(rdma);
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
if (ret) {
printk(KERN_INFO
"svcrdma: could not post a receive buffer, err=%d."
@@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->count = 1;
/* Prepare the SGE for the RPCRDMA Header */
- ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
@@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[sge_no].addr))
goto err;
atomic_inc(&rdma->sc_dma_used);
- ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
if (byte_count != 0) {
@@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
/* Build an req vec for the XDR */
ctxt = svc_rdma_get_context(rdma);
ctxt->direction = DMA_TO_DEVICE;
- vec = svc_rdma_get_req_map();
- ret = map_xdr(rdma, &rqstp->rq_res, vec);
+ vec = svc_rdma_get_req_map(rdma);
+ ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec);
if (ret)
goto err0;
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
- res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ ret = -ENOMEM;
+ res_page = alloc_page(GFP_KERNEL);
+ if (!res_page)
+ goto err0;
rdma_resp = page_address(res_page);
reply_ary = svc_rdma_get_reply_array(rdma_argp);
if (reply_ary)
@@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
inline_bytes);
- svc_rdma_put_req_map(vec);
+ svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
return ret;
err1:
put_page(res_page);
err0:
- svc_rdma_put_req_map(vec);
+ svc_rdma_put_req_map(rdma, vec);
svc_rdma_put_context(ctxt, 0);
return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b348b4adef29..5763825d09bf 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt)
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
+ gfp_t flags)
{
struct svc_rdma_op_ctxt *ctxt;
- ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
- GFP_KERNEL | __GFP_NOFAIL);
- ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->dto_q);
+ ctxt = kmalloc(sizeof(*ctxt), flags);
+ if (ctxt) {
+ ctxt->xprt = xprt;
+ INIT_LIST_HEAD(&ctxt->free);
+ INIT_LIST_HEAD(&ctxt->dto_q);
+ }
+ return ctxt;
+}
+
+static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
+{
+ unsigned int i;
+
+ /* Each RPC/RDMA credit can consume a number of send
+ * and receive WQEs. One ctxt is allocated for each.
+ */
+ i = xprt->sc_sq_depth + xprt->sc_rq_depth;
+
+ while (i--) {
+ struct svc_rdma_op_ctxt *ctxt;
+
+ ctxt = alloc_ctxt(xprt, GFP_KERNEL);
+ if (!ctxt) {
+ dprintk("svcrdma: No memory for RDMA ctxt\n");
+ return false;
+ }
+ list_add(&ctxt->free, &xprt->sc_ctxts);
+ }
+ return true;
+}
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used++;
+ if (list_empty(&xprt->sc_ctxts))
+ goto out_empty;
+
+ ctxt = list_first_entry(&xprt->sc_ctxts,
+ struct svc_rdma_op_ctxt, free);
+ list_del_init(&ctxt->free);
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+out:
ctxt->count = 0;
ctxt->frmr = NULL;
- atomic_inc(&xprt->sc_ctxt_used);
return ctxt;
+
+out_empty:
+ /* Either pre-allocation missed the mark, or send
+ * queue accounting is broken.
+ */
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+ ctxt = alloc_ctxt(xprt, GFP_NOIO);
+ if (ctxt)
+ goto out;
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used--;
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+ WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
+ return NULL;
}
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
@@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
- * the sc_dma_lkey, otherwise, ignore it since it is
+ * the local_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
- if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+ if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
atomic_dec(&xprt->sc_dma_used);
ib_dma_unmap_page(xprt->sc_cm_id->device,
ctxt->sge[i].addr,
@@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
{
- struct svcxprt_rdma *xprt;
+ struct svcxprt_rdma *xprt = ctxt->xprt;
int i;
- xprt = ctxt->xprt;
if (free_pages)
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
- atomic_dec(&xprt->sc_ctxt_used);
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used--;
+ list_add(&ctxt->free, &xprt->sc_ctxts);
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
}
-/*
- * Temporary NFS req mappings are shared across all transport
- * instances. These are short lived and should be bounded by the number
- * of concurrent server threads * depth of the SQ.
- */
-struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
+{
+ while (!list_empty(&xprt->sc_ctxts)) {
+ struct svc_rdma_op_ctxt *ctxt;
+
+ ctxt = list_first_entry(&xprt->sc_ctxts,
+ struct svc_rdma_op_ctxt, free);
+ list_del(&ctxt->free);
+ kfree(ctxt);
+ }
+}
+
+static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
{
struct svc_rdma_req_map *map;
- map = kmem_cache_alloc(svc_rdma_map_cachep,
- GFP_KERNEL | __GFP_NOFAIL);
+
+ map = kmalloc(sizeof(*map), flags);
+ if (map)
+ INIT_LIST_HEAD(&map->free);
+ return map;
+}
+
+static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
+{
+ unsigned int i;
+
+ /* One for each receive buffer on this connection. */
+ i = xprt->sc_max_requests;
+
+ while (i--) {
+ struct svc_rdma_req_map *map;
+
+ map = alloc_req_map(GFP_KERNEL);
+ if (!map) {
+ dprintk("svcrdma: No memory for request map\n");
+ return false;
+ }
+ list_add(&map->free, &xprt->sc_maps);
+ }
+ return true;
+}
+
+struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_req_map *map = NULL;
+
+ spin_lock(&xprt->sc_map_lock);
+ if (list_empty(&xprt->sc_maps))
+ goto out_empty;
+
+ map = list_first_entry(&xprt->sc_maps,
+ struct svc_rdma_req_map, free);
+ list_del_init(&map->free);
+ spin_unlock(&xprt->sc_map_lock);
+
+out:
map->count = 0;
return map;
+
+out_empty:
+ spin_unlock(&xprt->sc_map_lock);
+
+ /* Pre-allocation amount was incorrect */
+ map = alloc_req_map(GFP_NOIO);
+ if (map)
+ goto out;
+
+ WARN_ONCE(1, "svcrdma: empty request map list?\n");
+ return NULL;
+}
+
+void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
+ struct svc_rdma_req_map *map)
+{
+ spin_lock(&xprt->sc_map_lock);
+ list_add(&map->free, &xprt->sc_maps);
+ spin_unlock(&xprt->sc_map_lock);
}
-void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
{
- kmem_cache_free(svc_rdma_map_cachep, map);
+ while (!list_empty(&xprt->sc_maps)) {
+ struct svc_rdma_req_map *map;
+
+ map = list_first_entry(&xprt->sc_maps,
+ struct svc_rdma_req_map, free);
+ list_del(&map->free);
+ kfree(map);
+ }
}
/* ib_cq event handler */
@@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
static void process_context(struct svcxprt_rdma *xprt,
struct svc_rdma_op_ctxt *ctxt)
{
+ struct svc_rdma_op_ctxt *read_hdr;
+ int free_pages = 0;
+
svc_rdma_unmap_dma(ctxt);
switch (ctxt->wr_op) {
case IB_WR_SEND:
- if (ctxt->frmr)
- pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
- svc_rdma_put_context(ctxt, 1);
+ free_pages = 1;
break;
case IB_WR_RDMA_WRITE:
- if (ctxt->frmr)
- pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
- svc_rdma_put_context(ctxt, 0);
break;
case IB_WR_RDMA_READ:
case IB_WR_RDMA_READ_WITH_INV:
svc_rdma_put_frmr(xprt, ctxt->frmr);
- if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
- struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
- if (read_hdr) {
- spin_lock_bh(&xprt->sc_rq_dto_lock);
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- list_add_tail(&read_hdr->dto_q,
- &xprt->sc_read_complete_q);
- spin_unlock_bh(&xprt->sc_rq_dto_lock);
- } else {
- pr_err("svcrdma: ctxt->read_hdr == NULL\n");
- }
- svc_xprt_enqueue(&xprt->sc_xprt);
- }
+
+ if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags))
+ break;
+
+ read_hdr = ctxt->read_hdr;
svc_rdma_put_context(ctxt, 0);
- break;
+
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ list_add_tail(&read_hdr->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ return;
default:
- printk(KERN_ERR "svcrdma: unexpected completion type, "
- "opcode=%d\n",
- ctxt->wr_op);
+ dprintk("svcrdma: unexpected completion opcode=%d\n",
+ ctxt->wr_op);
break;
}
+
+ svc_rdma_put_context(ctxt, free_pages);
}
/*
@@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+ INIT_LIST_HEAD(&cma_xprt->sc_maps);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
-
- cma_xprt->sc_ord = svcrdma_ord;
-
- cma_xprt->sc_max_req_size = svcrdma_max_req_size;
- cma_xprt->sc_max_requests = svcrdma_max_requests;
- cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
- atomic_set(&cma_xprt->sc_sq_count, 0);
- atomic_set(&cma_xprt->sc_ctxt_used, 0);
+ spin_lock_init(&cma_xprt->sc_ctxt_lock);
+ spin_lock_init(&cma_xprt->sc_map_lock);
if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
return cma_xprt;
}
-int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
{
struct ib_recv_wr recv_wr, *bad_recv_wr;
struct svc_rdma_op_ctxt *ctxt;
@@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err_put_ctxt;
}
- page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ page = alloc_page(flags);
+ if (!page)
+ goto err_put_ctxt;
ctxt->pages[sge_no] = page;
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
@@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
atomic_inc(&xprt->sc_dma_used);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
- ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count = sge_no + 1;
buflen += PAGE_SIZE;
}
@@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct rdma_conn_param conn_param;
struct ib_cq_init_attr cq_attr = {};
struct ib_qp_init_attr qp_attr;
- struct ib_device_attr devattr;
- int uninitialized_var(dma_mr_acc);
- int need_dma_mr = 0;
- int ret;
- int i;
+ struct ib_device *dev;
+ unsigned int i;
+ int ret = 0;
listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
clear_bit(XPT_CONN, &xprt->xpt_flags);
@@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
newxprt, newxprt->sc_cm_id);
- ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
- if (ret) {
- dprintk("svcrdma: could not query device attributes on "
- "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
- goto errout;
- }
+ dev = newxprt->sc_cm_id->device;
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
- newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+ newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
(size_t)RPCSVC_MAXPAGES);
- newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+ newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
RPCSVC_MAXPAGES);
- newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
- (size_t)svcrdma_max_requests);
- newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+ newxprt->sc_max_req_size = svcrdma_max_req_size;
+ newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
+ svcrdma_max_requests);
+ newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
+ svcrdma_max_bc_requests);
+ newxprt->sc_rq_depth = newxprt->sc_max_requests +
+ newxprt->sc_max_bc_requests;
+ newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+
+ if (!svc_rdma_prealloc_ctxts(newxprt))
+ goto errout;
+ if (!svc_rdma_prealloc_maps(newxprt))
+ goto errout;
/*
* Limit ORD based on client limit, local device limit, and
* configured svcrdma limit.
*/
- newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+ newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
- newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+ newxprt->sc_pd = ib_alloc_pd(dev);
if (IS_ERR(newxprt->sc_pd)) {
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
}
cq_attr.cqe = newxprt->sc_sq_depth;
- newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ newxprt->sc_sq_cq = ib_create_cq(dev,
sq_comp_handler,
cq_event_handler,
newxprt,
@@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
- cq_attr.cqe = newxprt->sc_max_requests;
- newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ cq_attr.cqe = newxprt->sc_rq_depth;
+ newxprt->sc_rq_cq = ib_create_cq(dev,
rq_comp_handler,
cq_event_handler,
newxprt,
@@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
- qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+ qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
" cap.max_send_sge = %d\n"
" cap.max_recv_sge = %d\n",
newxprt->sc_cm_id, newxprt->sc_pd,
- newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+ dev, newxprt->sc_pd->device,
qp_attr.cap.max_send_wr,
qp_attr.cap.max_recv_wr,
qp_attr.cap.max_send_sge,
@@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
* of an RDMA_READ. IB does not.
*/
newxprt->sc_reader = rdma_read_chunk_lcl;
- if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
newxprt->sc_frmr_pg_list_len =
- devattr.max_fast_reg_page_list_len;
+ dev->attrs.max_fast_reg_page_list_len;
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
newxprt->sc_reader = rdma_read_chunk_frmr;
}
@@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/*
* Determine if a DMA MR is required and if so, what privs are required
*/
- if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num) &&
- !rdma_ib_or_roce(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num))
+ if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
+ !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
goto errout;
- if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
- !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num) &&
- !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
- dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
- }
-
- if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num))
+ if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
- /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
- if (need_dma_mr) {
- /* Register all of physical memory */
- newxprt->sc_phys_mr =
- ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
- if (IS_ERR(newxprt->sc_phys_mr)) {
- dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
- ret);
- goto errout;
- }
- newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
- } else
- newxprt->sc_dma_lkey =
- newxprt->sc_cm_id->device->local_dma_lkey;
-
/* Post receive buffers */
- for (i = 0; i < newxprt->sc_max_requests; i++) {
- ret = svc_rdma_post_recv(newxprt);
+ for (i = 0; i < newxprt->sc_rq_depth; i++) {
+ ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
if (ret) {
dprintk("svcrdma: failure posting receive buffers\n");
goto errout;
@@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work)
{
struct svcxprt_rdma *rdma =
container_of(work, struct svcxprt_rdma, sc_work);
- dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+
+ dprintk("svcrdma: %s(%p)\n", __func__, rdma);
/* We should only be called from kref_put */
- if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+ if (atomic_read(&xprt->xpt_ref.refcount) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
+ atomic_read(&xprt->xpt_ref.refcount));
/*
* Destroy queued, but not processed read completions. Note
@@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work)
}
/* Warn if we leaked a resource or under-referenced */
- if (atomic_read(&rdma->sc_ctxt_used) != 0)
+ if (rdma->sc_ctxt_used != 0)
pr_err("svcrdma: ctxt still in use? (%d)\n",
- atomic_read(&rdma->sc_ctxt_used));
+ rdma->sc_ctxt_used);
if (atomic_read(&rdma->sc_dma_used) != 0)
pr_err("svcrdma: dma still in use? (%d)\n",
atomic_read(&rdma->sc_dma_used));
- /* De-allocate fastreg mr */
+ /* Final put of backchannel client transport */
+ if (xprt->xpt_bc_xprt) {
+ xprt_put(xprt->xpt_bc_xprt);
+ xprt->xpt_bc_xprt = NULL;
+ }
+
rdma_dealloc_frmr_q(rdma);
+ svc_rdma_destroy_ctxts(rdma);
+ svc_rdma_destroy_maps(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work)
if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
ib_destroy_cq(rdma->sc_rq_cq);
- if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
- ib_dereg_mr(rdma->sc_phys_mr);
-
if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
ib_dealloc_pd(rdma->sc_pd);
@@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
int length;
int ret;
- p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ p = alloc_page(GFP_KERNEL);
+ if (!p)
+ return;
va = page_address(p);
/* XDR encode error */
@@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
return;
}
atomic_inc(&xprt->sc_dma_used);
- ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->sge[0].length = length;
/* Prepare SEND WR */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 740bddcf3488..b1b009f10ea3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -63,7 +63,7 @@
*/
static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
-static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
@@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = {
#endif
-#define RPCRDMA_BIND_TO (60U * HZ)
-#define RPCRDMA_INIT_REEST_TO (5U * HZ)
-#define RPCRDMA_MAX_REEST_TO (30U * HZ)
-#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
-
-static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
+static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */
static void
xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
}
-static void
+void
xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
char buf[128];
@@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
}
-static void
+void
xprt_rdma_free_addresses(struct rpc_xprt *xprt)
{
unsigned int i;
@@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
if (req == NULL)
return NULL;
- flags = GFP_NOIO | __GFP_NOWARN;
+ flags = RPCRDMA_DEF_GFP;
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
@@ -642,7 +637,7 @@ drop_connection:
return -ENOTCONN; /* implies disconnect */
}
-static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
long idle_time = 0;
@@ -743,6 +738,11 @@ void xprt_rdma_cleanup(void)
rpcrdma_destroy_wq();
frwr_destroy_recovery_wq();
+
+ rc = xprt_unregister_transport(&xprt_rdma_bc);
+ if (rc)
+ dprintk("RPC: %s: xprt_unregister(bc) returned %i\n",
+ __func__, rc);
}
int xprt_rdma_init(void)
@@ -766,6 +766,14 @@ int xprt_rdma_init(void)
return rc;
}
+ rc = xprt_register_transport(&xprt_rdma_bc);
+ if (rc) {
+ xprt_unregister_transport(&xprt_rdma);
+ rpcrdma_destroy_wq();
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
dprintk("Defaults:\n");
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 732c71ce5dca..878f1bfb1db9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -462,7 +462,6 @@ int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
struct rpcrdma_ia *ia = &xprt->rx_ia;
- struct ib_device_attr *devattr = &ia->ri_devattr;
int rc;
ia->ri_dma_mr = NULL;
@@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
goto out2;
}
- rc = ib_query_device(ia->ri_device, devattr);
- if (rc) {
- dprintk("RPC: %s: ib_query_device failed %d\n",
- __func__, rc);
- goto out3;
- }
-
if (memreg == RPCRDMA_FRMR) {
- if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
- (devattr->max_fast_reg_page_list_len == 0)) {
+ if (!(ia->ri_device->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+ (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
dprintk("RPC: %s: FRMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_MTHCAFMR;
@@ -566,24 +559,23 @@ int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
- struct ib_device_attr *devattr = &ia->ri_devattr;
struct ib_cq *sendcq, *recvcq;
struct ib_cq_init_attr cq_attr = {};
unsigned int max_qp_wr;
int rc, err;
- if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+ if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
dprintk("RPC: %s: insufficient sge's available\n",
__func__);
return -ENOMEM;
}
- if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+ if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
dprintk("RPC: %s: insufficient wqe's available\n",
__func__);
return -ENOMEM;
}
- max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS;
+ max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
/* check provider's send/recv wr limits */
if (cdata->max_requests > max_qp_wr)
@@ -668,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
- if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
ep->rep_remote_cma.responder_resources = 32;
else
ep->rep_remote_cma.responder_resources =
- devattr->max_qp_rd_atom;
+ ia->ri_device->attrs.max_qp_rd_atom;
ep->rep_remote_cma.retry_count = 7;
ep->rep_remote_cma.flow_control = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 728101ddc44b..38fe11b09875 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -55,6 +55,11 @@
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
+#define RPCRDMA_BIND_TO (60U * HZ)
+#define RPCRDMA_INIT_REEST_TO (5U * HZ)
+#define RPCRDMA_MAX_REEST_TO (30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
+
/*
* Interface Adapter -- one per transport instance
*/
@@ -68,7 +73,6 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
- struct ib_device_attr ri_devattr;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -142,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
return (struct rpcrdma_msg *)rb->rg_base;
}
+#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -309,6 +315,8 @@ struct rpcrdma_buffer {
u32 rb_bc_srv_max_requests;
spinlock_t rb_reqslock; /* protect rb_allreqs */
struct list_head rb_allreqs;
+
+ u32 rb_bc_max_requests;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -516,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
+extern unsigned int xprt_rdma_max_inline_read;
+void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
+void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
@@ -531,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
-/* Temporary NFS request map cache. Created in svc_rdma.c */
-extern struct kmem_cache *svc_rdma_map_cachep;
-/* WR context cache. Created in svc_rdma.c */
-extern struct kmem_cache *svc_rdma_ctxt_cachep;
-/* Workqueue created in svc_rdma.c */
-extern struct workqueue_struct *svc_rdma_wq;
+extern struct xprt_class xprt_rdma_bc;
#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 49d5093eb055..c51e2831f498 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
UNIXCB(skb).fp = NULL;
for (i = scm->fp->count-1; i >= 0; i--)
- unix_notinflight(scm->fp->fp[i]);
+ unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}
static void unix_destruct_scm(struct sk_buff *skb)
@@ -1561,7 +1561,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
return -ENOMEM;
for (i = scm->fp->count - 1; i >= 0; i--)
- unix_inflight(scm->fp->fp[i]);
+ unix_inflight(scm->fp->user, scm->fp->fp[i]);
return max_level;
}
@@ -1781,7 +1781,12 @@ restart_locked:
goto out_unlock;
}
- if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
+ /* other == sk && unix_peer(other) != sk if
+ * - unix_peer(sk) == NULL, destination address bound to sk
+ * - unix_peer(sk) == sk by time of get but disconnected before lock
+ */
+ if (other != sk &&
+ unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
if (timeo) {
timeo = unix_wait_for_peer(other, timeo);
@@ -2277,13 +2282,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
size_t size = state->size;
unsigned int last_len;
- err = -EINVAL;
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+ err = -EINVAL;
goto out;
+ }
- err = -EOPNOTSUPP;
- if (flags & MSG_OOB)
+ if (unlikely(flags & MSG_OOB)) {
+ err = -EOPNOTSUPP;
goto out;
+ }
target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
timeo = sock_rcvtimeo(sk, noblock);
@@ -2329,9 +2336,11 @@ again:
goto unlock;
unix_state_unlock(sk);
- err = -EAGAIN;
- if (!timeo)
+ if (!timeo) {
+ err = -EAGAIN;
break;
+ }
+
mutex_unlock(&u->readlock);
timeo = unix_stream_data_wait(sk, timeo, last,
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 8fcdc2283af5..6a0d48525fcf 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp)
* descriptor if it is for an AF_UNIX socket.
*/
-void unix_inflight(struct file *fp)
+void unix_inflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp);
@@ -133,11 +133,11 @@ void unix_inflight(struct file *fp)
}
unix_tot_inflight++;
}
- fp->f_cred->user->unix_inflight++;
+ user->unix_inflight++;
spin_unlock(&unix_gc_lock);
}
-void unix_notinflight(struct file *fp)
+void unix_notinflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp);
@@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp)
list_del_init(&u->link);
unix_tot_inflight--;
}
- fp->f_cred->user->unix_inflight--;
+ user->unix_inflight--;
spin_unlock(&unix_gc_lock);
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7fd1220fbfa0..bbe65dcb9738 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1557,8 +1557,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
if (err < 0)
goto out;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
-
while (total_written < len) {
ssize_t written;
@@ -1578,7 +1576,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
goto out_wait;
release_sock(sk);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
timeout = schedule_timeout(timeout);
+ finish_wait(sk_sleep(sk), &wait);
lock_sock(sk);
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
@@ -1588,8 +1588,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
goto out_wait;
}
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
}
/* These checks occur both as part of and after the loop
@@ -1635,7 +1633,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
out_wait:
if (total_written > 0)
err = total_written;
- finish_wait(sk_sleep(sk), &wait);
out:
release_sock(sk);
return err;
@@ -1716,7 +1713,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (err < 0)
goto out;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (1) {
s64 ready = vsock_stream_has_data(vsk);
@@ -1727,7 +1723,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
*/
err = -ENOMEM;
- goto out_wait;
+ goto out;
} else if (ready > 0) {
ssize_t read;
@@ -1750,7 +1746,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
vsk, target, read,
!(flags & MSG_PEEK), &recv_data);
if (err < 0)
- goto out_wait;
+ goto out;
if (read >= target || flags & MSG_PEEK)
break;
@@ -1773,7 +1769,9 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
break;
release_sock(sk);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
timeout = schedule_timeout(timeout);
+ finish_wait(sk_sleep(sk), &wait);
lock_sock(sk);
if (signal_pending(current)) {
@@ -1783,9 +1781,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
err = -EAGAIN;
break;
}
-
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
}
}
@@ -1816,8 +1811,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
err = copied;
}
-out_wait:
- finish_wait(sk_sleep(sk), &wait);
out:
release_sock(sk);
return err;