11 files changed, 440 insertions, 370 deletions
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
index 5622763ad402..7e51f128045d 100644
--- a/net/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -7,7 +7,7 @@
 
 #include <linux/ceph/buffer.h>
 #include <linux/ceph/decode.h>
-#include <linux/ceph/libceph.h> /* for ceph_kvmalloc */
+#include <linux/ceph/libceph.h> /* for kvmalloc */
 
 struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
 {
@@ -17,7 +17,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
 	if (!b)
 		return NULL;
 
-	b->vec.iov_base = ceph_kvmalloc(len, gfp);
+	b->vec.iov_base = kvmalloc(len, gfp);
 	if (!b->vec.iov_base) {
 		kfree(b);
 		return NULL;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 97d6ea763e32..4c6441536d55 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -190,41 +190,14 @@ int ceph_compare_options(struct ceph_options *new_opt,
 }
 EXPORT_SYMBOL(ceph_compare_options);
 
-/*
- * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
- * compatible with (a superset of) GFP_KERNEL.  This is because while the
- * actual pages are allocated with the specified flags, the page table pages
- * are always allocated with GFP_KERNEL.
- *
- * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
- */
-void *ceph_kvmalloc(size_t size, gfp_t flags)
-{
-	void *p;
-
-	if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) {
-		p = kvmalloc(size, flags);
-	} else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) {
-		unsigned int nofs_flag = memalloc_nofs_save();
-		p = kvmalloc(size, GFP_KERNEL);
-		memalloc_nofs_restore(nofs_flag);
-	} else {
-		unsigned int noio_flag = memalloc_noio_save();
-		p = kvmalloc(size, GFP_KERNEL);
-		memalloc_noio_restore(noio_flag);
-	}
-
-	return p;
-}
-
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid)
 {
 	int i = 0;
 	char tmp[3];
 	int err = -EINVAL;
 	int d;
 
-	dout("parse_fsid '%s'\n", str);
+	dout("%s '%s'\n", __func__, str);
 	tmp[2] = 0;
 	while (*str && i < 16) {
 		if (ispunct(*str)) {
@@ -244,9 +217,10 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid)
 
 	if (i == 16)
 		err = 0;
-	dout("parse_fsid ret %d got fsid %pU\n", err, fsid);
+	dout("%s ret %d got fsid %pU\n", __func__, err, fsid);
 	return err;
 }
+EXPORT_SYMBOL(ceph_parse_fsid);
 
 /*
  * ceph options
@@ -272,6 +246,7 @@ enum {
 	Opt_cephx_sign_messages,
 	Opt_tcp_nodelay,
 	Opt_abort_on_full,
+	Opt_rxbounce,
 };
 
 enum {
@@ -321,6 +296,7 @@ static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_u32	("osdkeepalive",		Opt_osdkeepalivetimeout),
 	fsparam_enum	("read_from_replica",		Opt_read_from_replica,
 			 ceph_param_read_from_replica),
+	fsparam_flag	("rxbounce",			Opt_rxbounce),
 	fsparam_enum	("ms_mode",			Opt_ms_mode,
 			 ceph_param_ms_mode),
 	fsparam_string	("secret",			Opt_secret),
@@ -422,14 +398,14 @@ out:
 }
 
 int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
-		       struct fc_log *l)
+		       struct fc_log *l, char delim)
 {
 	struct p_log log = {.prefix = "libceph", .log = l};
 	int ret;
 
-	/* ip1[:port1][,ip2[:port2]...] */
+	/* ip1[:port1][<delim>ip2[:port2]...] */
 	ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON,
-			     &opt->num_mon);
+			     &opt->num_mon, delim);
 	if (ret) {
 		error_plog(&log, "Failed to parse monitor IPs: %d", ret);
 		return ret;
@@ -455,8 +431,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 	case Opt_ip:
 		err = ceph_parse_ips(param->string,
 				     param->string + param->size,
-				     &opt->my_addr,
-				     1, NULL);
+				     &opt->my_addr, 1, NULL, ',');
 		if (err) {
 			error_plog(&log, "Failed to parse ip: %d", err);
 			return err;
@@ -465,7 +440,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 		break;
 
 	case Opt_fsid:
-		err = parse_fsid(param->string, &opt->fsid);
+		err = ceph_parse_fsid(param->string, &opt->fsid);
 		if (err) {
 			error_plog(&log, "Failed to parse fsid: %d", err);
 			return err;
@@ -611,6 +586,9 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 	case Opt_abort_on_full:
 		opt->flags |= CEPH_OPT_ABORT_ON_FULL;
 		break;
+	case Opt_rxbounce:
+		opt->flags |= CEPH_OPT_RXBOUNCE;
+		break;
 
 	default:
 		BUG();
@@ -687,6 +665,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 		seq_puts(m, "notcp_nodelay,");
 	if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL))
 		seq_puts(m, "abort_on_full,");
+	if (opt->flags & CEPH_OPT_RXBOUNCE)
+		seq_puts(m, "rxbounce,");
 
 	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
 		seq_printf(m, "mount_timeout=%d,",
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 7057f8db4f99..1daf95e17d67 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -906,7 +906,6 @@ int crush_do_rule(const struct crush_map *map,
 	int recurse_to_leaf;
 	int wsize = 0;
 	int osize;
-	int *tmp;
 	const struct crush_rule *rule;
 	__u32 step;
 	int i, j;
@@ -1073,9 +1072,7 @@ int crush_do_rule(const struct crush_map *map,
 				memcpy(o, c, osize*sizeof(*o));
 
 			/* swap o and w arrays */
-			tmp = o;
-			o = w;
-			w = tmp;
+			swap(o, w);
 			wsize = osize;
 			break;
 
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 92d89b331645..051d22c0e4ad 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -147,7 +147,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
 static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
 
 /*
- * Should be used for buffers allocated with ceph_kvmalloc().
+ * Should be used for buffers allocated with kvmalloc().
  * Currently these are encrypt out-buffer (ceph_buffer) and decrypt
  * in-buffer (msg front).
  *
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 57d043b382ed..dfa237fbd5a3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -515,6 +515,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	}
+	if (con->bounce_page) {
+		__free_page(con->bounce_page);
+		con->bounce_page = NULL;
+	}
 
 	if (ceph_msgr2(from_msgr(con->msgr)))
 		ceph_con_v2_reset_protocol(con);
@@ -724,7 +728,6 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
 		it->iter.bi_size = cursor->resid;
 
 	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
-	cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
 }
 
 static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
@@ -750,10 +753,8 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
 	cursor->resid -= bytes;
 	bio_advance_iter(it->bio, &it->iter, bytes);
 
-	if (!cursor->resid) {
-		BUG_ON(!cursor->last_piece);
+	if (!cursor->resid)
 		return false;   /* no more data */
-	}
 
 	if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done &&
 		       page == bio_iter_page(it->bio, it->iter)))
@@ -766,9 +767,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
 			it->iter.bi_size = cursor->resid;
 	}
 
-	BUG_ON(cursor->last_piece);
 	BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
-	cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
 	return true;
 }
 #endif /* CONFIG_BLOCK */
@@ -784,8 +783,6 @@ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor,
 	cursor->bvec_iter.bi_size = cursor->resid;
 
 	BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
-	cursor->last_piece =
-	    cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
 }
 
 static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor,
@@ -811,19 +808,14 @@ static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor,
 	cursor->resid -= bytes;
 	bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes);
 
-	if (!cursor->resid) {
-		BUG_ON(!cursor->last_piece);
+	if (!cursor->resid)
 		return false;   /* no more data */
-	}
 
 	if (!bytes || (cursor->bvec_iter.bi_bvec_done &&
 		       page == bvec_iter_page(bvecs, cursor->bvec_iter)))
 		return false;	/* more bytes to process in this segment */
 
-	BUG_ON(cursor->last_piece);
 	BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
-	cursor->last_piece =
-	    cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
 	return true;
 }
 
@@ -849,7 +841,6 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
 	BUG_ON(page_count > (int)USHRT_MAX);
 	cursor->page_count = (unsigned short)page_count;
 	BUG_ON(length > SIZE_MAX - cursor->page_offset);
-	cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE;
 }
 
 static struct page *
@@ -864,11 +855,7 @@ ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
 	BUG_ON(cursor->page_offset >= PAGE_SIZE);
 
 	*page_offset = cursor->page_offset;
-	if (cursor->last_piece)
-		*length = cursor->resid;
-	else
-		*length = PAGE_SIZE - *page_offset;
-
+	*length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
 	return data->pages[cursor->page_index];
 }
 
@@ -893,8 +880,6 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
 
 	BUG_ON(cursor->page_index >= cursor->page_count);
 	cursor->page_index++;
-	cursor->last_piece = cursor->resid <= PAGE_SIZE;
-
 	return true;
 }
 
@@ -924,7 +909,6 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
 	cursor->resid = min(length, pagelist->length);
 	cursor->page = page;
 	cursor->offset = 0;
-	cursor->last_piece = cursor->resid <= PAGE_SIZE;
 }
 
 static struct page *
@@ -944,11 +928,7 @@ ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
 
 	/* offset of first page in pagelist is always 0 */
 	*page_offset = cursor->offset & ~PAGE_MASK;
-	if (cursor->last_piece)
-		*length = cursor->resid;
-	else
-		*length = PAGE_SIZE - *page_offset;
-
+	*length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
 	return cursor->page;
 }
 
@@ -981,8 +961,6 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
 
 	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
 	cursor->page = list_next_entry(cursor->page, lru);
-	cursor->last_piece = cursor->resid <= PAGE_SIZE;
-
 	return true;
 }
 
@@ -1040,8 +1018,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
  * Indicate whether this is the last piece in this data item.
  */
 struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
-				size_t *page_offset, size_t *length,
-				bool *last_piece)
+				size_t *page_offset, size_t *length)
 {
 	struct page *page;
 
@@ -1070,8 +1047,6 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
 	BUG_ON(*page_offset + *length > PAGE_SIZE);
 	BUG_ON(!*length);
 	BUG_ON(*length > cursor->resid);
-	if (last_piece)
-		*last_piece = cursor->last_piece;
 
 	return page;
 }
@@ -1108,7 +1083,6 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 	cursor->total_resid -= bytes;
 
 	if (!cursor->resid && cursor->total_resid) {
-		WARN_ON(!cursor->last_piece);
 		cursor->data++;
 		__ceph_msg_data_cursor_init(cursor);
 		new_piece = true;
@@ -1267,30 +1241,31 @@ static int ceph_parse_server_name(const char *name, size_t namelen,
  */
 int ceph_parse_ips(const char *c, const char *end,
 		   struct ceph_entity_addr *addr,
-		   int max_count, int *count)
+		   int max_count, int *count, char delim)
 {
 	int i, ret = -EINVAL;
 	const char *p = c;
 
 	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
 	for (i = 0; i < max_count; i++) {
+		char cur_delim = delim;
 		const char *ipend;
 		int port;
-		char delim = ',';
 
 		if (*p == '[') {
-			delim = ']';
+			cur_delim = ']';
 			p++;
 		}
 
-		ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend);
+		ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim,
+					     &ipend);
 		if (ret)
 			goto bad;
 		ret = -EINVAL;
 
 		p = ipend;
 
-		if (delim == ']') {
+		if (cur_delim == ']') {
 			if (*p != ']') {
 				dout("missing matching ']'\n");
 				goto bad;
@@ -1326,11 +1301,11 @@ int ceph_parse_ips(const char *c, const char *end,
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 		addr[i].nonce = 0;
 
-		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
+		dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i]));
 
 		if (p == end)
 			break;
-		if (*p != ',')
+		if (*p != delim)
 			goto bad;
 		p++;
 	}
@@ -1920,7 +1895,7 @@ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
 
 	/* front */
 	if (front_len) {
-		m->front.iov_base = ceph_kvmalloc(front_len, flags);
+		m->front.iov_base = kvmalloc(front_len, flags);
 		if (m->front.iov_base == NULL) {
 			dout("ceph_msg_new can't allocate %d bytes\n",
 			     front_len);
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 2cb5ffdf071a..3ddbde87e4d6 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -495,7 +495,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 			continue;
 		}
 
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		page = ceph_msg_data_next(cursor, &page_offset, &length);
 		if (length == cursor->total_resid)
 			more = MSG_MORE;
 		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
@@ -992,8 +992,7 @@ static int read_partial_message_section(struct ceph_connection *con,
 
 static int read_partial_msg_data(struct ceph_connection *con)
 {
-	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
 	struct page *page;
 	size_t page_offset;
@@ -1001,9 +1000,6 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	u32 crc = 0;
 	int ret;
 
-	if (!msg->num_data_items)
-		return -EIO;
-
 	if (do_datacrc)
 		crc = con->in_data_crc;
 	while (cursor->total_resid) {
@@ -1012,7 +1008,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
 			continue;
 		}
 
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		page = ceph_msg_data_next(cursor, &page_offset, &length);
 		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
 		if (ret <= 0) {
 			if (do_datacrc)
@@ -1031,6 +1027,46 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	return 1;	/* must return > 0 to indicate success */
 }
 
+static int read_partial_msg_data_bounce(struct ceph_connection *con)
+{
+	struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+	struct page *page;
+	size_t off, len;
+	u32 crc;
+	int ret;
+
+	if (unlikely(!con->bounce_page)) {
+		con->bounce_page = alloc_page(GFP_NOIO);
+		if (!con->bounce_page) {
+			pr_err("failed to allocate bounce page\n");
+			return -ENOMEM;
+		}
+	}
+
+	crc = con->in_data_crc;
+	while (cursor->total_resid) {
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &off, &len);
+		ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len);
+		if (ret <= 0) {
+			con->in_data_crc = crc;
+			return ret;
+		}
+
+		crc = crc32c(crc, page_address(con->bounce_page), ret);
+		memcpy_to_page(page, off, page_address(con->bounce_page), ret);
+
+		ceph_msg_data_advance(cursor, ret);
+	}
+	con->in_data_crc = crc;
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
 /*
  * read (part of) a message.
  */
@@ -1141,7 +1177,13 @@ static int read_partial_message(struct ceph_connection *con)
 
 	/* (page) data */
 	if (data_len) {
-		ret = read_partial_msg_data(con);
+		if (!m->num_data_items)
+			return -EIO;
+
+		if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
+			ret = read_partial_msg_data_bounce(con);
+		else
+			ret = read_partial_msg_data(con);
 		if (ret <= 0)
 			return ret;
 	}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index cc40ce4e02fb..cc8ff81a50b7 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -57,8 +57,9 @@
 #define IN_S_HANDLE_CONTROL_REMAINDER	3
 #define IN_S_PREPARE_READ_DATA		4
 #define IN_S_PREPARE_READ_DATA_CONT	5
-#define IN_S_HANDLE_EPILOGUE		6
-#define IN_S_FINISH_SKIP		7
+#define IN_S_PREPARE_READ_ENC_PAGE	6
+#define IN_S_HANDLE_EPILOGUE		7
+#define IN_S_FINISH_SKIP		8
 
 #define OUT_S_QUEUE_DATA		1
 #define OUT_S_QUEUE_DATA_CONT		2
@@ -308,7 +309,7 @@ static void *alloc_conn_buf(struct ceph_connection *con, int len)
 	if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
 		return NULL;
 
-	buf = ceph_kvmalloc(len, GFP_NOIO);
+	buf = kvmalloc(len, GFP_NOIO);
 	if (!buf)
 		return NULL;
 
@@ -861,7 +862,7 @@ static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
 		ceph_msg_data_advance(cursor, 0);
 
 	/* get a piece of data, cursor isn't advanced */
-	page = ceph_msg_data_next(cursor, &off, &len, NULL);
+	page = ceph_msg_data_next(cursor, &off, &len);
 
 	bv->bv_page = page;
 	bv->bv_offset = off;
@@ -1032,22 +1033,41 @@ static int decrypt_control_remainder(struct ceph_connection *con)
 			 padded_len(rem_len) + CEPH_GCM_TAG_LEN);
 }
 
-static int decrypt_message(struct ceph_connection *con)
+static int decrypt_tail(struct ceph_connection *con)
 {
+	struct sg_table enc_sgt = {};
 	struct sg_table sgt = {};
+	int tail_len;
 	int ret;
 
+	tail_len = tail_onwire_len(con->in_msg, true);
+	ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages,
+					con->v2.in_enc_page_cnt, 0, tail_len,
+					GFP_NOIO);
+	if (ret)
+		goto out;
+
 	ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
 			MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
 			con->v2.in_buf, true);
 	if (ret)
 		goto out;
 
-	ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl,
-			tail_onwire_len(con->in_msg, true));
+	dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con,
+	     con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents);
+	ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len);
+	if (ret)
+		goto out;
+
+	WARN_ON(!con->v2.in_enc_page_cnt);
+	ceph_release_page_vector(con->v2.in_enc_pages,
+				 con->v2.in_enc_page_cnt);
+	con->v2.in_enc_pages = NULL;
+	con->v2.in_enc_page_cnt = 0;
 
 out:
 	sg_free_table(&sgt);
+	sg_free_table(&enc_sgt);
 	return ret;
 }
 
@@ -1733,54 +1753,153 @@ static int prepare_read_control_remainder(struct ceph_connection *con)
 	return 0;
 }
 
-static void prepare_read_data(struct ceph_connection *con)
+static int prepare_read_data(struct ceph_connection *con)
 {
 	struct bio_vec bv;
 
-	if (!con_secure(con))
-		con->in_data_crc = -1;
+	con->in_data_crc = -1;
 	ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
 				  data_len(con->in_msg));
 
 	get_bvec_at(&con->v2.in_cursor, &bv);
+	if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+		if (unlikely(!con->bounce_page)) {
+			con->bounce_page = alloc_page(GFP_NOIO);
+			if (!con->bounce_page) {
+				pr_err("failed to allocate bounce page\n");
+				return -ENOMEM;
+			}
+		}
+
+		bv.bv_page = con->bounce_page;
+		bv.bv_offset = 0;
+	}
 	set_in_bvec(con, &bv);
 	con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+	return 0;
 }
 
 static void prepare_read_data_cont(struct ceph_connection *con)
 {
 	struct bio_vec bv;
 
-	if (!con_secure(con))
+	if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+		con->in_data_crc = crc32c(con->in_data_crc,
+					  page_address(con->bounce_page),
+					  con->v2.in_bvec.bv_len);
+
+		get_bvec_at(&con->v2.in_cursor, &bv);
+		memcpy_to_page(bv.bv_page, bv.bv_offset,
+			       page_address(con->bounce_page),
+			       con->v2.in_bvec.bv_len);
+	} else {
 		con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
 						    con->v2.in_bvec.bv_page,
 						    con->v2.in_bvec.bv_offset,
 						    con->v2.in_bvec.bv_len);
+	}
 
 	ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
 	if (con->v2.in_cursor.total_resid) {
 		get_bvec_at(&con->v2.in_cursor, &bv);
+		if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+			bv.bv_page = con->bounce_page;
+			bv.bv_offset = 0;
+		}
 		set_in_bvec(con, &bv);
 		WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
 		return;
 	}
 
 	/*
-	 * We've read all data.  Prepare to read data padding (if any)
-	 * and epilogue.
+	 * We've read all data.  Prepare to read epilogue.
 	 */
 	reset_in_kvecs(con);
-	if (con_secure(con)) {
-		if (need_padding(data_len(con->in_msg)))
-			add_in_kvec(con, DATA_PAD(con->v2.in_buf),
-				    padding_len(data_len(con->in_msg)));
-		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN);
+	add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+	con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static int prepare_read_tail_plain(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->in_msg;
+
+	if (!front_len(msg) && !middle_len(msg)) {
+		WARN_ON(!data_len(msg));
+		return prepare_read_data(con);
+	}
+
+	reset_in_kvecs(con);
+	if (front_len(msg)) {
+		add_in_kvec(con, msg->front.iov_base, front_len(msg));
+		WARN_ON(msg->front.iov_len != front_len(msg));
+	}
+	if (middle_len(msg)) {
+		add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+		WARN_ON(msg->middle->vec.iov_len != middle_len(msg));
+	}
+
+	if (data_len(msg)) {
+		con->v2.in_state = IN_S_PREPARE_READ_DATA;
 	} else {
 		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
 	}
+	return 0;
+}
+
+static void prepare_read_enc_page(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i,
+	     con->v2.in_enc_resid);
+	WARN_ON(!con->v2.in_enc_resid);
+
+	bv.bv_page = con->v2.in_enc_pages[con->v2.in_enc_i];
+	bv.bv_offset = 0;
+	bv.bv_len = min(con->v2.in_enc_resid, (int)PAGE_SIZE);
+
+	set_in_bvec(con, &bv);
+	con->v2.in_enc_i++;
+	con->v2.in_enc_resid -= bv.bv_len;
+
+	if (con->v2.in_enc_resid) {
+		con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE;
+		return;
+	}
+
+	/*
+	 * We are set to read the last piece of ciphertext (ending
+	 * with epilogue) + auth tag.
+	 */
+	WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt);
 	con->v2.in_state = IN_S_HANDLE_EPILOGUE;
 }
 
+static int prepare_read_tail_secure(struct ceph_connection *con)
+{
+	struct page **enc_pages;
+	int enc_page_cnt;
+	int tail_len;
+
+	tail_len = tail_onwire_len(con->in_msg, true);
+	WARN_ON(!tail_len);
+
+	enc_page_cnt = calc_pages_for(0, tail_len);
+	enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+	if (IS_ERR(enc_pages))
+		return PTR_ERR(enc_pages);
+
+	WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt);
+	con->v2.in_enc_pages = enc_pages;
+	con->v2.in_enc_page_cnt = enc_page_cnt;
+	con->v2.in_enc_resid = tail_len;
+	con->v2.in_enc_i = 0;
+
+	prepare_read_enc_page(con);
+	return 0;
+}
+
 static void __finish_skip(struct ceph_connection *con)
 {
 	con->in_seq++;
@@ -2589,47 +2708,26 @@ static int __handle_control(struct ceph_connection *con, void *p)
 	}
 
 	msg = con->in_msg;  /* set in process_message_header() */
-	if (!front_len(msg) && !middle_len(msg)) {
-		if (!data_len(msg))
-			return process_message(con);
-
-		prepare_read_data(con);
-		return 0;
-	}
-
-	reset_in_kvecs(con);
 	if (front_len(msg)) {
 		WARN_ON(front_len(msg) > msg->front_alloc_len);
-		add_in_kvec(con, msg->front.iov_base, front_len(msg));
 		msg->front.iov_len = front_len(msg);
-
-		if (con_secure(con) && need_padding(front_len(msg)))
-			add_in_kvec(con, FRONT_PAD(con->v2.in_buf),
-				    padding_len(front_len(msg)));
 	} else {
 		msg->front.iov_len = 0;
 	}
 	if (middle_len(msg)) {
 		WARN_ON(middle_len(msg) > msg->middle->alloc_len);
-		add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
 		msg->middle->vec.iov_len = middle_len(msg);
-
-		if (con_secure(con) && need_padding(middle_len(msg)))
-			add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf),
-				    padding_len(middle_len(msg)));
 	} else if (msg->middle) {
 		msg->middle->vec.iov_len = 0;
 	}
 
-	if (data_len(msg)) {
-		con->v2.in_state = IN_S_PREPARE_READ_DATA;
-	} else {
-		add_in_kvec(con, con->v2.in_buf,
-			    con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN :
-					      CEPH_EPILOGUE_PLAIN_LEN);
-		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
-	}
-	return 0;
+	if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+		return process_message(con);
+
+	if (con_secure(con))
+		return prepare_read_tail_secure(con);
+
+	return prepare_read_tail_plain(con);
 }
 
 static int handle_preamble(struct ceph_connection *con)
@@ -2717,7 +2815,7 @@ static int handle_epilogue(struct ceph_connection *con)
 	int ret;
 
 	if (con_secure(con)) {
-		ret = decrypt_message(con);
+		ret = decrypt_tail(con);
 		if (ret) {
 			if (ret == -EBADMSG)
 				con->error_msg = "integrity error, bad epilogue auth tag";
@@ -2785,13 +2883,16 @@ static int populate_in_iter(struct ceph_connection *con)
 			ret = handle_control_remainder(con);
 			break;
 		case IN_S_PREPARE_READ_DATA:
-			prepare_read_data(con);
-			ret = 0;
+			ret = prepare_read_data(con);
 			break;
 		case IN_S_PREPARE_READ_DATA_CONT:
 			prepare_read_data_cont(con);
 			ret = 0;
 			break;
+		case IN_S_PREPARE_READ_ENC_PAGE:
+			prepare_read_enc_page(con);
+			ret = 0;
+			break;
 		case IN_S_HANDLE_EPILOGUE:
 			ret = handle_epilogue(con);
 			break;
@@ -3326,20 +3427,16 @@ void ceph_con_v2_revoke(struct ceph_connection *con)
 
 static void revoke_at_prepare_read_data(struct ceph_connection *con)
 {
-	int remaining;  /* data + [data padding] + epilogue */
+	int remaining;
 	int resid;
 
+	WARN_ON(con_secure(con));
 	WARN_ON(!data_len(con->in_msg));
 	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
 	resid = iov_iter_count(&con->v2.in_iter);
 	WARN_ON(!resid);
 
-	if (con_secure(con))
-		remaining = padded_len(data_len(con->in_msg)) +
-			    CEPH_EPILOGUE_SECURE_LEN;
-	else
-		remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
-
+	remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
 	dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
 	     remaining);
 	con->v2.in_iter.count -= resid;
@@ -3350,8 +3447,9 @@ static void revoke_at_prepare_read_data(struct ceph_connection *con)
 static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
 {
 	int recved, resid;  /* current piece of data */
-	int remaining;  /* [data padding] + epilogue */
+	int remaining;
 
+	WARN_ON(con_secure(con));
 	WARN_ON(!data_len(con->in_msg));
 	WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
 	resid = iov_iter_count(&con->v2.in_iter);
@@ -3363,12 +3461,7 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
 		ceph_msg_data_advance(&con->v2.in_cursor, recved);
 	WARN_ON(resid > con->v2.in_cursor.total_resid);
 
-	if (con_secure(con))
-		remaining = padding_len(data_len(con->in_msg)) +
-			    CEPH_EPILOGUE_SECURE_LEN;
-	else
-		remaining = CEPH_EPILOGUE_PLAIN_LEN;
-
+	remaining = CEPH_EPILOGUE_PLAIN_LEN;
 	dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
 	     con->v2.in_cursor.total_resid, remaining);
 	con->v2.in_iter.count -= resid;
@@ -3376,11 +3469,26 @@ static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
 	con->v2.in_state = IN_S_FINISH_SKIP;
 }
 
+static void revoke_at_prepare_read_enc_page(struct ceph_connection *con)
+{
+	int resid;  /* current enc page (not necessarily data) */
+
+	WARN_ON(!con_secure(con));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+
+	dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid,
+	     con->v2.in_enc_resid);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid + con->v2.in_enc_resid);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
 static void revoke_at_handle_epilogue(struct ceph_connection *con)
 {
 	int resid;
 
-	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
 	resid = iov_iter_count(&con->v2.in_iter);
 	WARN_ON(!resid);
 
@@ -3399,6 +3507,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
 	case IN_S_PREPARE_READ_DATA_CONT:
 		revoke_at_prepare_read_data_cont(con);
 		break;
+	case IN_S_PREPARE_READ_ENC_PAGE:
+		revoke_at_prepare_read_enc_page(con);
+		break;
 	case IN_S_HANDLE_EPILOGUE:
 		revoke_at_handle_epilogue(con);
 		break;
@@ -3432,6 +3543,13 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con)
 	clear_out_sign_kvecs(con);
 	free_conn_bufs(con);
 
+	if (con->v2.in_enc_pages) {
+		WARN_ON(!con->v2.in_enc_page_cnt);
+		ceph_release_page_vector(con->v2.in_enc_pages,
+					 con->v2.in_enc_page_cnt);
+		con->v2.in_enc_pages = NULL;
+		con->v2.in_enc_page_cnt = 0;
+	}
 	if (con->v2.out_enc_pages) {
 		WARN_ON(!con->v2.out_enc_page_cnt);
 		ceph_release_page_vector(con->v2.out_enc_pages,
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 6a6898ee4049..db60217f911b 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -222,7 +222,7 @@ static void pick_new_mon(struct ceph_mon_client *monc)
 				max--;
 		}
 
-		n = prandom_u32() % max;
+		n = prandom_u32_max(max);
 		if (o >= 0 && n >= o)
 			n++;
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1c5815530e0d..4e4f1e4bc265 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -537,43 +537,6 @@ static void request_init(struct ceph_osd_request *req)
 	target_init(&req->r_t);
 }
 
-/*
- * This is ugly, but it allows us to reuse linger registration and ping
- * requests, keeping the structure of the code around send_linger{_ping}()
- * reasonable.  Setting up a min_nr=2 mempool for each linger request
- * and dealing with copying ops (this blasts req only, watch op remains
- * intact) isn't any better.
- */
-static void request_reinit(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-	bool mempool = req->r_mempool;
-	unsigned int num_ops = req->r_num_ops;
-	u64 snapid = req->r_snapid;
-	struct ceph_snap_context *snapc = req->r_snapc;
-	bool linger = req->r_linger;
-	struct ceph_msg *request_msg = req->r_request;
-	struct ceph_msg *reply_msg = req->r_reply;
-
-	dout("%s req %p\n", __func__, req);
-	WARN_ON(kref_read(&req->r_kref) != 1);
-	request_release_checks(req);
-
-	WARN_ON(kref_read(&request_msg->kref) != 1);
-	WARN_ON(kref_read(&reply_msg->kref) != 1);
-	target_destroy(&req->r_t);
-
-	request_init(req);
-	req->r_osdc = osdc;
-	req->r_mempool = mempool;
-	req->r_num_ops = num_ops;
-	req->r_snapid = snapid;
-	req->r_snapc = snapc;
-	req->r_linger = linger;
-	req->r_request = request_msg;
-	req->r_reply = reply_msg;
-}
-
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       unsigned int num_ops,
@@ -918,14 +881,30 @@ EXPORT_SYMBOL(osd_req_op_xattr_init);
  * @watch_opcode: CEPH_OSD_WATCH_OP_*
  */
 static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
-				  u64 cookie, u8 watch_opcode)
+				  u8 watch_opcode, u64 cookie, u32 gen)
 {
 	struct ceph_osd_req_op *op;
 
 	op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
 	op->watch.cookie = cookie;
 	op->watch.op = watch_opcode;
-	op->watch.gen = 0;
+	op->watch.gen = gen;
+}
+
+/*
+ * prot_ver, timeout and notify payload (may be empty) should already be
+ * encoded in @request_pl
+ */
+static void osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+				   u64 cookie, struct ceph_pagelist *request_pl)
+{
+	struct ceph_osd_req_op *op;
+
+	op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+	op->notify.cookie = cookie;
+
+	ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl);
+	op->indata_len = request_pl->length;
 }
 
 /*
@@ -1500,7 +1479,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 
 static int pick_random_replica(const struct ceph_osds *acting)
 {
-	int i = prandom_u32() % acting->size;
+	int i = prandom_u32_max(acting->size);
 
 	dout("%s picked osd%d, primary osd%d\n", __func__,
 	     acting->osds[i], acting->primary);
@@ -2385,7 +2364,11 @@ again:
 		if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) {
 			err = -ENOSPC;
 		} else {
-			pr_warn_ratelimited("FULL or reached pool quota\n");
+			if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL))
+				pr_warn_ratelimited("cluster is full (osdmap FULL)\n");
+			else
+				pr_warn_ratelimited("pool %lld is full or reached quota\n",
+						    req->r_t.base_oloc.pool);
 			req->r_t.paused = true;
 			maybe_request_map(osdc);
 		}
@@ -2727,10 +2710,13 @@ static void linger_release(struct kref *kref)
 	WARN_ON(!list_empty(&lreq->pending_lworks));
 	WARN_ON(lreq->osd);
 
-	if (lreq->reg_req)
-		ceph_osdc_put_request(lreq->reg_req);
-	if (lreq->ping_req)
-		ceph_osdc_put_request(lreq->ping_req);
+	if (lreq->request_pl)
+		ceph_pagelist_release(lreq->request_pl);
+	if (lreq->notify_id_pages)
+		ceph_release_page_vector(lreq->notify_id_pages, 1);
+
+	ceph_osdc_put_request(lreq->reg_req);
+	ceph_osdc_put_request(lreq->ping_req);
 	target_destroy(&lreq->t);
 	kfree(lreq);
 }
@@ -2999,6 +2985,12 @@ static void linger_commit_cb(struct ceph_osd_request *req)
 	struct ceph_osd_linger_request *lreq = req->r_priv;
 
 	mutex_lock(&lreq->lock);
+	if (req != lreq->reg_req) {
+		dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+		     __func__, lreq, lreq->linger_id, req, lreq->reg_req);
+		goto out;
+	}
+
 	dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
 	     lreq->linger_id, req->r_result);
 	linger_reg_commit_complete(lreq, req->r_result);
@@ -3022,6 +3014,7 @@ static void linger_commit_cb(struct ceph_osd_request *req)
 		}
 	}
 
+out:
 	mutex_unlock(&lreq->lock);
 	linger_put(lreq);
 }
@@ -3044,6 +3037,12 @@ static void linger_reconnect_cb(struct ceph_osd_request *req)
 	struct ceph_osd_linger_request *lreq = req->r_priv;
 
 	mutex_lock(&lreq->lock);
+	if (req != lreq->reg_req) {
+		dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+		     __func__, lreq, lreq->linger_id, req, lreq->reg_req);
+		goto out;
+	}
+
 	dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
 	     lreq, lreq->linger_id, req->r_result, lreq->last_error);
 	if (req->r_result < 0) {
@@ -3053,46 +3052,64 @@ static void linger_reconnect_cb(struct ceph_osd_request *req)
 		}
 	}
 
+out:
 	mutex_unlock(&lreq->lock);
 	linger_put(lreq);
 }
 
 static void send_linger(struct ceph_osd_linger_request *lreq)
 {
-	struct ceph_osd_request *req = lreq->reg_req;
-	struct ceph_osd_req_op *op = &req->r_ops[0];
+	struct ceph_osd_client *osdc = lreq->osdc;
+	struct ceph_osd_request *req;
+	int ret;
 
-	verify_osdc_wrlocked(req->r_osdc);
+	verify_osdc_wrlocked(osdc);
+	mutex_lock(&lreq->lock);
 	dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
 
-	if (req->r_osd)
-		cancel_linger_request(req);
+	if (lreq->reg_req) {
+		if (lreq->reg_req->r_osd)
+			cancel_linger_request(lreq->reg_req);
+		ceph_osdc_put_request(lreq->reg_req);
+	}
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO);
+	BUG_ON(!req);
 
-	request_reinit(req);
 	target_copy(&req->r_t, &lreq->t);
 	req->r_mtime = lreq->mtime;
 
-	mutex_lock(&lreq->lock);
 	if (lreq->is_watch && lreq->committed) {
-		WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
-			op->watch.cookie != lreq->linger_id);
-		op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
-		op->watch.gen = ++lreq->register_gen;
+		osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT,
+				      lreq->linger_id, ++lreq->register_gen);
 		dout("lreq %p reconnect register_gen %u\n", lreq,
-		     op->watch.gen);
+		     req->r_ops[0].watch.gen);
 		req->r_callback = linger_reconnect_cb;
 	} else {
-		if (!lreq->is_watch)
+		if (lreq->is_watch) {
+			osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH,
+					      lreq->linger_id, 0);
+		} else {
 			lreq->notify_id = 0;
-		else
-			WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+
+			refcount_inc(&lreq->request_pl->refcnt);
+			osd_req_op_notify_init(req, 0, lreq->linger_id,
+					       lreq->request_pl);
+			ceph_osd_data_pages_init(
+			    osd_req_op_data(req, 0, notify, response_data),
+			    lreq->notify_id_pages, PAGE_SIZE, 0, false, false);
+		}
 		dout("lreq %p register\n", lreq);
 		req->r_callback = linger_commit_cb;
 	}
-	mutex_unlock(&lreq->lock);
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	BUG_ON(ret);
 
 	req->r_priv = linger_get(lreq);
 	req->r_linger = true;
+	lreq->reg_req = req;
+	mutex_unlock(&lreq->lock);
 
 	submit_request(req, true);
 }
@@ -3102,6 +3119,12 @@ static void linger_ping_cb(struct ceph_osd_request *req)
 	struct ceph_osd_linger_request *lreq = req->r_priv;
 
 	mutex_lock(&lreq->lock);
+	if (req != lreq->ping_req) {
+		dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+		     __func__, lreq, lreq->linger_id, req, lreq->ping_req);
+		goto out;
+	}
+
 	dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
 	     __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
 	     lreq->last_error);
@@ -3117,6 +3140,7 @@ static void linger_ping_cb(struct ceph_osd_request *req)
 		     lreq->register_gen, req->r_ops[0].watch.gen);
 	}
 
+out:
 	mutex_unlock(&lreq->lock);
 	linger_put(lreq);
 }
@@ -3124,8 +3148,8 @@ static void linger_ping_cb(struct ceph_osd_request *req)
 static void send_linger_ping(struct ceph_osd_linger_request *lreq)
 {
 	struct ceph_osd_client *osdc = lreq->osdc;
-	struct ceph_osd_request *req = lreq->ping_req;
-	struct ceph_osd_req_op *op = &req->r_ops[0];
+	struct ceph_osd_request *req;
+	int ret;
 
 	if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
 		dout("%s PAUSERD\n", __func__);
@@ -3137,19 +3161,26 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq)
 	     __func__, lreq, lreq->linger_id, lreq->ping_sent,
 	     lreq->register_gen);
 
-	if (req->r_osd)
-		cancel_linger_request(req);
+	if (lreq->ping_req) {
+		if (lreq->ping_req->r_osd)
+			cancel_linger_request(lreq->ping_req);
+		ceph_osdc_put_request(lreq->ping_req);
+	}
 
-	request_reinit(req);
-	target_copy(&req->r_t, &lreq->t);
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO);
+	BUG_ON(!req);
 
-	WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
-		op->watch.cookie != lreq->linger_id ||
-		op->watch.op != CEPH_OSD_WATCH_OP_PING);
-	op->watch.gen = lreq->register_gen;
+	target_copy(&req->r_t, &lreq->t);
+	osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id,
+			      lreq->register_gen);
 	req->r_callback = linger_ping_cb;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	BUG_ON(ret);
+
 	req->r_priv = linger_get(lreq);
 	req->r_linger = true;
+	lreq->ping_req = req;
 
 	ceph_osdc_get_request(req);
 	account_request(req);
@@ -3165,12 +3196,6 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
 
 	down_write(&osdc->lock);
 	linger_register(lreq);
-	if (lreq->is_watch) {
-		lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id;
-		lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id;
-	} else {
-		lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
-	}
 
 	calc_target(osdc, &lreq->t, false);
 	osd = lookup_create_osd(osdc, lreq->t.osd, true);
@@ -3202,9 +3227,9 @@ static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
  */
 static void __linger_cancel(struct ceph_osd_linger_request *lreq)
 {
-	if (lreq->is_watch && lreq->ping_req->r_osd)
+	if (lreq->ping_req && lreq->ping_req->r_osd)
 		cancel_linger_request(lreq->ping_req);
-	if (lreq->reg_req->r_osd)
+	if (lreq->reg_req && lreq->reg_req->r_osd)
 		cancel_linger_request(lreq->reg_req);
 	cancel_linger_map_check(lreq);
 	unlink_linger(lreq->osd, lreq);
@@ -4553,21 +4578,23 @@ bad:
 /*
  * Register request, send initial attempt.
  */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-			    struct ceph_osd_request *req,
-			    bool nofail)
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
 {
 	down_read(&osdc->lock);
 	submit_request(req, false);
 	up_read(&osdc->lock);
-
-	return 0;
 }
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
 /*
- * Unregister a registered request.  The request is not completed:
- * ->r_result isn't set and __complete_request() isn't called.
+ * Unregister request.  If @req was registered, it isn't completed:
+ * r_result isn't set and __complete_request() isn't invoked.
+ *
+ * If @req wasn't registered, this call may have raced with
+ * handle_reply(), in which case r_result would already be set and
+ * __complete_request() would be getting invoked, possibly even
+ * concurrently with this call.
  */
 void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
@@ -4653,43 +4680,6 @@ again:
 }
 EXPORT_SYMBOL(ceph_osdc_sync);
 
-static struct ceph_osd_request *
-alloc_linger_request(struct ceph_osd_linger_request *lreq)
-{
-	struct ceph_osd_request *req;
-
-	req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
-	if (!req)
-		return NULL;
-
-	ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
-	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
-	return req;
-}
-
-static struct ceph_osd_request *
-alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode)
-{
-	struct ceph_osd_request *req;
-
-	req = alloc_linger_request(lreq);
-	if (!req)
-		return NULL;
-
-	/*
-	 * Pass 0 for cookie because we don't know it yet, it will be
-	 * filled in by linger_submit().
-	 */
-	osd_req_op_watch_init(req, 0, 0, watch_opcode);
-
-	if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-
-	return req;
-}
-
 /*
  * Returns a handle, caller owns a ref.
  */
@@ -4719,18 +4709,6 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
 	lreq->t.flags = CEPH_OSD_FLAG_WRITE;
 	ktime_get_real_ts64(&lreq->mtime);
 
-	lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH);
-	if (!lreq->reg_req) {
-		ret = -ENOMEM;
-		goto err_put_lreq;
-	}
-
-	lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING);
-	if (!lreq->ping_req) {
-		ret = -ENOMEM;
-		goto err_put_lreq;
-	}
-
 	linger_submit(lreq);
 	ret = linger_reg_commit_wait(lreq);
 	if (ret) {
@@ -4768,14 +4746,14 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
 	ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
 	req->r_flags = CEPH_OSD_FLAG_WRITE;
 	ktime_get_real_ts64(&req->r_mtime);
-	osd_req_op_watch_init(req, 0, lreq->linger_id,
-			      CEPH_OSD_WATCH_OP_UNWATCH);
+	osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH,
+			      lreq->linger_id, 0);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	linger_cancel(lreq);
 	linger_put(lreq);
 	ret = wait_request_timeout(req, opts->mount_timeout);
@@ -4846,7 +4824,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 
 out_put_req:
@@ -4855,35 +4833,6 @@ out_put_req:
 }
 EXPORT_SYMBOL(ceph_osdc_notify_ack);
 
-static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
-				  u64 cookie, u32 prot_ver, u32 timeout,
-				  void *payload, u32 payload_len)
-{
-	struct ceph_osd_req_op *op;
-	struct ceph_pagelist *pl;
-	int ret;
-
-	op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
-	op->notify.cookie = cookie;
-
-	pl = ceph_pagelist_alloc(GFP_NOIO);
-	if (!pl)
-		return -ENOMEM;
-
-	ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
-	ret |= ceph_pagelist_encode_32(pl, timeout);
-	ret |= ceph_pagelist_encode_32(pl, payload_len);
-	ret |= ceph_pagelist_append(pl, payload, payload_len);
-	if (ret) {
-		ceph_pagelist_release(pl);
-		return -ENOMEM;
-	}
-
-	ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
-	op->indata_len = pl->length;
-	return 0;
-}
-
 /*
  * @timeout: in seconds
  *
@@ -4902,7 +4851,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
 		     size_t *preply_len)
 {
 	struct ceph_osd_linger_request *lreq;
-	struct page **pages;
 	int ret;
 
 	WARN_ON(!timeout);
@@ -4915,41 +4863,35 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
 	if (!lreq)
 		return -ENOMEM;
 
-	lreq->preply_pages = preply_pages;
-	lreq->preply_len = preply_len;
-
-	ceph_oid_copy(&lreq->t.base_oid, oid);
-	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
-	lreq->t.flags = CEPH_OSD_FLAG_READ;
-
-	lreq->reg_req = alloc_linger_request(lreq);
-	if (!lreq->reg_req) {
+	lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO);
+	if (!lreq->request_pl) {
 		ret = -ENOMEM;
 		goto out_put_lreq;
 	}
 
-	/*
-	 * Pass 0 for cookie because we don't know it yet, it will be
-	 * filled in by linger_submit().
-	 */
-	ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout,
-				     payload, payload_len);
-	if (ret)
+	ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */
+	ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout);
+	ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len);
+	ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len);
+	if (ret) {
+		ret = -ENOMEM;
 		goto out_put_lreq;
+	}
 
 	/* for notify_id */
-	pages = ceph_alloc_page_vector(1, GFP_NOIO);
-	if (IS_ERR(pages)) {
-		ret = PTR_ERR(pages);
+	lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(lreq->notify_id_pages)) {
+		ret = PTR_ERR(lreq->notify_id_pages);
+		lreq->notify_id_pages = NULL;
 		goto out_put_lreq;
 	}
-	ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
-						 response_data),
-				 pages, PAGE_SIZE, 0, false, true);
 
-	ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO);
-	if (ret)
-		goto out_put_lreq;
+	lreq->preply_pages = preply_pages;
+	lreq->preply_len = preply_len;
+
+	ceph_oid_copy(&lreq->t.base_oid, oid);
+	ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+	lreq->t.flags = CEPH_OSD_FLAG_READ;
 
 	linger_submit(lreq);
 	ret = linger_reg_commit_wait(lreq);
@@ -5098,7 +5040,7 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		void *p = page_address(pages[0]);
@@ -5175,7 +5117,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	if (ret)
 		goto out_put_req;
 
-	ceph_osdc_start_request(osdc, req, false);
+	ceph_osdc_start_request(osdc, req);
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		ret = req->r_ops[0].rval;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 75b738083523..295098873861 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -11,6 +11,22 @@
 #include <linux/crush/hash.h>
 #include <linux/crush/mapper.h>
 
+static __printf(2, 3)
+void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid,
+	       map->epoch, &vaf);
+
+	va_end(args);
+}
+
 char *ceph_osdmap_state_str(char *str, int len, u32 state)
 {
 	if (!len)
@@ -571,10 +587,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 			goto bad;
 #endif
 		r = kmalloc(struct_size(r, steps, yes), GFP_NOFS);
-		c->rules[i] = r;
 		if (r == NULL)
 			goto badmem;
 		dout(" rule %d is at %p\n", i, r);
+		c->rules[i] = r;
 		r->len = yes;
 		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
 		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
@@ -980,7 +996,7 @@ static struct crush_work *alloc_workspace(const struct crush_map *c)
 	work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
 	dout("%s work_size %zu bytes\n", __func__, work_size);
 
-	work = ceph_kvmalloc(work_size, GFP_NOIO);
+	work = kvmalloc(work_size, GFP_NOIO);
 	if (!work)
 		return NULL;
 
@@ -1190,9 +1206,9 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
 	if (max == map->max_osd)
 		return 0;
 
-	state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
-	weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
-	addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
+	state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
+	weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
+	addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
 	if (!state || !weight || !addr) {
 		kvfree(state);
 		kvfree(weight);
@@ -1222,7 +1238,7 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
 	if (map->osd_primary_affinity) {
 		u32 *affinity;
 
-		affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
+		affinity = kvmalloc(array_size(max, sizeof(*affinity)),
 					 GFP_NOFS);
 		if (!affinity)
 			return -ENOMEM;
@@ -1503,7 +1519,7 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
 	if (!map->osd_primary_affinity) {
 		int i;
 
-		map->osd_primary_affinity = ceph_kvmalloc(
+		map->osd_primary_affinity = kvmalloc(
 		    array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
 		    GFP_NOFS);
 		if (!map->osd_primary_affinity)
@@ -1566,7 +1582,7 @@ static int decode_new_primary_affinity(void **p, void *end,
 		if (ret)
 			return ret;
 
-		pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
+		osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff);
 	}
 
 	return 0;
@@ -1864,9 +1880,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 		osd = ceph_decode_32(p);
 		w = ceph_decode_32(p);
 		BUG_ON(osd >= map->max_osd);
-		pr_info("osd%d weight 0x%x %s\n", osd, w,
-		     w == CEPH_OSD_IN ? "(in)" :
-		     (w == CEPH_OSD_OUT ? "(out)" : ""));
+		osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w,
+			    w == CEPH_OSD_IN ? "(in)" :
+			    (w == CEPH_OSD_OUT ? "(out)" : ""));
 		map->osd_weight[osd] = w;
 
 		/*
@@ -1898,10 +1914,10 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 		BUG_ON(osd >= map->max_osd);
 		if ((map->osd_state[osd] & CEPH_OSD_UP) &&
 		    (xorstate & CEPH_OSD_UP))
-			pr_info("osd%d down\n", osd);
+			osdmap_info(map, "osd%d down\n", osd);
 		if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
 		    (xorstate & CEPH_OSD_EXISTS)) {
-			pr_info("osd%d does not exist\n", osd);
+			osdmap_info(map, "osd%d does not exist\n", osd);
 			ret = set_primary_affinity(map, osd,
 						   CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
 			if (ret)
@@ -1931,7 +1947,7 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 
 		dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
 
-		pr_info("osd%d up\n", osd);
+		osdmap_info(map, "osd%d up\n", osd);
 		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
 		map->osd_addr[osd] = addr;
 	}
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 65e34f78b05d..74622b278d57 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -96,7 +96,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
 EXPORT_SYMBOL(ceph_pagelist_append);
 
 /* Allocate enough pages for a pagelist to append the given amount
- * of data without without allocating.
+ * of data without allocating.
  * Returns: 0 on success, -ENOMEM on error.
  */
 int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)