From 84eea8c79090c44564877cd47c73455e32ec4846 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 16 May 2017 08:55:34 +0800
Subject: ceph: re-request max size after importing caps

The 'wanted max size' could be sent to inode's old auth mds, re-send
it to inode's new auth mds if necessary. Otherwise write syscall may
hang.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a3ebb632294e..6752223dc81c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3027,8 +3027,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 					le32_to_cpu(grant->truncate_seq),
 					le64_to_cpu(grant->truncate_size),
 					size);
-		/* max size increase? */
-		if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+	}
+
+	if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
+		if (max_size != ci->i_max_size) {
 			dout("max_size %lld -> %llu\n",
 			     ci->i_max_size, max_size);
 			ci->i_max_size = max_size;
@@ -3037,6 +3039,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 				ci->i_requested_max_size = 0;
 			}
 			wake = true;
+		} else if (ci->i_wanted_max_size > ci->i_max_size &&
+			   ci->i_wanted_max_size > ci->i_requested_max_size) {
+			/* CEPH_CAP_OP_IMPORT */
+			wake = true;
 		}
 	}
 
@@ -3554,7 +3560,6 @@ retry:
 	}
 
 	/* make sure we re-request max_size, if necessary */
-	ci->i_wanted_max_size = 0;
 	ci->i_requested_max_size = 0;
 
 	*old_issued = issued;
-- 
cgit v1.2.3-59-g8ed1b


From efb0ca765ac6f4985b57ef215e8d55e746b083f4 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 22 May 2017 12:03:32 +0800
Subject: ceph: update the 'approaching max_size' code

The old 'approaching max_size' code expects MDS set max_size to
'2 * reported_size'. This is no longer true. The new code reports
file size when half of previous max_size increment has been used.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c  |  2 +-
 fs/ceph/caps.c  | 18 ++++++++++++++++--
 fs/ceph/file.c  |  2 +-
 fs/ceph/inode.c |  9 +++------
 fs/ceph/super.h |  3 ++-
 5 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1e71e6ca5ddf..82220e9aaf7d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1318,7 +1318,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file_inode(file);
-	int check_cap = 0;
+	bool check_cap = false;
 
 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
 	     inode, page, (int)pos, (int)copied, (int)len);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6752223dc81c..f5552455223f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1653,6 +1653,21 @@ static int try_nonblocking_invalidate(struct inode *inode)
 	return -1;
 }
 
+bool __ceph_should_report_size(struct ceph_inode_info *ci)
+{
+	loff_t size = ci->vfs_inode.i_size;
+	/* mds will adjust max size according to the reported size */
+	if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
+		return false;
+	if (size >= ci->i_max_size)
+		return true;
+	/* half of previous max_size increment has been used */
+	if (ci->i_max_size > ci->i_reported_size &&
+	    (size << 1) >= ci->i_max_size + ci->i_reported_size)
+		return true;
+	return false;
+}
+
 /*
  * Swiss army knife function to examine currently used and wanted
  * versus held caps.  Release, flush, ack revoked caps to mds as
@@ -1806,8 +1821,7 @@ retry_locked:
 			}
 
 			/* approaching file_max? */
-			if ((inode->i_size << 1) >= ci->i_max_size &&
-			    (ci->i_reported_size << 1) < ci->i_max_size) {
+			if (__ceph_should_report_size(ci)) {
 				dout("i_size approaching max_size\n");
 				goto ack;
 			}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 29308a80d66f..3d48c415f3cb 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1040,8 +1040,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	int num_pages;
 	int written = 0;
 	int flags;
-	int check_caps = 0;
 	int ret;
+	bool check_caps = false;
 	struct timespec mtime = current_time(inode);
 	size_t count = iov_iter_count(from);
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4de6cdddf059..53f23c920266 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1653,20 +1653,17 @@ out:
 	return err;
 }
 
-int ceph_inode_set_size(struct inode *inode, loff_t size)
+bool ceph_inode_set_size(struct inode *inode, loff_t size)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	int ret = 0;
+	bool ret;
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
 	i_size_write(inode, size);
 	inode->i_blocks = calc_inode_blocks(size);
 
-	/* tell the MDS if we are approaching max_size */
-	if ((size << 1) >= ci->i_max_size &&
-	    (ci->i_reported_size << 1) < ci->i_max_size)
-		ret = 1;
+	ret = __ceph_should_report_size(ci);
 
 	spin_unlock(&ci->i_ceph_lock);
 	return ret;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a973acd8beaf..f8a0aba0d938 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -793,7 +793,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
 extern int ceph_inode_holds_cap(struct inode *inode, int mask);
 
-extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
 extern void __ceph_do_pending_vmtruncate(struct inode *inode);
 extern void ceph_queue_vmtruncate(struct inode *inode);
 
@@ -918,6 +918,7 @@ extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 				       struct ceph_snap_context *snapc);
 extern void ceph_flush_snaps(struct ceph_inode_info *ci,
 			     struct ceph_mds_session **psession);
+extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 			    struct ceph_mds_session *session);
 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
-- 
cgit v1.2.3-59-g8ed1b


From f2b0c45f09796f87723a1225c919035457f72b7a Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 23 May 2017 17:03:12 +0800
Subject: ceph: remove useless page->mapping check in writepage_nounlock()

Callers of writepage_nounlock() have already ensured non-null
page->mapping.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 82220e9aaf7d..96f83a417944 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -534,10 +534,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
-	if (!page->mapping || !page->mapping->host) {
-		dout("writepage %p - no mapping\n", page);
-		return -EFAULT;
-	}
 	inode = page->mapping->host;
 	ci = ceph_inode(inode);
 	fsc = ceph_inode_to_client(inode);
-- 
cgit v1.2.3-59-g8ed1b


From fa71fefb308532eb5b2e4b38d914d19fc836f73e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 23 May 2017 17:18:53 +0800
Subject: ceph: redirty page when writepage_nounlock() skips unwritable page

Ceph needs to flush dirty page in the order in which in which snap
context they belong to. Dirty pages belong to older snap context
should be flushed earlier. if writepage_nounlock() can not flush a
page, it should redirty the page.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 96f83a417944..8fde3b59e3a5 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -551,8 +551,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		dout("writepage %p page %p snapc %p not writeable - noop\n",
 		     inode, page, snapc);
 		/* we should only noop if called by kswapd */
-		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		WARN_ON(!(current->flags & PF_MEMALLOC));
 		ceph_put_snap_context(oldest);
+		redirty_page_for_writepage(wbc, page);
 		goto out;
 	}
 	ceph_put_snap_context(oldest);
-- 
cgit v1.2.3-59-g8ed1b


From 439868812aac01ec5d1b133a51e768280f3fc8d5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 23 May 2017 17:48:28 +0800
Subject: ceph: cleanup writepage_nounlock()

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8fde3b59e3a5..50836280a6f8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -530,7 +530,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	long writeback_stat;
 	u64 truncate_size;
 	u32 truncate_seq;
-	int err = 0, len = PAGE_SIZE;
+	int err, len = PAGE_SIZE;
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
@@ -543,7 +543,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	snapc = page_snap_context(page);
 	if (snapc == NULL) {
 		dout("writepage %p page %p not dirty?\n", inode, page);
-		goto out;
+		return 0;
 	}
 	oldest = get_oldest_context(inode, &snap_size,
 				    &truncate_size, &truncate_seq);
@@ -554,7 +554,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		WARN_ON(!(current->flags & PF_MEMALLOC));
 		ceph_put_snap_context(oldest);
 		redirty_page_for_writepage(wbc, page);
-		goto out;
+		return 0;
 	}
 	ceph_put_snap_context(oldest);
 
@@ -564,8 +564,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	/* is this a partial page at end of file? */
 	if (page_off >= snap_size) {
 		dout("%p page eof %llu\n", page, snap_size);
-		goto out;
+		return 0;
 	}
+
 	if (snap_size < page_off + len)
 		len = snap_size - page_off;
 
@@ -592,7 +593,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 			dout("writepage interrupted page %p\n", page);
 			redirty_page_for_writepage(wbc, page);
 			end_page_writeback(page);
-			goto out;
+			return err;
 		}
 		dout("writepage setting page/mapping error %d %p\n",
 		     err, page);
@@ -608,7 +609,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	end_page_writeback(page);
 	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
 	ceph_put_snap_context(snapc);  /* page's reference */
-out:
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 92e57e6287cc7402487edf3eb09c8a7b36dad63f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 5 Jun 2017 11:07:28 +0800
Subject: ceph: don't re-send interrupted flock request

Don't re-send interrupted flock request in cases of mds failover
and receiving request forward. Because corresponding 'lock intr'
request may have been finished, it won't get re-sent.

Link: http://tracker.ceph.com/issues/20170
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/locks.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 6806dbeaee19..64ae74472046 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -127,6 +127,29 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 	dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
 	     req->r_tid);
 
+	mutex_lock(&mdsc->mutex);
+	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
+		err = 0;
+	} else {
+		/*
+		 * ensure we aren't running concurrently with
+		 * ceph_fill_trace or ceph_readdir_prepopulate, which
+		 * rely on locks (dir mutex) held by our caller.
+		 */
+		mutex_lock(&req->r_fill_mutex);
+		req->r_err = err;
+		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+		mutex_unlock(&req->r_fill_mutex);
+
+		if (!req->r_session) {
+			// haven't sent the request
+			err = 0;
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+	if (!err)
+		return 0;
+
 	intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
 					    USE_AUTH_MDS);
 	if (IS_ERR(intr_req))
@@ -146,7 +169,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 	if (err && err != -ERESTARTSYS)
 		return err;
 
-	wait_for_completion(&req->r_completion);
+	wait_for_completion_killable(&req->r_safe_completion);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 1684dd03e9f59212775cafa50ea77b9ef5b263db Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 14 Jun 2017 15:54:56 +0800
Subject: ceph: getattr before read on ceph.* xattrs

Previously we were returning values for quota, layout
xattrs without any kind of update -- the user just got
whatever happened to be in our cache.

Clearly this extra round trip has a cost, but reads of
these xattrs are fairly rare, happening on admin
intervention rather than in normal operation.

Link: http://tracker.ceph.com/issues/17939
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 75267cdd5dfd..11263f102e4c 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -756,6 +756,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 	/* let's see if a virtual xattr was requested */
 	vxattr = ceph_match_vxattr(inode, name);
 	if (vxattr) {
+		err = ceph_do_getattr(inode, 0, true);
+		if (err)
+			return err;
 		err = -ENODATA;
 		if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
 			err = vxattr->getxattr_cb(ci, value, size);
-- 
cgit v1.2.3-59-g8ed1b


From 62a65f36d016fff32179acdbfcb8b2d8d9e54757 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 22 Jun 2017 16:26:34 +0800
Subject: ceph: avoid invalid memory dereference in the middle of umount

extra_mon_dispatch() and debugfs' foo_show functions dereference
fsc->mdsc. we should clean up fsc->client->extra_mon_dispatch
and debugfs before destroying fsc->mds.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 4 ++--
 fs/ceph/super.c      | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0c05df44cc6c..666a9f274832 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3769,13 +3769,13 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
 {
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-
 	dout("mdsc_destroy %p\n", mdsc);
-	ceph_mdsc_stop(mdsc);
 
 	/* flush out any connection work with references to us */
 	ceph_msgr_flush();
 
+	ceph_mdsc_stop(mdsc);
+
 	fsc->mdsc = NULL;
 	kfree(mdsc);
 	dout("mdsc_destroy %p done\n", mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 8d7918ce694a..14e78dd52ef9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -636,8 +636,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
 
 	destroy_mount_options(fsc->mount_options);
 
-	ceph_fs_debugfs_cleanup(fsc);
-
 	ceph_destroy_client(fsc->client);
 
 	kfree(fsc);
@@ -1040,6 +1038,10 @@ static void ceph_kill_sb(struct super_block *s)
 
 	ceph_mdsc_pre_umount(fsc->mdsc);
 	generic_shutdown_super(s);
+
+	fsc->client->extra_mon_dispatch = NULL;
+	ceph_fs_debugfs_cleanup(fsc);
+
 	ceph_mdsc_destroy(fsc);
 
 	destroy_fs_client(fsc);
-- 
cgit v1.2.3-59-g8ed1b


From 4b9f2042fd2a9da7e6c7b4dd49eff19dc3754e4f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 27 Jun 2017 17:17:24 +0800
Subject: ceph: avoid accessing freeing inode in ceph_check_delayed_caps()

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f5552455223f..7007ae2a5ad2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3809,6 +3809,7 @@ bad:
  */
 void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 {
+	struct inode *inode;
 	struct ceph_inode_info *ci;
 	int flags = CHECK_CAPS_NODELAY;
 
@@ -3824,9 +3825,15 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 		    time_before(jiffies, ci->i_hold_caps_max))
 			break;
 		list_del_init(&ci->i_cap_delay_list);
+
+		inode = igrab(&ci->vfs_inode);
 		spin_unlock(&mdsc->cap_delay_lock);
-		dout("check_delayed_caps on %p\n", &ci->vfs_inode);
-		ceph_check_caps(ci, flags, NULL);
+
+		if (inode) {
+			dout("check_delayed_caps on %p\n", inode);
+			ceph_check_caps(ci, flags, NULL);
+			iput(inode);
+		}
 	}
 	spin_unlock(&mdsc->cap_delay_lock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1d8f83604c4244d93c5a49f5107624769df6248f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 27 Jun 2017 11:57:56 +0800
Subject: ceph: new mount option that specifies fscache uniquifier

Current ceph uses FSID as primary index key of fscache data. This
allows ceph to retain cached data across remount. But this causes
problem (kernel opps, fscache does not support sharing data) when
a filesystem get mounted several times (with fscache enabled, with
different mount options).

The fix is adding a new mount option, which specifies uniquifier
for fscache.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/cache.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/ceph/super.c | 41 +++++++++++++++++--------
 fs/ceph/super.h |  1 +
 3 files changed, 113 insertions(+), 21 deletions(-)

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 4e7421caf380..fd1172823f86 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -35,18 +35,34 @@ struct fscache_netfs ceph_cache_netfs = {
 	.version	= 0,
 };
 
+static DEFINE_MUTEX(ceph_fscache_lock);
+static LIST_HEAD(ceph_fscache_list);
+
+struct ceph_fscache_entry {
+	struct list_head list;
+	struct fscache_cookie *fscache;
+	struct ceph_fsid fsid;
+	size_t uniq_len;
+	char uniquifier[0];
+};
+
 static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
 					     void *buffer, uint16_t maxbuf)
 {
 	const struct ceph_fs_client* fsc = cookie_netfs_data;
-	uint16_t klen;
+	const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+	uint16_t fsid_len, uniq_len;
 
-	klen = sizeof(fsc->client->fsid);
-	if (klen > maxbuf)
+	fsid_len = sizeof(fsc->client->fsid);
+	uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+	if (fsid_len + uniq_len > maxbuf)
 		return 0;
 
-	memcpy(buffer, &fsc->client->fsid, klen);
-	return klen;
+	memcpy(buffer, &fsc->client->fsid, fsid_len);
+	if (uniq_len)
+		memcpy(buffer + fsid_len, fscache_uniq, uniq_len);
+
+	return fsid_len + uniq_len;
 }
 
 static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
@@ -67,13 +83,54 @@ void ceph_fscache_unregister(void)
 
 int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
 {
+	const struct ceph_fsid *fsid = &fsc->client->fsid;
+	const char *fscache_uniq = fsc->mount_options->fscache_uniq;
+	size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
+	struct ceph_fscache_entry *ent;
+	int err = 0;
+
+	mutex_lock(&ceph_fscache_lock);
+	list_for_each_entry(ent, &ceph_fscache_list, list) {
+		if (memcmp(&ent->fsid, fsid, sizeof(*fsid)))
+			continue;
+		if (ent->uniq_len != uniq_len)
+			continue;
+		if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len))
+			continue;
+
+		pr_err("fscache cookie already registered for fsid %pU\n", fsid);
+		pr_err("  use fsc=%%s mount option to specify a uniquifier\n");
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
+	ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL);
+	if (!ent) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
 	fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
 					      &ceph_fscache_fsid_object_def,
 					      fsc, true);
-	if (!fsc->fscache)
-		pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
 
-	return 0;
+	if (fsc->fscache) {
+		memcpy(&ent->fsid, fsid, sizeof(*fsid));
+		if (uniq_len > 0) {
+			memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
+			ent->uniq_len = uniq_len;
+		}
+		ent->fscache = fsc->fscache;
+		list_add_tail(&ent->list, &ceph_fscache_list);
+	} else {
+		kfree(ent);
+		pr_err("unable to register fscache cookie for fsid %pU\n",
+		       fsid);
+		/* all other fs ignore this error */
+	}
+out_unlock:
+	mutex_unlock(&ceph_fscache_lock);
+	return err;
 }
 
 static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
@@ -349,7 +406,24 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
 
 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
 {
-	fscache_relinquish_cookie(fsc->fscache, 0);
+	if (fscache_cookie_valid(fsc->fscache)) {
+		struct ceph_fscache_entry *ent;
+		bool found = false;
+
+		mutex_lock(&ceph_fscache_lock);
+		list_for_each_entry(ent, &ceph_fscache_list, list) {
+			if (ent->fscache == fsc->fscache) {
+				list_del(&ent->list);
+				kfree(ent);
+				found = true;
+				break;
+			}
+		}
+		WARN_ON_ONCE(!found);
+		mutex_unlock(&ceph_fscache_lock);
+
+		__fscache_relinquish_cookie(fsc->fscache, 0);
+	}
 	fsc->fscache = NULL;
 }
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 14e78dd52ef9..aa06a8c24792 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -121,6 +121,7 @@ enum {
 	/* int args above */
 	Opt_snapdirname,
 	Opt_mds_namespace,
+	Opt_fscache_uniq,
 	Opt_last_string,
 	/* string args above */
 	Opt_dirstat,
@@ -158,6 +159,7 @@ static match_table_t fsopt_tokens = {
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
 	{Opt_mds_namespace, "mds_namespace=%s"},
+	{Opt_fscache_uniq, "fsc=%s"},
 	/* string args above */
 	{Opt_dirstat, "dirstat"},
 	{Opt_nodirstat, "nodirstat"},
@@ -223,6 +225,14 @@ static int parse_fsopt_token(char *c, void *private)
 		if (!fsopt->mds_namespace)
 			return -ENOMEM;
 		break;
+	case Opt_fscache_uniq:
+		fsopt->fscache_uniq = kstrndup(argstr[0].from,
+					       argstr[0].to-argstr[0].from,
+					       GFP_KERNEL);
+		if (!fsopt->fscache_uniq)
+			return -ENOMEM;
+		fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
+		break;
 		/* misc */
 	case Opt_wsize:
 		fsopt->wsize = intval;
@@ -317,6 +327,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 	kfree(args->snapdir_name);
 	kfree(args->mds_namespace);
 	kfree(args->server_path);
+	kfree(args->fscache_uniq);
 	kfree(args);
 }
 
@@ -350,8 +361,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
 	ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
 	if (ret)
 		return ret;
-
 	ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+	if (ret)
+		return ret;
+	ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
 	if (ret)
 		return ret;
 
@@ -475,8 +488,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",noasyncreaddir");
 	if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
 		seq_puts(m, ",nodcache");
-	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
-		seq_puts(m, ",fsc");
+	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
+		if (fsopt->fscache_uniq)
+			seq_printf(m, ",fsc=%s", fsopt->fscache_uniq);
+		else
+			seq_puts(m, ",fsc");
+	}
 	if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
 		seq_puts(m, ",nopoolperm");
 
@@ -597,18 +614,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	if (!fsc->wb_pagevec_pool)
 		goto fail_trunc_wq;
 
-	/* setup fscache */
-	if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
-	    (ceph_fscache_register_fs(fsc) != 0))
-		goto fail_fscache;
-
 	/* caps */
 	fsc->min_caps = fsopt->max_readdir;
 
 	return fsc;
 
-fail_fscache:
-	ceph_fscache_unregister_fs(fsc);
 fail_trunc_wq:
 	destroy_workqueue(fsc->trunc_wq);
 fail_pg_inv_wq:
@@ -626,8 +636,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
 	dout("destroy_fs_client %p\n", fsc);
 
-	ceph_fscache_unregister_fs(fsc);
-
 	destroy_workqueue(fsc->wb_wq);
 	destroy_workqueue(fsc->pg_inv_wq);
 	destroy_workqueue(fsc->trunc_wq);
@@ -820,6 +828,13 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 		if (err < 0)
 			goto out;
 
+		/* setup fscache */
+		if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) {
+			err = ceph_fscache_register_fs(fsc);
+			if (err < 0)
+				goto out;
+		}
+
 		if (!fsc->mount_options->server_path) {
 			path = "";
 			dout("mount opening path \\t\n");
@@ -1042,6 +1057,8 @@ static void ceph_kill_sb(struct super_block *s)
 	fsc->client->extra_mon_dispatch = NULL;
 	ceph_fs_debugfs_cleanup(fsc);
 
+	ceph_fscache_unregister_fs(fsc);
+
 	ceph_mdsc_destroy(fsc);
 
 	destroy_fs_client(fsc);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f8a0aba0d938..f02a2225fe42 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -73,6 +73,7 @@ struct ceph_mount_options {
 	char *snapdir_name;   /* default ".snap" */
 	char *mds_namespace;  /* default NULL */
 	char *server_path;    /* default  "/" */
+	char *fscache_uniq;   /* default NULL */
 };
 
 struct ceph_fs_client {
-- 
cgit v1.2.3-59-g8ed1b


From 481f001ffa5d8e9d72f8206e92f73ded076eeb30 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 3 Jul 2017 09:09:10 +0800
Subject: ceph: update ceph_dentry_info::lease_session when necessary

Current code does not update ceph_dentry_info::lease_session once
it is set. If auth mds of corresponding dentry changes, dentry lease
keeps in an invalid state.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 53f23c920266..220dfd87cbfa 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1016,6 +1016,7 @@ static void update_dentry_lease(struct dentry *dentry,
 	long unsigned ttl = from_time + (duration * HZ) / 1000;
 	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
 	struct inode *dir;
+	struct ceph_mds_session *old_lease_session = NULL;
 
 	/*
 	 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
@@ -1051,8 +1052,10 @@ static void update_dentry_lease(struct dentry *dentry,
 	    time_before(ttl, di->time))
 		goto out_unlock;  /* we already have a newer lease. */
 
-	if (di->lease_session && di->lease_session != session)
-		goto out_unlock;
+	if (di->lease_session && di->lease_session != session) {
+		old_lease_session = di->lease_session;
+		di->lease_session = NULL;
+	}
 
 	ceph_dentry_lru_touch(dentry);
 
@@ -1065,6 +1068,8 @@ static void update_dentry_lease(struct dentry *dentry,
 	di->time = ttl;
 out_unlock:
 	spin_unlock(&dentry->d_lock);
+	if (old_lease_session)
+		ceph_put_mds_session(old_lease_session);
 	return;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From dcbbd97ccb9c6f4dad39875c1404d2643eaf110b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 5 Jun 2017 14:44:59 +0200
Subject: libceph: remove ceph_sanitize_features() workaround

Reflects ceph.git commit ff1959282826ae6acd7134e1b1ede74ffd1cc04a.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_features.h | 21 ---------------------
 net/ceph/messenger.c               |  3 +--
 2 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index fd8b2953c78f..4962708841b5 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -77,29 +77,8 @@
 #define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING   (1ULL<<58) /* New, v7 encoding */
 #define CEPH_FEATURE_FS_FILE_LAYOUT_V2       (1ULL<<58) /* file_layout_t */
 
-/*
- * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
- * vector to evaluate to 64 bit ~0.  To cope, we designate 1ULL << 63
- * to mean 33 bit ~0, and introduce a helper below to do the
- * translation.
- *
- * This was introduced by ceph.git commit
- *   9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8
- * and fixed by ceph.git commit
- *   4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c
- */
 #define CEPH_FEATURE_RESERVED (1ULL<<63)
 
-static inline u64 ceph_sanitize_features(u64 features)
-{
-	if (features & CEPH_FEATURE_RESERVED) {
-		/* everything through OSD_SNAPMAPPER */
-		return 0x1ffffffffull;
-	} else {
-		return features;
-	}
-}
-
 /*
  * Features supported.
  */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 588a91930051..9daed2540639 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2033,8 +2033,7 @@ static int process_connect(struct ceph_connection *con)
 {
 	u64 sup_feat = from_msgr(con->msgr)->supported_features;
 	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = ceph_sanitize_features(
-				le64_to_cpu(con->in_reply.features));
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
 	int ret;
 
 	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-- 
cgit v1.2.3-59-g8ed1b


From f179d3ba8cb9073c2d96315b79ff7bc658a1feee Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 5 Jun 2017 14:44:59 +0200
Subject: libceph: new features macros

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_features.h | 242 +++++++++++++++++++++++++------------
 1 file changed, 167 insertions(+), 75 deletions(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 4962708841b5..7fce9ed44af0 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -2,82 +2,174 @@
 #define __CEPH_FEATURES
 
 /*
- * feature bits
+ * Each time we reclaim bits for reuse we need to specify another bit
+ * that, if present, indicates we have the new incarnation of that
+ * feature.  Base case is 1 (first use).
  */
-#define CEPH_FEATURE_UID            (1ULL<<0)
-#define CEPH_FEATURE_NOSRCADDR      (1ULL<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK  (1ULL<<2)
-#define CEPH_FEATURE_FLOCK          (1ULL<<3)
-#define CEPH_FEATURE_SUBSCRIBE2     (1ULL<<4)
-#define CEPH_FEATURE_MONNAMES       (1ULL<<5)
-#define CEPH_FEATURE_RECONNECT_SEQ  (1ULL<<6)
-#define CEPH_FEATURE_DIRLAYOUTHASH  (1ULL<<7)
-#define CEPH_FEATURE_OBJECTLOCATOR  (1ULL<<8)
-#define CEPH_FEATURE_PGID64         (1ULL<<9)
-#define CEPH_FEATURE_INCSUBOSDMAP   (1ULL<<10)
-#define CEPH_FEATURE_PGPOOL3        (1ULL<<11)
-#define CEPH_FEATURE_OSDREPLYMUX    (1ULL<<12)
-#define CEPH_FEATURE_OSDENC         (1ULL<<13)
-#define CEPH_FEATURE_OMAP           (1ULL<<14)
-#define CEPH_FEATURE_MONENC         (1ULL<<15)
-#define CEPH_FEATURE_QUERY_T        (1ULL<<16)
-#define CEPH_FEATURE_INDEP_PG_MAP   (1ULL<<17)
-#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18)
-#define CEPH_FEATURE_CHUNKY_SCRUB   (1ULL<<19)
-#define CEPH_FEATURE_MON_NULLROUTE  (1ULL<<20)
-#define CEPH_FEATURE_MON_GV         (1ULL<<21)
-#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22)
-#define CEPH_FEATURE_MSG_AUTH	    (1ULL<<23)
-#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24)
-#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25)
-#define CEPH_FEATURE_CREATEPOOLID   (1ULL<<26)
-#define CEPH_FEATURE_REPLY_CREATE_INODE   (1ULL<<27)
-#define CEPH_FEATURE_OSD_HBMSGS     (1ULL<<28)
-#define CEPH_FEATURE_MDSENC         (1ULL<<29)
-#define CEPH_FEATURE_OSDHASHPSPOOL  (1ULL<<30)
-#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31)
-#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
-#define CEPH_FEATURE_MON_SCRUB      (1ULL<<33)
-#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
-#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
-#define CEPH_FEATURE_CRUSH_V2      (1ULL<<36)  /* new indep; SET_* steps */
-#define CEPH_FEATURE_EXPORT_PEER   (1ULL<<37)
-#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
-#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38)   /* overlap with EC */
-/* The process supports new-style OSDMap encoding. Monitors also use
-   this bit to determine if peers support NAK messages. */
-#define CEPH_FEATURE_OSDMAP_ENC    (1ULL<<39)
-#define CEPH_FEATURE_MDS_INLINE_DATA     (1ULL<<40)
-#define CEPH_FEATURE_CRUSH_TUNABLES3     (1ULL<<41)
-#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41)  /* overlap w/ tunables3 */
-#define CEPH_FEATURE_MSGR_KEEPALIVE2   (1ULL<<42)
-#define CEPH_FEATURE_OSD_POOLRESEND    (1ULL<<43)
-#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44)
-#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45)
-#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46)
-#define CEPH_FEATURE_OSD_REPOP         (1ULL<<46)   /* overlap with fadvise */
-#define CEPH_FEATURE_OSD_OBJECT_DIGEST  (1ULL<<46)  /* overlap with fadvise */
-#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */
-#define CEPH_FEATURE_MDS_QUOTA      (1ULL<<47)
-#define CEPH_FEATURE_CRUSH_V4      (1ULL<<48)  /* straw2 buckets */
-#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
-// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
-#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49)  /* overlap w/ above */
-#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
-#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
-#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
-#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
-#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
-#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
-#define CEPH_FEATURE_NEW_OSDOP_ENCODING   (1ULL<<56) /* New, v7 encoding */
-#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
-#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
-#define CEPH_FEATURE_CRUSH_TUNABLES5	(1ULL<<58) /* chooseleaf stable mode */
-// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
-#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING   (1ULL<<58) /* New, v7 encoding */
-#define CEPH_FEATURE_FS_FILE_LAYOUT_V2       (1ULL<<58) /* file_layout_t */
-
-#define CEPH_FEATURE_RESERVED (1ULL<<63)
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name)			\
+	const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit);		\
+	const static uint64_t CEPH_FEATUREMASK_##name =			\
+		(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+/* this bit is ignored but still advertised by release *when* */
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+	const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+	const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name =		\
+		(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+/*
+ * this bit is ignored by release *unused* and not advertised by
+ * release *unadvertised*
+ */
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+/*
+ * test for a feature.  this test is safer than a typical mask against
+ * the bit because it ensures that we have the bit AND the marker for the
+ * bit's incarnation.  this must be used in any case where the features
+ * bits may include an old meaning of the bit.
+ */
+#define CEPH_HAVE_FEATURE(x, name)			\
+	(((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * A *major* release is a release through which all upgrades must pass
+ * (e.g., jewel).  For example, no pre-jewel server will ever talk to
+ * a post-jewel server (mon, osd, etc).
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ *  - In the first phase we indicate that a feature is DEPRECATED as of
+ *    a particular release.  This is the first major release X (say,
+ *    jewel) that does not depend on its peers advertising the feature.
+ *    That is, it safely assumes its peers all have the feature.  We
+ *    indicate this with the DEPRECATED macro.  For example,
+ *
+ *      DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
+ *
+ *    because 10.2.z (jewel) did not care if its peers advertised this
+ *    feature bit.
+ *
+ *  - In the second phase we stop advertising the the bit and call it
+ *    RETIRED.  This can normally be done in the *next* major release
+ *    following the one in which we marked the feature DEPRECATED.  In
+ *    the above example, for 12.0.z (luminous) we can say:
+ *
+ *      DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+ *
+ *  - The bit can be reused in the first post-luminous release, 13.0.z
+ *    (m).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
+ */
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64)
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC)
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF)    // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
+
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC)
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
+DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS)  // deprecate me
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)
+DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES)
+DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)
+DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2)
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(50, 1, MON_METADATA)
+DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT)
+DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES)
+DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3)
+DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT)
+DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4)
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB)
+DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap
+DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING)  // *do not share this bit*
+
+DEFINE_CEPH_FEATURE(61, 1, RESERVED2)          // unused, but slow down!
+DEFINE_CEPH_FEATURE(62, 1, RESERVED)           // do not use; used as a sentinal
+DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
+
 
 /*
  * Features supported.
-- 
cgit v1.2.3-59-g8ed1b


From ca35ffea898d2cae67820202bd9a494742726b41 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 5 Jun 2017 14:44:59 +0200
Subject: libceph: handle non-empty dest in ceph_{oloc,oid}_copy()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osdmap.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 55e3a477f92d..bf2fcc837e36 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1547,12 +1547,13 @@ bad:
 void ceph_oloc_copy(struct ceph_object_locator *dest,
 		    const struct ceph_object_locator *src)
 {
-	WARN_ON(!ceph_oloc_empty(dest));
-	WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
+	ceph_oloc_destroy(dest);
 
 	dest->pool = src->pool;
 	if (src->pool_ns)
 		dest->pool_ns = ceph_get_string(src->pool_ns);
+	else
+		dest->pool_ns = NULL;
 }
 EXPORT_SYMBOL(ceph_oloc_copy);
 
@@ -1565,14 +1566,15 @@ EXPORT_SYMBOL(ceph_oloc_destroy);
 void ceph_oid_copy(struct ceph_object_id *dest,
 		   const struct ceph_object_id *src)
 {
-	WARN_ON(!ceph_oid_empty(dest));
+	ceph_oid_destroy(dest);
 
 	if (src->name != src->inline_name) {
 		/* very rare, see ceph_object_id definition */
 		dest->name = kmalloc(src->name_len + 1,
 				     GFP_NOIO | __GFP_NOFAIL);
+	} else {
+		dest->name = dest->inline_name;
 	}
-
 	memcpy(dest->name, src->name, src->name_len + 1);
 	dest->name_len = src->name_len;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2d7522e0bda17eb3c7af0ffe74f03dd6c3cca443 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 5 Jun 2017 14:45:00 +0200
Subject: libceph: advertise support for OSD_POOLRESEND

The code has been in place since commit 63244fa123a7 ("libceph:
introduce ceph_osd_request_target, calc_target()"), and, with the
ceph_{oloc,oid}_copy() issue fixed in the previous commit, is now
in working order.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_features.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 7fce9ed44af0..89c68af48539 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -197,6 +197,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_CRUSH_TUNABLES3 |		\
 	 CEPH_FEATURE_OSD_PRIMARY_AFFINITY |	\
 	 CEPH_FEATURE_MSGR_KEEPALIVE2 |		\
+	 CEPH_FEATURE_OSD_POOLRESEND |		\
 	 CEPH_FEATURE_CRUSH_V4 |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
 	 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
-- 
cgit v1.2.3-59-g8ed1b


From 220abf5aa7ba5f544f1b589bde33761c60bbf9a0 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 5 Jun 2017 14:45:00 +0200
Subject: libceph: support SERVER_JEWEL feature bits

Only MON_STATEFUL_SUB, really.  MON_ROUTE_OSDMAP and
OSDSUBOP_NO_SNAPCONTEXT are irrelevant.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_features.h | 2 ++
 net/ceph/mon_client.c              | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 89c68af48539..78a58770e6e9 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -199,6 +199,8 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MSGR_KEEPALIVE2 |		\
 	 CEPH_FEATURE_OSD_POOLRESEND |		\
 	 CEPH_FEATURE_CRUSH_V4 |		\
+	 CEPH_FEATURE_SERVER_JEWEL |		\
+	 CEPH_FEATURE_MON_STATEFUL_SUB |	\
 	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
 	 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 250f11f78609..875675765531 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -6,6 +6,7 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/debugfs.h>
@@ -297,6 +298,10 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
 
 	mutex_lock(&monc->mutex);
 	if (monc->sub_renew_sent) {
+		/*
+		 * This is only needed for legacy (infernalis or older)
+		 * MONs -- see delayed_work().
+		 */
 		monc->sub_renew_after = monc->sub_renew_sent +
 					    (seconds >> 1) * HZ - 1;
 		dout("%s sent %lu duration %d renew after %lu\n", __func__,
@@ -955,7 +960,8 @@ static void delayed_work(struct work_struct *work)
 			__validate_auth(monc);
 		}
 
-		if (is_auth) {
+		if (is_auth &&
+		    !(monc->con.peer_features & CEPH_FEATURE_MON_STATEFUL_SUB)) {
 			unsigned long now = jiffies;
 
 			dout("%s renew subs? now %lu renew after %lu\n",
-- 
cgit v1.2.3-59-g8ed1b


From dc93e0e2831de2f80817b89aae4864b88332fcce Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 5 Jun 2017 14:45:00 +0200
Subject: libceph: fold [l]req->last_force_resend into ceph_osd_request_target

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  4 ++--
 net/ceph/osd_client.c           | 21 ++++++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 85650b415e73..ef630ebd1169 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -148,6 +148,8 @@ struct ceph_osd_request_target {
 	unsigned int flags;                /* CEPH_OSD_FLAG_* */
 	bool paused;
 
+	u32 last_force_resend;
+
 	int osd;
 };
 
@@ -193,7 +195,6 @@ struct ceph_osd_request {
 	unsigned long r_stamp;                /* jiffies, send or check time */
 	unsigned long r_start_stamp;          /* jiffies */
 	int r_attempts;
-	u32 r_last_force_resend;
 	u32 r_map_dne_bound;
 
 	struct ceph_osd_req_op r_ops[];
@@ -221,7 +222,6 @@ struct ceph_osd_linger_request {
 	struct list_head pending_lworks;
 
 	struct ceph_osd_request_target t;
-	u32 last_force_resend;
 	u32 map_dne_bound;
 
 	struct timespec mtime;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 924f07c36ddb..aca6319b99b6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -384,6 +384,8 @@ static void target_copy(struct ceph_osd_request_target *dest,
 	dest->flags = src->flags;
 	dest->paused = src->paused;
 
+	dest->last_force_resend = src->last_force_resend;
+
 	dest->osd = src->osd;
 }
 
@@ -1311,7 +1313,6 @@ enum calc_target_result {
 
 static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 					   struct ceph_osd_request_target *t,
-					   u32 *last_force_resend,
 					   bool any_change)
 {
 	struct ceph_pg_pool_info *pi;
@@ -1332,11 +1333,10 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	}
 
 	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
-		if (last_force_resend &&
-		    *last_force_resend < pi->last_force_request_resend) {
-			*last_force_resend = pi->last_force_request_resend;
+		if (t->last_force_resend < pi->last_force_request_resend) {
+			t->last_force_resend = pi->last_force_request_resend;
 			force_resend = true;
-		} else if (!last_force_resend) {
+		} else if (t->last_force_resend == 0) {
 			force_resend = true;
 		}
 	}
@@ -1645,7 +1645,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
-	ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	ct_res = calc_target(osdc, &req->r_t, false);
 	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
 		goto promote;
 
@@ -2441,7 +2441,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
 	struct ceph_osd_client *osdc = lreq->osdc;
 	struct ceph_osd *osd;
 
-	calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
+	calc_target(osdc, &lreq->t, false);
 	osd = lookup_create_osd(osdc, lreq->t.osd, true);
 	link_linger(osd, lreq);
 
@@ -3059,7 +3059,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
 	struct ceph_osd_client *osdc = lreq->osdc;
 	enum calc_target_result ct_res;
 
-	ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
+	ct_res = calc_target(osdc, &lreq->t, true);
 	if (ct_res == CALC_TARGET_NEED_RESEND) {
 		struct ceph_osd *osd;
 
@@ -3130,8 +3130,7 @@ static void scan_requests(struct ceph_osd *osd,
 		n = rb_next(n); /* unlink_request(), check_pool_dne() */
 
 		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
-		ct_res = calc_target(osdc, &req->r_t,
-				     &req->r_last_force_resend, false);
+		ct_res = calc_target(osdc, &req->r_t, false);
 		switch (ct_res) {
 		case CALC_TARGET_NO_ACTION:
 			force_resend_writes = cleared_full ||
@@ -3240,7 +3239,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
 		erase_request(need_resend, req); /* before link_request() */
 
 		WARN_ON(req->r_osd);
-		calc_target(osdc, &req->r_t, NULL, false);
+		calc_target(osdc, &req->r_t, false);
 		osd = lookup_create_osd(osdc, req->r_t.osd, true);
 		link_request(osd, req);
 		if (!req->r_linger) {
-- 
cgit v1.2.3-59-g8ed1b


From 8e48cf00c48fdefb01f70db81f31438cd0c29dcc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 5 Jun 2017 14:45:00 +0200
Subject: libceph: new pi->last_force_request_resend

The old (v15) pi->last_force_request_resend has been repurposed to
make pre-RESEND_ON_SPLIT clients that don't check for PG splits but do
obey pi->last_force_request_resend resend on splits.  See ceph.git
commit 189ca7ec6420 ("mon/OSDMonitor: make pre-luminous clients resend
ops on split").

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osdmap.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index bf2fcc837e36..710ee3dc01b9 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -682,11 +682,48 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 		*p += len;
 	}
 
+	/*
+	 * last_force_op_resend_preluminous, will be overridden if the
+	 * map was encoded with RESEND_ON_SPLIT
+	 */
 	if (ev >= 15)
 		pi->last_force_request_resend = ceph_decode_32(p);
 	else
 		pi->last_force_request_resend = 0;
 
+	if (ev >= 16)
+		*p += 4; /* skip min_read_recency_for_promote */
+
+	if (ev >= 17)
+		*p += 8; /* skip expected_num_objects */
+
+	if (ev >= 19)
+		*p += 4; /* skip cache_target_dirty_high_ratio_micro */
+
+	if (ev >= 20)
+		*p += 4; /* skip min_write_recency_for_promote */
+
+	if (ev >= 21)
+		*p += 1; /* skip use_gmt_hitset */
+
+	if (ev >= 22)
+		*p += 1; /* skip fast_read */
+
+	if (ev >= 23) {
+		*p += 4; /* skip hit_set_grade_decay_rate */
+		*p += 4; /* skip hit_set_search_last_n */
+	}
+
+	if (ev >= 24) {
+		/* skip opts */
+		*p += 1 + 1; /* versions */
+		len = ceph_decode_32(p);
+		*p += len;
+	}
+
+	if (ev >= 25)
+		pi->last_force_request_resend = ceph_decode_32(p);
+
 	/* ignore the rest */
 
 	*p = pool_end;
-- 
cgit v1.2.3-59-g8ed1b


From dc98ff7230e5ccf11c621dff0d590e83574a7184 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:53 +0200
Subject: libceph: introduce ceph_spg, ceph_pg_to_primary_shard()

Store both raw pgid and actual spgid in ceph_osd_request_target.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  3 ++-
 include/linux/ceph/osdmap.h     | 10 ++++++++++
 net/ceph/debugfs.c              | 11 ++++++++++-
 net/ceph/osd_client.c           |  7 +++++--
 net/ceph/osdmap.c               | 33 +++++++++++++++++++++++++++++++++
 5 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ef630ebd1169..6114f7b02135 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -136,7 +136,8 @@ struct ceph_osd_request_target {
 	struct ceph_object_id target_oid;
 	struct ceph_object_locator target_oloc;
 
-	struct ceph_pg pgid;
+	struct ceph_pg pgid;               /* last raw pg we mapped to */
+	struct ceph_spg spgid;             /* last actual spg we mapped to */
 	u32 pg_num;
 	u32 pg_num_mask;
 	struct ceph_osds acting;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 938656f70807..7ae5b416b4b6 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,6 +24,13 @@ struct ceph_pg {
 	uint32_t seed;
 };
 
+#define CEPH_SPG_NOSHARD	-1
+
+struct ceph_spg {
+	struct ceph_pg pgid;
+	s8 shard;
+};
+
 int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 
 #define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
@@ -271,6 +278,9 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
 			       const struct ceph_pg *raw_pgid,
 			       struct ceph_osds *up,
 			       struct ceph_osds *acting);
+bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid,
+			      struct ceph_spg *spgid);
 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
 			      const struct ceph_pg *raw_pgid);
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 71ba13927b3d..50ab1bdb16e2 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -147,11 +147,20 @@ static int monc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
+static void dump_spgid(struct seq_file *s, const struct ceph_spg *spgid)
+{
+	seq_printf(s, "%llu.%x", spgid->pgid.pool, spgid->pgid.seed);
+	if (spgid->shard != CEPH_SPG_NOSHARD)
+		seq_printf(s, "s%d", spgid->shard);
+}
+
 static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
 {
 	int i;
 
-	seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+	seq_printf(s, "osd%d\t%llu.%x\t", t->osd, t->pgid.pool, t->pgid.seed);
+	dump_spgid(s, &t->spgid);
+	seq_puts(s, "\t[");
 	for (i = 0; i < t->up.size; i++)
 		seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
 	seq_printf(s, "]/%d\t[", t->up.primary);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index aca6319b99b6..66509414f4c8 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -373,6 +373,7 @@ static void target_copy(struct ceph_osd_request_target *dest,
 	ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
 
 	dest->pgid = src->pgid; /* struct */
+	dest->spgid = src->spgid; /* struct */
 	dest->pg_num = src->pg_num;
 	dest->pg_num_mask = src->pg_num_mask;
 	ceph_osds_copy(&dest->acting, &src->acting);
@@ -1394,6 +1395,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	    ceph_osds_changed(&t->acting, &acting, any_change) ||
 	    force_resend) {
 		t->pgid = pgid; /* struct */
+		ceph_pg_to_primary_shard(osdc->osdmap, &pgid, &t->spgid);
 		ceph_osds_copy(&t->acting, &acting);
 		ceph_osds_copy(&t->up, &up);
 		t->size = pi->size;
@@ -1595,9 +1597,10 @@ static void send_request(struct ceph_osd_request *req)
 
 	encode_request(req, req->r_request);
 
-	dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+	dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d flags 0x%x attempt %d\n",
 	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
-	     req->r_t.osd, req->r_flags, req->r_attempts);
+	     req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
+	     req->r_t.spgid.shard, osd->o_osd, req->r_flags, req->r_attempts);
 
 	req->r_t.paused = false;
 	req->r_stamp = jiffies;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 710ee3dc01b9..a4155620eace 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2265,6 +2265,39 @@ out:
 	WARN_ON(!osds_valid(up) || !osds_valid(acting));
 }
 
+bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid,
+			      struct ceph_spg *spgid)
+{
+	struct ceph_pg_pool_info *pi;
+	struct ceph_pg pgid;
+	struct ceph_osds up, acting;
+	int i;
+
+	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+	if (!pi)
+		return false;
+
+	raw_pg_to_pg(pi, raw_pgid, &pgid);
+
+	if (ceph_can_shift_osds(pi)) {
+		spgid->pgid = pgid; /* struct */
+		spgid->shard = CEPH_SPG_NOSHARD;
+		return true;
+	}
+
+	ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+	for (i = 0; i < acting.size; i++) {
+		if (acting.osds[i] == acting.primary) {
+			spgid->pgid = pgid; /* struct */
+			spgid->shard = i;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 /*
  * Return acting primary for given PG, or -1 if none.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 2e59ffd1df4aba5289f04d362efc8432fac14949 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:53 +0200
Subject: libceph: encode_{pgid,oloc}() helpers

Factor out encode_{pgid,oloc}() and use ceph_encode_string() for oid.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osd_client.c | 50 +++++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 66509414f4c8..b0298c3681b6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -540,7 +540,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc)
+static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
 {
 	return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
 }
@@ -1485,6 +1485,28 @@ static void setup_request_data(struct ceph_osd_request *req,
 	WARN_ON(data_len != msg->data_length);
 }
 
+static void encode_pgid(void **p, const struct ceph_pg *pgid)
+{
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, pgid->pool);
+	ceph_encode_32(p, pgid->seed);
+	ceph_encode_32(p, -1); /* preferred */
+}
+
+static void encode_oloc(void **p, void *end,
+			const struct ceph_object_locator *oloc)
+{
+	ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc));
+	ceph_encode_64(p, oloc->pool);
+	ceph_encode_32(p, -1); /* preferred */
+	ceph_encode_32(p, 0);  /* key len */
+	if (oloc->pool_ns)
+		ceph_encode_string(p, end, oloc->pool_ns->str,
+				   oloc->pool_ns->len);
+	else
+		ceph_encode_32(p, 0);
+}
+
 static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
 	void *p = msg->front.iov_base;
@@ -1512,28 +1534,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
 	memset(p, 0, sizeof(struct ceph_eversion));
 	p += sizeof(struct ceph_eversion);
 
-	/* oloc */
-	ceph_start_encoding(&p, 5, 4,
-			    ceph_oloc_encoding_size(&req->r_t.target_oloc));
-	ceph_encode_64(&p, req->r_t.target_oloc.pool);
-	ceph_encode_32(&p, -1); /* preferred */
-	ceph_encode_32(&p, 0); /* key len */
-	if (req->r_t.target_oloc.pool_ns)
-		ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str,
-				   req->r_t.target_oloc.pool_ns->len);
-	else
-		ceph_encode_32(&p, 0);
-
-	/* pgid */
-	ceph_encode_8(&p, 1);
-	ceph_encode_64(&p, req->r_t.pgid.pool);
-	ceph_encode_32(&p, req->r_t.pgid.seed);
-	ceph_encode_32(&p, -1); /* preferred */
-
-	/* oid */
-	ceph_encode_32(&p, req->r_t.target_oid.name_len);
-	memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
-	p += req->r_t.target_oid.name_len;
+	encode_oloc(&p, end, &req->r_t.target_oloc);
+	encode_pgid(&p, &req->r_t.pgid);
+	ceph_encode_string(&p, end, req->r_t.target_oid.name,
+			   req->r_t.target_oid.name_len);
 
 	/* ops, can imply data */
 	ceph_encode_16(&p, req->r_num_ops);
-- 
cgit v1.2.3-59-g8ed1b


From 98ad5ebd1505eb903ae8bc27e94c1ab0d1c3e651 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:54 +0200
Subject: libceph: ceph_connection_operations::reencode_message() method

Give upper layers a chance to reencode the message after the connection
is negotiated and ->peer_features is set.  OSD client will use this to
support both luminous and pre-luminous OSDs (in a single cluster): the
former need MOSDOp v8; the latter will continue to be sent MOSDOp v4.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 2 ++
 net/ceph/messenger.c           | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index c5c4c713e00f..fbd94d9fa5dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -44,6 +44,8 @@ struct ceph_connection_operations {
 					struct ceph_msg_header *hdr,
 					int *skip);
 
+	void (*reencode_message) (struct ceph_msg *msg);
+
 	int (*sign_message) (struct ceph_msg *msg);
 	int (*check_message_signature) (struct ceph_msg *msg);
 };
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9daed2540639..0c31035bbfee 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1288,13 +1288,16 @@ static void prepare_write_message(struct ceph_connection *con)
 		m->hdr.seq = cpu_to_le64(++con->out_seq);
 		m->needs_out_seq = false;
 	}
-	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+	if (con->ops->reencode_message)
+		con->ops->reencode_message(m);
 
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
 	     m->data_length);
-	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
 
 	/* tag + hdr + front + middle */
 	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-- 
cgit v1.2.3-59-g8ed1b


From 8cb441c0545dfd4dafeedc1e2d7157e1072413ac Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:54 +0200
Subject: libceph: MOSDOp v8 encoding (actual spgid + full hash)

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  17 +++++
 include/linux/ceph/osdmap.h     |   4 +-
 net/ceph/osd_client.c           | 153 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 154 insertions(+), 20 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 6114f7b02135..bca2718ac253 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -205,6 +205,23 @@ struct ceph_request_redirect {
 	struct ceph_object_locator oloc;
 };
 
+/*
+ * osd request identifier
+ *
+ * caller name + incarnation# + tid to unique identify this request
+ */
+struct ceph_osd_reqid {
+	struct ceph_entity_name name;
+	__le64 tid;
+	__le32 inc;
+} __packed;
+
+struct ceph_blkin_trace_info {
+	__le64 trace_id;
+	__le64 span_id;
+	__le64 parent_span_id;
+} __packed;
+
 typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
 				 u64 notifier_id, void *data, size_t data_len);
 typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 7ae5b416b4b6..66447fc7f334 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -205,11 +205,13 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
 	return &map->osd_addr[osd];
 }
 
+#define CEPH_PGID_ENCODING_LEN		(1 + 8 + 4 + 4)
+
 static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 {
 	__u8 version;
 
-	if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
+	if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
 		pr_warn("incomplete pg encoding\n");
 		return -EINVAL;
 	}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b0298c3681b6..eaaf17e7c37b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -12,6 +12,7 @@
 #include <linux/bio.h>
 #endif
 
+#include <linux/ceph/ceph_features.h>
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/messenger.h>
@@ -555,17 +556,21 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
 	WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
 
 	/* create request message */
-	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
-	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
+	msg_size = CEPH_ENCODING_START_BLK_LEN +
+			CEPH_PGID_ENCODING_LEN + 1; /* spgid */
+	msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
+	msg_size += CEPH_ENCODING_START_BLK_LEN +
+			sizeof(struct ceph_osd_reqid); /* reqid */
+	msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
+	msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
 	msg_size += CEPH_ENCODING_START_BLK_LEN +
 			ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
-	msg_size += 1 + 8 + 4 + 4; /* pgid */
 	msg_size += 4 + req->r_base_oid.name_len; /* oid */
 	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
 	msg_size += 8; /* snapid */
 	msg_size += 8; /* snap_seq */
 	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
-	msg_size += 4; /* retry_attempt */
+	msg_size += 4 + 8; /* retry_attempt, features */
 
 	if (req->r_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -1493,6 +1498,13 @@ static void encode_pgid(void **p, const struct ceph_pg *pgid)
 	ceph_encode_32(p, -1); /* preferred */
 }
 
+static void encode_spgid(void **p, const struct ceph_spg *spgid)
+{
+	ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1);
+	encode_pgid(p, &spgid->pgid);
+	ceph_encode_8(p, spgid->shard);
+}
+
 static void encode_oloc(void **p, void *end,
 			const struct ceph_object_locator *oloc)
 {
@@ -1507,7 +1519,8 @@ static void encode_oloc(void **p, void *end,
 		ceph_encode_32(p, 0);
 }
 
-static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void encode_request_partial(struct ceph_osd_request *req,
+				   struct ceph_msg *msg)
 {
 	void *p = msg->front.iov_base;
 	void *const end = p + msg->front_alloc_len;
@@ -1524,18 +1537,25 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
 
 	setup_request_data(req, msg);
 
-	ceph_encode_32(&p, 1); /* client_inc, always 1 */
+	encode_spgid(&p, &req->r_t.spgid); /* actual spg */
+	ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
 	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
 	ceph_encode_32(&p, req->r_flags);
+
+	/* reqid */
+	ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid));
+	memset(p, 0, sizeof(struct ceph_osd_reqid));
+	p += sizeof(struct ceph_osd_reqid);
+
+	/* trace */
+	memset(p, 0, sizeof(struct ceph_blkin_trace_info));
+	p += sizeof(struct ceph_blkin_trace_info);
+
+	ceph_encode_32(&p, 0); /* client_inc, always 0 */
 	ceph_encode_timespec(p, &req->r_mtime);
 	p += sizeof(struct ceph_timespec);
 
-	/* reassert_version */
-	memset(p, 0, sizeof(struct ceph_eversion));
-	p += sizeof(struct ceph_eversion);
-
 	encode_oloc(&p, end, &req->r_t.target_oloc);
-	encode_pgid(&p, &req->r_t.pgid);
 	ceph_encode_string(&p, end, req->r_t.target_oid.name,
 			   req->r_t.target_oid.name_len);
 
@@ -1558,11 +1578,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
 	}
 
 	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+	BUG_ON(p != end - 8); /* space for features */
 
-	BUG_ON(p > end);
-	msg->front.iov_len = p - msg->front.iov_base;
-	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
-	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
+	/* front_len is finalized in encode_request_finish() */
 	msg->hdr.data_len = cpu_to_le32(data_len);
 	/*
 	 * The header "data_off" is a hint to the receiver allowing it
@@ -1571,9 +1590,99 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
 	 */
 	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
 
-	dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__,
-	     req, req->r_t.target_oid.name, req->r_t.target_oid.name_len,
-	     msg->front.iov_len, data_len);
+	dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg,
+	     req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+}
+
+static void encode_request_finish(struct ceph_msg *msg)
+{
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front_alloc_len;
+
+	if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
+		/* luminous OSD -- encode features and be done */
+		p = end - 8;
+		ceph_encode_64(&p, msg->con->peer_features);
+	} else {
+		struct {
+			char spgid[CEPH_ENCODING_START_BLK_LEN +
+				   CEPH_PGID_ENCODING_LEN + 1];
+			__le32 hash;
+			__le32 epoch;
+			__le32 flags;
+			char reqid[CEPH_ENCODING_START_BLK_LEN +
+				   sizeof(struct ceph_osd_reqid)];
+			char trace[sizeof(struct ceph_blkin_trace_info)];
+			__le32 client_inc;
+			struct ceph_timespec mtime;
+		} __packed head;
+		struct ceph_pg pgid;
+		void *oloc, *oid, *tail;
+		int oloc_len, oid_len, tail_len;
+		int len;
+
+		/*
+		 * Pre-luminous OSD -- reencode v8 into v4 using @head
+		 * as a temporary buffer.  Encode the raw PG; the rest
+		 * is just a matter of moving oloc, oid and tail blobs
+		 * around.
+		 */
+		memcpy(&head, p, sizeof(head));
+		p += sizeof(head);
+
+		oloc = p;
+		p += CEPH_ENCODING_START_BLK_LEN;
+		pgid.pool = ceph_decode_64(&p);
+		p += 4 + 4; /* preferred, key len */
+		len = ceph_decode_32(&p);
+		p += len;   /* nspace */
+		oloc_len = p - oloc;
+
+		oid = p;
+		len = ceph_decode_32(&p);
+		p += len;
+		oid_len = p - oid;
+
+		tail = p;
+		tail_len = (end - p) - 8;
+
+		p = msg->front.iov_base;
+		ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
+		ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch));
+		ceph_encode_copy(&p, &head.flags, sizeof(head.flags));
+		ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime));
+
+		/* reassert_version */
+		memset(p, 0, sizeof(struct ceph_eversion));
+		p += sizeof(struct ceph_eversion);
+
+		BUG_ON(p >= oloc);
+		memmove(p, oloc, oloc_len);
+		p += oloc_len;
+
+		pgid.seed = le32_to_cpu(head.hash);
+		encode_pgid(&p, &pgid); /* raw pg */
+
+		BUG_ON(p >= oid);
+		memmove(p, oid, oid_len);
+		p += oid_len;
+
+		/* tail -- ops, snapid, snapc, retry_attempt */
+		BUG_ON(p >= tail);
+		memmove(p, tail, tail_len);
+		p += tail_len;
+
+		msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+	}
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+	dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg,
+	     le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len),
+	     le16_to_cpu(msg->hdr.version));
 }
 
 /*
@@ -1599,7 +1708,7 @@ static void send_request(struct ceph_osd_request *req)
 	else
 		WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
 
-	encode_request(req, req->r_request);
+	encode_request_partial(req, req->r_request);
 
 	dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d flags 0x%x attempt %d\n",
 	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
@@ -4577,6 +4686,11 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
+static void osd_reencode_message(struct ceph_msg *msg)
+{
+	encode_request_finish(msg);
+}
+
 static int osd_sign_message(struct ceph_msg *msg)
 {
 	struct ceph_osd *o = msg->con->private;
@@ -4601,6 +4715,7 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.verify_authorizer_reply = verify_authorizer_reply,
 	.invalidate_authorizer = invalidate_authorizer,
 	.alloc_msg = alloc_msg,
+	.reencode_message = osd_reencode_message,
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
 	.fault = osd_fault,
-- 
cgit v1.2.3-59-g8ed1b


From 84ed45df4604ae9b1065b5fe2f250f57f7c69baf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:54 +0200
Subject: libceph: drop need_resend from calc_target()

Replace it with more fine-grained bools to separate updating
ceph_osd_request_target fields and the decision to resend.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osd_client.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eaaf17e7c37b..4143f73590f3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1325,8 +1325,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	struct ceph_pg pgid, last_pgid;
 	struct ceph_osds up, acting;
 	bool force_resend = false;
+	bool unpaused = false;
+	bool legacy_change;
 	bool need_check_tiering = false;
-	bool need_resend = false;
 	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
 	enum calc_target_result ct_res;
 	int ret;
@@ -1393,12 +1394,12 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 
 	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
 		t->paused = false;
-		need_resend = true;
+		unpaused = true;
 	}
+	legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
+			ceph_osds_changed(&t->acting, &acting, any_change);
 
-	if (ceph_pg_compare(&t->pgid, &pgid) ||
-	    ceph_osds_changed(&t->acting, &acting, any_change) ||
-	    force_resend) {
+	if (legacy_change || force_resend) {
 		t->pgid = pgid; /* struct */
 		ceph_pg_to_primary_shard(osdc->osdmap, &pgid, &t->spgid);
 		ceph_osds_copy(&t->acting, &acting);
@@ -1410,10 +1411,13 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 		t->sort_bitwise = sort_bitwise;
 
 		t->osd = acting.primary;
-		need_resend = true;
 	}
 
-	ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+	if (unpaused || legacy_change || force_resend)
+		ct_res = CALC_TARGET_NEED_RESEND;
+	else
+		ct_res = CALC_TARGET_NO_ACTION;
+
 out:
 	dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
 	return ct_res;
-- 
cgit v1.2.3-59-g8ed1b


From 7de030d6b10a56e991312a978ace6be3c090097c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:54 +0200
Subject: libceph: resend on PG splits if OSD has RESEND_ON_SPLIT

Note that ceph_osd_request_target fields are updated regardless of
RESEND_ON_SPLIT.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  2 ++
 net/ceph/osd_client.c       | 21 ++++++++++++++-------
 net/ceph/osdmap.c           |  7 +++----
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 66447fc7f334..63fb073a3355 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -249,6 +249,8 @@ static inline void ceph_osds_init(struct ceph_osds *set)
 
 void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
 
+bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
+		      u32 new_pg_num);
 bool ceph_is_new_interval(const struct ceph_osds *old_acting,
 			  const struct ceph_osds *new_acting,
 			  const struct ceph_osds *old_up,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4143f73590f3..518dbac599d0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1319,6 +1319,7 @@ enum calc_target_result {
 
 static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 					   struct ceph_osd_request_target *t,
+					   struct ceph_connection *con,
 					   bool any_change)
 {
 	struct ceph_pg_pool_info *pi;
@@ -1327,6 +1328,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	bool force_resend = false;
 	bool unpaused = false;
 	bool legacy_change;
+	bool split = false;
 	bool need_check_tiering = false;
 	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
 	enum calc_target_result ct_res;
@@ -1398,8 +1400,10 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	}
 	legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
 			ceph_osds_changed(&t->acting, &acting, any_change);
+	if (t->pg_num)
+		split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
 
-	if (legacy_change || force_resend) {
+	if (legacy_change || force_resend || split) {
 		t->pgid = pgid; /* struct */
 		ceph_pg_to_primary_shard(osdc->osdmap, &pgid, &t->spgid);
 		ceph_osds_copy(&t->acting, &acting);
@@ -1413,7 +1417,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 		t->osd = acting.primary;
 	}
 
-	if (unpaused || legacy_change || force_resend)
+	if (unpaused || legacy_change || force_resend ||
+	    (split && con && CEPH_HAVE_FEATURE(con->peer_features,
+					       RESEND_ON_SPLIT)))
 		ct_res = CALC_TARGET_NEED_RESEND;
 	else
 		ct_res = CALC_TARGET_NO_ACTION;
@@ -1765,7 +1771,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
 
 again:
-	ct_res = calc_target(osdc, &req->r_t, false);
+	ct_res = calc_target(osdc, &req->r_t, NULL, false);
 	if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
 		goto promote;
 
@@ -2561,7 +2567,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
 	struct ceph_osd_client *osdc = lreq->osdc;
 	struct ceph_osd *osd;
 
-	calc_target(osdc, &lreq->t, false);
+	calc_target(osdc, &lreq->t, NULL, false);
 	osd = lookup_create_osd(osdc, lreq->t.osd, true);
 	link_linger(osd, lreq);
 
@@ -3179,7 +3185,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
 	struct ceph_osd_client *osdc = lreq->osdc;
 	enum calc_target_result ct_res;
 
-	ct_res = calc_target(osdc, &lreq->t, true);
+	ct_res = calc_target(osdc, &lreq->t, NULL, true);
 	if (ct_res == CALC_TARGET_NEED_RESEND) {
 		struct ceph_osd *osd;
 
@@ -3250,7 +3256,8 @@ static void scan_requests(struct ceph_osd *osd,
 		n = rb_next(n); /* unlink_request(), check_pool_dne() */
 
 		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
-		ct_res = calc_target(osdc, &req->r_t, false);
+		ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
+				     false);
 		switch (ct_res) {
 		case CALC_TARGET_NO_ACTION:
 			force_resend_writes = cleared_full ||
@@ -3359,7 +3366,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
 		erase_request(need_resend, req); /* before link_request() */
 
 		WARN_ON(req->r_osd);
-		calc_target(osdc, &req->r_t, false);
+		calc_target(osdc, &req->r_t, NULL, false);
 		osd = lookup_create_osd(osdc, req->r_t.osd, true);
 		link_request(osd, req);
 		if (!req->r_linger) {
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index a4155620eace..367879afed58 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1753,9 +1753,8 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
 	dest->primary = src->primary;
 }
 
-static bool is_split(const struct ceph_pg *pgid,
-		     u32 old_pg_num,
-		     u32 new_pg_num)
+bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
+		      u32 new_pg_num)
 {
 	int old_bits = calc_bits_of(old_pg_num);
 	int old_mask = (1 << old_bits) - 1;
@@ -1800,7 +1799,7 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
 	       !osds_equal(old_up, new_up) ||
 	       old_size != new_size ||
 	       old_min_size != new_min_size ||
-	       is_split(pgid, old_pg_num, new_pg_num) ||
+	       ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
 	       old_sort_bitwise != new_sort_bitwise;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a10bcb19ae02cea7d5e6650fbc2de3ced46b4e5d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:55 +0200
Subject: libceph: delete from need_resend_linger before
 check_linger_pool_dne()

When processing a map update consisting of multiple incrementals, we
may end up running check_linger_pool_dne() on a lingering request that
was previously added to need_resend_linger list.  If it is concluded
that the target pool doesn't exist, the request is killed off while
still on need_resend_linger list, which leads to a crash on a NULL
lreq->osd in kick_requests():

    libceph: linger_id 18446462598732840961 pool does not exist
    BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
    IP: ceph_osdc_handle_map+0x4ae/0x870

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 518dbac599d0..576101b635ef 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3243,6 +3243,7 @@ static void scan_requests(struct ceph_osd *osd,
 				list_add_tail(&lreq->scan_item, need_resend_linger);
 			break;
 		case CALC_TARGET_POOL_DNE:
+			list_del_init(&lreq->scan_item);
 			check_linger_pool_dne(lreq);
 			break;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 04c7d789e269c2b82bbd08106049a5a979cdb3fd Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:55 +0200
Subject: libceph: make sure need_resend targets reflect latest map

Otherwise we may miss events like PG splits, pool deletions, etc when
we get multiple incremental maps at once.  Because check_pool_dne() can
now be fed an unlinked request, finish_request() needed to be taught to
handle unlinked requests.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/debugfs.c              |  2 +-
 net/ceph/osd_client.c           | 33 +++++++++++++++++++++++++--------
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index bca2718ac253..62c672bcbb31 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -149,6 +149,7 @@ struct ceph_osd_request_target {
 	unsigned int flags;                /* CEPH_OSD_FLAG_* */
 	bool paused;
 
+	u32 epoch;
 	u32 last_force_resend;
 
 	int osd;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 50ab1bdb16e2..c0089f8ccaeb 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -166,7 +166,7 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
 	seq_printf(s, "]/%d\t[", t->up.primary);
 	for (i = 0; i < t->acting.size; i++)
 		seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
-	seq_printf(s, "]/%d\t", t->acting.primary);
+	seq_printf(s, "]/%d\te%u\t", t->acting.primary, t->epoch);
 	if (t->target_oloc.pool_ns) {
 		seq_printf(s, "%*pE/%*pE\t0x%x",
 			(int)t->target_oloc.pool_ns->len,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 576101b635ef..173ab9c68eb6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -386,6 +386,7 @@ static void target_copy(struct ceph_osd_request_target *dest,
 	dest->flags = src->flags;
 	dest->paused = src->paused;
 
+	dest->epoch = src->epoch;
 	dest->last_force_resend = src->last_force_resend;
 
 	dest->osd = src->osd;
@@ -1334,6 +1335,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	enum calc_target_result ct_res;
 	int ret;
 
+	t->epoch = osdc->osdmap->epoch;
 	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
 	if (!pi) {
 		t->osd = CEPH_HOMELESS_OSD;
@@ -1720,10 +1722,11 @@ static void send_request(struct ceph_osd_request *req)
 
 	encode_request_partial(req, req->r_request);
 
-	dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d flags 0x%x attempt %d\n",
+	dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n",
 	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
 	     req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
-	     req->r_t.spgid.shard, osd->o_osd, req->r_flags, req->r_attempts);
+	     req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags,
+	     req->r_attempts);
 
 	req->r_t.paused = false;
 	req->r_stamp = jiffies;
@@ -1863,13 +1866,12 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked)
 static void finish_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
-	struct ceph_osd *osd = req->r_osd;
 
-	verify_osd_locked(osd);
+	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
 	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
 
-	WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
-	unlink_request(osd, req);
+	if (req->r_osd)
+		unlink_request(req->r_osd, req);
 	atomic_dec(&osdc->num_requests);
 
 	/*
@@ -3356,8 +3358,25 @@ static void kick_requests(struct ceph_osd_client *osdc,
 			  struct list_head *need_resend_linger)
 {
 	struct ceph_osd_linger_request *lreq, *nlreq;
+	enum calc_target_result ct_res;
 	struct rb_node *n;
 
+	/* make sure need_resend targets reflect latest map */
+	for (n = rb_first(need_resend); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		n = rb_next(n);
+
+		if (req->r_t.epoch < osdc->osdmap->epoch) {
+			ct_res = calc_target(osdc, &req->r_t, NULL, false);
+			if (ct_res == CALC_TARGET_POOL_DNE) {
+				erase_request(need_resend, req);
+				check_pool_dne(req);
+			}
+		}
+	}
+
 	for (n = rb_first(need_resend); n; ) {
 		struct ceph_osd_request *req =
 		    rb_entry(n, struct ceph_osd_request, r_node);
@@ -3366,8 +3385,6 @@ static void kick_requests(struct ceph_osd_client *osdc,
 		n = rb_next(n);
 		erase_request(need_resend, req); /* before link_request() */
 
-		WARN_ON(req->r_osd);
-		calc_target(osdc, &req->r_t, NULL, false);
 		osd = lookup_create_osd(osdc, req->r_t.osd, true);
 		link_request(osd, req);
 		if (!req->r_linger) {
-- 
cgit v1.2.3-59-g8ed1b


From db098ec4e41a6ad8f5b248b1287bb583754e944e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:55 +0200
Subject: libceph: always populate t->target_{oid,oloc} in calc_target()

need_check_tiering logic doesn't make a whole lot of sense.  Drop it
and apply tiering unconditionally on every calc_target() call instead.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osd_client.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 173ab9c68eb6..c050e28d3c97 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1330,7 +1330,6 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	bool unpaused = false;
 	bool legacy_change;
 	bool split = false;
-	bool need_check_tiering = false;
 	bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
 	enum calc_target_result ct_res;
 	int ret;
@@ -1351,17 +1350,11 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 			force_resend = true;
 		}
 	}
-	if (ceph_oid_empty(&t->target_oid) || force_resend) {
-		ceph_oid_copy(&t->target_oid, &t->base_oid);
-		need_check_tiering = true;
-	}
-	if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
-		ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
-		need_check_tiering = true;
-	}
 
-	if (need_check_tiering &&
-	    (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+	/* apply tiering */
+	ceph_oid_copy(&t->target_oid, &t->base_oid);
+	ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+	if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
 		if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
 			t->target_oloc.pool = pi->read_tier;
 		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
-- 
cgit v1.2.3-59-g8ed1b


From 6d637a540e6f87b9c926d45a507ff143640517cf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:55 +0200
Subject: libceph: use target pi for calc_target() calculations

For luminous and beyond we are encoding the actual spgid, which
requires operating with the correct pg_num, i.e. that of the target
pool.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osd_client.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index c050e28d3c97..ce0055dc7544 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1306,7 +1306,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 		       ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		       __pool_full(pi);
 
-	WARN_ON(pi->id != t->base_oloc.pool);
+	WARN_ON(pi->id != t->target_oloc.pool);
 	return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
 	       ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
 	       (osdc->osdmap->epoch < osdc->epoch_barrier);
@@ -1359,6 +1359,13 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 			t->target_oloc.pool = pi->read_tier;
 		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
 			t->target_oloc.pool = pi->write_tier;
+
+		pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
+		if (!pi) {
+			t->osd = CEPH_HOMELESS_OSD;
+			ct_res = CALC_TARGET_POOL_DNE;
+			goto out;
+		}
 	}
 
 	ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
-- 
cgit v1.2.3-59-g8ed1b


From df28152d53b449a72258000f592472215fc9371e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 15 Jun 2017 16:30:56 +0200
Subject: libceph: avoid unnecessary pi lookups in calc_target()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h | 10 +++++++--
 net/ceph/osd_client.c       |  8 +++----
 net/ceph/osdmap.c           | 54 +++++++++++++++++++++++++--------------------
 3 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 63fb073a3355..060d059acbf8 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -273,16 +273,22 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
+int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+				const struct ceph_object_id *oid,
+				const struct ceph_object_locator *oloc,
+				struct ceph_pg *raw_pgid);
 int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
-			      struct ceph_object_id *oid,
-			      struct ceph_object_locator *oloc,
+			      const struct ceph_object_id *oid,
+			      const struct ceph_object_locator *oloc,
 			      struct ceph_pg *raw_pgid);
 
 void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+			       struct ceph_pg_pool_info *pi,
 			       const struct ceph_pg *raw_pgid,
 			       struct ceph_osds *up,
 			       struct ceph_osds *acting);
 bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+			      struct ceph_pg_pool_info *pi,
 			      const struct ceph_pg *raw_pgid,
 			      struct ceph_spg *spgid);
 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ce0055dc7544..620aa43a6a0a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1368,8 +1368,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 		}
 	}
 
-	ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
-					&t->target_oloc, &pgid);
+	ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc,
+					  &pgid);
 	if (ret) {
 		WARN_ON(ret != -ENOENT);
 		t->osd = CEPH_HOMELESS_OSD;
@@ -1379,7 +1379,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	last_pgid.pool = pgid.pool;
 	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
 
-	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+	ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
 	if (any_change &&
 	    ceph_is_new_interval(&t->acting,
 				 &acting,
@@ -1407,7 +1407,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 
 	if (legacy_change || force_resend || split) {
 		t->pgid = pgid; /* struct */
-		ceph_pg_to_primary_shard(osdc->osdmap, &pgid, &t->spgid);
+		ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
 		ceph_osds_copy(&t->acting, &acting);
 		ceph_osds_copy(&t->up, &up);
 		t->size = pi->size;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 367879afed58..1e2e190a4c2a 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1923,16 +1923,12 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
  * Should only be called with target_oid and target_oloc (as opposed to
  * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
-			      struct ceph_object_id *oid,
-			      struct ceph_object_locator *oloc,
-			      struct ceph_pg *raw_pgid)
+int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+				const struct ceph_object_id *oid,
+				const struct ceph_object_locator *oloc,
+				struct ceph_pg *raw_pgid)
 {
-	struct ceph_pg_pool_info *pi;
-
-	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
-	if (!pi)
-		return -ENOENT;
+	WARN_ON(pi->id != oloc->pool);
 
 	if (!oloc->pool_ns) {
 		raw_pgid->pool = oloc->pool;
@@ -1964,6 +1960,20 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
 	}
 	return 0;
 }
+
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      const struct ceph_object_id *oid,
+			      const struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid)
+{
+	struct ceph_pg_pool_info *pi;
+
+	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
+	if (!pi)
+		return -ENOENT;
+
+	return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
+}
 EXPORT_SYMBOL(ceph_object_locator_to_pg);
 
 /*
@@ -2236,19 +2246,14 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
  * resend a request.
  */
 void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+			       struct ceph_pg_pool_info *pi,
 			       const struct ceph_pg *raw_pgid,
 			       struct ceph_osds *up,
 			       struct ceph_osds *acting)
 {
-	struct ceph_pg_pool_info *pi;
 	u32 pps;
 
-	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
-	if (!pi) {
-		ceph_osds_init(up);
-		ceph_osds_init(acting);
-		goto out;
-	}
+	WARN_ON(pi->id != raw_pgid->pool);
 
 	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
 	raw_to_up_osds(osdmap, pi, up);
@@ -2260,23 +2265,19 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
 		if (acting->primary == -1)
 			acting->primary = up->primary;
 	}
-out:
 	WARN_ON(!osds_valid(up) || !osds_valid(acting));
 }
 
 bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+			      struct ceph_pg_pool_info *pi,
 			      const struct ceph_pg *raw_pgid,
 			      struct ceph_spg *spgid)
 {
-	struct ceph_pg_pool_info *pi;
 	struct ceph_pg pgid;
 	struct ceph_osds up, acting;
 	int i;
 
-	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
-	if (!pi)
-		return false;
-
+	WARN_ON(pi->id != raw_pgid->pool);
 	raw_pg_to_pg(pi, raw_pgid, &pgid);
 
 	if (ceph_can_shift_osds(pi)) {
@@ -2285,7 +2286,7 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
 		return true;
 	}
 
-	ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+	ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
 	for (i = 0; i < acting.size; i++) {
 		if (acting.osds[i] == acting.primary) {
 			spgid->pgid = pgid; /* struct */
@@ -2303,9 +2304,14 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
 			      const struct ceph_pg *raw_pgid)
 {
+	struct ceph_pg_pool_info *pi;
 	struct ceph_osds up, acting;
 
-	ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+	pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+	if (!pi)
+		return -1;
+
+	ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
 	return acting.primary;
 }
 EXPORT_SYMBOL(ceph_pg_to_acting_primary);
-- 
cgit v1.2.3-59-g8ed1b


From 76f827a7b1faaaebc53f89d184e95ea3a0b8dd71 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 19 Jun 2017 12:18:05 +0200
Subject: libceph: make DEFINE_RB_* helpers more general

Initially for ceph_pg_mapping, ceph_spg_mapping and ceph_hobject_id,
compared with ceph_pg_compare(), ceph_spg_compare() and hoid_compare()
respectively.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h | 49 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 3229ae6c7846..8a79587e1317 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -184,10 +184,11 @@ static inline int calc_pages_for(u64 off, u64 len)
 		(off >> PAGE_SHIFT);
 }
 
-/*
- * These are not meant to be generic - an integer key is assumed.
- */
-#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)		\
+#define RB_BYVAL(a)      (a)
+#define RB_BYPTR(a)      (&(a))
+#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
+
+#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
 static void insert_##name(struct rb_root *root, type *t)		\
 {									\
 	struct rb_node **n = &root->rb_node;				\
@@ -197,11 +198,13 @@ static void insert_##name(struct rb_root *root, type *t)		\
 									\
 	while (*n) {							\
 		type *cur = rb_entry(*n, type, nodefld);		\
+		int cmp;						\
 									\
 		parent = *n;						\
-		if (t->keyfld < cur->keyfld)				\
+		cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld));	\
+		if (cmp < 0)						\
 			n = &(*n)->rb_left;				\
-		else if (t->keyfld > cur->keyfld)			\
+		else if (cmp > 0)					\
 			n = &(*n)->rb_right;				\
 		else							\
 			BUG();						\
@@ -217,19 +220,24 @@ static void erase_##name(struct rb_root *root, type *t)			\
 	RB_CLEAR_NODE(&t->nodefld);					\
 }
 
-#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)		\
-extern type __lookup_##name##_key;					\
-static type *lookup_##name(struct rb_root *root,			\
-			   typeof(__lookup_##name##_key.keyfld) key)	\
+/*
+ * @lookup_param_type is a parameter and not constructed from (@type,
+ * @keyfld) with typeof() because adding const is too unwieldy.
+ */
+#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp,	\
+			       lookup_param_type, nodefld)		\
+static type *lookup_##name(struct rb_root *root, lookup_param_type key)	\
 {									\
 	struct rb_node *n = root->rb_node;				\
 									\
 	while (n) {							\
 		type *cur = rb_entry(n, type, nodefld);			\
+		int cmp;						\
 									\
-		if (key < cur->keyfld)					\
+		cmp = cmpexp(key, keyexp(cur->keyfld));			\
+		if (cmp < 0)						\
 			n = n->rb_left;					\
-		else if (key > cur->keyfld)				\
+		else if (cmp > 0)					\
 			n = n->rb_right;				\
 		else							\
 			return cur;					\
@@ -238,6 +246,23 @@ static type *lookup_##name(struct rb_root *root,			\
 	return NULL;							\
 }
 
+#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp,		\
+			 lookup_param_type, nodefld)			\
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld)	\
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp,		\
+		       lookup_param_type, nodefld)
+
+/*
+ * Shorthands for integer keys.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)		\
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld)
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)		\
+extern type __lookup_##name##_key;					\
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL,	\
+		       typeof(__lookup_##name##_key.keyfld), nodefld)
+
 #define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)			\
 DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)			\
 DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
-- 
cgit v1.2.3-59-g8ed1b


From a02a946dfe9633d7e0202359836f6b5217a62824 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 19 Jun 2017 12:18:05 +0200
Subject: libceph: respect RADOS_BACKOFF backoffs

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h    |   1 +
 include/linux/ceph/osd_client.h |  45 +++
 include/linux/ceph/osdmap.h     |   1 +
 include/linux/ceph/rados.h      |   6 +
 net/ceph/ceph_common.c          |   1 +
 net/ceph/debugfs.c              |  74 +++++
 net/ceph/osd_client.c           | 593 ++++++++++++++++++++++++++++++++++++++++
 net/ceph/osdmap.c               |  16 ++
 8 files changed, 737 insertions(+)

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ad078ebe25d6..edf5b04b918a 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -147,6 +147,7 @@ struct ceph_dir_layout {
 #define CEPH_MSG_OSD_OP                 42
 #define CEPH_MSG_OSD_OPREPLY            43
 #define CEPH_MSG_WATCH_NOTIFY           44
+#define CEPH_MSG_OSD_BACKOFF            61
 
 
 /* watch-notify operations */
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 62c672bcbb31..c6d96a5f46fd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_OSD_CLIENT_H
 #define _FS_CEPH_OSD_CLIENT_H
 
+#include <linux/bitrev.h>
 #include <linux/completion.h>
 #include <linux/kref.h>
 #include <linux/mempool.h>
@@ -36,6 +37,8 @@ struct ceph_osd {
 	struct ceph_connection o_con;
 	struct rb_root o_requests;
 	struct rb_root o_linger_requests;
+	struct rb_root o_backoff_mappings;
+	struct rb_root o_backoffs_by_id;
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
@@ -275,6 +278,48 @@ struct ceph_watch_item {
 	struct ceph_entity_addr addr;
 };
 
+struct ceph_spg_mapping {
+	struct rb_node node;
+	struct ceph_spg spgid;
+
+	struct rb_root backoffs;
+};
+
+struct ceph_hobject_id {
+	void *key;
+	size_t key_len;
+	void *oid;
+	size_t oid_len;
+	u64 snapid;
+	u32 hash;
+	u8 is_max;
+	void *nspace;
+	size_t nspace_len;
+	s64 pool;
+
+	/* cache */
+	u32 hash_reverse_bits;
+};
+
+static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
+{
+	hoid->hash_reverse_bits = bitrev32(hoid->hash);
+}
+
+/*
+ * PG-wide backoff: [begin, end)
+ * per-object backoff: begin == end
+ */
+struct ceph_osd_backoff {
+	struct rb_node spg_node;
+	struct rb_node id_node;
+
+	struct ceph_spg spgid;
+	u64 id;
+	struct ceph_hobject_id *begin;
+	struct ceph_hobject_id *end;
+};
+
 #define CEPH_LINGER_ID_START	0xffff000000000000ULL
 
 struct ceph_osd_client {
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 060d059acbf8..fe6d189bdd30 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -32,6 +32,7 @@ struct ceph_spg {
 };
 
 int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
 
 #define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
 						       together */
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 5d0018782d50..385db08bb8b2 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -439,6 +439,12 @@ enum {
 
 const char *ceph_osd_watch_op_name(int o);
 
+enum {
+	CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+	CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+	CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
 /*
  * an individual object operation.  each may be accompanied by some data
  * payload
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 47e94b560ba0..3d265c5cb6d0 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type)
 	case CEPH_MSG_OSD_OP: return "osd_op";
 	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
 	case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
+	case CEPH_MSG_OSD_BACKOFF: return "osd_backoff";
 	default: return "unknown";
 	}
 }
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index c0089f8ccaeb..017f15c575f8 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -243,6 +243,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
 	mutex_unlock(&osd->lock);
 }
 
+static void dump_snapid(struct seq_file *s, u64 snapid)
+{
+	if (snapid == CEPH_NOSNAP)
+		seq_puts(s, "head");
+	else if (snapid == CEPH_SNAPDIR)
+		seq_puts(s, "snapdir");
+	else
+		seq_printf(s, "%llx", snapid);
+}
+
+static void dump_name_escaped(struct seq_file *s, unsigned char *name,
+			      size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		if (name[i] == '%' || name[i] == ':' || name[i] == '/' ||
+		    name[i] < 32 || name[i] >= 127) {
+			seq_printf(s, "%%%02x", name[i]);
+		} else {
+			seq_putc(s, name[i]);
+		}
+	}
+}
+
+static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid)
+{
+	if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max &&
+	    hoid->pool == S64_MIN) {
+		seq_puts(s, "MIN");
+		return;
+	}
+	if (hoid->is_max) {
+		seq_puts(s, "MAX");
+		return;
+	}
+	seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits);
+	dump_name_escaped(s, hoid->nspace, hoid->nspace_len);
+	seq_putc(s, ':');
+	dump_name_escaped(s, hoid->key, hoid->key_len);
+	seq_putc(s, ':');
+	dump_name_escaped(s, hoid->oid, hoid->oid_len);
+	seq_putc(s, ':');
+	dump_snapid(s, hoid->snapid);
+}
+
+static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) {
+		struct ceph_osd_backoff *backoff =
+		    rb_entry(n, struct ceph_osd_backoff, id_node);
+
+		seq_printf(s, "osd%d\t", osd->o_osd);
+		dump_spgid(s, &backoff->spgid);
+		seq_printf(s, "\t%llu\t", backoff->id);
+		dump_hoid(s, backoff->begin);
+		seq_putc(s, '\t');
+		dump_hoid(s, backoff->end);
+		seq_putc(s, '\n');
+	}
+
+	mutex_unlock(&osd->lock);
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
@@ -268,6 +335,13 @@ static int osdc_show(struct seq_file *s, void *pp)
 	}
 	dump_linger_requests(s, &osdc->homeless_osd);
 
+	seq_puts(s, "BACKOFFS\n");
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		dump_backoffs(s, osd);
+	}
+
 	up_read(&osdc->lock);
 	return 0;
 }
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 620aa43a6a0a..86a9737d8e3f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
 			struct ceph_osd_linger_request *lreq);
 static void unlink_linger(struct ceph_osd *osd,
 			  struct ceph_osd_linger_request *lreq);
+static void clear_backoffs(struct ceph_osd *osd);
 
 #if 1
 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
@@ -1019,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd)
 	RB_CLEAR_NODE(&osd->o_node);
 	osd->o_requests = RB_ROOT;
 	osd->o_linger_requests = RB_ROOT;
+	osd->o_backoff_mappings = RB_ROOT;
+	osd->o_backoffs_by_id = RB_ROOT;
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	osd->o_incarnation = 1;
@@ -1030,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd)
 	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
 	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
 	WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
 	WARN_ON(!list_empty(&osd->o_osd_lru));
 	WARN_ON(!list_empty(&osd->o_keepalive_item));
 
@@ -1150,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd)
 		unlink_linger(osd, lreq);
 		link_linger(&osdc->homeless_osd, lreq);
 	}
+	clear_backoffs(osd);
 
 	__remove_osd_from_lru(osd);
 	erase_osd(&osdc->osds, osd);
@@ -1431,6 +1437,328 @@ out:
 	return ct_res;
 }
 
+static struct ceph_spg_mapping *alloc_spg_mapping(void)
+{
+	struct ceph_spg_mapping *spg;
+
+	spg = kmalloc(sizeof(*spg), GFP_NOIO);
+	if (!spg)
+		return NULL;
+
+	RB_CLEAR_NODE(&spg->node);
+	spg->backoffs = RB_ROOT;
+	return spg;
+}
+
+static void free_spg_mapping(struct ceph_spg_mapping *spg)
+{
+	WARN_ON(!RB_EMPTY_NODE(&spg->node));
+	WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
+
+	kfree(spg);
+}
+
+/*
+ * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
+ * ceph_pg_mapping.  Used to track OSD backoffs -- a backoff [range] is
+ * defined only within a specific spgid; it does not pass anything to
+ * children on split, or to another primary.
+ */
+DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
+		 RB_BYPTR, const struct ceph_spg *, node)
+
+static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
+{
+	return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
+}
+
+static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
+				   void **pkey, size_t *pkey_len)
+{
+	if (hoid->key_len) {
+		*pkey = hoid->key;
+		*pkey_len = hoid->key_len;
+	} else {
+		*pkey = hoid->oid;
+		*pkey_len = hoid->oid_len;
+	}
+}
+
+static int compare_names(const void *name1, size_t name1_len,
+			 const void *name2, size_t name2_len)
+{
+	int ret;
+
+	ret = memcmp(name1, name2, min(name1_len, name2_len));
+	if (!ret) {
+		if (name1_len < name2_len)
+			ret = -1;
+		else if (name1_len > name2_len)
+			ret = 1;
+	}
+	return ret;
+}
+
+static int hoid_compare(const struct ceph_hobject_id *lhs,
+			const struct ceph_hobject_id *rhs)
+{
+	void *effective_key1, *effective_key2;
+	size_t effective_key1_len, effective_key2_len;
+	int ret;
+
+	if (lhs->is_max < rhs->is_max)
+		return -1;
+	if (lhs->is_max > rhs->is_max)
+		return 1;
+
+	if (lhs->pool < rhs->pool)
+		return -1;
+	if (lhs->pool > rhs->pool)
+		return 1;
+
+	if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
+		return -1;
+	if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
+		return 1;
+
+	ret = compare_names(lhs->nspace, lhs->nspace_len,
+			    rhs->nspace, rhs->nspace_len);
+	if (ret)
+		return ret;
+
+	hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
+	hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
+	ret = compare_names(effective_key1, effective_key1_len,
+			    effective_key2, effective_key2_len);
+	if (ret)
+		return ret;
+
+	ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
+	if (ret)
+		return ret;
+
+	if (lhs->snapid < rhs->snapid)
+		return -1;
+	if (lhs->snapid > rhs->snapid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
+ * compat stuff here.
+ *
+ * Assumes @hoid is zero-initialized.
+ */
+static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid)
+{
+	u8 struct_v;
+	u32 struct_len;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	if (struct_v < 4) {
+		pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
+		goto e_inval;
+	}
+
+	hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
+						GFP_NOIO);
+	if (IS_ERR(hoid->key)) {
+		ret = PTR_ERR(hoid->key);
+		hoid->key = NULL;
+		return ret;
+	}
+
+	hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
+						GFP_NOIO);
+	if (IS_ERR(hoid->oid)) {
+		ret = PTR_ERR(hoid->oid);
+		hoid->oid = NULL;
+		return ret;
+	}
+
+	ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
+	ceph_decode_32_safe(p, end, hoid->hash, e_inval);
+	ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
+
+	hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
+						   GFP_NOIO);
+	if (IS_ERR(hoid->nspace)) {
+		ret = PTR_ERR(hoid->nspace);
+		hoid->nspace = NULL;
+		return ret;
+	}
+
+	ceph_decode_64_safe(p, end, hoid->pool, e_inval);
+
+	ceph_hoid_build_hash_cache(hoid);
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
+{
+	return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
+	       4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
+}
+
+static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid)
+{
+	ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
+	ceph_encode_string(p, end, hoid->key, hoid->key_len);
+	ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
+	ceph_encode_64(p, hoid->snapid);
+	ceph_encode_32(p, hoid->hash);
+	ceph_encode_8(p, hoid->is_max);
+	ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
+	ceph_encode_64(p, hoid->pool);
+}
+
+static void free_hoid(struct ceph_hobject_id *hoid)
+{
+	if (hoid) {
+		kfree(hoid->key);
+		kfree(hoid->oid);
+		kfree(hoid->nspace);
+		kfree(hoid);
+	}
+}
+
+static struct ceph_osd_backoff *alloc_backoff(void)
+{
+	struct ceph_osd_backoff *backoff;
+
+	backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
+	if (!backoff)
+		return NULL;
+
+	RB_CLEAR_NODE(&backoff->spg_node);
+	RB_CLEAR_NODE(&backoff->id_node);
+	return backoff;
+}
+
+static void free_backoff(struct ceph_osd_backoff *backoff)
+{
+	WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
+	WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
+
+	free_hoid(backoff->begin);
+	free_hoid(backoff->end);
+	kfree(backoff);
+}
+
+/*
+ * Within a specific spgid, backoffs are managed by ->begin hoid.
+ */
+DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
+			RB_BYVAL, spg_node);
+
+static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root,
+					    const struct ceph_hobject_id *hoid)
+{
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		struct ceph_osd_backoff *cur =
+		    rb_entry(n, struct ceph_osd_backoff, spg_node);
+		int cmp;
+
+		cmp = hoid_compare(hoid, cur->begin);
+		if (cmp < 0) {
+			n = n->rb_left;
+		} else if (cmp > 0) {
+			if (hoid_compare(hoid, cur->end) < 0)
+				return cur;
+
+			n = n->rb_right;
+		} else {
+			return cur;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Each backoff has a unique id within its OSD session.
+ */
+DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
+
+static void clear_backoffs(struct ceph_osd *osd)
+{
+	while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
+		struct ceph_spg_mapping *spg =
+		    rb_entry(rb_first(&osd->o_backoff_mappings),
+			     struct ceph_spg_mapping, node);
+
+		while (!RB_EMPTY_ROOT(&spg->backoffs)) {
+			struct ceph_osd_backoff *backoff =
+			    rb_entry(rb_first(&spg->backoffs),
+				     struct ceph_osd_backoff, spg_node);
+
+			erase_backoff(&spg->backoffs, backoff);
+			erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
+			free_backoff(backoff);
+		}
+		erase_spg_mapping(&osd->o_backoff_mappings, spg);
+		free_spg_mapping(spg);
+	}
+}
+
+/*
+ * Set up a temporary, non-owning view into @t.
+ */
+static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
+				  const struct ceph_osd_request_target *t)
+{
+	hoid->key = NULL;
+	hoid->key_len = 0;
+	hoid->oid = t->target_oid.name;
+	hoid->oid_len = t->target_oid.name_len;
+	hoid->snapid = CEPH_NOSNAP;
+	hoid->hash = t->pgid.seed;
+	hoid->is_max = false;
+	if (t->target_oloc.pool_ns) {
+		hoid->nspace = t->target_oloc.pool_ns->str;
+		hoid->nspace_len = t->target_oloc.pool_ns->len;
+	} else {
+		hoid->nspace = NULL;
+		hoid->nspace_len = 0;
+	}
+	hoid->pool = t->target_oloc.pool;
+	ceph_hoid_build_hash_cache(hoid);
+}
+
+static bool should_plug_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd *osd = req->r_osd;
+	struct ceph_spg_mapping *spg;
+	struct ceph_osd_backoff *backoff;
+	struct ceph_hobject_id hoid;
+
+	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
+	if (!spg)
+		return false;
+
+	hoid_fill_from_target(&hoid, &req->r_t);
+	backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
+	if (!backoff)
+		return false;
+
+	dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
+	     __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
+	     backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
+	return true;
+}
+
 static void setup_request_data(struct ceph_osd_request *req,
 			       struct ceph_msg *msg)
 {
@@ -1707,6 +2035,10 @@ static void send_request(struct ceph_osd_request *req)
 	verify_osd_locked(osd);
 	WARN_ON(osd->o_osd != req->r_t.osd);
 
+	/* backoff? */
+	if (should_plug_request(req))
+		return;
+
 	/*
 	 * We may have a previously queued request message hanging
 	 * around.  Cancel it to avoid corrupting the msgr.
@@ -3527,6 +3859,8 @@ static void kick_osd_requests(struct ceph_osd *osd)
 {
 	struct rb_node *n;
 
+	clear_backoffs(osd);
+
 	for (n = rb_first(&osd->o_requests); n; ) {
 		struct ceph_osd_request *req =
 		    rb_entry(n, struct ceph_osd_request, r_node);
@@ -3572,6 +3906,261 @@ out_unlock:
 	up_write(&osdc->lock);
 }
 
+struct MOSDBackoff {
+	struct ceph_spg spgid;
+	u32 map_epoch;
+	u8 op;
+	u64 id;
+	struct ceph_hobject_id *begin;
+	struct ceph_hobject_id *end;
+};
+
+static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m)
+{
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	u8 struct_v;
+	u32 struct_len;
+	int ret;
+
+	ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
+	if (ret)
+		return ret;
+
+	ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
+	if (ret)
+		return ret;
+
+	ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
+	ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
+	ceph_decode_8_safe(&p, end, m->op, e_inval);
+	ceph_decode_64_safe(&p, end, m->id, e_inval);
+
+	m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
+	if (!m->begin)
+		return -ENOMEM;
+
+	ret = decode_hoid(&p, end, m->begin);
+	if (ret) {
+		free_hoid(m->begin);
+		return ret;
+	}
+
+	m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
+	if (!m->end) {
+		free_hoid(m->begin);
+		return -ENOMEM;
+	}
+
+	ret = decode_hoid(&p, end, m->end);
+	if (ret) {
+		free_hoid(m->begin);
+		free_hoid(m->end);
+		return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static struct ceph_msg *create_backoff_message(
+				const struct ceph_osd_backoff *backoff,
+				u32 map_epoch)
+{
+	struct ceph_msg *msg;
+	void *p, *end;
+	int msg_size;
+
+	msg_size = CEPH_ENCODING_START_BLK_LEN +
+			CEPH_PGID_ENCODING_LEN + 1; /* spgid */
+	msg_size += 4 + 1 + 8; /* map_epoch, op, id */
+	msg_size += CEPH_ENCODING_START_BLK_LEN +
+			hoid_encoding_size(backoff->begin);
+	msg_size += CEPH_ENCODING_START_BLK_LEN +
+			hoid_encoding_size(backoff->end);
+
+	msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
+	if (!msg)
+		return NULL;
+
+	p = msg->front.iov_base;
+	end = p + msg->front_alloc_len;
+
+	encode_spgid(&p, &backoff->spgid);
+	ceph_encode_32(&p, map_epoch);
+	ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
+	ceph_encode_64(&p, backoff->id);
+	encode_hoid(&p, end, backoff->begin);
+	encode_hoid(&p, end, backoff->end);
+	BUG_ON(p != end);
+
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+	return msg;
+}
+
+static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m)
+{
+	struct ceph_spg_mapping *spg;
+	struct ceph_osd_backoff *backoff;
+	struct ceph_msg *msg;
+
+	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
+	     m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
+
+	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
+	if (!spg) {
+		spg = alloc_spg_mapping();
+		if (!spg) {
+			pr_err("%s failed to allocate spg\n", __func__);
+			return;
+		}
+		spg->spgid = m->spgid; /* struct */
+		insert_spg_mapping(&osd->o_backoff_mappings, spg);
+	}
+
+	backoff = alloc_backoff();
+	if (!backoff) {
+		pr_err("%s failed to allocate backoff\n", __func__);
+		return;
+	}
+	backoff->spgid = m->spgid; /* struct */
+	backoff->id = m->id;
+	backoff->begin = m->begin;
+	m->begin = NULL; /* backoff now owns this */
+	backoff->end = m->end;
+	m->end = NULL;   /* ditto */
+
+	insert_backoff(&spg->backoffs, backoff);
+	insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
+
+	/*
+	 * Ack with original backoff's epoch so that the OSD can
+	 * discard this if there was a PG split.
+	 */
+	msg = create_backoff_message(backoff, m->map_epoch);
+	if (!msg) {
+		pr_err("%s failed to allocate msg\n", __func__);
+		return;
+	}
+	ceph_con_send(&osd->o_con, msg);
+}
+
+static bool target_contained_by(const struct ceph_osd_request_target *t,
+				const struct ceph_hobject_id *begin,
+				const struct ceph_hobject_id *end)
+{
+	struct ceph_hobject_id hoid;
+	int cmp;
+
+	hoid_fill_from_target(&hoid, t);
+	cmp = hoid_compare(&hoid, begin);
+	return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0);
+}
+
+static void handle_backoff_unblock(struct ceph_osd *osd,
+				   const struct MOSDBackoff *m)
+{
+	struct ceph_spg_mapping *spg;
+	struct ceph_osd_backoff *backoff;
+	struct rb_node *n;
+
+	dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
+	     m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
+
+	backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
+	if (!backoff) {
+		pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
+		       __func__, osd->o_osd, m->spgid.pgid.pool,
+		       m->spgid.pgid.seed, m->spgid.shard, m->id);
+		return;
+	}
+
+	if (hoid_compare(backoff->begin, m->begin) &&
+	    hoid_compare(backoff->end, m->end)) {
+		pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
+		       __func__, osd->o_osd, m->spgid.pgid.pool,
+		       m->spgid.pgid.seed, m->spgid.shard, m->id);
+		/* unblock it anyway... */
+	}
+
+	spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
+	BUG_ON(!spg);
+
+	erase_backoff(&spg->backoffs, backoff);
+	erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
+	free_backoff(backoff);
+
+	if (RB_EMPTY_ROOT(&spg->backoffs)) {
+		erase_spg_mapping(&osd->o_backoff_mappings, spg);
+		free_spg_mapping(spg);
+	}
+
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
+			/*
+			 * Match against @m, not @backoff -- the PG may
+			 * have split on the OSD.
+			 */
+			if (target_contained_by(&req->r_t, m->begin, m->end)) {
+				/*
+				 * If no other installed backoff applies,
+				 * resend.
+				 */
+				send_request(req);
+			}
+		}
+	}
+}
+
+static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct MOSDBackoff m;
+	int ret;
+
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		up_read(&osdc->lock);
+		return;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
+
+	mutex_lock(&osd->lock);
+	ret = decode_MOSDBackoff(msg, &m);
+	if (ret) {
+		pr_err("failed to decode MOSDBackoff: %d\n", ret);
+		ceph_msg_dump(msg);
+		goto out_unlock;
+	}
+
+	switch (m.op) {
+	case CEPH_OSD_BACKOFF_OP_BLOCK:
+		handle_backoff_block(osd, &m);
+		break;
+	case CEPH_OSD_BACKOFF_OP_UNBLOCK:
+		handle_backoff_unblock(osd, &m);
+		break;
+	default:
+		pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
+	}
+
+	free_hoid(m.begin);
+	free_hoid(m.end);
+
+out_unlock:
+	mutex_unlock(&osd->lock);
+	up_read(&osdc->lock);
+}
+
 /*
  * Process osd watch notifications
  */
@@ -4509,6 +5098,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 	case CEPH_MSG_OSD_OPREPLY:
 		handle_reply(osd, msg);
 		break;
+	case CEPH_MSG_OSD_BACKOFF:
+		handle_backoff(osd, msg);
+		break;
 	case CEPH_MSG_WATCH_NOTIFY:
 		handle_watch_notify(osdc, msg);
 		break;
@@ -4631,6 +5223,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
+	case CEPH_MSG_OSD_BACKOFF:
 	case CEPH_MSG_WATCH_NOTIFY:
 		return alloc_msg_with_page_vector(hdr);
 	case CEPH_MSG_OSD_OPREPLY:
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 1e2e190a4c2a..1d87a736221b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -418,6 +418,22 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
 	return 0;
 }
 
+int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
+{
+	int ret;
+
+	ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
+	if (ret)
+		return ret;
+
+	if (lhs->shard < rhs->shard)
+		return -1;
+	if (lhs->shard > rhs->shard)
+		return 1;
+
+	return 0;
+}
+
 /*
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds) and primary_temp (explicit primary setting)
-- 
cgit v1.2.3-59-g8ed1b


From 33333d107112be4d822234deb8820ec6eeb0831d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 21 Jun 2017 17:27:17 +0200
Subject: libceph: don't pass pgid by value

Make __{lookup,remove}_pg_mapping() look like their ceph_spg_mapping
counterparts: take const struct ceph_pg *.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osdmap.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 1d87a736221b..06baf6bd40e4 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -465,7 +465,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 }
 
 static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   struct ceph_pg pgid)
+						   const struct ceph_pg *pgid)
 {
 	struct rb_node *n = root->rb_node;
 	struct ceph_pg_mapping *pg;
@@ -473,32 +473,32 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 
 	while (n) {
 		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = ceph_pg_compare(&pgid, &pg->pgid);
+		c = ceph_pg_compare(pgid, &pg->pgid);
 		if (c < 0) {
 			n = n->rb_left;
 		} else if (c > 0) {
 			n = n->rb_right;
 		} else {
 			dout("__lookup_pg_mapping %lld.%x got %p\n",
-			     pgid.pool, pgid.seed, pg);
+			     pgid->pool, pgid->seed, pg);
 			return pg;
 		}
 	}
 	return NULL;
 }
 
-static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
+static int __remove_pg_mapping(struct rb_root *root, const struct ceph_pg *pgid)
 {
 	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
 
 	if (pg) {
-		dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
+		dout("__remove_pg_mapping %lld.%x %p\n", pgid->pool, pgid->seed,
 		     pg);
 		rb_erase(&pg->node, root);
 		kfree(pg);
 		return 0;
 	}
-	dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
+	dout("__remove_pg_mapping %lld.%x dne\n", pgid->pool, pgid->seed);
 	return -ENOENT;
 }
 
@@ -1034,7 +1034,7 @@ static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
 
 		ceph_decode_32_safe(p, end, len, e_inval);
 
-		ret = __remove_pg_mapping(&map->pg_temp, pgid);
+		ret = __remove_pg_mapping(&map->pg_temp, &pgid);
 		BUG_ON(!incremental && ret != -ENOENT);
 
 		if (!incremental || len > 0) {
@@ -1095,7 +1095,7 @@ static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
 
 		ceph_decode_32_safe(p, end, osd, e_inval);
 
-		ret = __remove_pg_mapping(&map->primary_temp, pgid);
+		ret = __remove_pg_mapping(&map->primary_temp, &pgid);
 		BUG_ON(!incremental && ret != -ENOENT);
 
 		if (!incremental || osd != (u32)-1) {
@@ -2226,7 +2226,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
 	ceph_osds_init(temp);
 
 	/* pg_temp? */
-	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, &pgid);
 	if (pg) {
 		for (i = 0; i < pg->pg_temp.len; i++) {
 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
@@ -2249,7 +2249,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
 	}
 
 	/* primary_temp? */
-	pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+	pg = __lookup_pg_mapping(&osdmap->primary_temp, &pgid);
 	if (pg)
 		temp->primary = pg->primary_temp.osd;
 }
-- 
cgit v1.2.3-59-g8ed1b


From a303bb0e58345fe9f7ab2f82b90266f2b5036058 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 21 Jun 2017 17:27:17 +0200
Subject: libceph: introduce and switch to decode_pg_mapping()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osdmap.c | 150 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 83 insertions(+), 67 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 06baf6bd40e4..41b380ac68ac 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -434,6 +434,25 @@ int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
 	return 0;
 }
 
+static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
+{
+	struct ceph_pg_mapping *pg;
+
+	pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
+	if (!pg)
+		return NULL;
+
+	RB_CLEAR_NODE(&pg->node);
+	return pg;
+}
+
+static void free_pg_mapping(struct ceph_pg_mapping *pg)
+{
+	WARN_ON(!RB_EMPTY_NODE(&pg->node));
+
+	kfree(pg);
+}
+
 /*
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds) and primary_temp (explicit primary setting)
@@ -1017,47 +1036,36 @@ static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
 	return __decode_pools(p, end, map, true);
 }
 
-static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
-			    bool incremental)
+typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
+
+static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
+			     decode_mapping_fn_t fn, bool incremental)
 {
 	u32 n;
 
+	WARN_ON(!incremental && !fn);
+
 	ceph_decode_32_safe(p, end, n, e_inval);
 	while (n--) {
+		struct ceph_pg_mapping *pg;
 		struct ceph_pg pgid;
-		u32 len, i;
 		int ret;
 
 		ret = ceph_decode_pgid(p, end, &pgid);
 		if (ret)
 			return ret;
 
-		ceph_decode_32_safe(p, end, len, e_inval);
-
-		ret = __remove_pg_mapping(&map->pg_temp, &pgid);
-		BUG_ON(!incremental && ret != -ENOENT);
-
-		if (!incremental || len > 0) {
-			struct ceph_pg_mapping *pg;
+		ret = __remove_pg_mapping(mapping_root, &pgid);
+		WARN_ON(!incremental && ret != -ENOENT);
 
-			ceph_decode_need(p, end, len*sizeof(u32), e_inval);
+		if (fn) {
+			pg = fn(p, end, incremental);
+			if (IS_ERR(pg))
+				return PTR_ERR(pg);
 
-			if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-				return -EINVAL;
-
-			pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
-			if (!pg)
-				return -ENOMEM;
-
-			pg->pgid = pgid;
-			pg->pg_temp.len = len;
-			for (i = 0; i < len; i++)
-				pg->pg_temp.osds[i] = ceph_decode_32(p);
-
-			ret = __insert_pg_mapping(pg, &map->pg_temp);
-			if (ret) {
-				kfree(pg);
-				return ret;
+			if (pg) {
+				pg->pgid = pgid; /* struct */
+				__insert_pg_mapping(pg, mapping_root);
 			}
 		}
 	}
@@ -1068,69 +1076,77 @@ e_inval:
 	return -EINVAL;
 }
 
+static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
+						bool incremental)
+{
+	struct ceph_pg_mapping *pg;
+	u32 len, i;
+
+	ceph_decode_32_safe(p, end, len, e_inval);
+	if (len == 0 && incremental)
+		return NULL;	/* new_pg_temp: [] to remove */
+	if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
+		return ERR_PTR(-EINVAL);
+
+	ceph_decode_need(p, end, len * sizeof(u32), e_inval);
+	pg = alloc_pg_mapping(len * sizeof(u32));
+	if (!pg)
+		return ERR_PTR(-ENOMEM);
+
+	pg->pg_temp.len = len;
+	for (i = 0; i < len; i++)
+		pg->pg_temp.osds[i] = ceph_decode_32(p);
+
+	return pg;
+
+e_inval:
+	return ERR_PTR(-EINVAL);
+}
+
 static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
 {
-	return __decode_pg_temp(p, end, map, false);
+	return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
+				 false);
 }
 
 static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
 {
-	return __decode_pg_temp(p, end, map, true);
+	return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
+				 true);
 }
 
-static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
-				 bool incremental)
+static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
+						     bool incremental)
 {
-	u32 n;
-
-	ceph_decode_32_safe(p, end, n, e_inval);
-	while (n--) {
-		struct ceph_pg pgid;
-		u32 osd;
-		int ret;
-
-		ret = ceph_decode_pgid(p, end, &pgid);
-		if (ret)
-			return ret;
-
-		ceph_decode_32_safe(p, end, osd, e_inval);
-
-		ret = __remove_pg_mapping(&map->primary_temp, &pgid);
-		BUG_ON(!incremental && ret != -ENOENT);
-
-		if (!incremental || osd != (u32)-1) {
-			struct ceph_pg_mapping *pg;
-
-			pg = kzalloc(sizeof(*pg), GFP_NOFS);
-			if (!pg)
-				return -ENOMEM;
+	struct ceph_pg_mapping *pg;
+	u32 osd;
 
-			pg->pgid = pgid;
-			pg->primary_temp.osd = osd;
+	ceph_decode_32_safe(p, end, osd, e_inval);
+	if (osd == (u32)-1 && incremental)
+		return NULL;	/* new_primary_temp: -1 to remove */
 
-			ret = __insert_pg_mapping(pg, &map->primary_temp);
-			if (ret) {
-				kfree(pg);
-				return ret;
-			}
-		}
-	}
+	pg = alloc_pg_mapping(0);
+	if (!pg)
+		return ERR_PTR(-ENOMEM);
 
-	return 0;
+	pg->primary_temp.osd = osd;
+	return pg;
 
 e_inval:
-	return -EINVAL;
+	return ERR_PTR(-EINVAL);
 }
 
 static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
 {
-	return __decode_primary_temp(p, end, map, false);
+	return decode_pg_mapping(p, end, &map->primary_temp,
+				 __decode_primary_temp, false);
 }
 
 static int decode_new_primary_temp(void **p, void *end,
 				   struct ceph_osdmap *map)
 {
-	return __decode_primary_temp(p, end, map, true);
+	return decode_pg_mapping(p, end, &map->primary_temp,
+				 __decode_primary_temp, true);
 }
 
 u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
-- 
cgit v1.2.3-59-g8ed1b


From ab75144be08cfc1d80f49e9c37970fcadb1215a2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 21 Jun 2017 17:27:17 +0200
Subject: libceph: kill __{insert,lookup,remove}_pg_mapping()

Switch to DEFINE_RB_FUNCS2-generated {insert,lookup,erase}_pg_mapping().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osdmap.c | 87 ++++++++++---------------------------------------------
 1 file changed, 15 insertions(+), 72 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 41b380ac68ac..423747714017 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -457,69 +457,8 @@ static void free_pg_mapping(struct ceph_pg_mapping *pg)
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds) and primary_temp (explicit primary setting)
  */
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
-			       struct rb_root *root)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_pg_mapping *pg = NULL;
-	int c;
-
-	dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new);
-	while (*p) {
-		parent = *p;
-		pg = rb_entry(parent, struct ceph_pg_mapping, node);
-		c = ceph_pg_compare(&new->pgid, &pg->pgid);
-		if (c < 0)
-			p = &(*p)->rb_left;
-		else if (c > 0)
-			p = &(*p)->rb_right;
-		else
-			return -EEXIST;
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, root);
-	return 0;
-}
-
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-						   const struct ceph_pg *pgid)
-{
-	struct rb_node *n = root->rb_node;
-	struct ceph_pg_mapping *pg;
-	int c;
-
-	while (n) {
-		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = ceph_pg_compare(pgid, &pg->pgid);
-		if (c < 0) {
-			n = n->rb_left;
-		} else if (c > 0) {
-			n = n->rb_right;
-		} else {
-			dout("__lookup_pg_mapping %lld.%x got %p\n",
-			     pgid->pool, pgid->seed, pg);
-			return pg;
-		}
-	}
-	return NULL;
-}
-
-static int __remove_pg_mapping(struct rb_root *root, const struct ceph_pg *pgid)
-{
-	struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
-
-	if (pg) {
-		dout("__remove_pg_mapping %lld.%x %p\n", pgid->pool, pgid->seed,
-		     pg);
-		rb_erase(&pg->node, root);
-		kfree(pg);
-		return 0;
-	}
-	dout("__remove_pg_mapping %lld.%x dne\n", pgid->pool, pgid->seed);
-	return -ENOENT;
-}
+DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
+		 RB_BYPTR, const struct ceph_pg *, node)
 
 /*
  * rbtree of pg pool info
@@ -829,15 +768,15 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 		struct ceph_pg_mapping *pg =
 			rb_entry(rb_first(&map->pg_temp),
 				 struct ceph_pg_mapping, node);
-		rb_erase(&pg->node, &map->pg_temp);
-		kfree(pg);
+		erase_pg_mapping(&map->pg_temp, pg);
+		free_pg_mapping(pg);
 	}
 	while (!RB_EMPTY_ROOT(&map->primary_temp)) {
 		struct ceph_pg_mapping *pg =
 			rb_entry(rb_first(&map->primary_temp),
 				 struct ceph_pg_mapping, node);
-		rb_erase(&pg->node, &map->primary_temp);
-		kfree(pg);
+		erase_pg_mapping(&map->primary_temp, pg);
+		free_pg_mapping(pg);
 	}
 	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
 		struct ceph_pg_pool_info *pi =
@@ -1055,8 +994,12 @@ static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
 		if (ret)
 			return ret;
 
-		ret = __remove_pg_mapping(mapping_root, &pgid);
-		WARN_ON(!incremental && ret != -ENOENT);
+		pg = lookup_pg_mapping(mapping_root, &pgid);
+		if (pg) {
+			WARN_ON(!incremental);
+			erase_pg_mapping(mapping_root, pg);
+			free_pg_mapping(pg);
+		}
 
 		if (fn) {
 			pg = fn(p, end, incremental);
@@ -1065,7 +1008,7 @@ static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
 
 			if (pg) {
 				pg->pgid = pgid; /* struct */
-				__insert_pg_mapping(pg, mapping_root);
+				insert_pg_mapping(mapping_root, pg);
 			}
 		}
 	}
@@ -2242,7 +2185,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
 	ceph_osds_init(temp);
 
 	/* pg_temp? */
-	pg = __lookup_pg_mapping(&osdmap->pg_temp, &pgid);
+	pg = lookup_pg_mapping(&osdmap->pg_temp, &pgid);
 	if (pg) {
 		for (i = 0; i < pg->pg_temp.len; i++) {
 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
@@ -2265,7 +2208,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
 	}
 
 	/* primary_temp? */
-	pg = __lookup_pg_mapping(&osdmap->primary_temp, &pgid);
+	pg = lookup_pg_mapping(&osdmap->primary_temp, &pgid);
 	if (pg)
 		temp->primary = pg->primary_temp.osd;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 278b1d709c6acc6f7d138fed775c76695b068e43 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 21 Jun 2017 17:27:17 +0200
Subject: libceph: ceph_decode_skip_* helpers

Some of these won't be as efficient as they could be (e.g.
ceph_decode_skip_set(... 32 ...) could advance by len * sizeof(u32)
once instead of advancing by sizeof(u32) len times), but that's fine
and not worth a bunch of extra macro code.

Replace skip_name_map() with ceph_decode_skip_map as an example.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h | 60 +++++++++++++++++++++++++++++++++++++++++++++
 net/ceph/osdmap.c           | 25 +++----------------
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index f990f2cc907a..14af9b70d301 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -132,6 +132,66 @@ bad:
 	return ERR_PTR(-ERANGE);
 }
 
+/*
+ * skip helpers
+ */
+#define ceph_decode_skip_n(p, end, n, bad)			\
+	do {							\
+		ceph_decode_need(p, end, n, bad);		\
+                *p += n;					\
+	} while (0)
+
+#define ceph_decode_skip_64(p, end, bad)			\
+ceph_decode_skip_n(p, end, sizeof(u64), bad)
+
+#define ceph_decode_skip_32(p, end, bad)			\
+ceph_decode_skip_n(p, end, sizeof(u32), bad)
+
+#define ceph_decode_skip_16(p, end, bad)			\
+ceph_decode_skip_n(p, end, sizeof(u16), bad)
+
+#define ceph_decode_skip_8(p, end, bad)				\
+ceph_decode_skip_n(p, end, sizeof(u8), bad)
+
+#define ceph_decode_skip_string(p, end, bad)			\
+	do {							\
+		u32 len;					\
+								\
+		ceph_decode_32_safe(p, end, len, bad);		\
+		ceph_decode_skip_n(p, end, len, bad);		\
+	} while (0)
+
+#define ceph_decode_skip_set(p, end, type, bad)			\
+	do {							\
+		u32 len;					\
+								\
+		ceph_decode_32_safe(p, end, len, bad);		\
+		while (len--)					\
+			ceph_decode_skip_##type(p, end, bad);	\
+	} while (0)
+
+#define ceph_decode_skip_map(p, end, ktype, vtype, bad)		\
+	do {							\
+		u32 len;					\
+								\
+		ceph_decode_32_safe(p, end, len, bad);		\
+		while (len--) {					\
+			ceph_decode_skip_##ktype(p, end, bad);	\
+			ceph_decode_skip_##vtype(p, end, bad);	\
+		}						\
+	} while (0)
+
+#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \
+	do {							\
+		u32 len;					\
+								\
+		ceph_decode_32_safe(p, end, len, bad);		\
+		while (len--) {					\
+			ceph_decode_skip_##ktype1(p, end, bad);	\
+			ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \
+		}						\
+	} while (0)
+
 /*
  * struct ceph_timespec <-> struct timespec
  */
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 423747714017..f6d561edd511 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -138,21 +138,6 @@ bad:
 	return -EINVAL;
 }
 
-static int skip_name_map(void **p, void *end)
-{
-        int len;
-        ceph_decode_32_safe(p, end, len ,bad);
-        while (len--) {
-                int strlen;
-                *p += sizeof(u32);
-                ceph_decode_32_safe(p, end, strlen, bad);
-                *p += strlen;
-}
-        return 0;
-bad:
-        return -EINVAL;
-}
-
 static void crush_finalize(struct crush_map *c)
 {
 	__s32 b;
@@ -187,7 +172,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	void **p = &pbyval;
 	void *start = pbyval;
 	u32 magic;
-	u32 num_name_maps;
 
 	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
 
@@ -353,12 +337,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		}
 	}
 
-	/* ignore trailing name maps. */
-        for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
-                err = skip_name_map(p, end);
-                if (err < 0)
-                        goto done;
-        }
+	ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
+	ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
+	ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
 
         /* tunables */
         ceph_decode_need(p, end, 3*sizeof(u32), done);
-- 
cgit v1.2.3-59-g8ed1b


From 6f428df47dae2c8ea31fd4c0c74a12a8a5ac2d1d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 21 Jun 2017 17:27:18 +0200
Subject: libceph: pg_upmap[_items] infrastructure

pg_temp and pg_upmap encodings are the same (PG -> array of osds),
except for the incremental remove: it's an empty mapping in new_pg_temp
for pg_temp and a separate old_pg_upmap set for pg_upmap.  (This isn't
to allow for empty pg_upmap mappings -- apparently, pg_temp just wasn't
looked at as an example for pg_upmap encoding.)

Reuse __decode_pg_temp() for decoding pg_upmap and new_pg_upmap.
__decode_pg_temp() stores into pg_temp union member, but since pg_upmap
union member is identical, reading through pg_upmap later is OK.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  10 +++-
 net/ceph/debugfs.c          |  23 ++++++++
 net/ceph/osdmap.c           | 135 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 164 insertions(+), 4 deletions(-)

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index fe6d189bdd30..c612cff81f5c 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -143,10 +143,14 @@ struct ceph_pg_mapping {
 		struct {
 			int len;
 			int osds[];
-		} pg_temp;
+		} pg_temp, pg_upmap;
 		struct {
 			int osd;
 		} primary_temp;
+		struct {
+			int len;
+			int from_to[][2];
+		} pg_upmap_items;
 	};
 };
 
@@ -165,6 +169,10 @@ struct ceph_osdmap {
 	struct rb_root pg_temp;
 	struct rb_root primary_temp;
 
+	/* remap (post-CRUSH, pre-up) */
+	struct rb_root pg_upmap;	/* PG := raw set */
+	struct rb_root pg_upmap_items;	/* from -> to within raw set */
+
 	u32 *osd_primary_affinity;
 
 	struct rb_root pg_pools;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 017f15c575f8..4f57d5bcaba2 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p)
 		seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
 			   pg->pgid.seed, pg->primary_temp.osd);
 	}
+	for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(n, struct ceph_pg_mapping, node);
+
+		seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool,
+			   pg->pgid.seed);
+		for (i = 0; i < pg->pg_upmap.len; i++)
+			seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+				   pg->pg_upmap.osds[i]);
+		seq_printf(s, "]\n");
+	}
+	for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(n, struct ceph_pg_mapping, node);
+
+		seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool,
+			   pg->pgid.seed);
+		for (i = 0; i < pg->pg_upmap_items.len; i++)
+			seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","),
+				   pg->pg_upmap_items.from_to[i][0],
+				   pg->pg_upmap_items.from_to[i][1]);
+		seq_printf(s, "]\n");
+	}
 
 	up_read(&osdc->lock);
 	return 0;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f6d561edd511..a3f60d0bfd13 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -735,6 +735,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
 	map->pool_max = -1;
 	map->pg_temp = RB_ROOT;
 	map->primary_temp = RB_ROOT;
+	map->pg_upmap = RB_ROOT;
+	map->pg_upmap_items = RB_ROOT;
 	mutex_init(&map->crush_workspace_mutex);
 
 	return map;
@@ -759,6 +761,20 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 		erase_pg_mapping(&map->primary_temp, pg);
 		free_pg_mapping(pg);
 	}
+	while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_upmap),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_upmap);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_upmap_items),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_upmap_items);
+		kfree(pg);
+	}
 	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
 		struct ceph_pg_pool_info *pi =
 			rb_entry(rb_first(&map->pg_pools),
@@ -1161,6 +1177,75 @@ e_inval:
 	return -EINVAL;
 }
 
+static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
+						 bool __unused)
+{
+	return __decode_pg_temp(p, end, false);
+}
+
+static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+	return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+				 false);
+}
+
+static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+	return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+				 true);
+}
+
+static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+	return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
+}
+
+static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
+						       bool __unused)
+{
+	struct ceph_pg_mapping *pg;
+	u32 len, i;
+
+	ceph_decode_32_safe(p, end, len, e_inval);
+	if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
+		return ERR_PTR(-EINVAL);
+
+	ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
+	pg = kzalloc(sizeof(*pg) + 2 * len * sizeof(u32), GFP_NOIO);
+	if (!pg)
+		return ERR_PTR(-ENOMEM);
+
+	pg->pg_upmap_items.len = len;
+	for (i = 0; i < len; i++) {
+		pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
+		pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
+	}
+
+	return pg;
+
+e_inval:
+	return ERR_PTR(-EINVAL);
+}
+
+static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
+{
+	return decode_pg_mapping(p, end, &map->pg_upmap_items,
+				 __decode_pg_upmap_items, false);
+}
+
+static int decode_new_pg_upmap_items(void **p, void *end,
+				     struct ceph_osdmap *map)
+{
+	return decode_pg_mapping(p, end, &map->pg_upmap_items,
+				 __decode_pg_upmap_items, true);
+}
+
+static int decode_old_pg_upmap_items(void **p, void *end,
+				     struct ceph_osdmap *map)
+{
+	return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
+}
+
 /*
  * decode a full map.
  */
@@ -1250,9 +1335,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		if (err)
 			goto bad;
 	} else {
-		/* XXX can this happen? */
-		kfree(map->osd_primary_affinity);
-		map->osd_primary_affinity = NULL;
+		WARN_ON(map->osd_primary_affinity);
 	}
 
 	/* crush */
@@ -1261,6 +1344,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 	if (err)
 		goto bad;
 
+	*p += len;
+	if (struct_v >= 3) {
+		/* erasure_code_profiles */
+		ceph_decode_skip_map_of_map(p, end, string, string, string,
+					    bad);
+	}
+
+	if (struct_v >= 4) {
+		err = decode_pg_upmap(p, end, map);
+		if (err)
+			goto bad;
+
+		err = decode_pg_upmap_items(p, end, map);
+		if (err)
+			goto bad;
+	} else {
+		WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
+		WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
+	}
+
 	/* ignore the rest */
 	*p = end;
 
@@ -1520,6 +1623,32 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			goto bad;
 	}
 
+	if (struct_v >= 3) {
+		/* new_erasure_code_profiles */
+		ceph_decode_skip_map_of_map(p, end, string, string, string,
+					    bad);
+		/* old_erasure_code_profiles */
+		ceph_decode_skip_set(p, end, string, bad);
+	}
+
+	if (struct_v >= 4) {
+		err = decode_new_pg_upmap(p, end, map);
+		if (err)
+			goto bad;
+
+		err = decode_old_pg_upmap(p, end, map);
+		if (err)
+			goto bad;
+
+		err = decode_new_pg_upmap_items(p, end, map);
+		if (err)
+			goto bad;
+
+		err = decode_old_pg_upmap_items(p, end, map);
+		if (err)
+			goto bad;
+	}
+
 	/* ignore the rest */
 	*p = end;
 
-- 
cgit v1.2.3-59-g8ed1b


From 463bb8da5042c165bf50ae2688d251c5af26f3cf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 21 Jun 2017 17:27:18 +0200
Subject: libceph: compute actual pgid in ceph_pg_to_up_acting_osds()

Move raw_pg_to_pg() call out of get_temp_osds() and into
ceph_pg_to_up_acting_osds(), for upcoming apply_upmap().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osdmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index a3f60d0bfd13..245c8025ab44 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2284,18 +2284,16 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap,
  */
 static void get_temp_osds(struct ceph_osdmap *osdmap,
 			  struct ceph_pg_pool_info *pi,
-			  const struct ceph_pg *raw_pgid,
+			  const struct ceph_pg *pgid,
 			  struct ceph_osds *temp)
 {
-	struct ceph_pg pgid;
 	struct ceph_pg_mapping *pg;
 	int i;
 
-	raw_pg_to_pg(pi, raw_pgid, &pgid);
 	ceph_osds_init(temp);
 
 	/* pg_temp? */
-	pg = lookup_pg_mapping(&osdmap->pg_temp, &pgid);
+	pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
 	if (pg) {
 		for (i = 0; i < pg->pg_temp.len; i++) {
 			if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
@@ -2318,7 +2316,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
 	}
 
 	/* primary_temp? */
-	pg = lookup_pg_mapping(&osdmap->primary_temp, &pgid);
+	pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
 	if (pg)
 		temp->primary = pg->primary_temp.osd;
 }
@@ -2336,14 +2334,16 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
 			       struct ceph_osds *up,
 			       struct ceph_osds *acting)
 {
+	struct ceph_pg pgid;
 	u32 pps;
 
 	WARN_ON(pi->id != raw_pgid->pool);
+	raw_pg_to_pg(pi, raw_pgid, &pgid);
 
 	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
 	raw_to_up_osds(osdmap, pi, up);
 	apply_primary_affinity(osdmap, pi, pps, up);
-	get_temp_osds(osdmap, pi, raw_pgid, acting);
+	get_temp_osds(osdmap, pi, &pgid, acting);
 	if (!acting->size) {
 		memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
 		acting->size = up->size;
-- 
cgit v1.2.3-59-g8ed1b


From 1c2e7b451b889bead46cef410a737d1767cd6f0b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 21 Jun 2017 17:27:18 +0200
Subject: libceph: apply_upmap()

Previously, pg_to_raw_osds() didn't filter for existent OSDs because
raw_to_up_osds() would filter for "up" ("up" is predicated on "exists")
and raw_to_up_osds() was called directly after pg_to_raw_osds().  Now,
with apply_upmap() call in there, nonexistent OSDs in pg_to_raw_osds()
output can affect apply_upmap().  Introduce remove_nonexistent_osds()
to deal with that.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osdmap.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 2 deletions(-)

diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 245c8025ab44..93baa69407c5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2117,9 +2117,37 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 	return r;
 }
 
+static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
+				    struct ceph_pg_pool_info *pi,
+				    struct ceph_osds *set)
+{
+	int i;
+
+	if (ceph_can_shift_osds(pi)) {
+		int removed = 0;
+
+		/* shift left */
+		for (i = 0; i < set->size; i++) {
+			if (!ceph_osd_exists(osdmap, set->osds[i])) {
+				removed++;
+				continue;
+			}
+			if (removed)
+				set->osds[i - removed] = set->osds[i];
+		}
+		set->size -= removed;
+	} else {
+		/* set dne devices to NONE */
+		for (i = 0; i < set->size; i++) {
+			if (!ceph_osd_exists(osdmap, set->osds[i]))
+				set->osds[i] = CRUSH_ITEM_NONE;
+		}
+	}
+}
+
 /*
- * Calculate raw set (CRUSH output) for given PG.  The result may
- * contain nonexistent OSDs.  ->primary is undefined for a raw set.
+ * Calculate raw set (CRUSH output) for given PG and filter out
+ * nonexistent OSDs.  ->primary is undefined for a raw set.
  *
  * Placement seed (CRUSH input) is returned through @ppps.
  */
@@ -2162,6 +2190,70 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
 	}
 
 	raw->size = len;
+	remove_nonexistent_osds(osdmap, pi, raw);
+}
+
+/* apply pg_upmap[_items] mappings */
+static void apply_upmap(struct ceph_osdmap *osdmap,
+			const struct ceph_pg *pgid,
+			struct ceph_osds *raw)
+{
+	struct ceph_pg_mapping *pg;
+	int i, j;
+
+	pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
+	if (pg) {
+		/* make sure targets aren't marked out */
+		for (i = 0; i < pg->pg_upmap.len; i++) {
+			int osd = pg->pg_upmap.osds[i];
+
+			if (osd != CRUSH_ITEM_NONE &&
+			    osd < osdmap->max_osd &&
+			    osdmap->osd_weight[osd] == 0) {
+				/* reject/ignore explicit mapping */
+				return;
+			}
+		}
+		for (i = 0; i < pg->pg_upmap.len; i++)
+			raw->osds[i] = pg->pg_upmap.osds[i];
+		raw->size = pg->pg_upmap.len;
+		return;
+	}
+
+	pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
+	if (pg) {
+		/*
+		 * Note: this approach does not allow a bidirectional swap,
+		 * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+		 */
+		for (i = 0; i < pg->pg_upmap_items.len; i++) {
+			int from = pg->pg_upmap_items.from_to[i][0];
+			int to = pg->pg_upmap_items.from_to[i][1];
+			int pos = -1;
+			bool exists = false;
+
+			/* make sure replacement doesn't already appear */
+			for (j = 0; j < raw->size; j++) {
+				int osd = raw->osds[j];
+
+				if (osd == to) {
+					exists = true;
+					break;
+				}
+				/* ignore mapping if target is marked out */
+				if (osd == from && pos < 0 &&
+				    !(to != CRUSH_ITEM_NONE &&
+				      to < osdmap->max_osd &&
+				      osdmap->osd_weight[to] == 0)) {
+					pos = j;
+				}
+			}
+			if (!exists && pos >= 0) {
+				raw->osds[pos] = to;
+				return;
+			}
+		}
+	}
 }
 
 /*
@@ -2341,6 +2433,7 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
 	raw_pg_to_pg(pi, raw_pgid, &pgid);
 
 	pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+	apply_upmap(osdmap, &pgid, up);
 	raw_to_up_osds(osdmap, pi, up);
 	apply_primary_affinity(osdmap, pi, pps, up);
 	get_temp_osds(osdmap, pi, &pgid, acting);
-- 
cgit v1.2.3-59-g8ed1b


From 069f3222ca96acfe8c59937e98c401bda5475b48 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 22 Jun 2017 19:44:05 +0200
Subject: crush: implement weight and id overrides for straw2

bucket_straw2_choose needs to use weights that may be different from
weight_items. For instance to compensate for an uneven distribution
caused by a low number of values. Or to fix the probability biais
introduced by conditional probabilities (see
http://tracker.ceph.com/issues/15653 for more information).

We introduce a weight_set for each straw2 bucket to set the desired
weight for a given item at a given position. The weight of a given item
when picking the first replica (first position) may be different from
the weight the second replica (second position). For instance the weight
matrix for a given bucket containing items 3, 7 and 13 could be as
follows:

          position 0   position 1

item 3     0x10000      0x100000
item 7     0x40000       0x10000
item 13    0x40000       0x10000

When crush_do_rule picks the first of two replicas (position 0), item 7,
3 are four times more likely to be choosen by bucket_straw2_choose than
item 13. When choosing the second replica (position 1), item 3 is ten
times more likely to be choosen than item 7, 13.

By default the weight_set of each bucket exactly matches the content of
item_weights for each position to ensure backward compatibility.

bucket_straw2_choose compares items by using their id. The same ids are
also used to index buckets and they must be unique. For each item in a
bucket an array of ids can be provided for placement purposes and they
are used instead of the ids. If no replacement ids are provided, the
legacy behavior is preserved.

Reflects ceph.git commit 19537a450fd5c5a0bb8b7830947507a76db2ceca.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/crush/crush.h  | 58 ++++++++++++++++++++++++++++++++++
 include/linux/crush/mapper.h |  9 +++---
 net/ceph/crush/mapper.c      | 74 +++++++++++++++++++++++++++++++++-----------
 net/ceph/osdmap.c            |  2 +-
 4 files changed, 119 insertions(+), 24 deletions(-)

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index fbecbd089d75..d8676e56fa23 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -137,6 +137,64 @@ struct crush_bucket {
 
 };
 
+/** @ingroup API
+ *
+ * Replacement weights for each item in a bucket. The size of the
+ * array must be exactly the size of the straw2 bucket, just as the
+ * item_weights array.
+ *
+ */
+struct crush_weight_set {
+	__u32 *weights; /*!< 16.16 fixed point weights
+                             in the same order as items */
+	__u32 size;     /*!< size of the __weights__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for a given straw2 bucket, for
+ * placement purposes.
+ *
+ * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
+ * replacement weights found at __weight_set[N]__ are used instead of
+ * the weights from __item_weights__. If __N__ is greater than
+ * __weight_set_size__, the weights found at __weight_set_size-1__ are
+ * used instead. For instance if __weight_set__ is:
+ *
+ *    [ [ 0x10000, 0x20000 ],   // position 0
+ *      [ 0x20000, 0x40000 ] ]  // position 1
+ *
+ * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
+ * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * etc.
+ *
+ */
+struct crush_choose_arg {
+	__s32 *ids;            /*!< values to use instead of items */
+	__u32 ids_size;        /*!< size of the __ids__ array */
+	struct crush_weight_set *weight_set; /*!< weight replacements for
+                                                  a given position */
+	__u32 weight_set_size; /*!< size of the __weight_set__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for each bucket in the crushmap. The
+ * __size__ of the __args__ array must be exactly the same as the
+ * __map->max_buckets__.
+ *
+ * The __crush_choose_arg__ at index N will be used when choosing
+ * an item from the bucket __map->buckets[N]__ bucket, provided it
+ * is a straw2 bucket.
+ *
+ */
+struct crush_choose_arg_map {
+	struct crush_choose_arg *args; /*!< replacement for each bucket
+                                            in the crushmap */
+	__u32 size;                    /*!< size of the __args__ array */
+};
+
 struct crush_bucket_uniform {
 	struct crush_bucket h;
 	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index c95e19e1ff11..141edabb947e 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -11,11 +11,10 @@
 #include "crush.h"
 
 extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
-extern int crush_do_rule(const struct crush_map *map,
-			 int ruleno,
-			 int x, int *result, int result_max,
-			 const __u32 *weights, int weight_max,
-			 void *cwin);
+int crush_do_rule(const struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  const __u32 *weight, int weight_max,
+		  void *cwin, const struct crush_choose_arg *choose_args);
 
 /*
  * Returns the exact amount of workspace that will need to be used
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b5cd8c21bfdf..0b2646a9cc50 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -302,19 +302,42 @@ static __u64 crush_ln(unsigned int xin)
  *
  */
 
+static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
+				     const struct crush_choose_arg *arg,
+				     int position)
+{
+	if (!arg || !arg->weight_set || arg->weight_set_size == 0)
+		return bucket->item_weights;
+
+	if (position >= arg->weight_set_size)
+		position = arg->weight_set_size - 1;
+	return arg->weight_set[position].weights;
+}
+
+static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
+				 const struct crush_choose_arg *arg)
+{
+	if (!arg || !arg->ids)
+		return bucket->h.items;
+
+	return arg->ids;
+}
+
 static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
-				int x, int r)
+				int x, int r,
+				const struct crush_choose_arg *arg,
+				int position)
 {
 	unsigned int i, high = 0;
 	unsigned int u;
-	unsigned int w;
 	__s64 ln, draw, high_draw = 0;
+	__u32 *weights = get_choose_arg_weights(bucket, arg, position);
+	__s32 *ids = get_choose_arg_ids(bucket, arg);
 
 	for (i = 0; i < bucket->h.size; i++) {
-		w = bucket->item_weights[i];
-		if (w) {
-			u = crush_hash32_3(bucket->h.hash, x,
-					   bucket->h.items[i], r);
+		dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
+		if (weights[i]) {
+			u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
 			u &= 0xffff;
 
 			/*
@@ -335,7 +358,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
 			 * weight means a larger (less negative) value
 			 * for draw.
 			 */
-			draw = div64_s64(ln, w);
+			draw = div64_s64(ln, weights[i]);
 		} else {
 			draw = S64_MIN;
 		}
@@ -352,7 +375,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
 
 static int crush_bucket_choose(const struct crush_bucket *in,
 			       struct crush_work_bucket *work,
-			       int x, int r)
+			       int x, int r,
+			       const struct crush_choose_arg *arg,
+			       int position)
 {
 	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
 	BUG_ON(in->size == 0);
@@ -374,7 +399,7 @@ static int crush_bucket_choose(const struct crush_bucket *in,
 	case CRUSH_BUCKET_STRAW2:
 		return bucket_straw2_choose(
 			(const struct crush_bucket_straw2 *)in,
-			x, r);
+			x, r, arg, position);
 	default:
 		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
 		return in->items[0];
@@ -436,7 +461,8 @@ static int crush_choose_firstn(const struct crush_map *map,
 			       unsigned int vary_r,
 			       unsigned int stable,
 			       int *out2,
-			       int parent_r)
+			       int parent_r,
+			       const struct crush_choose_arg *choose_args)
 {
 	int rep;
 	unsigned int ftotal, flocal;
@@ -486,7 +512,10 @@ static int crush_choose_firstn(const struct crush_map *map,
 				else
 					item = crush_bucket_choose(
 						in, work->work[-1-in->id],
-						x, r);
+						x, r,
+						(choose_args ?
+						 &choose_args[-1-in->id] : 0),
+						outpos);
 				if (item >= map->max_devices) {
 					dprintk("   bad item %d\n", item);
 					skip_rep = 1;
@@ -543,7 +572,8 @@ static int crush_choose_firstn(const struct crush_map *map,
 							    vary_r,
 							    stable,
 							    NULL,
-							    sub_r) <= outpos)
+							    sub_r,
+							    choose_args) <= outpos)
 							/* didn't get leaf */
 							reject = 1;
 					} else {
@@ -620,7 +650,8 @@ static void crush_choose_indep(const struct crush_map *map,
 			       unsigned int recurse_tries,
 			       int recurse_to_leaf,
 			       int *out2,
-			       int parent_r)
+			       int parent_r,
+			       const struct crush_choose_arg *choose_args)
 {
 	const struct crush_bucket *in = bucket;
 	int endpos = outpos + left;
@@ -692,7 +723,10 @@ static void crush_choose_indep(const struct crush_map *map,
 
 				item = crush_bucket_choose(
 					in, work->work[-1-in->id],
-					x, r);
+					x, r,
+					(choose_args ?
+					 &choose_args[-1-in->id] : 0),
+					outpos);
 				if (item >= map->max_devices) {
 					dprintk("   bad item %d\n", item);
 					out[rep] = CRUSH_ITEM_NONE;
@@ -746,7 +780,8 @@ static void crush_choose_indep(const struct crush_map *map,
 							x, 1, numrep, 0,
 							out2, rep,
 							recurse_tries, 0,
-							0, NULL, r);
+							0, NULL, r,
+							choose_args);
 						if (out2[rep] == CRUSH_ITEM_NONE) {
 							/* placed nothing; no leaf */
 							break;
@@ -854,11 +889,12 @@ void crush_init_workspace(const struct crush_map *map, void *v)
  * @weight: weight vector (for map leaves)
  * @weight_max: size of weight vector
  * @cwin: pointer to at least crush_work_size() bytes of memory
+ * @choose_args: weights and ids for each known bucket
  */
 int crush_do_rule(const struct crush_map *map,
 		  int ruleno, int x, int *result, int result_max,
 		  const __u32 *weight, int weight_max,
-		  void *cwin)
+		  void *cwin, const struct crush_choose_arg *choose_args)
 {
 	int result_len;
 	struct crush_work *cw = cwin;
@@ -1013,7 +1049,8 @@ int crush_do_rule(const struct crush_map *map,
 						vary_r,
 						stable,
 						c+osize,
-						0);
+						0,
+						choose_args);
 				} else {
 					out_size = ((numrep < (result_max-osize)) ?
 						    numrep : (result_max-osize));
@@ -1030,7 +1067,8 @@ int crush_do_rule(const struct crush_map *map,
 						   choose_leaf_tries : 1,
 						recurse_to_leaf,
 						c+osize,
-						0);
+						0,
+						choose_args);
 					osize += out_size;
 				}
 			}
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 93baa69407c5..9da0ee61aca5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2111,7 +2111,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
 	mutex_lock(&map->crush_workspace_mutex);
 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-			  weight, weight_max, map->crush_workspace);
+			  weight, weight_max, map->crush_workspace, NULL);
 	mutex_unlock(&map->crush_workspace_mutex);
 
 	return r;
-- 
cgit v1.2.3-59-g8ed1b


From 5cf9c4a9959b6273675310d14a834ef14fbca37c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 22 Jun 2017 19:44:05 +0200
Subject: libceph, crush: per-pool crush_choose_arg_map for crush_do_rule()

If there is no crush_choose_arg_map for a given pool, a NULL pointer is
passed to preserve existing crush_do_rule() behavior.

Reflects ceph.git commits 55fb91d64071552ea1bc65ab4ea84d3c8b73ab4b,
                          dbe36e08be00c6519a8c89718dd47b0219c20516.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/crush/crush.h |   8 ++
 net/ceph/crush/crush.c      |   3 +
 net/ceph/osdmap.c           | 200 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 208 insertions(+), 3 deletions(-)

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index d8676e56fa23..92e165d417a6 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -2,6 +2,7 @@
 #define CEPH_CRUSH_CRUSH_H
 
 #ifdef __KERNEL__
+# include <linux/rbtree.h>
 # include <linux/types.h>
 #else
 # include "crush_compat.h"
@@ -190,6 +191,10 @@ struct crush_choose_arg {
  *
  */
 struct crush_choose_arg_map {
+#ifdef __KERNEL__
+	struct rb_node node;
+	u64 choose_args_index;
+#endif
 	struct crush_choose_arg *args; /*!< replacement for each bucket
                                             in the crushmap */
 	__u32 size;                    /*!< size of the __args__ array */
@@ -294,6 +299,9 @@ struct crush_map {
 	__u32 allowed_bucket_algs;
 
 	__u32 *choose_tries;
+#else
+	/* CrushWrapper::choose_args */
+	struct rb_root choose_args;
 #endif
 };
 
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 5bf94c04f645..4b428f46a8ca 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -1,6 +1,7 @@
 #ifdef __KERNEL__
 # include <linux/slab.h>
 # include <linux/crush/crush.h>
+void clear_choose_args(struct crush_map *c);
 #else
 # include "crush_compat.h"
 # include "crush.h"
@@ -127,6 +128,8 @@ void crush_destroy(struct crush_map *map)
 
 #ifndef __KERNEL__
 	kfree(map->choose_tries);
+#else
+	clear_choose_args(map);
 #endif
 	kfree(map);
 }
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9da0ee61aca5..f630d1072299 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -138,6 +138,177 @@ bad:
 	return -EINVAL;
 }
 
+static struct crush_choose_arg_map *alloc_choose_arg_map(void)
+{
+	struct crush_choose_arg_map *arg_map;
+
+	arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
+	if (!arg_map)
+		return NULL;
+
+	RB_CLEAR_NODE(&arg_map->node);
+	return arg_map;
+}
+
+static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
+{
+	if (arg_map) {
+		int i, j;
+
+		WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
+
+		for (i = 0; i < arg_map->size; i++) {
+			struct crush_choose_arg *arg = &arg_map->args[i];
+
+			for (j = 0; j < arg->weight_set_size; j++)
+				kfree(arg->weight_set[j].weights);
+			kfree(arg->weight_set);
+			kfree(arg->ids);
+		}
+		kfree(arg_map->args);
+		kfree(arg_map);
+	}
+}
+
+DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
+		node);
+
+void clear_choose_args(struct crush_map *c)
+{
+	while (!RB_EMPTY_ROOT(&c->choose_args)) {
+		struct crush_choose_arg_map *arg_map =
+		    rb_entry(rb_first(&c->choose_args),
+			     struct crush_choose_arg_map, node);
+
+		erase_choose_arg_map(&c->choose_args, arg_map);
+		free_choose_arg_map(arg_map);
+	}
+}
+
+static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
+{
+	u32 *a = NULL;
+	u32 len;
+	int ret;
+
+	ceph_decode_32_safe(p, end, len, e_inval);
+	if (len) {
+		u32 i;
+
+		a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
+		if (!a) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		ceph_decode_need(p, end, len * sizeof(u32), e_inval);
+		for (i = 0; i < len; i++)
+			a[i] = ceph_decode_32(p);
+	}
+
+	*plen = len;
+	return a;
+
+e_inval:
+	ret = -EINVAL;
+fail:
+	kfree(a);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Assumes @arg is zero-initialized.
+ */
+static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
+{
+	int ret;
+
+	ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
+	if (arg->weight_set_size) {
+		u32 i;
+
+		arg->weight_set = kmalloc_array(arg->weight_set_size,
+						sizeof(*arg->weight_set),
+						GFP_NOIO);
+		if (!arg->weight_set)
+			return -ENOMEM;
+
+		for (i = 0; i < arg->weight_set_size; i++) {
+			struct crush_weight_set *w = &arg->weight_set[i];
+
+			w->weights = decode_array_32_alloc(p, end, &w->size);
+			if (IS_ERR(w->weights)) {
+				ret = PTR_ERR(w->weights);
+				w->weights = NULL;
+				return ret;
+			}
+		}
+	}
+
+	arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
+	if (IS_ERR(arg->ids)) {
+		ret = PTR_ERR(arg->ids);
+		arg->ids = NULL;
+		return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int decode_choose_args(void **p, void *end, struct crush_map *c)
+{
+	struct crush_choose_arg_map *arg_map = NULL;
+	u32 num_choose_arg_maps, num_buckets;
+	int ret;
+
+	ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
+	while (num_choose_arg_maps--) {
+		arg_map = alloc_choose_arg_map();
+		if (!arg_map) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		ceph_decode_64_safe(p, end, arg_map->choose_args_index,
+				    e_inval);
+		arg_map->size = c->max_buckets;
+		arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
+					GFP_NOIO);
+		if (!arg_map->args) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		ceph_decode_32_safe(p, end, num_buckets, e_inval);
+		while (num_buckets--) {
+			struct crush_choose_arg *arg;
+			u32 bucket_index;
+
+			ceph_decode_32_safe(p, end, bucket_index, e_inval);
+			if (bucket_index >= arg_map->size)
+				goto e_inval;
+
+			arg = &arg_map->args[bucket_index];
+			ret = decode_choose_arg(p, end, arg);
+			if (ret)
+				goto fail;
+		}
+
+		insert_choose_arg_map(&c->choose_args, arg_map);
+	}
+
+	return 0;
+
+e_inval:
+	ret = -EINVAL;
+fail:
+	free_choose_arg_map(arg_map);
+	return ret;
+}
+
 static void crush_finalize(struct crush_map *c)
 {
 	__s32 b;
@@ -179,6 +350,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	if (c == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	c->choose_args = RB_ROOT;
+
         /* set tunables to default values */
         c->choose_local_tries = 2;
         c->choose_local_fallback_tries = 5;
@@ -372,6 +545,21 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	dout("crush decode tunable chooseleaf_stable = %d\n",
 	     c->chooseleaf_stable);
 
+	if (*p != end) {
+		/* class_map */
+		ceph_decode_skip_map(p, end, 32, 32, bad);
+		/* class_name */
+		ceph_decode_skip_map(p, end, 32, string, bad);
+		/* class_bucket */
+		ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
+	}
+
+	if (*p != end) {
+		err = decode_choose_args(p, end, c);
+		if (err)
+			goto bad;
+	}
+
 done:
 	crush_finalize(c);
 	dout("crush_decode success\n");
@@ -2103,15 +2291,21 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
 
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
-		    const __u32 *weight, int weight_max)
+		    const __u32 *weight, int weight_max,
+		    u64 choose_args_index)
 {
+	struct crush_choose_arg_map *arg_map;
 	int r;
 
 	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
 
+	arg_map = lookup_choose_arg_map(&map->crush->choose_args,
+					choose_args_index);
+
 	mutex_lock(&map->crush_workspace_mutex);
 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-			  weight, weight_max, map->crush_workspace, NULL);
+			  weight, weight_max, map->crush_workspace,
+			  arg_map ? arg_map->args : NULL);
 	mutex_unlock(&map->crush_workspace_mutex);
 
 	return r;
@@ -2181,7 +2375,7 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
 	}
 
 	len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
-		       osdmap->osd_weight, osdmap->max_osd);
+		       osdmap->osd_weight, osdmap->max_osd, pi->id);
 	if (len < 0) {
 		pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
 		       len, ruleno, pi->id, pi->crush_ruleset, pi->type,
-- 
cgit v1.2.3-59-g8ed1b


From b88ed8d84fbd1e652cc7a1f6e03550d2b4edf653 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 22 Jun 2017 19:44:05 +0200
Subject: crush: crush_init_workspace starts with struct crush_work

It is not just a pointer to crush_work, it is the whole structure.
That is not a problem since it only contains a pointer. But it will
be a problem if new data members are added to crush_work.

Reflects ceph.git commit ee957dd431bfbeb6dadaf77764db8e0757417328.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/crush/mapper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 0b2646a9cc50..9887fce04219 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -858,7 +858,7 @@ void crush_init_workspace(const struct crush_map *map, void *v)
 	 * set the pointer first and then reserve the space for it to
 	 * point to by incrementing the point.
 	 */
-	v += sizeof(struct crush_work *);
+	v += sizeof(struct crush_work);
 	w->work = v;
 	v += map->max_buckets * sizeof(struct crush_work_bucket *);
 	for (b = 0; b < map->max_buckets; ++b) {
-- 
cgit v1.2.3-59-g8ed1b


From 9eebe45c091e2dff22d4bd87360a624303148ed1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 22 Jun 2017 19:44:05 +0200
Subject: crush: remove an obsolete comment

Reflects ceph.git commit dca1ae1e0a6b02029c3a7f9dec4114972be26d50.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/crush/mapper.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 9887fce04219..746b145bfd11 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -1004,11 +1004,6 @@ int crush_do_rule(const struct crush_map *map,
 
 			for (i = 0; i < wsize; i++) {
 				int bno;
-				/*
-				 * see CRUSH_N, CRUSH_N_MINUS macros.
-				 * basically, numrep <= 0 means relative to
-				 * the provided result_max
-				 */
 				numrep = curstep->arg1;
 				if (numrep <= 0) {
 					numrep += result_max;
-- 
cgit v1.2.3-59-g8ed1b


From 0bb05da2ec57163b7a25efef001ed8f52b18b070 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 22 Jun 2017 19:44:06 +0200
Subject: libceph: osd_state is 32 bits wide in luminous

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osdmap.h |  4 ++--
 net/ceph/debugfs.c          |  2 +-
 net/ceph/osdmap.c           | 29 ++++++++++++++++++++---------
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index c612cff81f5c..a0996cb9faed 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -162,7 +162,7 @@ struct ceph_osdmap {
 	u32 flags;         /* CEPH_OSDMAP_* */
 
 	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
-	u8 *osd_state;     /* CEPH_OSD_* */
+	u32 *osd_state;    /* CEPH_OSD_* */
 	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
 	struct ceph_entity_addr *osd_addr;
 
@@ -203,7 +203,7 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 	return !ceph_osd_is_up(map, osd);
 }
 
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
+char *ceph_osdmap_state_str(char *str, int len, u32 state);
 extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
 
 static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 4f57d5bcaba2..fa5233e0d01c 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 	}
 	for (i = 0; i < map->max_osd; i++) {
 		struct ceph_entity_addr *addr = &map->osd_addr[i];
-		int state = map->osd_state[i];
+		u32 state = map->osd_state[i];
 		char sb[64];
 
 		seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f630d1072299..864789c5974e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -11,7 +11,7 @@
 #include <linux/crush/hash.h>
 #include <linux/crush/mapper.h>
 
-char *ceph_osdmap_state_str(char *str, int len, int state)
+char *ceph_osdmap_state_str(char *str, int len, u32 state)
 {
 	if (!len)
 		return str;
@@ -984,7 +984,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
  */
 static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 {
-	u8 *state;
+	u32 *state;
 	u32 *weight;
 	struct ceph_entity_addr *addr;
 	int i;
@@ -1484,13 +1484,21 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 
 	/* osd_state, osd_weight, osd_addrs->client_addr */
 	ceph_decode_need(p, end, 3*sizeof(u32) +
-			 map->max_osd*(1 + sizeof(*map->osd_weight) +
+			 map->max_osd*((struct_v >= 5 ? sizeof(u32) :
+							sizeof(u8)) +
+				       sizeof(*map->osd_weight) +
 				       sizeof(*map->osd_addr)), e_inval);
 
 	if (ceph_decode_32(p) != map->max_osd)
 		goto e_inval;
 
-	ceph_decode_copy(p, map->osd_state, map->max_osd);
+	if (struct_v >= 5) {
+		for (i = 0; i < map->max_osd; i++)
+			map->osd_state[i] = ceph_decode_32(p);
+	} else {
+		for (i = 0; i < map->max_osd; i++)
+			map->osd_state[i] = ceph_decode_8(p);
+	}
 
 	if (ceph_decode_32(p) != map->max_osd)
 		goto e_inval;
@@ -1598,7 +1606,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  *     new_up_client: { osd=6, addr=... } # set osd_state and addr
  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
  */
-static int decode_new_up_state_weight(void **p, void *end,
+static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 				      struct ceph_osdmap *map)
 {
 	void *new_up_client;
@@ -1614,7 +1622,7 @@ static int decode_new_up_state_weight(void **p, void *end,
 
 	new_state = *p;
 	ceph_decode_32_safe(p, end, len, e_inval);
-	len *= sizeof(u32) + sizeof(u8);
+	len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
 	ceph_decode_need(p, end, len, e_inval);
 	*p += len;
 
@@ -1650,11 +1658,14 @@ static int decode_new_up_state_weight(void **p, void *end,
 	len = ceph_decode_32(p);
 	while (len--) {
 		s32 osd;
-		u8 xorstate;
+		u32 xorstate;
 		int ret;
 
 		osd = ceph_decode_32(p);
-		xorstate = ceph_decode_8(p);
+		if (struct_v >= 5)
+			xorstate = ceph_decode_32(p);
+		else
+			xorstate = ceph_decode_8(p);
 		if (xorstate == 0)
 			xorstate = CEPH_OSD_UP;
 		BUG_ON(osd >= map->max_osd);
@@ -1788,7 +1799,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_up_client, new_state, new_weight */
-	err = decode_new_up_state_weight(p, end, map);
+	err = decode_new_up_state_weight(p, end, struct_v, map);
 	if (err)
 		goto bad;
 
-- 
cgit v1.2.3-59-g8ed1b


From 33e9c8dbfbcef8e4cda8e43a445e692ab7e0d8c0 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Jun 2017 12:05:55 +0200
Subject: libceph: advertise support for NEW_OSDOP_ENCODING and SERVER_LUMINOUS

All four SERVER_LUMINOUS feature bits are implemented, switch it on!

NEW_OSDOP_ENCODING doesn't mean much for the client (it signifies
support for MOSDOp v6) but needs to be enabled in order to get the
latest (currently v25) pg_pool_t.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Acked-by: Sage Weil <sage@redhat.com>
---
 include/linux/ceph/ceph_features.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 78a58770e6e9..f0f6c537b64c 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -184,6 +184,11 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_PGPOOL3 |			\
 	 CEPH_FEATURE_OSDENC |			\
 	 CEPH_FEATURE_CRUSH_TUNABLES |		\
+	 CEPH_FEATURE_SERVER_LUMINOUS |		\
+	 CEPH_FEATURE_RESEND_ON_SPLIT |		\
+	 CEPH_FEATURE_RADOS_BACKOFF |		\
+	 CEPH_FEATURE_OSDMAP_PG_UPMAP |		\
+	 CEPH_FEATURE_CRUSH_CHOOSE_ARGS |	\
 	 CEPH_FEATURE_MSG_AUTH |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE |	\
@@ -199,6 +204,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MSGR_KEEPALIVE2 |		\
 	 CEPH_FEATURE_OSD_POOLRESEND |		\
 	 CEPH_FEATURE_CRUSH_V4 |		\
+	 CEPH_FEATURE_NEW_OSDOP_ENCODING |	\
 	 CEPH_FEATURE_SERVER_JEWEL |		\
 	 CEPH_FEATURE_MON_STATEFUL_SUB |	\
 	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
-- 
cgit v1.2.3-59-g8ed1b