From 3ed97d6345a36a0a61e6af62ad8a66ca40f1aa2e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 26 Apr 2016 15:05:29 +0200
Subject: libceph: make ceph_osdc_put_request() accept NULL

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4801571f51cb..3e61fc8bb371 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1099,8 +1099,7 @@ release_pvec_pages:
 		mapping->writeback_index = index;
 
 out:
-	if (req)
-		ceph_osdc_put_request(req);
+	ceph_osdc_put_request(req);
 	ceph_put_snap_context(snapc);
 	dout("writepages done, rc = %d\n", rc);
 	return rc;
@@ -1824,10 +1823,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 out_unlock:
 	up_write(&mdsc->pool_perm_rwsem);
 
-	if (rd_req)
-		ceph_osdc_put_request(rd_req);
-	if (wr_req)
-		ceph_osdc_put_request(wr_req);
+	ceph_osdc_put_request(rd_req);
+	ceph_osdc_put_request(wr_req);
 out:
 	if (!err)
 		err = have;
-- 
cgit v1.2.3-59-g8ed1b


From 13d1ad16d05eebb4db977eb955716b9da2c19fbd Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 27 Apr 2016 14:15:51 +0200
Subject: libceph: move message allocation out of ceph_osdc_alloc_request()

The size of ->r_request and ->r_reply messages depends on the size of
the object name (ceph_object_id), while the size of ceph_osd_request is
fixed.  Move message allocation into a separate function that would
have to be called after ceph_object_id and ceph_object_locator (which
is also going to become variable in size with RADOS namespaces) have
been filled in:

    req = ceph_osdc_alloc_request(...);
    <fill in req->r_base_oid>
    <fill in req->r_base_oloc>
    ceph_osdc_alloc_messages(req);

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             | 18 ++++++++-
 fs/ceph/addr.c                  |  8 ++++
 fs/ceph/file.c                  |  7 ++++
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/osd_client.c           | 88 +++++++++++++++++++++++------------------
 5 files changed, 82 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c3089f32a392..bda4deade82e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1954,7 +1954,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
 					  GFP_NOIO);
 	if (!osd_req)
-		return NULL;	/* ENOMEM */
+		goto fail;
 
 	if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1967,7 +1967,14 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+		goto fail;
+
 	return osd_req;
+
+fail:
+	ceph_osdc_put_request(osd_req);
+	return NULL;
 }
 
 /*
@@ -2003,7 +2010,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
 						false, GFP_NOIO);
 	if (!osd_req)
-		return NULL;	/* ENOMEM */
+		goto fail;
 
 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
 	osd_req->r_callback = rbd_osd_req_callback;
@@ -2012,7 +2019,14 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
+	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
+		goto fail;
+
 	return osd_req;
+
+fail:
+	ceph_osdc_put_request(osd_req);
+	return NULL;
 }
 
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e61fc8bb371..6fee7e0b8931 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1762,6 +1762,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		 "%llx.00000000", ci->i_vino.ino);
 	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
 
+	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
+	if (err)
+		goto out_unlock;
+
 	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
 					 1, false, GFP_NOFS);
 	if (!wr_req) {
@@ -1775,6 +1779,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	wr_req->r_base_oloc.pool = pool;
 	wr_req->r_base_oid = rd_req->r_base_oid;
 
+	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
+	if (err)
+		goto out_unlock;
+
 	/* one page should be large enough for STAT data */
 	pages = ceph_alloc_page_vector(1, GFP_KERNEL);
 	if (IS_ERR(pages)) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a79f9269831e..5d46d106bbb7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -717,6 +717,13 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_base_oloc = orig_req->r_base_oloc;
 	req->r_base_oid = orig_req->r_base_oid;
 
+	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
+	if (ret) {
+		ceph_osdc_put_request(req);
+		req = orig_req;
+		goto out;
+	}
+
 	req->r_ops[0] = orig_req->r_ops[0];
 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cbf460927c42..66a1fcd5bff7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -322,6 +322,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 				    struct ceph_snap_context *snapc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccb9539dc780..d66dacc9d0d4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -369,8 +369,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       gfp_t gfp_flags)
 {
 	struct ceph_osd_request *req;
-	struct ceph_msg *msg;
-	size_t msg_size;
 
 	if (use_mempool) {
 		BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -407,53 +405,59 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	req->r_base_oloc.pool = -1;
 	req->r_target_oloc.pool = -1;
 
-	msg_size = OSD_OPREPLY_FRONT_LEN;
-	if (num_ops > CEPH_OSD_SLAB_OPS) {
-		/* ceph_osd_op and rval */
-		msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
-			    (sizeof(struct ceph_osd_op) + 4);
-	}
+	dout("%s req %p\n", __func__, req);
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-	/* create reply message */
-	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
-				   gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
-	req->r_reply = msg;
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_msg *msg;
+	int msg_size;
 
+	/* create request message */
 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
 	msg_size += 1 + 8 + 4 + 4; /* pgid */
-	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
-	msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+	msg_size += 4 + req->r_base_oid.name_len; /* oid */
+	msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
 	msg_size += 8; /* snapid */
 	msg_size += 8; /* snap_seq */
-	msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+	msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
 	msg_size += 4; /* retry_attempt */
 
-	/* create request message; allow space for oid */
-	if (use_mempool)
+	if (req->r_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
-		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
-	if (!msg) {
-		ceph_osdc_put_request(req);
-		return NULL;
-	}
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
 
 	memset(msg->front.iov_base, 0, msg->front.iov_len);
-
 	req->r_request = msg;
 
-	return req;
+	/* create reply message */
+	msg_size = OSD_OPREPLY_FRONT_LEN;
+	if (req->r_num_ops > CEPH_OSD_SLAB_OPS) {
+		/* ceph_osd_op and rval */
+		msg_size += (req->r_num_ops - CEPH_OSD_SLAB_OPS) *
+			    (sizeof(struct ceph_osd_op) + 4);
+	}
+
+	if (req->r_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+	if (!msg)
+		return -ENOMEM;
+
+	req->r_reply = msg;
+
+	return 0;
 }
-EXPORT_SYMBOL(ceph_osdc_alloc_request);
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
 
 static bool osd_req_opcode_valid(u16 opcode)
 {
@@ -828,17 +832,17 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
 					GFP_NOFS);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
+	if (!req) {
+		r = -ENOMEM;
+		goto fail;
+	}
 
 	req->r_flags = flags;
 
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
-	if (r < 0) {
-		ceph_osdc_put_request(req);
-		return ERR_PTR(r);
-	}
+	if (r)
+		goto fail;
 
 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
 		osd_req_op_init(req, which, opcode, 0);
@@ -864,7 +868,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		 "%llx.%08llx", vino.ino, objnum);
 	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
 
+	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+	if (r)
+		goto fail;
+
 	return req;
+
+fail:
+	ceph_osdc_put_request(req);
+	return ERR_PTR(r);
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
 
-- 
cgit v1.2.3-59-g8ed1b


From d30291b985d1854565d7f2c82a4457869d5265e8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 29 Apr 2016 19:54:20 +0200
Subject: libceph: variable-sized ceph_object_id

Currently ceph_object_id can hold object names of up to 100
(CEPH_MAX_OID_NAME_LEN) characters.  This is enough for all use cases,
expect one - long rbd image names:

- a format 1 header is named "<imgname>.rbd"
- an object that points to a format 2 header is named "rbd_id.<imgname>"

We operate on these potentially long-named objects during rbd map, and,
for format 1 images, during header refresh.  (A format 2 header name is
a small system-generated string.)

Lift this 100 character limit by making ceph_object_id be able to point
to an externally-allocated string.  Apart from being able to work with
almost arbitrarily-long named objects, this allows us to reduce the
size of ceph_object_id from >100 bytes to 64 bytes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c         |  8 +++-
 fs/ceph/addr.c              |  6 +--
 fs/ceph/file.c              |  2 +-
 fs/ceph/ioctl.c             |  2 +-
 include/linux/ceph/osdmap.h | 62 ++++++++++++++++++------------
 net/ceph/debugfs.c          |  2 +-
 net/ceph/osd_client.c       | 16 +++++---
 net/ceph/osdmap.c           | 93 ++++++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 150 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bda4deade82e..3bf93a2a20f0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1965,7 +1965,9 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_priv = obj_request;
 
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+			     obj_request->object_name))
+		goto fail;
 
 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
 		goto fail;
@@ -2017,7 +2019,9 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_priv = obj_request;
 
 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
-	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
+			     obj_request->object_name))
+		goto fail;
 
 	if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
 		goto fail;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6fee7e0b8931..6f28dd9bacb2 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1758,9 +1758,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	rd_req->r_flags = CEPH_OSD_FLAG_READ;
 	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
 	rd_req->r_base_oloc.pool = pool;
-	snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
-		 "%llx.00000000", ci->i_vino.ino);
-	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+	ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
 
 	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
 	if (err)
@@ -1777,7 +1775,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
 	wr_req->r_base_oloc.pool = pool;
-	wr_req->r_base_oid = rd_req->r_base_oid;
+	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
 
 	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
 	if (err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5d46d106bbb7..9d470397e249 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -715,7 +715,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 			CEPH_OSD_FLAG_ONDISK |
 			CEPH_OSD_FLAG_WRITE;
 	req->r_base_oloc = orig_req->r_base_oloc;
-	req->r_base_oid = orig_req->r_base_oid;
+	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (ret) {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..db296709784a 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -213,7 +213,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		 ceph_ino(inode), dl.object_no);
 
 	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
-	ceph_oid_set_name(&oid, dl.object_name);
+	ceph_oid_printf(&oid, "%s", dl.object_name);
 
 	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
 	if (r < 0) {
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..777a29412706 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -64,11 +64,47 @@ struct ceph_object_locator {
  */
 #define CEPH_MAX_OID_NAME_LEN 100
 
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around.  It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
 struct ceph_object_id {
-	char name[CEPH_MAX_OID_NAME_LEN];
+	char *name;
+	char inline_name[CEPH_OID_INLINE_LEN];
 	int name_len;
 };
 
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+	oid->name = oid->inline_name;
+	oid->name_len = 0;
+}
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+	return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+		   const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+		     const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
 struct ceph_pg_mapping {
 	struct rb_node node;
 	struct ceph_pg pgid;
@@ -113,30 +149,6 @@ struct ceph_osdmap {
 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
-				     const char *name)
-{
-	int len;
-
-	len = strlen(name);
-	if (len > sizeof(oid->name)) {
-		WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
-		     name, len, sizeof(oid->name));
-		len = sizeof(oid->name);
-	}
-
-	memcpy(oid->name, name, len);
-	oid->name_len = len;
-}
-
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
-				 struct ceph_object_id *src)
-{
-	BUG_ON(src->name_len > sizeof(dest->name));
-	memcpy(dest->name, src->name, src->name_len);
-	dest->name_len = src->name_len;
-}
-
 static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
 	return osd >= 0 && osd < map->max_osd &&
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..6f8413293d15 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -161,7 +161,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 			   req->r_osd ? req->r_osd->o_osd : -1,
 			   req->r_pgid.pool, req->r_pgid.seed);
 
-		seq_printf(s, "%.*s", req->r_base_oid.name_len,
+		seq_printf(s, "%*pE", req->r_base_oid.name_len,
 			   req->r_base_oid.name);
 
 		if (req->r_reassert_version.epoch)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 75e27bd3d372..95910aed8e2e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -334,7 +334,10 @@ static void ceph_osdc_release_request(struct kref *kref)
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
 
+	ceph_oid_destroy(&req->r_base_oid);
+	ceph_oid_destroy(&req->r_target_oid);
 	ceph_put_snap_context(req->r_snapc);
+
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -401,7 +404,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
+	ceph_oid_init(&req->r_base_oid);
 	req->r_base_oloc.pool = -1;
+	ceph_oid_init(&req->r_target_oid);
 	req->r_target_oloc.pool = -1;
 
 	dout("%s req %p\n", __func__, req);
@@ -415,6 +420,8 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
 	struct ceph_msg *msg;
 	int msg_size;
 
+	WARN_ON(ceph_oid_empty(&req->r_base_oid));
+
 	/* create request message */
 	msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
 	msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
@@ -859,10 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	}
 
 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
-
-	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
-		 "%llx.%08llx", vino.ino, objnum);
-	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
+	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
 	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (r)
@@ -1410,7 +1414,7 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 		req->r_target_oloc = req->r_base_oloc; /* struct */
 		need_check_tiering = true;
 	}
-	if (req->r_target_oid.name_len == 0) {
+	if (ceph_oid_empty(&req->r_target_oid)) {
 		ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
 		need_check_tiering = true;
 	}
@@ -2501,7 +2505,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 	/* oid */
 	ceph_encode_32(&p, req->r_base_oid.name_len);
 	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-	dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+	dout("oid %*pE len %d\n", req->r_base_oid.name_len,
 	     req->r_base_oid.name, req->r_base_oid.name_len);
 	p += req->r_base_oid.name_len;
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..4668b871ca47 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1381,8 +1381,99 @@ bad:
 	return ERR_PTR(err);
 }
 
+void ceph_oid_copy(struct ceph_object_id *dest,
+		   const struct ceph_object_id *src)
+{
+	WARN_ON(!ceph_oid_empty(dest));
+
+	if (src->name != src->inline_name) {
+		/* very rare, see ceph_object_id definition */
+		dest->name = kmalloc(src->name_len + 1,
+				     GFP_NOIO | __GFP_NOFAIL);
+	}
 
+	memcpy(dest->name, src->name, src->name_len + 1);
+	dest->name_len = src->name_len;
+}
+EXPORT_SYMBOL(ceph_oid_copy);
 
+static __printf(2, 0)
+int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
+{
+	int len;
+
+	WARN_ON(!ceph_oid_empty(oid));
+
+	len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
+	if (len >= sizeof(oid->inline_name))
+		return len;
+
+	oid->name_len = len;
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, BUG.
+ */
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	BUG_ON(oid_printf_vargs(oid, fmt, ap));
+	va_end(ap);
+}
+EXPORT_SYMBOL(ceph_oid_printf);
+
+static __printf(3, 0)
+int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
+		      const char *fmt, va_list ap)
+{
+	va_list aq;
+	int len;
+
+	va_copy(aq, ap);
+	len = oid_printf_vargs(oid, fmt, aq);
+	va_end(aq);
+
+	if (len) {
+		char *external_name;
+
+		external_name = kmalloc(len + 1, gfp);
+		if (!external_name)
+			return -ENOMEM;
+
+		oid->name = external_name;
+		WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
+		oid->name_len = len;
+	}
+
+	return 0;
+}
+
+/*
+ * If oid doesn't fit into inline buffer, allocate.
+ */
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+		     const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+EXPORT_SYMBOL(ceph_oid_aprintf);
+
+void ceph_oid_destroy(struct ceph_object_id *oid)
+{
+	if (oid->name != oid->inline_name)
+		kfree(oid->name);
+}
+EXPORT_SYMBOL(ceph_oid_destroy);
 
 /*
  * calculate file layout from given offset, length.
@@ -1474,7 +1565,7 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
 				     oid->name_len);
 
-	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+	dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name,
 	     pg_out->pool, pg_out->seed);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From fcd00b68bbe2bf5606cb45c2cd4a250a390bcc1f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: DEFINE_RB_FUNCS macro

Given

    struct foo {
        u64 id;
        struct rb_node bar_node;
    };

generate insert_bar(), erase_bar() and lookup_bar() functions with

    DEFINE_RB_FUNCS(bar, struct foo, id, bar_node)

The key is assumed to be an integer (u64, int, etc), compared with
< and >.  nodefld has to be initialized with RB_CLEAR_NODE().

Start using it for MDS, MON and OSD requests and OSD sessions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 54 ++++++------------------
 include/linux/ceph/libceph.h | 57 ++++++++++++++++++++++++++
 net/ceph/mon_client.c        | 52 ++++--------------------
 net/ceph/osd_client.c        | 97 +++++---------------------------------------
 4 files changed, 88 insertions(+), 172 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..cff85af425d4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -567,51 +567,23 @@ void ceph_mdsc_release_request(struct kref *kref)
 	kfree(req);
 }
 
+DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
+
 /*
  * lookup session, bump ref if found.
  *
  * called under mdsc->mutex.
  */
-static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
-					     u64 tid)
+static struct ceph_mds_request *
+lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
 {
 	struct ceph_mds_request *req;
-	struct rb_node *n = mdsc->request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mds_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else {
-			ceph_mdsc_get_request(req);
-			return req;
-		}
-	}
-	return NULL;
-}
 
-static void __insert_request(struct ceph_mds_client *mdsc,
-			     struct ceph_mds_request *new)
-{
-	struct rb_node **p = &mdsc->request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mds_request *req = NULL;
+	req = lookup_request(&mdsc->request_tree, tid);
+	if (req)
+		ceph_mdsc_get_request(req);
 
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mds_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &mdsc->request_tree);
+	return req;
 }
 
 /*
@@ -630,7 +602,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 				  req->r_num_caps);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
 	ceph_mdsc_get_request(req);
-	__insert_request(mdsc, req);
+	insert_request(&mdsc->request_tree, req);
 
 	req->r_uid = current_fsuid();
 	req->r_gid = current_fsgid();
@@ -663,8 +635,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		}
 	}
 
-	rb_erase(&req->r_node, &mdsc->request_tree);
-	RB_CLEAR_NODE(&req->r_node);
+	erase_request(&mdsc->request_tree, req);
 
 	if (req->r_unsafe_dir && req->r_got_unsafe) {
 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -1722,6 +1693,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	INIT_LIST_HEAD(&req->r_unsafe_target_item);
 	req->r_fmode = -1;
 	kref_init(&req->r_kref);
+	RB_CLEAR_NODE(&req->r_node);
 	INIT_LIST_HEAD(&req->r_wait);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
@@ -2414,7 +2386,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	/* get request, session */
 	tid = le64_to_cpu(msg->hdr.tid);
 	mutex_lock(&mdsc->mutex);
-	req = __lookup_request(mdsc, tid);
+	req = lookup_get_request(mdsc, tid);
 	if (!req) {
 		dout("handle_reply on unknown tid %llu\n", tid);
 		mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2576,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	fwd_seq = ceph_decode_32(&p);
 
 	mutex_lock(&mdsc->mutex);
-	req = __lookup_request(mdsc, tid);
+	req = lookup_get_request(mdsc, tid);
 	if (!req) {
 		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
 		goto out;  /* dup reply? */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index db92a8d4926e..690985daad1c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
 		(off >> PAGE_SHIFT);
 }
 
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)		\
+static void insert_##name(struct rb_root *root, type *t)		\
+{									\
+	struct rb_node **n = &root->rb_node;				\
+	struct rb_node *parent = NULL;					\
+									\
+	BUG_ON(!RB_EMPTY_NODE(&t->nodefld));				\
+									\
+	while (*n) {							\
+		type *cur = rb_entry(*n, type, nodefld);		\
+									\
+		parent = *n;						\
+		if (t->keyfld < cur->keyfld)				\
+			n = &(*n)->rb_left;				\
+		else if (t->keyfld > cur->keyfld)			\
+			n = &(*n)->rb_right;				\
+		else							\
+			BUG();						\
+	}								\
+									\
+	rb_link_node(&t->nodefld, parent, n);				\
+	rb_insert_color(&t->nodefld, root);				\
+}									\
+static void erase_##name(struct rb_root *root, type *t)			\
+{									\
+	BUG_ON(RB_EMPTY_NODE(&t->nodefld));				\
+	rb_erase(&t->nodefld, root);					\
+	RB_CLEAR_NODE(&t->nodefld);					\
+}
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)		\
+static type *lookup_##name(struct rb_root *root,			\
+			   typeof(((type *)0)->keyfld) key)		\
+{									\
+	struct rb_node *n = root->rb_node;				\
+									\
+	while (n) {							\
+		type *cur = rb_entry(n, type, nodefld);			\
+									\
+		if (key < cur->keyfld)					\
+			n = n->rb_left;					\
+		else if (key > cur->keyfld)				\
+			n = n->rb_right;				\
+		else							\
+			return cur;					\
+	}								\
+									\
+	return NULL;							\
+}
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld)			\
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld)			\
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
+
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
 extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 3dfafdad92aa..a426a4b03e75 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -478,45 +478,7 @@ out:
 /*
  * generic requests (currently statfs, mon_get_version)
  */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-	struct ceph_mon_client *monc, u64 tid)
-{
-	struct ceph_mon_generic_request *req;
-	struct rb_node *n = monc->generic_request_tree.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_mon_generic_request, node);
-		if (tid < req->tid)
-			n = n->rb_left;
-		else if (tid > req->tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-			    struct ceph_mon_generic_request *new)
-{
-	struct rb_node **p = &monc->generic_request_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_mon_generic_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_mon_generic_request, node);
-		if (new->tid < req->tid)
-			p = &(*p)->rb_left;
-		else if (new->tid > req->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, &monc->generic_request_tree);
-}
+DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
 
 static void release_generic_request(struct kref *kref)
 {
@@ -551,7 +513,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 	struct ceph_msg *m;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
 	if (!req) {
 		dout("get_generic_reply %lld dne\n", tid);
 		*skip = 1;
@@ -578,14 +540,14 @@ static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 	/* register request */
 	req->tid = tid != 0 ? tid : ++monc->last_tid;
 	req->request->hdr.tid = cpu_to_le64(req->tid);
-	__insert_generic_request(monc, req);
+	insert_generic_request(&monc->generic_request_tree, req);
 	ceph_con_send(&monc->con, ceph_msg_get(req->request));
 	mutex_unlock(&monc->mutex);
 
 	err = wait_for_completion_interruptible(&req->completion);
 
 	mutex_lock(&monc->mutex);
-	rb_erase(&req->node, &monc->generic_request_tree);
+	erase_generic_request(&monc->generic_request_tree, req);
 
 	if (!err)
 		err = req->result;
@@ -619,7 +581,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, tid);
+	req = lookup_generic_request(&monc->generic_request_tree, tid);
 	if (req) {
 		*(struct ceph_statfs *)req->buf = reply->st;
 		req->result = 0;
@@ -651,6 +613,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 		return -ENOMEM;
 
 	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
 	req->buf = buf;
 	init_completion(&req->completion);
 
@@ -696,7 +659,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
 		goto bad;
 
 	mutex_lock(&monc->mutex);
-	req = __lookup_generic_req(monc, handle);
+	req = lookup_generic_request(&monc->generic_request_tree, handle);
 	if (req) {
 		*(u64 *)req->buf = ceph_decode_64(&p);
 		req->result = 0;
@@ -732,6 +695,7 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
 		return -ENOMEM;
 
 	kref_init(&req->kref);
+	RB_CLEAR_NODE(&req->node);
 	req->buf = newest;
 	init_completion(&req->completion);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index c423e11d6857..8256051ed88f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -875,45 +875,7 @@ EXPORT_SYMBOL(ceph_osdc_new_request);
 /*
  * We keep osd requests in an rbtree, sorted by ->r_tid.
  */
-static void __insert_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *new)
-{
-	struct rb_node **p = &osdc->requests.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd_request *req = NULL;
-
-	while (*p) {
-		parent = *p;
-		req = rb_entry(parent, struct ceph_osd_request, r_node);
-		if (new->r_tid < req->r_tid)
-			p = &(*p)->rb_left;
-		else if (new->r_tid > req->r_tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->r_node, parent, p);
-	rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-						 u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid)
-			n = n->rb_left;
-		else if (tid > req->r_tid)
-			n = n->rb_right;
-		else
-			return req;
-	}
-	return NULL;
-}
+DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
 
 static struct ceph_osd_request *
 __lookup_request_ge(struct ceph_osd_client *osdc,
@@ -1101,6 +1063,8 @@ static void put_osd(struct ceph_osd *osd)
 	}
 }
 
+DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
+
 /*
  * remove an osd from our map
  */
@@ -1111,8 +1075,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	WARN_ON(!list_empty(&osd->o_linger_requests));
 
 	list_del_init(&osd->o_osd_lru);
-	rb_erase(&osd->o_node, &osdc->osds);
-	RB_CLEAR_NODE(&osd->o_node);
+	erase_osd(&osdc->osds, osd);
 }
 
 static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
@@ -1188,45 +1151,6 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	return 0;
 }
 
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-	struct rb_node **p = &osdc->osds.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_osd *osd = NULL;
-
-	dout("__insert_osd %p osd%d\n", new, new->o_osd);
-	while (*p) {
-		parent = *p;
-		osd = rb_entry(parent, struct ceph_osd, o_node);
-		if (new->o_osd < osd->o_osd)
-			p = &(*p)->rb_left;
-		else if (new->o_osd > osd->o_osd)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&new->o_node, parent, p);
-	rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-	struct ceph_osd *osd;
-	struct rb_node *n = osdc->osds.rb_node;
-
-	while (n) {
-		osd = rb_entry(n, struct ceph_osd, o_node);
-		if (o < osd->o_osd)
-			n = n->rb_left;
-		else if (o > osd->o_osd)
-			n = n->rb_right;
-		else
-			return osd;
-	}
-	return NULL;
-}
-
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
 	schedule_delayed_work(&osdc->timeout_work,
@@ -1248,7 +1172,7 @@ static void __register_request(struct ceph_osd_client *osdc,
 	req->r_tid = ++osdc->last_tid;
 	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
-	__insert_request(osdc, req);
+	insert_request(&osdc->requests, req);
 	ceph_osdc_get_request(req);
 	osdc->num_requests++;
 	if (osdc->num_requests == 1) {
@@ -1270,8 +1194,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 	}
 
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	rb_erase(&req->r_node, &osdc->requests);
-	RB_CLEAR_NODE(&req->r_node);
+	erase_request(&osdc->requests, req);
 	osdc->num_requests--;
 
 	if (req->r_osd) {
@@ -1482,7 +1405,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 		req->r_osd = NULL;
 	}
 
-	req->r_osd = __lookup_osd(osdc, o);
+	req->r_osd = lookup_osd(&osdc->osds, o);
 	if (!req->r_osd && o >= 0) {
 		err = -ENOMEM;
 		req->r_osd = create_osd(osdc, o);
@@ -1492,7 +1415,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 		}
 
 		dout("map_request osd %p is osd%d\n", req->r_osd, o);
-		__insert_osd(osdc, req->r_osd);
+		insert_osd(&osdc->osds, req->r_osd);
 
 		ceph_con_open(&req->r_osd->o_con,
 			      CEPH_ENTITY_TYPE_OSD, o,
@@ -1822,7 +1745,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	/* lookup */
 	down_read(&osdc->map_sem);
 	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
+	req = lookup_request(&osdc->requests, tid);
 	if (req == NULL) {
 		dout("handle_reply tid %llu dne\n", tid);
 		goto bad_mutex;
@@ -2880,7 +2803,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 
 	tid = le64_to_cpu(hdr->tid);
 	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
+	req = lookup_request(&osdc->requests, tid);
 	if (!req) {
 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
 		     osd->o_osd, tid);
-- 
cgit v1.2.3-59-g8ed1b


From d9591f5e28686277d9312d3c7422faf1368b305e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:22 +0200
Subject: libceph: rename ceph_oloc_oid_to_pg()

Rename ceph_oloc_oid_to_pg() to ceph_object_locator_to_pg().  Emphasise
that returned is raw PG and return -ENOENT instead of -EIO if the pool
doesn't exist.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c             |  2 +-
 include/linux/ceph/osdmap.h |  9 ++++-----
 net/ceph/osd_client.c       |  4 ++--
 net/ceph/osdmap.c           | 31 ++++++++++++++++---------------
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index db296709784a..cca7fff22725 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -215,7 +215,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
 	ceph_oid_printf(&oid, "%s", dl.object_name);
 
-	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
+	r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
 	if (r < 0) {
 		up_read(&osdc->map_sem);
 		return r;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index ce7a41a182d4..b70440c05b49 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -213,11 +213,10 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
-/* calculate mapping of object to a placement group */
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-			       struct ceph_object_locator *oloc,
-			       struct ceph_object_id *oid,
-			       struct ceph_pg *pg_out);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      struct ceph_object_id *oid,
+			      struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid);
 
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
 			       struct ceph_pg pgid,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8256051ed88f..cb9f1953f5fb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1324,8 +1324,8 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 		/* !pi is caught in ceph_oloc_oid_to_pg() */
 	}
 
-	return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
-				   &req->r_target_oid, pg_out);
+	return ceph_object_locator_to_pg(osdmap, &req->r_target_oid,
+					 &req->r_target_oloc, pg_out);
 }
 
 static void __enqueue_request(struct ceph_osd_request *req)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9a0cc072a909..6267839cb246 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1545,30 +1545,31 @@ invalid:
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
 /*
- * Calculate mapping of a (oloc, oid) pair to a PG.  Should only be
- * called with target's (oloc, oid), since tiering isn't taken into
- * account.
+ * Map an object into a PG.
+ *
+ * Should only be called with target_oid and target_oloc (as opposed to
+ * base_oid and base_oloc), since tiering isn't taken into account.
  */
-int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
-			struct ceph_object_locator *oloc,
-			struct ceph_object_id *oid,
-			struct ceph_pg *pg_out)
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+			      struct ceph_object_id *oid,
+			      struct ceph_object_locator *oloc,
+			      struct ceph_pg *raw_pgid)
 {
 	struct ceph_pg_pool_info *pi;
 
-	pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+	pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
 	if (!pi)
-		return -EIO;
+		return -ENOENT;
 
-	pg_out->pool = oloc->pool;
-	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
-				     oid->name_len);
+	raw_pgid->pool = oloc->pool;
+	raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+				       oid->name_len);
 
-	dout("%s %*pE pgid %llu.%x\n", __func__, oid->name_len, oid->name,
-	     pg_out->pool, pg_out->seed);
+	dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
+	     oid->name, raw_pgid->pool, raw_pgid->seed);
 	return 0;
 }
-EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+EXPORT_SYMBOL(ceph_object_locator_to_pg);
 
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    int *result, int result_max,
-- 
cgit v1.2.3-59-g8ed1b


From f81f16339a05775df600b2ff75a79be1864975c1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: rename ceph_calc_pg_primary()

Rename ceph_calc_pg_primary() to ceph_pg_to_acting_primary() to
emphasise that it returns acting primary.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c             | 2 +-
 include/linux/ceph/osdmap.h | 4 ++--
 net/ceph/osdmap.c           | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index cca7fff22725..1831ad6cf066 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -221,7 +221,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return r;
 	}
 
-	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
 			ceph_osd_addr(osdc->osdmap, dl.osd);
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 942189d311e0..3fd978a1639b 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -236,8 +236,8 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
 			       const struct ceph_pg *raw_pgid,
 			       struct ceph_osds *up,
 			       struct ceph_osds *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-				struct ceph_pg pgid);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid);
 
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
 						    u64 id);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f5fc8fc63879..656384a8fd1e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1896,13 +1896,14 @@ out:
 }
 
 /*
- * Return primary osd for given pgid, or -1 if none.
+ * Return acting primary for given PG, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+			      const struct ceph_pg *raw_pgid)
 {
 	struct ceph_osds up, acting;
 
-	ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+	ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
 	return acting.primary;
 }
-EXPORT_SYMBOL(ceph_calc_pg_primary);
+EXPORT_SYMBOL(ceph_pg_to_acting_primary);
-- 
cgit v1.2.3-59-g8ed1b


From 63244fa123a755e4bbaee03022b68613c71d1332 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:23 +0200
Subject: libceph: introduce ceph_osd_request_target, calc_target()

Introduce ceph_osd_request_target, containing all mapping-related
fields of ceph_osd_request and calc_target() for calculating mappings
and populating it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  |   2 +-
 fs/ceph/file.c                  |   2 +-
 include/linux/ceph/osd_client.h |  23 ++++++
 include/linux/ceph/osdmap.h     |  34 +++++++++
 include/linux/ceph/rados.h      |   5 ++
 net/ceph/osd_client.c           | 157 +++++++++++++++++++++++++++++++++++++++-
 net/ceph/osdmap.c               | 121 +++++++++++++++++++++++++++++++
 7 files changed, 340 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6f28dd9bacb2..c5d75486823b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1774,7 +1774,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
 			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-	wr_req->r_base_oloc.pool = pool;
+	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
 	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
 
 	err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 9d470397e249..36b4a41dfa67 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -714,7 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
 			CEPH_OSD_FLAG_ONDISK |
 			CEPH_OSD_FLAG_WRITE;
-	req->r_base_oloc = orig_req->r_base_oloc;
+	ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
 	ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 63854a8df183..48806ee4488d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
 				     struct ceph_msg *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
+#define CEPH_HOMELESS_OSD	-1
+
 /* a given osd we're communicating with */
 struct ceph_osd {
 	atomic_t o_ref;
@@ -118,6 +120,27 @@ struct ceph_osd_req_op {
 	};
 };
 
+struct ceph_osd_request_target {
+	struct ceph_object_id base_oid;
+	struct ceph_object_locator base_oloc;
+	struct ceph_object_id target_oid;
+	struct ceph_object_locator target_oloc;
+
+	struct ceph_pg pgid;
+	u32 pg_num;
+	u32 pg_num_mask;
+	struct ceph_osds acting;
+	struct ceph_osds up;
+	int size;
+	int min_size;
+	bool sort_bitwise;
+
+	unsigned int flags;                /* CEPH_OSD_FLAG_* */
+	bool paused;
+
+	int osd;
+};
+
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 989294d0b8d2..420bb7968b25 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
 
 #define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
 						       together */
+#define CEPH_POOL_FLAG_FULL		(1ULL << 1) /* pool is full */
 
 struct ceph_pg_pool_info {
 	struct rb_node node;
@@ -62,6 +63,22 @@ struct ceph_object_locator {
 	s64 pool;
 };
 
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+	oloc->pool = -1;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+	return oloc->pool == -1;
+}
+
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+				  const struct ceph_object_locator *src)
+{
+	dest->pool = src->pool;
+}
+
 /*
  * Maximum supported by kernel client object name length
  *
@@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set)
 
 void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
 
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+			  const struct ceph_osds *new_acting,
+			  const struct ceph_osds *old_up,
+			  const struct ceph_osds *new_up,
+			  int old_size,
+			  int new_size,
+			  int old_min_size,
+			  int new_min_size,
+			  u32 old_pg_num,
+			  u32 new_pg_num,
+			  bool old_sort_bitwise,
+			  bool new_sort_bitwise,
+			  const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+		       const struct ceph_osds *new_acting,
+		       bool any_change);
+
 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 off, u64 len,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 913c87c26d33..f28ed864e682 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
 #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
 #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
 
 /*
  * The error code to return when an OSD can't handle a write
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0ff400a56cd6..cff3a7e29233 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -298,6 +298,30 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	}
 }
 
+/*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+	ceph_oid_init(&t->base_oid);
+	ceph_oloc_init(&t->base_oloc);
+	ceph_oid_init(&t->target_oid);
+	ceph_oloc_init(&t->target_oloc);
+
+	ceph_osds_init(&t->acting);
+	ceph_osds_init(&t->up);
+	t->size = -1;
+	t->min_size = -1;
+
+	t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+	ceph_oid_destroy(&t->base_oid);
+	ceph_oid_destroy(&t->target_oid);
+}
+
 /*
  * requests
  */
@@ -1273,6 +1297,11 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_set_request_linger);
 
+static bool __pool_full(struct ceph_pg_pool_info *pi)
+{
+	return pi->flags & CEPH_POOL_FLAG_FULL;
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
@@ -1289,6 +1318,20 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc,
 		(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
 }
 
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+				    const struct ceph_osd_request_target *t,
+				    struct ceph_pg_pool_info *pi)
+{
+	bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+	bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+		       ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		       __pool_full(pi);
+
+	WARN_ON(pi->id != t->base_oloc.pool);
+	return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
 /*
  * Calculate mapping of a request to a PG.  Takes tiering into account.
  */
@@ -1328,6 +1371,116 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
 					 &req->r_target_oloc, pg_out);
 }
 
+enum calc_target_result {
+	CALC_TARGET_NO_ACTION = 0,
+	CALC_TARGET_NEED_RESEND,
+	CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+					   struct ceph_osd_request_target *t,
+					   u32 *last_force_resend,
+					   bool any_change)
+{
+	struct ceph_pg_pool_info *pi;
+	struct ceph_pg pgid, last_pgid;
+	struct ceph_osds up, acting;
+	bool force_resend = false;
+	bool need_check_tiering = false;
+	bool need_resend = false;
+	bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
+					     CEPH_OSDMAP_SORTBITWISE);
+	enum calc_target_result ct_res;
+	int ret;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+	if (!pi) {
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
+	}
+
+	if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+		if (last_force_resend &&
+		    *last_force_resend < pi->last_force_request_resend) {
+			*last_force_resend = pi->last_force_request_resend;
+			force_resend = true;
+		} else if (!last_force_resend) {
+			force_resend = true;
+		}
+	}
+	if (ceph_oid_empty(&t->target_oid) || force_resend) {
+		ceph_oid_copy(&t->target_oid, &t->base_oid);
+		need_check_tiering = true;
+	}
+	if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+		ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+		need_check_tiering = true;
+	}
+
+	if (need_check_tiering &&
+	    (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+		if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+			t->target_oloc.pool = pi->read_tier;
+		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+			t->target_oloc.pool = pi->write_tier;
+	}
+
+	ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+					&t->target_oloc, &pgid);
+	if (ret) {
+		WARN_ON(ret != -ENOENT);
+		t->osd = CEPH_HOMELESS_OSD;
+		ct_res = CALC_TARGET_POOL_DNE;
+		goto out;
+	}
+	last_pgid.pool = pgid.pool;
+	last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+	ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+	if (any_change &&
+	    ceph_is_new_interval(&t->acting,
+				 &acting,
+				 &t->up,
+				 &up,
+				 t->size,
+				 pi->size,
+				 t->min_size,
+				 pi->min_size,
+				 t->pg_num,
+				 pi->pg_num,
+				 t->sort_bitwise,
+				 sort_bitwise,
+				 &last_pgid))
+		force_resend = true;
+
+	if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+		t->paused = false;
+		need_resend = true;
+	}
+
+	if (ceph_pg_compare(&t->pgid, &pgid) ||
+	    ceph_osds_changed(&t->acting, &acting, any_change) ||
+	    force_resend) {
+		t->pgid = pgid; /* struct */
+		ceph_osds_copy(&t->acting, &acting);
+		ceph_osds_copy(&t->up, &up);
+		t->size = pi->size;
+		t->min_size = pi->min_size;
+		t->pg_num = pi->pg_num;
+		t->pg_num_mask = pi->pg_num_mask;
+		t->sort_bitwise = sort_bitwise;
+
+		t->osd = acting.primary;
+		need_resend = true;
+	}
+
+	ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+	dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+	return ct_res;
+}
+
 static void __enqueue_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
@@ -1805,12 +1958,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		redir.oloc.pool = -1;
 	}
 
-	if (redir.oloc.pool != -1) {
+	if (!ceph_oloc_empty(&redir.oloc)) {
 		dout("redirect pool %lld\n", redir.oloc.pool);
 
 		__unregister_request(osdc, req);
 
-		req->r_target_oloc = redir.oloc; /* struct */
+		ceph_oloc_copy(&req->r_target_oloc, &redir.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 66c3ebead92f..7d4a5b43085e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1521,6 +1521,32 @@ void ceph_oid_destroy(struct ceph_object_id *oid)
 }
 EXPORT_SYMBOL(ceph_oid_destroy);
 
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+			 const struct ceph_osds *rhs)
+{
+	if (lhs->size == rhs->size &&
+	    !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+		return true;
+
+	return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+		       const struct ceph_osds *rhs)
+{
+	if (__osds_equal(lhs, rhs) &&
+	    lhs->primary == rhs->primary)
+		return true;
+
+	return false;
+}
+
 static bool osds_valid(const struct ceph_osds *set)
 {
 	/* non-empty set */
@@ -1553,6 +1579,101 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
 	dest->primary = src->primary;
 }
 
+static bool is_split(const struct ceph_pg *pgid,
+		     u32 old_pg_num,
+		     u32 new_pg_num)
+{
+	int old_bits = calc_bits_of(old_pg_num);
+	int old_mask = (1 << old_bits) - 1;
+	int n;
+
+	WARN_ON(pgid->seed >= old_pg_num);
+	if (new_pg_num <= old_pg_num)
+		return false;
+
+	for (n = 1; ; n++) {
+		int next_bit = n << (old_bits - 1);
+		u32 s = next_bit | pgid->seed;
+
+		if (s < old_pg_num || s == pgid->seed)
+			continue;
+		if (s >= new_pg_num)
+			break;
+
+		s = ceph_stable_mod(s, old_pg_num, old_mask);
+		if (s == pgid->seed)
+			return true;
+	}
+
+	return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+			  const struct ceph_osds *new_acting,
+			  const struct ceph_osds *old_up,
+			  const struct ceph_osds *new_up,
+			  int old_size,
+			  int new_size,
+			  int old_min_size,
+			  int new_min_size,
+			  u32 old_pg_num,
+			  u32 new_pg_num,
+			  bool old_sort_bitwise,
+			  bool new_sort_bitwise,
+			  const struct ceph_pg *pgid)
+{
+	return !osds_equal(old_acting, new_acting) ||
+	       !osds_equal(old_up, new_up) ||
+	       old_size != new_size ||
+	       old_min_size != new_min_size ||
+	       is_split(pgid, old_pg_num, new_pg_num) ||
+	       old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+	int i;
+
+	for (i = 0; i < acting->size; i++) {
+		if (acting->osds[i] == osd)
+			return i;
+	}
+
+	return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+			    const struct ceph_osds *new_acting)
+{
+	if (!old_acting->size && !new_acting->size)
+		return false; /* both still empty */
+
+	if (!old_acting->size ^ !new_acting->size)
+		return true; /* was empty, now not, or vice versa */
+
+	if (old_acting->primary != new_acting->primary)
+		return true; /* primary changed */
+
+	if (calc_pg_rank(old_acting->primary, old_acting) !=
+	    calc_pg_rank(new_acting->primary, new_acting))
+		return true;
+
+	return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+		       const struct ceph_osds *new_acting,
+		       bool any_change)
+{
+	if (primary_changed(old_acting, new_acting))
+		return true;
+
+	if (any_change && !__osds_equal(old_acting, new_acting))
+		return true;
+
+	return false;
+}
+
 /*
  * calculate file layout from given offset, length.
  * fill in correct oid, logical length, and object extent
-- 
cgit v1.2.3-59-g8ed1b


From bb873b539154ab51893430b4ad6ba4051775276a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 26 May 2016 00:29:52 +0200
Subject: libceph: switch to calc_target(), part 2

The crux of this is getting rid of ceph_osdc_build_request(), so that
MOSDOp can be encoded not before but after calc_target() calculates the
actual target.  Encoding now happens within ceph_osdc_start_request().

Also nuked is the accompanying bunch of pointers into the encoded
buffer that was used to update fields on each send - instead, the
entire front is re-encoded.  If we want to support target->name_len !=
base->name_len in the future, there is no other way, because oid is
surrounded by other fields in the encoded buffer.

Encoding OSD ops and adding data items to the request message were
mixed together in osd_req_encode_op().  While we want to re-encode OSD
ops, we don't want to add duplicate data items to the message when
resending, so all call to ceph_osdc_msg_data_add() are factored out
into a new setup_request_data().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             |  18 +-
 fs/ceph/addr.c                  |  16 +-
 fs/ceph/file.c                  |  16 +-
 include/linux/ceph/osd_client.h |  29 ++--
 include/linux/ceph/rados.h      |   7 +
 net/ceph/debugfs.c              |  61 ++++---
 net/ceph/osd_client.c           | 355 ++++++++++++++++++++--------------------
 7 files changed, 247 insertions(+), 255 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f3ea927f93de..0e598916e048 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1896,27 +1896,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 {
 	struct rbd_img_request *img_request = obj_request->img_request;
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
-	u64 snap_id;
-
-	rbd_assert(osd_req != NULL);
 
-	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
-	ceph_osdc_build_request(osd_req, obj_request->offset,
-			NULL, snap_id, NULL);
+	if (img_request)
+		osd_req->r_snapid = img_request->snap_id;
 }
 
 static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 {
-	struct rbd_img_request *img_request = obj_request->img_request;
 	struct ceph_osd_request *osd_req = obj_request->osd_req;
-	struct ceph_snap_context *snapc;
-	struct timespec mtime = CURRENT_TIME;
-
-	rbd_assert(osd_req != NULL);
 
-	snapc = img_request ? img_request->snapc : NULL;
-	ceph_osdc_build_request(osd_req, obj_request->offset,
-			snapc, CEPH_NOSNAP, &mtime);
+	osd_req->r_mtime = CURRENT_TIME;
+	osd_req->r_data_offset = obj_request->offset;
 }
 
 /*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c5d75486823b..59b3c3fbd3bd 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
 	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
 	ret = ceph_osdc_start_request(osdc, req, false);
 	if (ret < 0)
@@ -1063,10 +1061,7 @@ new_request:
 			pages = NULL;
 		}
 
-		vino = ceph_vino(inode);
-		ceph_osdc_build_request(req, offset, snapc, vino.snap,
-					&inode->i_mtime);
-
+		req->r_mtime = inode->i_mtime;
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
 		req = NULL;
@@ -1614,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 		goto out;
 	}
 
-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1652,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 			goto out_put;
 	}
 
-	ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
+	req->r_mtime = inode->i_mtime;
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1790,12 +1785,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
 	osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
 				     0, false, true);
-	ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
-				&ci->vfs_inode.i_mtime);
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
-	ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
-				&ci->vfs_inode.i_mtime);
+	wr_req->r_mtime = ci->vfs_inode.i_mtime;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 36b4a41dfa67..52e4b72dd5de 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -727,8 +727,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_ops[0] = orig_req->r_ops[0];
 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
-	ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
-				snapc, CEPH_NOSNAP, &aio_req->mtime);
+	req->r_mtime = aio_req->mtime;
+	req->r_data_offset = req->r_ops[0].extent.offset;
 
 	ceph_osdc_put_request(orig_req);
 
@@ -882,14 +882,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					(pos+len) | (PAGE_SIZE - 1));
 
 			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+			req->r_mtime = mtime;
 		}
 
-
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
 						 false, false);
 
-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
 		if (aio_req) {
 			aio_req->total_len += len;
 			aio_req->num_reqs++;
@@ -1074,9 +1072,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
 						false, true);
 
-		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
-		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
-
+		req->r_mtime = mtime;
 		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 		if (!ret)
 			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1532,9 +1528,7 @@ static int ceph_zero_partial_object(struct inode *inode,
 		goto out;
 	}
 
-	ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap,
-				&inode->i_mtime);
-
+	req->r_mtime = inode->i_mtime;
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03bf9d9e1517..67a37d98e0ca 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -104,7 +104,7 @@ struct ceph_osd_req_op {
 			struct ceph_osd_data response_data;
 			__u8 class_len;
 			__u8 method_len;
-			__u8 argc;
+			u32 indata_len;
 		} cls;
 		struct {
 			u64 cookie;
@@ -162,14 +162,6 @@ struct ceph_osd_request {
 	/* request osd ops array  */
 	unsigned int		r_num_ops;
 
-	/* these are updated on each send */
-	__le32           *r_request_osdmap_epoch;
-	__le32           *r_request_flags;
-	__le64           *r_request_pool;
-	void             *r_request_pgid;
-	__le32           *r_request_attempts;
-	struct ceph_eversion *r_request_reassert_version;
-
 	int               r_result;
 	int               r_got_reply;
 	int		  r_linger;
@@ -180,16 +172,22 @@ struct ceph_osd_request {
 	struct completion r_completion, r_safe_completion;
 	ceph_osdc_callback_t r_callback;
 	ceph_osdc_unsafe_callback_t r_unsafe_callback;
-	struct ceph_eversion r_reassert_version;
 	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
 	void *r_priv;			      /* ditto */
 
-	u64               r_snapid;
-	unsigned long     r_stamp;            /* send OR check time */
+	/* set by submitter */
+	u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
+	struct ceph_snap_context *r_snapc;    /* for writes */
+	struct timespec r_mtime;              /* ditto */
+	u64 r_data_offset;                    /* ditto */
 
-	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+	/* internal */
+	unsigned long r_stamp;                /* jiffies, send or check time */
+	int r_attempts;
+	struct ceph_eversion r_replay_version; /* aka reassert_version */
+	u32 r_last_force_resend;
 
 	struct ceph_osd_req_op r_ops[];
 };
@@ -334,11 +332,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       gfp_t gfp_flags);
 int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
 
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				    struct ceph_snap_context *snapc,
-				    u64 snap_id,
-				    struct timespec *mtime);
-
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
 				      struct ceph_vino vino,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index f28ed864e682..28740a58f32c 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -394,6 +394,13 @@ enum {
 	CEPH_OSD_FLAG_SKIPRWLOCKS =   0x10000,  /* skip rw locks */
 	CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
 	CEPH_OSD_FLAG_FLUSH =         0x40000,  /* this is part of flush */
+	CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000,  /* map snap direct to clone id */
+	CEPH_OSD_FLAG_ENFORCE_SNAPC   = 0x100000,  /* use snapc provided even if
+						      pool uses pool snaps */
+	CEPH_OSD_FLAG_REDIRECTED   = 0x200000,  /* op has been redirected */
+	CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000,  /* redirect bit is authoritative */
+	CEPH_OSD_FLAG_FULL_TRY =    0x800000,  /* try op despite full flag */
+	CEPH_OSD_FLAG_FULL_FORCE = 0x1000000,  /* force op despite full flag */
 };
 
 enum {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 0c11ab5f8c30..6d3ff713edeb 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -145,6 +145,43 @@ static int monc_show(struct seq_file *s, void *p)
 	return 0;
 }
 
+static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
+{
+	int i;
+
+	seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
+	for (i = 0; i < t->up.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
+	seq_printf(s, "]/%d\t[", t->up.primary);
+	for (i = 0; i < t->acting.size; i++)
+		seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
+	seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
+		   t->target_oid.name_len, t->target_oid.name, t->flags);
+	if (t->paused)
+		seq_puts(s, "\tP");
+}
+
+static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
+{
+	int i;
+
+	seq_printf(s, "%llu\t", req->r_tid);
+	dump_target(s, &req->r_t);
+
+	seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
+		   le32_to_cpu(req->r_replay_version.epoch),
+		   le64_to_cpu(req->r_replay_version.version));
+
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
+			   ceph_osd_op_name(op->op));
+	}
+
+	seq_putc(s, '\n');
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
@@ -154,32 +191,10 @@ static int osdc_show(struct seq_file *s, void *pp)
 	mutex_lock(&osdc->request_mutex);
 	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
 		struct ceph_osd_request *req;
-		unsigned int i;
-		int opcode;
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
-		seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1,
-			   req->r_t.pgid.pool, req->r_t.pgid.seed);
-
-		seq_printf(s, "%*pE", req->r_base_oid.name_len,
-			   req->r_base_oid.name);
-
-		if (req->r_reassert_version.epoch)
-			seq_printf(s, "\t%u'%llu",
-			   (unsigned int)le32_to_cpu(req->r_reassert_version.epoch),
-			   le64_to_cpu(req->r_reassert_version.version));
-		else
-			seq_printf(s, "\t");
-
-		for (i = 0; i < req->r_num_ops; i++) {
-			opcode = req->r_ops[i].op;
-			seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
-				   ceph_osd_op_name(opcode));
-		}
-
-		seq_printf(s, "\n");
+		dump_request(s, req);
 	}
 	mutex_unlock(&osdc->request_mutex);
 	return 0;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 013101598c41..8a008f083283 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -34,8 +34,6 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
 					struct ceph_osd_request *req);
 static void __enqueue_request(struct ceph_osd_request *req);
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req);
 
 /*
  * Implement client access to distributed object storage cluster.
@@ -209,6 +207,8 @@ void osd_req_op_cls_request_data_pagelist(
 
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
+	osd_req->r_ops[which].cls.indata_len += pagelist->length;
+	osd_req->r_ops[which].indata_len += pagelist->length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 
@@ -221,6 +221,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
 	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 				pages_from_pool, own_pages);
+	osd_req->r_ops[which].cls.indata_len += length;
+	osd_req->r_ops[which].indata_len += length;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
 
@@ -610,8 +612,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 
 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
-	op->cls.argc = 0;	/* currently unused */
-
 	op->indata_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -709,16 +709,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 	}
 }
 
-static u64 osd_req_encode_op(struct ceph_osd_request *req,
-			      struct ceph_osd_op *dst, unsigned int which)
+static u32 osd_req_encode_op(struct ceph_osd_op *dst,
+			     const struct ceph_osd_req_op *src)
 {
-	struct ceph_osd_req_op *src;
-	struct ceph_osd_data *osd_data;
-	u64 request_data_len = 0;
-	u64 data_length;
-
-	BUG_ON(which >= req->r_num_ops);
-	src = &req->r_ops[which];
 	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
 		pr_err("unrecognized osd opcode %d\n", src->op);
 
@@ -727,49 +720,23 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
 	switch (src->op) {
 	case CEPH_OSD_OP_STAT:
-		osd_data = &src->raw_data_in;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_TRUNCATE:
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			request_data_len = src->extent.length;
 		dst->extent.offset = cpu_to_le64(src->extent.offset);
 		dst->extent.length = cpu_to_le64(src->extent.length);
 		dst->extent.truncate_size =
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
-		osd_data = &src->extent.osd_data;
-		if (src->op == CEPH_OSD_OP_WRITE ||
-		    src->op == CEPH_OSD_OP_WRITEFULL)
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-		else
-			ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
-		osd_data = &src->cls.request_info;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
-		request_data_len = osd_data->pagelist->length;
-
-		osd_data = &src->cls.request_data;
-		data_length = ceph_osd_data_length(osd_data);
-		if (data_length) {
-			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
-			dst->cls.indata_len = cpu_to_le32(data_length);
-			ceph_osdc_msg_data_add(req->r_request, osd_data);
-			src->indata_len += data_length;
-			request_data_len += data_length;
-		}
-		osd_data = &src->cls.response_data;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
@@ -791,9 +758,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
 		dst->xattr.cmp_op = src->xattr.cmp_op;
 		dst->xattr.cmp_mode = src->xattr.cmp_mode;
-		osd_data = &src->xattr.osd_data;
-		ceph_osdc_msg_data_add(req->r_request, osd_data);
-		request_data_len = osd_data->pagelist->length;
 		break;
 	case CEPH_OSD_OP_CREATE:
 	case CEPH_OSD_OP_DELETE:
@@ -810,7 +774,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 	dst->flags = cpu_to_le32(src->flags);
 	dst->payload_len = cpu_to_le32(src->indata_len);
 
-	return request_data_len;
+	return src->indata_len;
 }
 
 /*
@@ -852,8 +816,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		goto fail;
 	}
 
-	req->r_flags = flags;
-
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
 	if (r)
@@ -877,9 +839,14 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
+	req->r_flags = flags;
 	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
 	ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
 
+	req->r_snapid = vino.snap;
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		req->r_data_offset = off;
+
 	r = ceph_osdc_alloc_messages(req, GFP_NOFS);
 	if (r)
 		goto fail;
@@ -1509,37 +1476,173 @@ out:
 	return err;
 }
 
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static void __send_request(struct ceph_osd_client *osdc,
-			   struct ceph_osd_request *req)
+static void setup_request_data(struct ceph_osd_request *req,
+			       struct ceph_msg *msg)
 {
-	void *p;
+	u32 data_len = 0;
+	int i;
+
+	if (!list_empty(&msg->data))
+		return;
 
-	dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n",
-	     req, req->r_tid, req->r_osd->o_osd, req->r_flags,
-	     req->r_t.pgid.pool, req->r_t.pgid.seed);
+	WARN_ON(msg->data_length);
+	for (i = 0; i < req->r_num_ops; i++) {
+		struct ceph_osd_req_op *op = &req->r_ops[i];
+
+		switch (op->op) {
+		/* request */
+		case CEPH_OSD_OP_WRITE:
+		case CEPH_OSD_OP_WRITEFULL:
+			WARN_ON(op->indata_len != op->extent.length);
+			ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+			break;
+		case CEPH_OSD_OP_SETXATTR:
+		case CEPH_OSD_OP_CMPXATTR:
+			WARN_ON(op->indata_len != op->xattr.name_len +
+						  op->xattr.value_len);
+			ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+			break;
+
+		/* reply */
+		case CEPH_OSD_OP_STAT:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->raw_data_in);
+			break;
+		case CEPH_OSD_OP_READ:
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->extent.osd_data);
+			break;
+
+		/* both */
+		case CEPH_OSD_OP_CALL:
+			WARN_ON(op->indata_len != op->cls.class_len +
+						  op->cls.method_len +
+						  op->cls.indata_len);
+			ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+			/* optional, can be NONE */
+			ceph_osdc_msg_data_add(req->r_reply,
+					       &op->cls.response_data);
+			break;
+		}
+
+		data_len += op->indata_len;
+	}
 
-	/* fill in message content that changes each time we send it */
-	put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
-	put_unaligned_le32(req->r_flags, req->r_request_flags);
-	put_unaligned_le64(req->r_t.target_oloc.pool, req->r_request_pool);
-	p = req->r_request_pgid;
+	WARN_ON(data_len != msg->data_length);
+}
+
+static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front_alloc_len;
+	u32 data_len = 0;
+	int i;
+
+	if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
+		/* snapshots aren't writeable */
+		WARN_ON(req->r_snapid != CEPH_NOSNAP);
+	} else {
+		WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
+			req->r_data_offset || req->r_snapc);
+	}
+
+	setup_request_data(req, msg);
+
+	ceph_encode_32(&p, 1); /* client_inc, always 1 */
+	ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
+	ceph_encode_32(&p, req->r_flags);
+	ceph_encode_timespec(p, &req->r_mtime);
+	p += sizeof(struct ceph_timespec);
+	/* aka reassert_version */
+	memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
+	p += sizeof(req->r_replay_version);
+
+	/* oloc */
+	ceph_encode_8(&p, 4);
+	ceph_encode_8(&p, 4);
+	ceph_encode_32(&p, 8 + 4 + 4);
+	ceph_encode_64(&p, req->r_t.target_oloc.pool);
+	ceph_encode_32(&p, -1); /* preferred */
+	ceph_encode_32(&p, 0); /* key len */
+
+	/* pgid */
+	ceph_encode_8(&p, 1);
 	ceph_encode_64(&p, req->r_t.pgid.pool);
 	ceph_encode_32(&p, req->r_t.pgid.seed);
-	put_unaligned_le64(1, req->r_request_attempts);  /* FIXME */
-	memcpy(req->r_request_reassert_version, &req->r_reassert_version,
-	       sizeof(req->r_reassert_version));
+	ceph_encode_32(&p, -1); /* preferred */
 
-	req->r_stamp = jiffies;
-	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+	/* oid */
+	ceph_encode_32(&p, req->r_t.target_oid.name_len);
+	memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
+	p += req->r_t.target_oid.name_len;
 
-	ceph_msg_get(req->r_request); /* send consumes a ref */
+	/* ops, can imply data */
+	ceph_encode_16(&p, req->r_num_ops);
+	for (i = 0; i < req->r_num_ops; i++) {
+		data_len += osd_req_encode_op(p, &req->r_ops[i]);
+		p += sizeof(struct ceph_osd_op);
+	}
 
-	req->r_sent = req->r_osd->o_incarnation;
+	ceph_encode_64(&p, req->r_snapid); /* snapid */
+	if (req->r_snapc) {
+		ceph_encode_64(&p, req->r_snapc->seq);
+		ceph_encode_32(&p, req->r_snapc->num_snaps);
+		for (i = 0; i < req->r_snapc->num_snaps; i++)
+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
+	} else {
+		ceph_encode_64(&p, 0); /* snap_seq */
+		ceph_encode_32(&p, 0); /* snaps len */
+	}
+
+	ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	msg->hdr.data_len = cpu_to_le32(data_len);
+	/*
+	 * The header "data_off" is a hint to the receiver allowing it
+	 * to align received data into its buffers such that there's no
+	 * need to re-copy it before writing it to disk (direct I/O).
+	 */
+	msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
 
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
+	     req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
+	     req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
+}
+
+/*
+ * @req has to be assigned a tid and registered.
+ */
+static void send_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd *osd = req->r_osd;
+
+	WARN_ON(osd->o_osd != req->r_t.osd);
+
+	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+	if (req->r_attempts)
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+	else
+		WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
+
+	encode_request(req, req->r_request);
+
+	dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
+	     __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
+	     req->r_t.osd, req->r_flags, req->r_attempts);
+
+	req->r_t.paused = false;
+	req->r_stamp = jiffies;
+	req->r_attempts++;
+
+	req->r_sent = osd->o_incarnation;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
 }
 
 /*
@@ -1550,8 +1653,10 @@ static void __send_queued(struct ceph_osd_client *osdc)
 	struct ceph_osd_request *req, *tmp;
 
 	dout("__send_queued\n");
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item)
-		__send_request(osdc, req);
+	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
+		list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+		send_request(req);
+	}
 }
 
 /*
@@ -1915,8 +2020,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			req->r_result = bytes;
 
 		/* in case this is a write and we need to replay, */
-		req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
-		req->r_reassert_version.version = cpu_to_le64(reassert_version);
+		req->r_replay_version.epoch = cpu_to_le32(reassert_epoch);
+		req->r_replay_version.version = cpu_to_le64(reassert_version);
 
 		req->r_got_reply = 1;
 	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
@@ -2432,105 +2537,6 @@ bad:
 	pr_err("osdc handle_watch_notify corrupt msg\n");
 }
 
-/*
- * build new request AND message
- *
- */
-void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				struct ceph_snap_context *snapc, u64 snap_id,
-				struct timespec *mtime)
-{
-	struct ceph_msg *msg = req->r_request;
-	void *p;
-	size_t msg_size;
-	int flags = req->r_flags;
-	u64 data_len;
-	unsigned int i;
-
-	req->r_snapid = snap_id;
-	WARN_ON(snapc != req->r_snapc);
-
-	/* encode request */
-	msg->hdr.version = cpu_to_le16(4);
-
-	p = msg->front.iov_base;
-	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-	req->r_request_osdmap_epoch = p;
-	p += 4;
-	req->r_request_flags = p;
-	p += 4;
-	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(p, mtime);
-	p += sizeof(struct ceph_timespec);
-	req->r_request_reassert_version = p;
-	p += sizeof(struct ceph_eversion); /* will get filled in */
-
-	/* oloc */
-	ceph_encode_8(&p, 4);
-	ceph_encode_8(&p, 4);
-	ceph_encode_32(&p, 8 + 4 + 4);
-	req->r_request_pool = p;
-	p += 8;
-	ceph_encode_32(&p, -1);  /* preferred */
-	ceph_encode_32(&p, 0);   /* key len */
-
-	ceph_encode_8(&p, 1);
-	req->r_request_pgid = p;
-	p += 8 + 4;
-	ceph_encode_32(&p, -1);  /* preferred */
-
-	/* oid */
-	ceph_encode_32(&p, req->r_base_oid.name_len);
-	memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
-	dout("oid %*pE len %d\n", req->r_base_oid.name_len,
-	     req->r_base_oid.name, req->r_base_oid.name_len);
-	p += req->r_base_oid.name_len;
-
-	/* ops--can imply data */
-	ceph_encode_16(&p, (u16)req->r_num_ops);
-	data_len = 0;
-	for (i = 0; i < req->r_num_ops; i++) {
-		data_len += osd_req_encode_op(req, p, i);
-		p += sizeof(struct ceph_osd_op);
-	}
-
-	/* snaps */
-	ceph_encode_64(&p, req->r_snapid);
-	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
-	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
-	if (req->r_snapc) {
-		for (i = 0; i < req->r_snapc->num_snaps; i++) {
-			ceph_encode_64(&p, req->r_snapc->snaps[i]);
-		}
-	}
-
-	req->r_request_attempts = p;
-	p += 4;
-
-	/* data */
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		u16 data_off;
-
-		/*
-		 * The header "data_off" is a hint to the receiver
-		 * allowing it to align received data into its
-		 * buffers such that there's no need to re-copy
-		 * it before writing it to disk (direct I/O).
-		 */
-		data_off = (u16) (off & 0xffff);
-		req->r_request->hdr.data_off = cpu_to_le16(data_off);
-	}
-	req->r_request->hdr.data_len = cpu_to_le32(data_len);
-
-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-	msg_size = p - msg->front.iov_base;
-	msg->front.iov_len = msg_size;
-	msg->hdr.front_len = cpu_to_le32(msg_size);
-
-	dout("build_request msg_size was %d\n", (int)msg_size);
-}
-EXPORT_SYMBOL(ceph_osdc_build_request);
-
 /*
  * Register request, send initial attempt.
  */
@@ -2749,15 +2755,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
-
 	osd_req_op_extent_osd_data_pages(req, 0,
 				pages, *plen, page_align, false, false);
 
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
 	     off, *plen, *plen, page_align);
 
-	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
-
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
@@ -2783,7 +2786,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
-	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
 	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2797,8 +2799,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				false, false);
 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
-	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
-
+	req->r_mtime = *mtime;
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
 		rc = ceph_osdc_wait_request(osdc, req);
-- 
cgit v1.2.3-59-g8ed1b


From 85e084feb47349d62989efe1713a8723af95f4ea Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:24 +0200
Subject: libceph: drop msg argument from ceph_osdc_callback_t

finish_read(), its only user, uses it to get to hdr.data_len, which is
what ->r_result is set to on success.  This gains us the ability to
safely call callbacks from contexts other than reply, e.g. map check.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c             | 5 ++---
 fs/ceph/addr.c                  | 9 ++++-----
 fs/ceph/file.c                  | 7 +++----
 include/linux/ceph/osd_client.h | 3 +--
 net/ceph/osd_client.c           | 4 ++--
 5 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0e598916e048..82b03aa509e6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1828,13 +1828,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
 		obj_request_done_set(obj_request);
 }
 
-static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
-				struct ceph_msg *msg)
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 {
 	struct rbd_obj_request *obj_request = osd_req->r_priv;
 	u16 opcode;
 
-	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+	dout("%s: osd_req %p\n", __func__, osd_req);
 	rbd_assert(osd_req == obj_request->osd_req);
 	if (obj_request_img_data_test(obj_request)) {
 		rbd_assert(obj_request->img_request);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 59b3c3fbd3bd..a11756a39471 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
 /*
  * Finish an async read(ahead) op.
  */
-static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void finish_read(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
 	struct ceph_osd_data *osd_data;
-	int rc = req->r_result;
-	int bytes = le32_to_cpu(msg->hdr.data_len);
+	int rc = req->r_result <= 0 ? req->r_result : 0;
+	int bytes = req->r_result >= 0 ? req->r_result : 0;
 	int num_pages;
 	int i;
 
@@ -598,8 +598,7 @@ static void ceph_release_pages(struct page **pages, int num)
  * If we get an error, set the mapping error bit, but not the individual
  * page error bits.
  */
-static void writepages_finish(struct ceph_osd_request *req,
-			      struct ceph_msg *msg)
+static void writepages_finish(struct ceph_osd_request *req)
 {
 	struct inode *inode = req->r_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 52e4b72dd5de..e75fd0b028e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -616,8 +616,7 @@ static void ceph_aio_complete(struct inode *inode,
 	kfree(aio_req);
 }
 
-static void ceph_aio_complete_req(struct ceph_osd_request *req,
-				  struct ceph_msg *msg)
+static void ceph_aio_complete_req(struct ceph_osd_request *req)
 {
 	int rc = req->r_result;
 	struct inode *inode = req->r_inode;
@@ -740,7 +739,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 out:
 	if (ret < 0) {
 		req->r_result = ret;
-		ceph_aio_complete_req(req, NULL);
+		ceph_aio_complete_req(req);
 	}
 
 	ceph_put_snap_context(snapc);
@@ -961,7 +960,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 							      req, false);
 			if (ret < 0) {
 				req->r_result = ret;
-				ceph_aio_complete_req(req, NULL);
+				ceph_aio_complete_req(req);
 			}
 		}
 		return -EIOCBQUEUED;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 67a37d98e0ca..3bebd60e7f9f 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,8 +20,7 @@ struct ceph_osd_client;
 /*
  * completion callback for async writepages
  */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-				     struct ceph_msg *);
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 #define CEPH_HOMELESS_OSD	-1
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8a008f083283..2a30c0bb3045 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2048,7 +2048,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
 			req->r_unsafe_callback(req, true);
 		if (req->r_callback)
-			req->r_callback(req, msg);
+			req->r_callback(req);
 		else
 			complete_all(&req->r_completion);
 	}
@@ -2072,7 +2072,7 @@ bad_put:
 	req->r_result = -EIO;
 	__unregister_request(osdc, req);
 	if (req->r_callback)
-		req->r_callback(req, msg);
+		req->r_callback(req);
 	else
 		complete_all(&req->r_completion);
 	complete_request(req);
-- 
cgit v1.2.3-59-g8ed1b


From fe5da05e979830b43b115d8a18ead521d507c783 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:24 +0200
Subject: libceph: redo callbacks and factor out MOSDOpReply decoding

If you specify ACK | ONDISK and set ->r_unsafe_callback, both
->r_callback and ->r_unsafe_callback(true) are called on ack.  This is
very confusing.  Redo this so that only one of them is called:

    ->r_unsafe_callback(true), on ack
    ->r_unsafe_callback(false), on commit

or

    ->r_callback, on ack|commit

Decode everything in decode_MOSDOpReply() to reduce clutter.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  |   3 +-
 fs/ceph/file.c                  |   2 +
 include/linux/ceph/osd_client.h |   5 +-
 net/ceph/osd_client.c           | 362 +++++++++++++++++++++++-----------------
 4 files changed, 215 insertions(+), 157 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a11756a39471..f47418477629 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1765,8 +1765,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		goto out_unlock;
 	}
 
-	wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
-			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+	wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
 	ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
 	ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e75fd0b028e9..30fd49eb25b4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -770,6 +770,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 		list_add_tail(&req->r_unsafe_item,
 			      &ci->i_unsafe_writes);
 		spin_unlock(&ci->i_unsafe_lock);
+
+		complete_all(&req->r_completion);
 	} else {
 		spin_lock(&ci->i_unsafe_lock);
 		list_del_init(&req->r_unsafe_item);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3bebd60e7f9f..2415dc0cb008 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -162,13 +162,14 @@ struct ceph_osd_request {
 	unsigned int		r_num_ops;
 
 	int               r_result;
-	int               r_got_reply;
+	bool              r_got_reply;
 	int		  r_linger;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
 	bool              r_mempool;
-	struct completion r_completion, r_safe_completion;
+	struct completion r_completion;
+	struct completion r_safe_completion;  /* fsync waiter */
 	ceph_osdc_callback_t r_callback;
 	ceph_osdc_unsafe_callback_t r_unsafe_callback;
 	struct list_head  r_unsafe_item;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2a30c0bb3045..baf2844b00d6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1693,6 +1693,14 @@ static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
 	return 0;
 }
 
+static void __complete_request(struct ceph_osd_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(req);
+	else
+		complete_all(&req->r_completion);
+}
+
 /*
  * Timeout callback, called every N seconds when 1 or more osd
  * requests has been active for more than N seconds.  When this
@@ -1875,107 +1883,76 @@ e_inval:
 	goto out;
 }
 
-static void complete_request(struct ceph_osd_request *req)
-{
-	complete_all(&req->r_safe_completion);  /* fsync waiter */
-}
+struct MOSDOpReply {
+	struct ceph_pg pgid;
+	u64 flags;
+	int result;
+	u32 epoch;
+	int num_ops;
+	u32 outdata_len[CEPH_OSD_MAX_OPS];
+	s32 rval[CEPH_OSD_MAX_OPS];
+	int retry_attempt;
+	struct ceph_eversion replay_version;
+	u64 user_version;
+	struct ceph_request_redirect redirect;
+};
 
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
 {
-	void *p, *end;
-	struct ceph_osd_request *req;
-	struct ceph_request_redirect redir;
-	u64 tid;
-	int object_len;
-	unsigned int numops;
-	int payload_len, flags;
-	s32 result;
-	s32 retry_attempt;
-	struct ceph_pg pg;
-	int err;
-	u32 reassert_epoch;
-	u64 reassert_version;
-	u32 osdmap_epoch;
-	int already_completed;
-	u32 bytes;
+	void *p = msg->front.iov_base;
+	void *const end = p + msg->front.iov_len;
+	u16 version = le16_to_cpu(msg->hdr.version);
+	struct ceph_eversion bad_replay_version;
 	u8 decode_redir;
-	unsigned int i;
-
-	tid = le64_to_cpu(msg->hdr.tid);
-	dout("handle_reply %p tid %llu\n", msg, tid);
-
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
+	u32 len;
+	int ret;
+	int i;
 
-	ceph_decode_need(&p, end, 4, bad);
-	object_len = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, object_len, bad);
-	p += object_len;
+	ceph_decode_32_safe(&p, end, len, e_inval);
+	ceph_decode_need(&p, end, len, e_inval);
+	p += len; /* skip oid */
 
-	err = ceph_decode_pgid(&p, end, &pg);
-	if (err)
-		goto bad;
+	ret = ceph_decode_pgid(&p, end, &m->pgid);
+	if (ret)
+		return ret;
 
-	ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad);
-	flags = ceph_decode_64(&p);
-	result = ceph_decode_32(&p);
-	reassert_epoch = ceph_decode_32(&p);
-	reassert_version = ceph_decode_64(&p);
-	osdmap_epoch = ceph_decode_32(&p);
+	ceph_decode_64_safe(&p, end, m->flags, e_inval);
+	ceph_decode_32_safe(&p, end, m->result, e_inval);
+	ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
+	memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
+	p += sizeof(bad_replay_version);
+	ceph_decode_32_safe(&p, end, m->epoch, e_inval);
 
-	/* lookup */
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
-	if (req == NULL) {
-		dout("handle_reply tid %llu dne\n", tid);
-		goto bad_mutex;
-	}
-	ceph_osdc_get_request(req);
+	ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
+	if (m->num_ops > ARRAY_SIZE(m->outdata_len))
+		goto e_inval;
 
-	dout("handle_reply %p tid %llu req %p result %d\n", msg, tid,
-	     req, result);
-
-	ceph_decode_need(&p, end, 4, bad_put);
-	numops = ceph_decode_32(&p);
-	if (numops > CEPH_OSD_MAX_OPS)
-		goto bad_put;
-	if (numops != req->r_num_ops)
-		goto bad_put;
-	payload_len = 0;
-	ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
-	for (i = 0; i < numops; i++) {
+	ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
+			 e_inval);
+	for (i = 0; i < m->num_ops; i++) {
 		struct ceph_osd_op *op = p;
-		int len;
 
-		len = le32_to_cpu(op->payload_len);
-		req->r_ops[i].outdata_len = len;
-		dout(" op %d has %d bytes\n", i, len);
-		payload_len += len;
+		m->outdata_len[i] = le32_to_cpu(op->payload_len);
 		p += sizeof(*op);
 	}
-	bytes = le32_to_cpu(msg->hdr.data_len);
-	if (payload_len != bytes) {
-		pr_warn("sum of op payload lens %d != data_len %d\n",
-			payload_len, bytes);
-		goto bad_put;
-	}
 
-	ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
-	retry_attempt = ceph_decode_32(&p);
-	for (i = 0; i < numops; i++)
-		req->r_ops[i].rval = ceph_decode_32(&p);
+	ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
+	for (i = 0; i < m->num_ops; i++)
+		ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
 
-	if (le16_to_cpu(msg->hdr.version) >= 6) {
-		p += 8 + 4; /* skip replay_version */
-		p += 8; /* skip user_version */
+	if (version >= 5) {
+		ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
+		memcpy(&m->replay_version, p, sizeof(m->replay_version));
+		p += sizeof(m->replay_version);
+		ceph_decode_64_safe(&p, end, m->user_version, e_inval);
+	} else {
+		m->replay_version = bad_replay_version; /* struct */
+		m->user_version = le64_to_cpu(m->replay_version.version);
+	}
 
-		if (le16_to_cpu(msg->hdr.version) >= 7)
-			ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+	if (version >= 6) {
+		if (version >= 7)
+			ceph_decode_8_safe(&p, end, decode_redir, e_inval);
 		else
 			decode_redir = 1;
 	} else {
@@ -1983,19 +1960,96 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	}
 
 	if (decode_redir) {
-		err = ceph_redirect_decode(&p, end, &redir);
-		if (err)
-			goto bad_put;
+		ret = ceph_redirect_decode(&p, end, &m->redirect);
+		if (ret)
+			return ret;
 	} else {
-		redir.oloc.pool = -1;
+		ceph_oloc_init(&m->redirect.oloc);
 	}
 
-	if (!ceph_oloc_empty(&redir.oloc)) {
-		dout("redirect pool %lld\n", redir.oloc.pool);
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+/*
+ * We are done with @req if
+ *   - @m is a safe reply, or
+ *   - @m is an unsafe reply and we didn't want a safe one
+ */
+static bool done_request(const struct ceph_osd_request *req,
+			 const struct MOSDOpReply *m)
+{
+	return (m->result < 0 ||
+		(m->flags & CEPH_OSD_FLAG_ONDISK) ||
+		!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
+}
 
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ *
+ * ->r_unsafe_callback is set?	yes			no
+ *
+ * first reply is OK (needed	r_cb/r_completion,	r_cb/r_completion,
+ * any or needed/got safe)	r_safe_completion	r_safe_completion
+ *
+ * first reply is unsafe	r_unsafe_cb(true)	(nothing)
+ *
+ * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
+ *				r_safe_completion	r_safe_completion
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	struct ceph_osd_request *req;
+	struct MOSDOpReply m;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 data_len = 0;
+	bool already_acked;
+	int ret;
+	int i;
+
+	dout("%s msg %p tid %llu\n", __func__, msg, tid);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	req = lookup_request(&osdc->requests, tid);
+	if (!req) {
+		dout("%s no tid %llu\n", __func__, tid);
+		goto out_unlock;
+	}
+	ceph_osdc_get_request(req);
+
+	ret = decode_MOSDOpReply(msg, &m);
+	if (ret) {
+		pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
+		       req->r_tid, ret);
+		ceph_msg_dump(msg);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
+	     __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
+	     m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
+	     le64_to_cpu(m.replay_version.version), m.user_version);
+
+	if (m.retry_attempt >= 0) {
+		if (m.retry_attempt != req->r_attempts - 1) {
+			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
+			     req, req->r_tid, m.retry_attempt,
+			     req->r_attempts - 1);
+			goto out_put;
+		}
+	} else {
+		WARN_ON(1); /* MOSDOpReply v4 is assumed */
+	}
+
+	if (!ceph_oloc_empty(&m.redirect.oloc)) {
+		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
+		     m.redirect.oloc.pool);
 		__unregister_request(osdc, req);
 
-		ceph_oloc_copy(&req->r_t.target_oloc, &redir.oloc);
+		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
 
 		/*
 		 * Start redirect requests with nofail=true.  If
@@ -2005,85 +2059,85 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		 * successfully.  In the future we might want to follow
 		 * original request's nofail setting here.
 		 */
-		err = __ceph_osdc_start_request(osdc, req, true);
-		BUG_ON(err);
+		ret = __ceph_osdc_start_request(osdc, req, true);
+		BUG_ON(ret);
 
-		goto out_unlock;
+		goto out_put;
 	}
 
-	already_completed = req->r_got_reply;
-	if (!req->r_got_reply) {
-		req->r_result = result;
-		dout("handle_reply result %d bytes %d\n", req->r_result,
-		     bytes);
-		if (req->r_result == 0)
-			req->r_result = bytes;
-
-		/* in case this is a write and we need to replay, */
-		req->r_replay_version.epoch = cpu_to_le32(reassert_epoch);
-		req->r_replay_version.version = cpu_to_le64(reassert_version);
-
-		req->r_got_reply = 1;
-	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-		dout("handle_reply tid %llu dup ack\n", tid);
-		goto out_unlock;
+	if (m.num_ops != req->r_num_ops) {
+		pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
+		       req->r_num_ops, req->r_tid);
+		goto fail_request;
 	}
-
-	dout("handle_reply tid %llu flags %d\n", tid, flags);
-
-	if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
-		__register_linger_request(osdc, req);
-
-	/* either this is a read, or we got the safe response */
-	if (result < 0 ||
-	    (flags & CEPH_OSD_FLAG_ONDISK) ||
-	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+	for (i = 0; i < req->r_num_ops; i++) {
+		dout(" req %p tid %llu op %d rval %d len %u\n", req,
+		     req->r_tid, i, m.rval[i], m.outdata_len[i]);
+		req->r_ops[i].rval = m.rval[i];
+		req->r_ops[i].outdata_len = m.outdata_len[i];
+		data_len += m.outdata_len[i];
+	}
+	if (data_len != le32_to_cpu(msg->hdr.data_len)) {
+		pr_err("sum of lens %u != %u for tid %llu\n", data_len,
+		       le32_to_cpu(msg->hdr.data_len), req->r_tid);
+		goto fail_request;
+	}
+	dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
+	     req, req->r_tid, req->r_got_reply, m.result, data_len);
+
+	already_acked = req->r_got_reply;
+	if (!already_acked) {
+		req->r_result = m.result ?: data_len;
+		req->r_replay_version = m.replay_version; /* struct */
+		req->r_got_reply = true;
+	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
+		dout("req %p tid %llu dup ack\n", req, req->r_tid);
+		goto out_put;
+	}
+
+	if (done_request(req, &m)) {
 		__unregister_request(osdc, req);
+		if (req->r_linger) {
+			WARN_ON(req->r_unsafe_callback);
+			__register_linger_request(osdc, req);
+		}
+	}
 
 	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
 
-	if (!already_completed) {
-		if (req->r_unsafe_callback &&
-		    result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK))
-			req->r_unsafe_callback(req, true);
-		if (req->r_callback)
-			req->r_callback(req);
-		else
-			complete_all(&req->r_completion);
-	}
-
-	if (flags & CEPH_OSD_FLAG_ONDISK) {
-		if (req->r_unsafe_callback && already_completed)
+	if (done_request(req, &m)) {
+		if (already_acked && req->r_unsafe_callback) {
+			dout("req %p tid %llu safe-cb\n", req, req->r_tid);
 			req->r_unsafe_callback(req, false);
-		complete_request(req);
+		} else {
+			dout("req %p tid %llu cb\n", req, req->r_tid);
+			__complete_request(req);
+		}
+	} else {
+		if (req->r_unsafe_callback) {
+			dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
+			req->r_unsafe_callback(req, true);
+		} else {
+			WARN_ON(1);
+		}
 	}
+	if (m.flags & CEPH_OSD_FLAG_ONDISK)
+		complete_all(&req->r_safe_completion);
 
-out:
-	dout("req=%p req->r_linger=%d\n", req, req->r_linger);
 	ceph_osdc_put_request(req);
 	return;
-out_unlock:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-	goto out;
 
-bad_put:
+fail_request:
 	req->r_result = -EIO;
 	__unregister_request(osdc, req);
-	if (req->r_callback)
-		req->r_callback(req);
-	else
-		complete_all(&req->r_completion);
-	complete_request(req);
+	__complete_request(req);
+	complete_all(&req->r_safe_completion);
+out_put:
 	ceph_osdc_put_request(req);
-bad_mutex:
+out_unlock:
 	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
-bad:
-	pr_err("corrupt osd_op_reply got %d %d\n",
-	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
-	ceph_msg_dump(msg);
 }
 
 static void reset_changed_osds(struct ceph_osd_client *osdc)
@@ -2591,7 +2645,9 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 	if (rc < 0) {
 		dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
 		ceph_osdc_cancel_request(req);
-		complete_request(req);
+
+		/* kludge - need to to wake ceph_osdc_sync() */
+		complete_all(&req->r_safe_completion);
 		return rc;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 5aea3dcd50215fa9563270251ad7323e2f2490ee Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 28 Apr 2016 16:07:26 +0200
Subject: libceph: a major OSD client update

This is a major sync up, up to ~Jewel.  The highlights are:

- per-session request trees (vs a global per-client tree)
- per-session locking (vs a global per-client rwlock)
- homeless OSD session
- no ad-hoc global per-client lists
- support for pool quotas
- foundation for watch/notify v2 support
- foundation for map check (pool deletion detection) support

The switchover is incomplete: lingering requests can be setup and
teared down but aren't ever reestablished.  This functionality is
restored with the introduction of the new lingering infrastructure
(ceph_osd_linger_request, linger_work, etc) in a later commit.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/ioctl.c                 |    8 +-
 fs/ceph/xattr.c                 |    8 +-
 include/linux/ceph/osd_client.h |   18 +-
 net/ceph/debugfs.c              |   34 +-
 net/ceph/osd_client.c           | 1164 +++++++++++++++++++--------------------
 5 files changed, 602 insertions(+), 630 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 1831ad6cf066..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	if (copy_from_user(&dl, arg, sizeof(dl)))
 		return -EFAULT;
 
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
 					  &dl.object_no, &dl.object_offset,
 					  &olen);
 	if (r < 0) {
-		up_read(&osdc->map_sem);
+		up_read(&osdc->lock);
 		return -EIO;
 	}
 	dl.file_offset -= dl.object_offset;
@@ -217,7 +217,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
 	if (r < 0) {
-		up_read(&osdc->map_sem);
+		up_read(&osdc->lock);
 		return r;
 	}
 
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	} else {
 		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
 	}
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 
 	/* send result back to user */
 	if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9410abdef3ce..5afabc4bf4c7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -75,7 +75,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	char buf[128];
 
 	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name) {
 		size_t len = strlen(pool_name);
@@ -107,7 +107,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 				ret = -ERANGE;
 		}
 	}
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 	return ret;
 }
 
@@ -141,13 +141,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
 	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
 	const char *pool_name;
 
-	down_read(&osdc->map_sem);
+	down_read(&osdc->lock);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name)
 		ret = snprintf(val, size, "%s", pool_name);
 	else
 		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
-	up_read(&osdc->map_sem);
+	up_read(&osdc->lock);
 	return ret;
 }
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 486d681694c4..342f22f1f040 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -33,12 +33,13 @@ struct ceph_osd {
 	int o_incarnation;
 	struct rb_node o_node;
 	struct ceph_connection o_con;
-	struct list_head o_requests;
+	struct rb_root o_requests;
 	struct list_head o_linger_requests;
 	struct list_head o_osd_lru;
 	struct ceph_auth_handshake o_auth;
 	unsigned long lru_ttl;
 	struct list_head o_keepalive_item;
+	struct mutex lock;
 };
 
 #define CEPH_OSD_SLAB_OPS	2
@@ -144,8 +145,6 @@ struct ceph_osd_request_target {
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
-	struct list_head r_req_lru_item;
-	struct list_head r_osd_item;
 	struct list_head r_linger_item;
 	struct list_head r_linger_osd_item;
 	struct ceph_osd *r_osd;
@@ -219,19 +218,16 @@ struct ceph_osd_client {
 	struct ceph_client     *client;
 
 	struct ceph_osdmap     *osdmap;       /* current map */
-	struct rw_semaphore    map_sem;
+	struct rw_semaphore    lock;
 
-	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
 	spinlock_t             osd_lru_lock;
-	u64                    last_tid;      /* tid of last request */
-	struct rb_root         requests;      /* pending requests */
-	struct list_head       req_lru;	      /* in-flight lru */
-	struct list_head       req_unsent;    /* unsent/need-resend queue */
-	struct list_head       req_notarget;  /* map to no osd */
 	struct list_head       req_linger;    /* lingering requests */
-	int                    num_requests;
+	struct ceph_osd        homeless_osd;
+	atomic64_t             last_tid;      /* tid of last request */
+	atomic_t               num_requests;
+	atomic_t               num_homeless;
 	struct delayed_work    timeout_work;
 	struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 6d3ff713edeb..61dbd9de4650 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -182,21 +182,39 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
 	seq_putc(s, '\n');
 }
 
+static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	mutex_lock(&osd->lock);
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		dump_request(s, req);
+	}
+
+	mutex_unlock(&osd->lock);
+}
+
 static int osdc_show(struct seq_file *s, void *pp)
 {
 	struct ceph_client *client = s->private;
 	struct ceph_osd_client *osdc = &client->osdc;
-	struct rb_node *p;
-
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-		struct ceph_osd_request *req;
+	struct rb_node *n;
 
-		req = rb_entry(p, struct ceph_osd_request, r_node);
+	down_read(&osdc->lock);
+	seq_printf(s, "REQUESTS %d homeless %d\n",
+		   atomic_read(&osdc->num_requests),
+		   atomic_read(&osdc->num_homeless));
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 
-		dump_request(s, req);
+		dump_requests(s, osd);
 	}
-	mutex_unlock(&osdc->request_mutex);
+	dump_requests(s, &osdc->homeless_osd);
+
+	up_read(&osdc->lock);
 	return 0;
 }
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d1c8e06f1261..4c856c87b1a9 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -25,16 +25,6 @@ static struct kmem_cache	*ceph_osd_request_cache;
 
 static const struct ceph_connection_operations osd_con_ops;
 
-static void __send_queued(struct ceph_osd_client *osdc);
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req);
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req);
-static void __unregister_linger_request(struct ceph_osd_client *osdc,
-					struct ceph_osd_request *req);
-static void __enqueue_request(struct ceph_osd_request *req);
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -53,6 +43,43 @@ static void __enqueue_request(struct ceph_osd_request *req);
  * channel with an OSD is reset.
  */
 
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
+
+#if 1
+static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
+{
+	bool wrlocked = true;
+
+	if (unlikely(down_read_trylock(sem))) {
+		wrlocked = false;
+		up_read(sem);
+	}
+
+	return wrlocked;
+}
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_locked(&osdc->lock));
+}
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
+{
+	WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
+}
+static inline void verify_osd_locked(struct ceph_osd *osd)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	WARN_ON(!(mutex_is_locked(&osd->lock) &&
+		  rwsem_is_locked(&osdc->lock)) &&
+		!rwsem_is_wrlocked(&osdc->lock));
+}
+#else
+static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
+static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
+static inline void verify_osd_locked(struct ceph_osd *osd) { }
+#endif
+
 /*
  * calculate the mapping of a file extent onto an object, and fill out the
  * request accordingly.  shorten extent as necessary if it crosses an
@@ -336,18 +363,14 @@ static void ceph_osdc_release_request(struct kref *kref)
 	dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
 	     req->r_request, req->r_reply);
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
-	WARN_ON(!list_empty(&req->r_req_lru_item));
-	WARN_ON(!list_empty(&req->r_osd_item));
 	WARN_ON(!list_empty(&req->r_linger_item));
 	WARN_ON(!list_empty(&req->r_linger_osd_item));
 	WARN_ON(req->r_osd);
 
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_reply) {
-		ceph_msg_revoke_incoming(req->r_reply);
+	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	}
 
 	for (which = 0; which < req->r_num_ops; which++)
 		osd_req_op_data_release(req, which);
@@ -418,8 +441,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd_item);
-	INIT_LIST_HEAD(&req->r_req_lru_item);
-	INIT_LIST_HEAD(&req->r_osd_item);
 
 	target_init(&req->r_t);
 
@@ -869,141 +890,11 @@ static bool osd_homeless(struct ceph_osd *osd)
 	return osd->o_osd == CEPH_HOMELESS_OSD;
 }
 
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-		    u64 tid)
-{
-	struct ceph_osd_request *req;
-	struct rb_node *n = osdc->requests.rb_node;
-
-	while (n) {
-		req = rb_entry(n, struct ceph_osd_request, r_node);
-		if (tid < req->r_tid) {
-			if (!n->rb_left)
-				return req;
-			n = n->rb_left;
-		} else if (tid > req->r_tid) {
-			n = n->rb_right;
-		} else {
-			return req;
-		}
-	}
-	return NULL;
-}
-
-static void __kick_linger_request(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-	struct ceph_osd *osd = req->r_osd;
-
-	/*
-	 * Linger requests need to be resent with a new tid to avoid
-	 * the dup op detection logic on the OSDs.  Achieve this with
-	 * a re-register dance instead of open-coding.
-	 */
-	ceph_osdc_get_request(req);
-	if (!list_empty(&req->r_linger_item))
-		__unregister_linger_request(osdc, req);
-	else
-		__unregister_request(osdc, req);
-	__register_request(osdc, req);
-	ceph_osdc_put_request(req);
-
-	/*
-	 * Unless request has been registered as both normal and
-	 * lingering, __unregister{,_linger}_request clears r_osd.
-	 * However, here we need to preserve r_osd to make sure we
-	 * requeue on the same OSD.
-	 */
-	WARN_ON(req->r_osd || !osd);
-	req->r_osd = osd;
-
-	dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
-	__enqueue_request(req);
-}
-
-/*
- * Resubmit requests pending on the given osd.
- */
-static void __kick_osd_requests(struct ceph_osd_client *osdc,
-				struct ceph_osd *osd)
-{
-	struct ceph_osd_request *req, *nreq;
-	LIST_HEAD(resend);
-	LIST_HEAD(resend_linger);
-	int err;
-
-	dout("%s osd%d\n", __func__, osd->o_osd);
-	err = __reset_osd(osdc, osd);
-	if (err)
-		return;
-
-	/*
-	 * Build up a list of requests to resend by traversing the
-	 * osd's list of requests.  Requests for a given object are
-	 * sent in tid order, and that is also the order they're
-	 * kept on this list.  Therefore all requests that are in
-	 * flight will be found first, followed by all requests that
-	 * have not yet been sent.  And to resend requests while
-	 * preserving this order we will want to put any sent
-	 * requests back on the front of the osd client's unsent
-	 * list.
-	 *
-	 * So we build a separate ordered list of already-sent
-	 * requests for the affected osd and splice it onto the
-	 * front of the osd client's unsent list.  Once we've seen a
-	 * request that has not yet been sent we're done.  Those
-	 * requests are already sitting right where they belong.
-	 */
-	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-		if (!req->r_sent)
-			break;
-
-		if (!req->r_linger) {
-			dout("%s requeueing %p tid %llu\n", __func__, req,
-			     req->r_tid);
-			list_move_tail(&req->r_req_lru_item, &resend);
-			req->r_flags |= CEPH_OSD_FLAG_RETRY;
-		} else {
-			list_move_tail(&req->r_req_lru_item, &resend_linger);
-		}
-	}
-	list_splice(&resend, &osdc->req_unsent);
-
-	/*
-	 * Both registered and not yet registered linger requests are
-	 * enqueued with a new tid on the same OSD.  We add/move them
-	 * to req_unsent/o_requests at the end to keep things in tid
-	 * order.
-	 */
-	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
-				 r_linger_osd_item) {
-		WARN_ON(!list_empty(&req->r_req_lru_item));
-		__kick_linger_request(req);
-	}
-
-	list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
-		__kick_linger_request(req);
-}
-
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
+static bool osd_registered(struct ceph_osd *osd)
 {
-	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
+	verify_osdc_locked(osd->o_osdc);
 
-	if (!osd)
-		return;
-	dout("osd_reset osd%d\n", osd->o_osd);
-	osdc = osd->o_osdc;
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	__kick_osd_requests(osdc, osd);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	return !RB_EMPTY_NODE(&osd->o_node);
 }
 
 /*
@@ -1013,17 +904,18 @@ static void osd_init(struct ceph_osd *osd)
 {
 	atomic_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
-	INIT_LIST_HEAD(&osd->o_requests);
+	osd->o_requests = RB_ROOT;
 	INIT_LIST_HEAD(&osd->o_linger_requests);
 	INIT_LIST_HEAD(&osd->o_osd_lru);
 	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	osd->o_incarnation = 1;
+	mutex_init(&osd->lock);
 }
 
 static void osd_cleanup(struct ceph_osd *osd)
 {
 	WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
-	WARN_ON(!list_empty(&osd->o_requests));
+	WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
 	WARN_ON(!list_empty(&osd->o_linger_requests));
 	WARN_ON(!list_empty(&osd->o_osd_lru));
 	WARN_ON(!list_empty(&osd->o_keepalive_item));
@@ -1077,30 +969,6 @@ static void put_osd(struct ceph_osd *osd)
 
 DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
 
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-	WARN_ON(!list_empty(&osd->o_requests));
-	WARN_ON(!list_empty(&osd->o_linger_requests));
-
-	list_del_init(&osd->o_osd_lru);
-	erase_osd(&osdc->osds, osd);
-}
-
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
-
-	if (!RB_EMPTY_NODE(&osd->o_node)) {
-		ceph_con_close(&osd->o_con);
-		__remove_osd(osdc, osd);
-		put_osd(osd);
-	}
-}
-
 static void __move_osd_to_lru(struct ceph_osd *osd)
 {
 	struct ceph_osd_client *osdc = osd->o_osdc;
@@ -1117,7 +985,7 @@ static void __move_osd_to_lru(struct ceph_osd *osd)
 
 static void maybe_move_osd_to_lru(struct ceph_osd *osd)
 {
-	if (list_empty(&osd->o_requests) &&
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests))
 		__move_osd_to_lru(osd);
 }
@@ -1134,30 +1002,64 @@ static void __remove_osd_from_lru(struct ceph_osd *osd)
 	spin_unlock(&osdc->osd_lru_lock);
 }
 
+/*
+ * Close the connection and assign any leftover requests to the
+ * homeless session.
+ */
+static void close_osd(struct ceph_osd *osd)
+{
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
+
+	verify_osdc_wrlocked(osdc);
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	ceph_con_close(&osd->o_con);
+
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		n = rb_next(n); /* unlink_request() */
+
+		dout(" reassigning req %p tid %llu\n", req, req->r_tid);
+		unlink_request(osd, req);
+		link_request(&osdc->homeless_osd, req);
+	}
+
+	__remove_osd_from_lru(osd);
+	erase_osd(&osdc->osds, osd);
+	put_osd(osd);
+}
+
 /*
  * reset osd connect
  */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int reopen_osd(struct ceph_osd *osd)
 {
 	struct ceph_entity_addr *peer_addr;
 
-	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-	if (list_empty(&osd->o_requests) &&
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	if (RB_EMPTY_ROOT(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
-		remove_osd(osdc, osd);
+		close_osd(osd);
 		return -ENODEV;
 	}
 
-	peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
+	peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
 	if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
 			!ceph_con_opened(&osd->o_con)) {
-		struct ceph_osd_request *req;
+		struct rb_node *n;
 
 		dout("osd addr hasn't changed and connection never opened, "
 		     "letting msgr retry\n");
 		/* touch each r_stamp for handle_timeout()'s benfit */
-		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+		for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+			struct ceph_osd_request *req =
+			    rb_entry(n, struct ceph_osd_request, r_node);
 			req->r_stamp = jiffies;
+		}
 
 		return -EAGAIN;
 	}
@@ -1169,73 +1071,84 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	return 0;
 }
 
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void __register_request(struct ceph_osd_client *osdc,
-			       struct ceph_osd_request *req)
-{
-	req->r_tid = ++osdc->last_tid;
-	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-	dout("__register_request %p tid %lld\n", req, req->r_tid);
-	insert_request(&osdc->requests, req);
-	ceph_osdc_get_request(req);
-	osdc->num_requests++;
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-				 struct ceph_osd_request *req)
+static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
+					  bool wrlocked)
 {
-	if (RB_EMPTY_NODE(&req->r_node)) {
-		dout("__unregister_request %p tid %lld not registered\n",
-			req, req->r_tid);
-		return;
-	}
+	struct ceph_osd *osd;
 
-	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	erase_request(&osdc->requests, req);
-	osdc->num_requests--;
+	if (wrlocked)
+		verify_osdc_wrlocked(osdc);
+	else
+		verify_osdc_locked(osdc);
 
-	if (req->r_osd) {
-		/* make sure the original request isn't in flight. */
-		ceph_msg_revoke(req->r_request);
+	if (o != CEPH_HOMELESS_OSD)
+		osd = lookup_osd(&osdc->osds, o);
+	else
+		osd = &osdc->homeless_osd;
+	if (!osd) {
+		if (!wrlocked)
+			return ERR_PTR(-EAGAIN);
 
-		list_del_init(&req->r_osd_item);
-		maybe_move_osd_to_lru(req->r_osd);
-		if (list_empty(&req->r_linger_osd_item))
-			req->r_osd = NULL;
+		osd = create_osd(osdc, o);
+		insert_osd(&osdc->osds, osd);
+		ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+			      &osdc->osdmap->osd_addr[osd->o_osd]);
 	}
 
-	list_del_init(&req->r_req_lru_item);
-	ceph_osdc_put_request(req);
+	dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
+	return osd;
 }
 
 /*
- * Cancel a previously queued request message
+ * Create request <-> OSD session relation.
+ *
+ * @req has to be assigned a tid, @osd may be homeless.
  */
-static void __cancel_request(struct ceph_osd_request *req)
+static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
 {
-	if (req->r_sent && req->r_osd) {
-		ceph_msg_revoke(req->r_request);
-		req->r_sent = 0;
-	}
+	verify_osd_locked(osd);
+	WARN_ON(!req->r_tid || req->r_osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	if (!osd_homeless(osd))
+		__remove_osd_from_lru(osd);
+	else
+		atomic_inc(&osd->o_osdc->num_homeless);
+
+	get_osd(osd);
+	insert_request(&osd->o_requests, req);
+	req->r_osd = osd;
 }
 
-static void __register_linger_request(struct ceph_osd_client *osdc,
+static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
+{
+	verify_osd_locked(osd);
+	WARN_ON(req->r_osd != osd);
+	dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
+	     req, req->r_tid);
+
+	req->r_osd = NULL;
+	erase_request(&osd->o_requests, req);
+	put_osd(osd);
+
+	if (!osd_homeless(osd))
+		maybe_move_osd_to_lru(osd);
+	else
+		atomic_dec(&osd->o_osdc->num_homeless);
+}
+
+static void __register_linger_request(struct ceph_osd *osd,
 				    struct ceph_osd_request *req)
 {
 	dout("%s %p tid %llu\n", __func__, req, req->r_tid);
 	WARN_ON(!req->r_linger);
 
 	ceph_osdc_get_request(req);
-	list_add_tail(&req->r_linger_item, &osdc->req_linger);
-	if (req->r_osd)
-		list_add_tail(&req->r_linger_osd_item,
-			      &req->r_osd->o_linger_requests);
+	list_add_tail(&req->r_linger_item, &osd->o_osdc->req_linger);
+	list_add_tail(&req->r_linger_osd_item, &osd->o_linger_requests);
+	__remove_osd_from_lru(osd);
+	req->r_osd = osd;
 }
 
 static void __unregister_linger_request(struct ceph_osd_client *osdc,
@@ -1255,7 +1168,7 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 	if (req->r_osd) {
 		list_del_init(&req->r_linger_osd_item);
 		maybe_move_osd_to_lru(req->r_osd);
-		if (list_empty(&req->r_osd_item))
+		if (RB_EMPTY_ROOT(&req->r_osd->o_requests))
 			req->r_osd = NULL;
 	}
 	ceph_osdc_put_request(req);
@@ -1291,11 +1204,20 @@ static bool have_pool_full(struct ceph_osd_client *osdc)
 	return false;
 }
 
+static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
+{
+	struct ceph_pg_pool_info *pi;
+
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
+
+	return __pool_full(pi);
+}
+
 /*
  * Returns whether a request should be blocked from being sent
  * based on the current osdmap and osd_client settings.
- *
- * Caller should hold map_sem for read.
  */
 static bool target_should_be_paused(struct ceph_osd_client *osdc,
 				    const struct ceph_osd_request_target *t,
@@ -1421,87 +1343,6 @@ out:
 	return ct_res;
 }
 
-static void __enqueue_request(struct ceph_osd_request *req)
-{
-	struct ceph_osd_client *osdc = req->r_osdc;
-
-	dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	if (req->r_osd) {
-		__remove_osd_from_lru(req->r_osd);
-		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
-		list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
-	} else {
-		list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
-	}
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.  Move the request to the appropriate list
- * (unsent, homeless) or leave on in-flight lru.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_request(struct ceph_osd_client *osdc,
-			 struct ceph_osd_request *req, int force_resend)
-{
-	enum calc_target_result ct_res;
-	int err;
-
-	dout("map_request %p tid %lld\n", req, req->r_tid);
-
-	ct_res = calc_target(osdc, &req->r_t, NULL, force_resend);
-	switch (ct_res) {
-	case CALC_TARGET_POOL_DNE:
-		list_move(&req->r_req_lru_item, &osdc->req_notarget);
-		return -EIO;
-	case CALC_TARGET_NO_ACTION:
-		return 0;  /* no change */
-	default:
-		BUG_ON(ct_res != CALC_TARGET_NEED_RESEND);
-	}
-
-	dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
-	     req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd,
-	     req->r_osd ? req->r_osd->o_osd : -1);
-
-	if (req->r_osd) {
-		__cancel_request(req);
-		list_del_init(&req->r_osd_item);
-		list_del_init(&req->r_linger_osd_item);
-		req->r_osd = NULL;
-	}
-
-	req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd);
-	if (!req->r_osd && req->r_t.osd >= 0) {
-		err = -ENOMEM;
-		req->r_osd = create_osd(osdc, req->r_t.osd);
-		if (!req->r_osd) {
-			list_move(&req->r_req_lru_item, &osdc->req_notarget);
-			goto out;
-		}
-
-		dout("map_request osd %p is osd%d\n", req->r_osd,
-		     req->r_osd->o_osd);
-		insert_osd(&osdc->osds, req->r_osd);
-
-		ceph_con_open(&req->r_osd->o_con,
-			      CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd,
-			      &osdc->osdmap->osd_addr[req->r_osd->o_osd]);
-	}
-
-	__enqueue_request(req);
-	err = 1;   /* osd or pg changed */
-
-out:
-	return err;
-}
-
 static void setup_request_data(struct ceph_osd_request *req,
 			       struct ceph_msg *msg)
 {
@@ -1648,8 +1489,16 @@ static void send_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd *osd = req->r_osd;
 
+	verify_osd_locked(osd);
 	WARN_ON(osd->o_osd != req->r_t.osd);
 
+	/*
+	 * We may have a previously queued request message hanging
+	 * around.  Cancel it to avoid corrupting the msgr.
+	 */
+	if (req->r_sent)
+		ceph_msg_revoke(req->r_request);
+
 	req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
 	if (req->r_attempts)
 		req->r_flags |= CEPH_OSD_FLAG_RETRY;
@@ -1671,24 +1520,11 @@ static void send_request(struct ceph_osd_request *req)
 	ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
 }
 
-/*
- * Send any requests in the queue (req_unsent).
- */
-static void __send_queued(struct ceph_osd_client *osdc)
-{
-	struct ceph_osd_request *req, *tmp;
-
-	dout("__send_queued\n");
-	list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
-		list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-		send_request(req);
-	}
-}
-
 static void maybe_request_map(struct ceph_osd_client *osdc)
 {
 	bool continuous = false;
 
+	verify_osdc_locked(osdc);
 	WARN_ON(!osdc->osdmap->epoch);
 
 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
@@ -1705,38 +1541,121 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
-/*
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
-				     struct ceph_osd_request *req,
-				     bool nofail)
+static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 {
-	int rc;
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd;
+	bool need_send = false;
+	bool promoted = false;
 
-	__register_request(osdc, req);
-	req->r_sent = 0;
-	req->r_got_reply = 0;
-	rc = __map_request(osdc, req, 0);
-	if (rc < 0) {
-		if (nofail) {
-			dout("osdc_start_request failed map, "
-				" will retry %lld\n", req->r_tid);
-			rc = 0;
-		} else {
-			__unregister_request(osdc, req);
-		}
-		return rc;
+	WARN_ON(req->r_tid || req->r_got_reply);
+	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
+
+again:
+	calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
+	osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
+	if (IS_ERR(osd)) {
+		WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
+		goto promote;
 	}
 
-	if (req->r_osd == NULL) {
-		dout("send_request %p no up osds in pg\n", req);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+	    ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
+		dout("req %p pausewr\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+		   ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
+		dout("req %p pauserd\n", req);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+		   !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
+				     CEPH_OSD_FLAG_FULL_FORCE)) &&
+		   (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+		    pool_full(osdc, req->r_t.base_oloc.pool))) {
+		dout("req %p full/pool_full\n", req);
+		pr_warn_ratelimited("FULL or reached pool quota\n");
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if (!osd_homeless(osd)) {
+		need_send = true;
 	} else {
-		__send_queued(osdc);
+		maybe_request_map(osdc);
 	}
 
-	return 0;
+	mutex_lock(&osd->lock);
+	/*
+	 * Assign the tid atomically with send_request() to protect
+	 * multiple writes to the same object from racing with each
+	 * other, resulting in out of order ops on the OSDs.
+	 */
+	req->r_tid = atomic64_inc_return(&osdc->last_tid);
+	link_request(osd, req);
+	if (need_send)
+		send_request(req);
+	mutex_unlock(&osd->lock);
+
+	if (promoted)
+		downgrade_write(&osdc->lock);
+	return;
+
+promote:
+	up_read(&osdc->lock);
+	down_write(&osdc->lock);
+	wrlocked = true;
+	promoted = true;
+	goto again;
+}
+
+static void account_request(struct ceph_osd_request *req)
+{
+	unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+
+	if (req->r_flags & CEPH_OSD_FLAG_READ) {
+		WARN_ON(req->r_flags & mask);
+		req->r_flags |= CEPH_OSD_FLAG_ACK;
+	} else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		WARN_ON(!(req->r_flags & mask));
+	else
+		WARN_ON(1);
+
+	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
+	atomic_inc(&req->r_osdc->num_requests);
+}
+
+static void submit_request(struct ceph_osd_request *req, bool wrlocked)
+{
+	ceph_osdc_get_request(req);
+	account_request(req);
+	__submit_request(req, wrlocked);
+}
+
+static void __finish_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd = req->r_osd;
+
+	verify_osd_locked(osd);
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	unlink_request(osd, req);
+	atomic_dec(&osdc->num_requests);
+
+	/*
+	 * If an OSD has failed or returned and a request has been sent
+	 * twice, it's possible to get a reply and end up here while the
+	 * request message is queued for delivery.  We will ignore the
+	 * reply, so not a big deal, but better to try and catch it.
+	 */
+	ceph_msg_revoke(req->r_request);
+	ceph_msg_revoke_incoming(req->r_reply);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+	__finish_request(req);
+	ceph_osdc_put_request(req);
 }
 
 static void __complete_request(struct ceph_osd_request *req)
@@ -1747,6 +1666,13 @@ static void __complete_request(struct ceph_osd_request *req)
 		complete_all(&req->r_completion);
 }
 
+static void cancel_request(struct ceph_osd_request *req)
+{
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	finish_request(req);
+}
+
 /*
  * Timeout callback, called every N seconds.  When 1 or more OSD
  * requests has been active for more than N seconds, we send a keepalive
@@ -1758,44 +1684,49 @@ static void handle_timeout(struct work_struct *work)
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_options *opts = osdc->client->options;
-	struct ceph_osd_request *req;
-	struct ceph_osd *osd;
-	struct list_head slow_osds;
-	dout("timeout\n");
-	down_read(&osdc->map_sem);
-
-	ceph_monc_request_next_osdmap(&osdc->client->monc);
+	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+	LIST_HEAD(slow_osds);
+	struct rb_node *n, *p;
 
-	mutex_lock(&osdc->request_mutex);
+	dout("%s osdc %p\n", __func__, osdc);
+	down_write(&osdc->lock);
 
 	/*
 	 * ping osds that are a bit slow.  this ensures that if there
 	 * is a break in the TCP connection we will notice, and reopen
 	 * a connection with that osd (from the fault callback).
 	 */
-	INIT_LIST_HEAD(&slow_osds);
-	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies,
-				req->r_stamp + opts->osd_keepalive_timeout))
-			break;
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+		bool found = false;
+
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (time_before(req->r_stamp, cutoff)) {
+				dout(" req %p tid %llu on osd%d is laggy\n",
+				     req, req->r_tid, osd->o_osd);
+				found = true;
+			}
+		}
 
-		osd = req->r_osd;
-		BUG_ON(!osd);
-		dout(" tid %llu is slow, will send keepalive on osd%d\n",
-		     req->r_tid, osd->o_osd);
-		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+		if (found)
+			list_move_tail(&osd->o_keepalive_item, &slow_osds);
 	}
+
+	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
+		maybe_request_map(osdc);
+
 	while (!list_empty(&slow_osds)) {
-		osd = list_entry(slow_osds.next, struct ceph_osd,
-				 o_keepalive_item);
+		struct ceph_osd *osd = list_first_entry(&slow_osds,
+							struct ceph_osd,
+							o_keepalive_item);
 		list_del_init(&osd->o_keepalive_item);
 		ceph_con_keepalive(&osd->o_con);
 	}
 
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-
+	up_write(&osdc->lock);
 	schedule_delayed_work(&osdc->timeout_work,
 			      osdc->client->options->osd_keepalive_timeout);
 }
@@ -1809,18 +1740,17 @@ static void handle_osds_timeout(struct work_struct *work)
 	struct ceph_osd *osd, *nosd;
 
 	dout("%s osdc %p\n", __func__, osdc);
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-
+	down_write(&osdc->lock);
 	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
 		if (time_before(jiffies, osd->lru_ttl))
 			break;
 
-		remove_osd(osdc, osd);
+		WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
+		WARN_ON(!list_empty(&osd->o_linger_requests));
+		close_osd(osd);
 	}
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	up_write(&osdc->lock);
 	schedule_delayed_work(&osdc->osds_timeout_work,
 			      round_jiffies_relative(delay));
 }
@@ -2045,8 +1975,9 @@ static bool done_request(const struct ceph_osd_request *req,
  * when we get the safe reply	r_unsafe_cb(false),	r_cb/r_completion,
  *				r_safe_completion	r_safe_completion
  */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 {
+	struct ceph_osd_client *osdc = osd->o_osdc;
 	struct ceph_osd_request *req;
 	struct MOSDOpReply m;
 	u64 tid = le64_to_cpu(msg->hdr.tid);
@@ -2057,14 +1988,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 	dout("%s msg %p tid %llu\n", __func__, msg, tid);
 
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
+
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
 	if (!req) {
-		dout("%s no tid %llu\n", __func__, tid);
-		goto out_unlock;
+		dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
+		goto out_unlock_session;
 	}
-	ceph_osdc_get_request(req);
 
 	ret = decode_MOSDOpReply(msg, &m);
 	if (ret) {
@@ -2083,7 +2019,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
 			     req, req->r_tid, m.retry_attempt,
 			     req->r_attempts - 1);
-			goto out_put;
+			goto out_unlock_session;
 		}
 	} else {
 		WARN_ON(1); /* MOSDOpReply v4 is assumed */
@@ -2092,22 +2028,14 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	if (!ceph_oloc_empty(&m.redirect.oloc)) {
 		dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
 		     m.redirect.oloc.pool);
-		__unregister_request(osdc, req);
+		unlink_request(osd, req);
+		mutex_unlock(&osd->lock);
 
 		ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
-
-		/*
-		 * Start redirect requests with nofail=true.  If
-		 * mapping fails, request will end up on the notarget
-		 * list, waiting for the new osdmap (which can take
-		 * a while), even though the original request mapped
-		 * successfully.  In the future we might want to follow
-		 * original request's nofail setting here.
-		 */
-		ret = __ceph_osdc_start_request(osdc, req, true);
-		BUG_ON(ret);
-
-		goto out_put;
+		req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+		req->r_tid = 0;
+		__submit_request(req, false);
+		goto out_unlock_osdc;
 	}
 
 	if (m.num_ops != req->r_num_ops) {
@@ -2137,19 +2065,19 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		req->r_got_reply = true;
 	} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
 		dout("req %p tid %llu dup ack\n", req, req->r_tid);
-		goto out_put;
+		goto out_unlock_session;
 	}
 
 	if (done_request(req, &m)) {
-		__unregister_request(osdc, req);
+		__finish_request(req);
 		if (req->r_linger) {
 			WARN_ON(req->r_unsafe_callback);
-			__register_linger_request(osdc, req);
+			__register_linger_request(osd, req);
 		}
 	}
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+	mutex_unlock(&osd->lock);
+	up_read(&osdc->lock);
 
 	if (done_request(req, &m)) {
 		if (already_acked && req->r_unsafe_callback) {
@@ -2175,14 +2103,13 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 fail_request:
 	req->r_result = -EIO;
-	__unregister_request(osdc, req);
+	__finish_request(req);
 	__complete_request(req);
 	complete_all(&req->r_safe_completion);
-out_put:
-	ceph_osdc_put_request(req);
-out_unlock:
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 }
 
 static void set_pool_was_full(struct ceph_osd_client *osdc)
@@ -2197,126 +2124,66 @@ static void set_pool_was_full(struct ceph_osd_client *osdc)
 	}
 }
 
-static void reset_changed_osds(struct ceph_osd_client *osdc)
+static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
 {
-	struct rb_node *p, *n;
+	struct ceph_pg_pool_info *pi;
 
-	dout("%s %p\n", __func__, osdc);
-	for (p = rb_first(&osdc->osds); p; p = n) {
-		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
+	pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
+	if (!pi)
+		return false;
 
-		n = rb_next(p);
-		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-		    memcmp(&osd->o_con.peer_addr,
-			   ceph_osd_addr(osdc->osdmap,
-					 osd->o_osd),
-			   sizeof(struct ceph_entity_addr)) != 0)
-			__reset_osd(osdc, osd);
-	}
+	return pi->was_full && !__pool_full(pi);
 }
 
 /*
- * Requeue requests whose mapping to an OSD has changed.  If requests map to
- * no osd, request a new map.
- *
- * Caller should hold map_sem for read.
+ * Requeue requests whose mapping to an OSD has changed.
  */
-static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
-			  bool force_resend_writes)
+static void scan_requests(struct ceph_osd *osd,
+			  bool force_resend,
+			  bool cleared_full,
+			  bool check_pool_cleared_full,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
 {
-	struct ceph_osd_request *req, *nreq;
-	struct rb_node *p;
-	int needmap = 0;
-	int err;
-	bool force_resend_req;
-
-	dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
-		force_resend_writes ? " (force resend writes)" : "");
-	mutex_lock(&osdc->request_mutex);
-	for (p = rb_first(&osdc->requests); p; ) {
-		req = rb_entry(p, struct ceph_osd_request, r_node);
-		p = rb_next(p);
-
-		/*
-		 * For linger requests that have not yet been
-		 * registered, move them to the linger list; they'll
-		 * be sent to the osd in the loop below.  Unregister
-		 * the request before re-registering it as a linger
-		 * request to ensure the __map_request() below
-		 * will decide it needs to be sent.
-		 */
-		if (req->r_linger && list_empty(&req->r_linger_item)) {
-			dout("%p tid %llu restart on osd%d\n",
-			     req, req->r_tid,
-			     req->r_osd ? req->r_osd->o_osd : -1);
-			ceph_osdc_get_request(req);
-			__unregister_request(osdc, req);
-			__register_linger_request(osdc, req);
-			ceph_osdc_put_request(req);
-			continue;
-		}
-
-		force_resend_req = force_resend ||
-			(force_resend_writes &&
-				req->r_flags & CEPH_OSD_FLAG_WRITE);
-		err = __map_request(osdc, req, force_resend_req);
-		if (err < 0)
-			continue;  /* error */
-		if (req->r_osd == NULL) {
-			dout("%p tid %llu maps to no osd\n", req, req->r_tid);
-			needmap++;  /* request a newer map */
-		} else if (err > 0) {
-			if (!req->r_linger) {
-				dout("%p tid %llu requeued on osd%d\n", req,
-				     req->r_tid,
-				     req->r_osd ? req->r_osd->o_osd : -1);
-				req->r_flags |= CEPH_OSD_FLAG_RETRY;
-			}
-		}
-	}
-
-	list_for_each_entry_safe(req, nreq, &osdc->req_linger,
-				 r_linger_item) {
-		dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
-
-		err = __map_request(osdc, req,
-				    force_resend || force_resend_writes);
-		dout("__map_request returned %d\n", err);
-		if (err < 0)
-			continue;  /* hrm! */
-		if (req->r_osd == NULL || err > 0) {
-			if (req->r_osd == NULL) {
-				dout("lingering %p tid %llu maps to no osd\n",
-				     req, req->r_tid);
-				/*
-				 * A homeless lingering request makes
-				 * no sense, as it's job is to keep
-				 * a particular OSD connection open.
-				 * Request a newer map and kick the
-				 * request, knowing that it won't be
-				 * resent until we actually get a map
-				 * that can tell us where to send it.
-				 */
-				needmap++;
-			}
-
-			dout("kicking lingering %p tid %llu osd%d\n", req,
-			     req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
-			__register_request(osdc, req);
-			__unregister_linger_request(osdc, req);
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct rb_node *n;
+	bool force_resend_writes;
+
+	for (n = rb_first(&osd->o_requests); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		enum calc_target_result ct_res;
+
+		n = rb_next(n); /* unlink_request() */
+
+		dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+		ct_res = calc_target(osdc, &req->r_t,
+				     &req->r_last_force_resend, false);
+		switch (ct_res) {
+		case CALC_TARGET_NO_ACTION:
+			force_resend_writes = cleared_full ||
+			    (check_pool_cleared_full &&
+			     pool_cleared_full(osdc, req->r_t.base_oloc.pool));
+			if (!force_resend &&
+			    (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
+			     !force_resend_writes))
+				break;
+
+			/* fall through */
+		case CALC_TARGET_NEED_RESEND:
+			unlink_request(osd, req);
+			insert_request(need_resend, req);
+			break;
+		case CALC_TARGET_POOL_DNE:
+			break;
 		}
 	}
-	reset_changed_osds(osdc);
-	mutex_unlock(&osdc->request_mutex);
-
-	if (needmap) {
-		dout("%d requests for down osds, need new map\n", needmap);
-		ceph_monc_request_next_osdmap(&osdc->client->monc);
-	}
 }
 
 static int handle_one_map(struct ceph_osd_client *osdc,
-			  void *p, void *end, bool incremental)
+			  void *p, void *end, bool incremental,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
 {
 	struct ceph_osdmap *newmap;
 	struct rb_node *n;
@@ -2362,11 +2229,51 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	}
 
 	was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
-	kick_requests(osdc, skipped_map, was_full);
+	scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
+		      need_resend, need_resend_linger);
+
+	for (n = rb_first(&osdc->osds); n; ) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		n = rb_next(n); /* close_osd() */
+
+		scan_requests(osd, skipped_map, was_full, true, need_resend,
+			      need_resend_linger);
+		if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+		    memcmp(&osd->o_con.peer_addr,
+			   ceph_osd_addr(osdc->osdmap, osd->o_osd),
+			   sizeof(struct ceph_entity_addr)))
+			close_osd(osd);
+	}
 
 	return 0;
 }
 
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct rb_root *need_resend,
+			  struct list_head *need_resend_linger)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(need_resend); n; ) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+		struct ceph_osd *osd;
+
+		n = rb_next(n);
+		erase_request(need_resend, req); /* before link_request() */
+
+		WARN_ON(req->r_osd);
+		calc_target(osdc, &req->r_t, NULL, false);
+		osd = lookup_create_osd(osdc, req->r_t.osd, true);
+		link_request(osd, req);
+		if (!req->r_linger) {
+			if (!osd_homeless(osd) && !req->r_t.paused)
+				send_request(req);
+		}
+	}
+}
+
 /*
  * Process updated osd map.
  *
@@ -2381,13 +2288,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	u32 nr_maps, maplen;
 	u32 epoch;
 	struct ceph_fsid fsid;
+	struct rb_root need_resend = RB_ROOT;
+	LIST_HEAD(need_resend_linger);
 	bool handled_incremental = false;
 	bool was_pauserd, was_pausewr;
 	bool pauserd, pausewr;
 	int err;
 
 	dout("%s have %u\n", __func__, osdc->osdmap->epoch);
-	down_write(&osdc->map_sem);
+	down_write(&osdc->lock);
 
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
@@ -2412,7 +2321,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		    osdc->osdmap->epoch + 1 == epoch) {
 			dout("applying incremental map %u len %d\n",
 			     epoch, maplen);
-			err = handle_one_map(osdc, p, p + maplen, true);
+			err = handle_one_map(osdc, p, p + maplen, true,
+					     &need_resend, &need_resend_linger);
 			if (err)
 				goto bad;
 			handled_incremental = true;
@@ -2443,7 +2353,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 			     osdc->osdmap->epoch);
 		} else {
 			dout("taking full map %u len %d\n", epoch, maplen);
-			err = handle_one_map(osdc, p, p + maplen, false);
+			err = handle_one_map(osdc, p, p + maplen, false,
+					     &need_resend, &need_resend_linger);
 			if (err)
 				goto bad;
 		}
@@ -2464,20 +2375,60 @@ done:
 	if (was_pauserd || was_pausewr || pauserd || pausewr)
 		maybe_request_map(osdc);
 
-	mutex_lock(&osdc->request_mutex);
-	__send_queued(osdc);
-	mutex_unlock(&osdc->request_mutex);
+	kick_requests(osdc, &need_resend, &need_resend_linger);
 
 	ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
 			  osdc->osdmap->epoch);
-	up_write(&osdc->map_sem);
+	up_write(&osdc->lock);
 	wake_up_all(&osdc->client->auth_wq);
 	return;
 
 bad:
 	pr_err("osdc handle_map corrupt msg\n");
 	ceph_msg_dump(msg);
-	up_write(&osdc->map_sem);
+	up_write(&osdc->lock);
+}
+
+/*
+ * Resubmit requests pending on the given osd.
+ */
+static void kick_osd_requests(struct ceph_osd *osd)
+{
+	struct rb_node *n;
+
+	for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
+		struct ceph_osd_request *req =
+		    rb_entry(n, struct ceph_osd_request, r_node);
+
+		if (!req->r_linger) {
+			if (!req->r_t.paused)
+				send_request(req);
+		}
+	}
+}
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_fault(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+
+	dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
+
+	down_write(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown\n", __func__, osd->o_osd);
+		goto out_unlock;
+	}
+
+	if (!reopen_osd(osd))
+		kick_osd_requests(osd);
+	maybe_request_map(osdc);
+
+out_unlock:
+	up_write(&osdc->lock);
 }
 
 /*
@@ -2680,17 +2631,11 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 			    struct ceph_osd_request *req,
 			    bool nofail)
 {
-	int rc;
-
-	down_read(&osdc->map_sem);
-	mutex_lock(&osdc->request_mutex);
-
-	rc = __ceph_osdc_start_request(osdc, req, nofail);
+	down_read(&osdc->lock);
+	submit_request(req, false);
+	up_read(&osdc->lock);
 
-	mutex_unlock(&osdc->request_mutex);
-	up_read(&osdc->map_sem);
-
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL(ceph_osdc_start_request);
 
@@ -2703,13 +2648,12 @@ void ceph_osdc_cancel_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
 
-	mutex_lock(&osdc->request_mutex);
+	down_write(&osdc->lock);
 	if (req->r_linger)
 		__unregister_linger_request(osdc, req);
-	__unregister_request(osdc, req);
-	mutex_unlock(&osdc->request_mutex);
-
-	dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
+	if (req->r_osd)
+		cancel_request(req);
+	up_write(&osdc->lock);
 }
 EXPORT_SYMBOL(ceph_osdc_cancel_request);
 
@@ -2744,32 +2688,40 @@ EXPORT_SYMBOL(ceph_osdc_wait_request);
  */
 void ceph_osdc_sync(struct ceph_osd_client *osdc)
 {
-	struct ceph_osd_request *req;
-	u64 last_tid, next_tid = 0;
+	struct rb_node *n, *p;
+	u64 last_tid = atomic64_read(&osdc->last_tid);
 
-	mutex_lock(&osdc->request_mutex);
-	last_tid = osdc->last_tid;
-	while (1) {
-		req = __lookup_request_ge(osdc, next_tid);
-		if (!req)
-			break;
-		if (req->r_tid > last_tid)
-			break;
+again:
+	down_read(&osdc->lock);
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+		mutex_lock(&osd->lock);
+		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			if (req->r_tid > last_tid)
+				break;
+
+			if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
+				continue;
 
-		next_tid = req->r_tid + 1;
-		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-			continue;
+			ceph_osdc_get_request(req);
+			mutex_unlock(&osd->lock);
+			up_read(&osdc->lock);
+			dout("%s waiting on req %p tid %llu last_tid %llu\n",
+			     __func__, req, req->r_tid, last_tid);
+			wait_for_completion(&req->r_safe_completion);
+			ceph_osdc_put_request(req);
+			goto again;
+		}
 
-		ceph_osdc_get_request(req);
-		mutex_unlock(&osdc->request_mutex);
-		dout("sync waiting on tid %llu (last is %llu)\n",
-		     req->r_tid, last_tid);
-		wait_for_completion(&req->r_safe_completion);
-		mutex_lock(&osdc->request_mutex);
-		ceph_osdc_put_request(req);
+		mutex_unlock(&osd->lock);
 	}
-	mutex_unlock(&osdc->request_mutex);
-	dout("sync done (thru tid %llu)\n", last_tid);
+
+	up_read(&osdc->lock);
+	dout("%s done last_tid %llu\n", __func__, last_tid);
 }
 EXPORT_SYMBOL(ceph_osdc_sync);
 
@@ -2793,18 +2745,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 
 	dout("init\n");
 	osdc->client = client;
-	init_rwsem(&osdc->map_sem);
-	mutex_init(&osdc->request_mutex);
-	osdc->last_tid = 0;
+	init_rwsem(&osdc->lock);
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
 	spin_lock_init(&osdc->osd_lru_lock);
-	osdc->requests = RB_ROOT;
-	INIT_LIST_HEAD(&osdc->req_lru);
-	INIT_LIST_HEAD(&osdc->req_unsent);
-	INIT_LIST_HEAD(&osdc->req_notarget);
 	INIT_LIST_HEAD(&osdc->req_linger);
-	osdc->num_requests = 0;
+	osd_init(&osdc->homeless_osd);
+	osdc->homeless_osd.o_osdc = osdc;
+	osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
 	spin_lock_init(&osdc->event_lock);
@@ -2861,13 +2809,19 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	cancel_delayed_work_sync(&osdc->timeout_work);
 	cancel_delayed_work_sync(&osdc->osds_timeout_work);
 
-	mutex_lock(&osdc->request_mutex);
+	down_write(&osdc->lock);
 	while (!RB_EMPTY_ROOT(&osdc->osds)) {
 		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
 						struct ceph_osd, o_node);
-		remove_osd(osdc, osd);
+		close_osd(osd);
 	}
-	mutex_unlock(&osdc->request_mutex);
+	up_write(&osdc->lock);
+	WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+	osd_cleanup(&osdc->homeless_osd);
+
+	WARN_ON(!list_empty(&osdc->osd_lru));
+	WARN_ON(atomic_read(&osdc->num_requests));
+	WARN_ON(atomic_read(&osdc->num_homeless));
 
 	ceph_osdmap_destroy(osdc->osdmap);
 	mempool_destroy(osdc->req_mempool);
@@ -2982,19 +2936,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc;
+	struct ceph_osd_client *osdc = osd->o_osdc;
 	int type = le16_to_cpu(msg->hdr.type);
 
-	if (!osd)
-		goto out;
-	osdc = osd->o_osdc;
-
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
 		ceph_osdc_handle_map(osdc, msg);
 		break;
 	case CEPH_MSG_OSD_OPREPLY:
-		handle_reply(osdc, msg);
+		handle_reply(osd, msg);
 		break;
 	case CEPH_MSG_WATCH_NOTIFY:
 		handle_watch_notify(osdc, msg);
@@ -3004,7 +2954,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
-out:
+
 	ceph_msg_put(msg);
 }
 
@@ -3019,21 +2969,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
-	struct ceph_msg *m;
+	struct ceph_msg *m = NULL;
 	struct ceph_osd_request *req;
 	int front_len = le32_to_cpu(hdr->front_len);
 	int data_len = le32_to_cpu(hdr->data_len);
-	u64 tid;
+	u64 tid = le64_to_cpu(hdr->tid);
 
-	tid = le64_to_cpu(hdr->tid);
-	mutex_lock(&osdc->request_mutex);
-	req = lookup_request(&osdc->requests, tid);
+	down_read(&osdc->lock);
+	if (!osd_registered(osd)) {
+		dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
+		*skip = 1;
+		goto out_unlock_osdc;
+	}
+	WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
+
+	mutex_lock(&osd->lock);
+	req = lookup_request(&osd->o_requests, tid);
 	if (!req) {
 		dout("%s osd%d tid %llu unknown, skipping\n", __func__,
 		     osd->o_osd, tid);
-		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	ceph_msg_revoke_incoming(req->r_reply);
@@ -3045,7 +3001,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
 				 false);
 		if (!m)
-			goto out;
+			goto out_unlock_session;
 		ceph_msg_put(req->r_reply);
 		req->r_reply = m;
 	}
@@ -3056,14 +3012,16 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 			req->r_reply->data_length);
 		m = NULL;
 		*skip = 1;
-		goto out;
+		goto out_unlock_session;
 	}
 
 	m = ceph_msg_get(req->r_reply);
 	dout("get_reply tid %lld %p\n", tid, m);
 
-out:
-	mutex_unlock(&osdc->request_mutex);
+out_unlock_session:
+	mutex_unlock(&osd->lock);
+out_unlock_osdc:
+	up_read(&osdc->lock);
 	return m;
 }
 
@@ -3083,8 +3041,8 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_OSD_OPREPLY:
 		return get_reply(con, hdr, skip);
 	default:
-		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-			osd->o_osd);
+		pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
+			osd->o_osd, type);
 		*skip = 1;
 		return NULL;
 	}
@@ -3188,5 +3146,5 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.alloc_msg = alloc_msg,
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
-	.fault = osd_reset,
+	.fault = osd_fault,
 };
-- 
cgit v1.2.3-59-g8ed1b


From 235a09821c2bc71d9d07f12217ce2ac00db99eba Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 30 Mar 2016 17:18:34 +0800
Subject: ceph: multiple filesystem support

To access non-default filesystem, we just need to subscribe to
mdsmap.<MDS_NAMESPACE_ID> and add a new mount option for mds
namespace id.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
[idryomov@gmail.com: switch to a new libceph API]
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 9 +++++++++
 fs/ceph/super.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f12d5e2955c2..452770dc9fc1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
  * mount options
  */
 enum {
+	Opt_mds_namespace,
 	Opt_wsize,
 	Opt_rsize,
 	Opt_rasize,
@@ -143,6 +144,7 @@ enum {
 };
 
 static match_table_t fsopt_tokens = {
+	{Opt_mds_namespace, "mds_namespace=%d"},
 	{Opt_wsize, "wsize=%d"},
 	{Opt_rsize, "rsize=%d"},
 	{Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
 		break;
 
 		/* misc */
+	case Opt_mds_namespace:
+		fsopt->mds_namespace = intval;
+		break;
 	case Opt_wsize:
 		fsopt->wsize = intval;
 		break;
@@ -367,6 +372,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
 	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 	fsopt->congestion_kb = default_congestion_kb();
+	fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
 
 	/*
 	 * Distinguish the server list from the path in "dev_name".
@@ -457,6 +463,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",noacl");
 #endif
 
+	if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
+		seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
 	if (fsopt->wsize)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
 	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -530,6 +538,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 		goto fail;
 	}
 	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+	fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
 	ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
 
 	fsc->mount_options = fsopt;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e705c4d612d7..db2200f5ba80 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_options {
 	int cap_release_safety;
 	int max_readdir;       /* max readdir result (entires) */
 	int max_readdir_bytes; /* max readdir result (bytes) */
+	int mds_namespace;
 
 	/*
 	 * everything above this point can be memcmp'd; everything below
-- 
cgit v1.2.3-59-g8ed1b


From d463a43d69f4af85887671d76182437775fd1631 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 31 Mar 2016 15:53:01 +0800
Subject: ceph: CEPH_FEATURE_MDSENC support

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/mdsmap.c | 43 ++++++++++++++++++++++++++++++++++---------
 fs/ceph/super.c  |  5 ++---
 2 files changed, 36 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 261531e55e9d..8c3591a7fbae 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	const void *start = *p;
 	int i, j, n;
 	int err = -EINVAL;
-	u16 version;
+	u8 mdsmap_v, mdsmap_cv;
 
 	m = kzalloc(sizeof(*m), GFP_NOFS);
 	if (m == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	ceph_decode_16_safe(p, end, version, bad);
-	if (version > 3) {
-		pr_warn("got mdsmap version %d > 3, failing", version);
-		goto bad;
+	ceph_decode_need(p, end, 1 + 1, bad);
+	mdsmap_v = ceph_decode_8(p);
+	mdsmap_cv = ceph_decode_8(p);
+	if (mdsmap_v >= 4) {
+	       u32 mdsmap_len;
+	       ceph_decode_32_safe(p, end, mdsmap_len, bad);
+	       if (end < *p + mdsmap_len)
+		       goto bad;
+	       end = *p + mdsmap_len;
 	}
 
 	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		u32 namelen;
 		s32 mds, inc, state;
 		u64 state_seq;
-		u8 infoversion;
+		u8 info_v;
+		void *info_end = NULL;
 		struct ceph_entity_addr addr;
 		u32 num_export_targets;
 		void *pexport_targets = NULL;
 		struct ceph_timespec laggy_since;
 		struct ceph_mds_info *info;
 
-		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+		ceph_decode_need(p, end, sizeof(u64) + 1, bad);
 		global_id = ceph_decode_64(p);
-		infoversion = ceph_decode_8(p);
+		info_v= ceph_decode_8(p);
+		if (info_v >= 4) {
+			u32 info_len;
+			u8 info_cv;
+			ceph_decode_need(p, end, 1 + sizeof(u32), bad);
+			info_cv = ceph_decode_8(p);
+			info_len = ceph_decode_32(p);
+			info_end = *p + info_len;
+			if (info_end > end)
+				goto bad;
+		}
+
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
 		*p += sizeof(u64);
 		namelen = ceph_decode_32(p);  /* skip mds name */
 		*p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
 		*p += namelen;
-		if (infoversion >= 2) {
+		if (info_v >= 2) {
 			ceph_decode_32_safe(p, end, num_export_targets, bad);
 			pexport_targets = *p;
 			*p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 			num_export_targets = 0;
 		}
 
+		if (info_end && *p != info_end) {
+			if (*p > info_end)
+				goto bad;
+			*p = info_end;
+		}
+
 		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
 		     i+1, n, global_id, mds, inc,
 		     ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	m->m_cas_pg_pool = ceph_decode_64(p);
 
 	/* ok, we don't care about the rest. */
+	*p = end;
 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
 	return m;
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 452770dc9fc1..d714ab20ad24 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -519,9 +519,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 {
 	struct ceph_fs_client *fsc;
 	const u64 supported_features =
-		CEPH_FEATURE_FLOCK |
-		CEPH_FEATURE_DIRLAYOUTHASH |
-		CEPH_FEATURE_MDS_INLINE_DATA;
+		CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
+		CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
 	const u64 required_features = 0;
 	int page_count;
 	size_t size;
-- 
cgit v1.2.3-59-g8ed1b


From 77310320c299b0dc050037ff8fc29fd1861fb005 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 8 Apr 2016 15:27:16 +0800
Subject: ceph: renew caps for read/write if mds session got killed.

When mds session gets killed, read/write operation may hang.
Client waits for Frw caps, but mds does not know what caps client
wants. To recover this, client sends an open request to mds. The
request will tell mds what caps client wants.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/caps.c       | 43 ++++++++++++++++++++++++++++++++----------
 fs/ceph/file.c       | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.c |  6 +++++-
 fs/ceph/super.h      |  2 ++
 4 files changed, 93 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cfaeef18cbca..fab93c66d879 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2317,7 +2317,7 @@ again:
 
 	/* make sure file is actually open */
 	file_wanted = __ceph_caps_file_wanted(ci);
-	if ((file_wanted & need) == 0) {
+	if ((file_wanted & need) != need) {
 		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
 		     ceph_cap_string(need), ceph_cap_string(file_wanted));
 		*err = -EBADF;
@@ -2412,12 +2412,26 @@ again:
 			goto out_unlock;
 		}
 
-		if (!__ceph_is_any_caps(ci) &&
-		    ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-			dout("get_cap_refs %p forced umount\n", inode);
-			*err = -EIO;
-			ret = 1;
-			goto out_unlock;
+		if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
+			int mds_wanted;
+			if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
+			    CEPH_MOUNT_SHUTDOWN) {
+				dout("get_cap_refs %p forced umount\n", inode);
+				*err = -EIO;
+				ret = 1;
+				goto out_unlock;
+			}
+			mds_wanted = __ceph_caps_mds_wanted(ci);
+			if ((mds_wanted & need) != need) {
+				dout("get_cap_refs %p caps were dropped"
+				     " (session killed?)\n", inode);
+				*err = -ESTALE;
+				ret = 1;
+				goto out_unlock;
+			}
+			if ((mds_wanted & file_wanted) ==
+			    (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
+				ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
 		}
 
 		dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 			if (err == -EAGAIN)
 				continue;
 			if (err < 0)
-				return err;
+				ret = err;
 		} else {
 			ret = wait_event_interruptible(ci->i_cap_wq,
 					try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 				continue;
 			if (err < 0)
 				ret = err;
-			if (ret < 0)
-				return ret;
+		}
+		if (ret < 0) {
+			if (err == -ESTALE) {
+				/* session was killed, try renew caps */
+				ret = ceph_renew_caps(&ci->vfs_inode);
+				if (ret == 0)
+					continue;
+			}
+			return ret;
 		}
 
 		if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -3226,6 +3247,8 @@ retry:
 
 	if (target < 0) {
 		__ceph_remove_cap(cap, false);
+		if (!ci->i_auth_cap)
+			ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 		goto out_unlock;
 	}
 
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 30fd49eb25b4..996e9ec81e4e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -191,6 +191,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 	return ret;
 }
 
+/*
+ * try renew caps after session gets killed.
+ */
+int ceph_renew_caps(struct inode *inode)
+{
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_request *req;
+	int err, flags, wanted;
+
+	spin_lock(&ci->i_ceph_lock);
+	wanted = __ceph_caps_file_wanted(ci);
+	if (__ceph_is_any_real_caps(ci) &&
+	    (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
+		int issued = __ceph_caps_issued(ci, NULL);
+		spin_unlock(&ci->i_ceph_lock);
+		dout("renew caps %p want %s issued %s updating mds_wanted\n",
+		     inode, ceph_cap_string(wanted), ceph_cap_string(issued));
+		ceph_check_caps(ci, 0, NULL);
+		return 0;
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+	flags = 0;
+	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
+		flags = O_RDWR;
+	else if (wanted & CEPH_CAP_FILE_RD)
+		flags = O_RDONLY;
+	else if (wanted & CEPH_CAP_FILE_WR)
+		flags = O_WRONLY;
+#ifdef O_LAZY
+	if (wanted & CEPH_CAP_FILE_LAZYIO)
+		flags |= O_LAZY;
+#endif
+
+	req = prepare_open_request(inode->i_sb, flags, 0);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->r_inode = inode;
+	ihold(inode);
+	req->r_num_caps = 1;
+	req->r_fmode = -1;
+
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+out:
+	dout("renew caps %p open result=%d\n", inode, err);
+	return err < 0 ? err : 0;
+}
+
 /*
  * If we already have the requisite capabilities, we can satisfy
  * the open request locally (no need to request new caps from the
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index cff85af425d4..1e5965d17cb1 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1133,6 +1133,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(inode->i_sb)->mdsc;
 
+		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
+
 		while (true) {
 			struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
 			if (!n)
@@ -1181,7 +1183,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		list_del(&cf->list);
 		ceph_free_cap_flush(cf);
 	}
-	while (drop--)
+
+	wake_up_all(&ci->i_cap_wq);
+	if (drop)
 		iput(inode);
 	return 0;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index db2200f5ba80..5fef3a6397db 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -470,6 +470,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_POOL_RD		(1 << 5)  /* can read from pool */
 #define CEPH_I_POOL_WR		(1 << 6)  /* can write to pool */
 #define CEPH_I_SEC_INITED	(1 << 7)  /* security initialized */
+#define CEPH_I_CAP_DROPPED	(1 << 8)  /* caps were forcibly dropped */
 
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
 					   long long release_count,
@@ -932,6 +933,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 
+extern int ceph_renew_caps(struct inode *inode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 			    struct file *file, unsigned flags, umode_t mode,
-- 
cgit v1.2.3-59-g8ed1b


From 6c93df5db628e710697c43bc1bd78a786549a548 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 15 Apr 2016 13:56:12 +0800
Subject: ceph: don't call truncate_pagecache in ceph_writepages_start

truncate_pagecache() may decrease inode's reference. This can cause
deadlock if inode's last reference is dropped and iput_final() wants
to evict the inode. (evict() calls inode_wait_for_writeback(), which
waits for ceph_writepages_start() to return).

The fix is use work thead to truncate dirty pages. Also add 'forced
umount' check to ceph_update_writeable_page(), which prevents new
pages getting dirty.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c       | 14 ++++++++++++--
 fs/ceph/inode.c      | 11 +++++++++++
 fs/ceph/mds_client.c | 22 +++++++++++++++-------
 3 files changed, 38 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f47418477629..d52e3bcfda7c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -715,8 +715,11 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
 	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
-		pr_warn("writepage_start %p on forced umount\n", inode);
-		truncate_pagecache(inode, 0);
+		if (ci->i_wrbuffer_ref > 0) {
+			pr_warn_ratelimited(
+				"writepage_start %p %lld forced umount\n",
+				inode, ceph_ino(inode));
+		}
 		mapping_set_error(mapping, -EIO);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
@@ -1127,6 +1130,7 @@ static int ceph_update_writeable_page(struct file *file,
 			    struct page *page)
 {
 	struct inode *inode = file_inode(file);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	loff_t page_off = pos & PAGE_MASK;
 	int pos_in_page = pos & ~PAGE_MASK;
@@ -1135,6 +1139,12 @@ static int ceph_update_writeable_page(struct file *file,
 	int r;
 	struct ceph_snap_context *snapc, *oldest;
 
+	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		dout(" page %p forced umount\n", page);
+		unlock_page(page);
+		return -EIO;
+	}
+
 retry_locked:
 	/* writepages currently holds page lock, but if we change that later, */
 	wait_on_page_writeback(page);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index edfade037738..b906e02cddad 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1623,10 +1623,21 @@ static void ceph_invalidate_work(struct work_struct *work)
 	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
 						  i_pg_inv_work);
 	struct inode *inode = &ci->vfs_inode;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	u32 orig_gen;
 	int check = 0;
 
 	mutex_lock(&ci->i_truncate_mutex);
+
+	if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
+				    inode, ceph_ino(inode));
+		mapping_set_error(inode->i_mapping, -EIO);
+		truncate_pagecache(inode, 0);
+		mutex_unlock(&ci->i_truncate_mutex);
+		goto out;
+	}
+
 	spin_lock(&ci->i_ceph_lock);
 	dout("invalidate_pages %p gen %d revoking %d\n", inode,
 	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1e5965d17cb1..cbe6c0afdadc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1120,9 +1120,11 @@ out:
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 				  void *arg)
 {
+	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	LIST_HEAD(to_remove);
-	int drop = 0;
+	bool drop = false;
+	bool invalidate = false;
 
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
@@ -1130,11 +1132,14 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	__ceph_remove_cap(cap, false);
 	if (!ci->i_auth_cap) {
 		struct ceph_cap_flush *cf;
-		struct ceph_mds_client *mdsc =
-			ceph_sb_to_client(inode->i_sb)->mdsc;
+		struct ceph_mds_client *mdsc = fsc->mdsc;
 
 		ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
 
+		if (ci->i_wrbuffer_ref > 0 &&
+		    ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+			invalidate = true;
+
 		while (true) {
 			struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
 			if (!n)
@@ -1156,7 +1161,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 				inode, ceph_ino(inode));
 			ci->i_dirty_caps = 0;
 			list_del_init(&ci->i_dirty_item);
-			drop = 1;
+			drop = true;
 		}
 		if (!list_empty(&ci->i_flushing_item)) {
 			pr_warn_ratelimited(
@@ -1166,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			ci->i_flushing_caps = 0;
 			list_del_init(&ci->i_flushing_item);
 			mdsc->num_cap_flushing--;
-			drop = 1;
+			drop = true;
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 
@@ -1185,6 +1190,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	}
 
 	wake_up_all(&ci->i_cap_wq);
+	if (invalidate)
+		ceph_queue_invalidate(inode);
 	if (drop)
 		iput(inode);
 	return 0;
@@ -1195,12 +1202,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
  */
 static void remove_session_caps(struct ceph_mds_session *session)
 {
+	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
+	struct super_block *sb = fsc->sb;
 	dout("remove_session_caps on %p\n", session);
-	iterate_session_caps(session, remove_session_caps_cb, NULL);
+	iterate_session_caps(session, remove_session_caps_cb, fsc);
 
 	spin_lock(&session->s_cap_lock);
 	if (session->s_nr_caps > 0) {
-		struct super_block *sb = session->s_mdsc->fsc->sb;
 		struct inode *inode;
 		struct ceph_cap *cap, *prev = NULL;
 		struct ceph_vino vino;
-- 
cgit v1.2.3-59-g8ed1b


From aeda081c5eb3d7668b64e2dc9bfc0bf30ef53632 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 18 Apr 2016 16:51:37 +0800
Subject: ceph: don't show symlink target in debugfs/mdsc

symlink target is useless for debug and can be very long. It's annoying
to show it in debugfs/mdsc.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 31f831471ed2..39ff678e567f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
 				   path ? path : "");
 			spin_unlock(&req->r_old_dentry->d_lock);
 			kfree(path);
-		} else if (req->r_path2) {
+		} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
 			if (req->r_ino2.ino)
 				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
 					   req->r_path2);
-- 
cgit v1.2.3-59-g8ed1b


From 3f38495409b613071021fca86629df7ae81820ad Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 21 Apr 2016 11:09:55 +0800
Subject: ceph: report mount root in session metadata

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/mds_client.c |  4 +++-
 fs/ceph/super.c      | 33 +++++++++++++++++++--------------
 fs/ceph/super.h      |  1 +
 3 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index cbe6c0afdadc..047f723bdbe6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -839,12 +839,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 	int metadata_bytes = 0;
 	int metadata_key_count = 0;
 	struct ceph_options *opt = mdsc->fsc->client->options;
+	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
 	void *p;
 
 	const char* metadata[][2] = {
 		{"hostname", utsname()->nodename},
 		{"kernel_version", utsname()->release},
-		{"entity_id", opt->name ? opt->name : ""},
+		{"entity_id", opt->name ? : ""},
+		{"root", fsopt->server_path ? : "/"},
 		{NULL, NULL}
 	};
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d714ab20ad24..91e02481ce06 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -302,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
 {
 	dout("destroy_mount_options %p\n", args);
 	kfree(args->snapdir_name);
+	kfree(args->server_path);
 	kfree(args);
 }
 
@@ -333,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
 	if (ret)
 		return ret;
 
+	ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
+	if (ret)
+		return ret;
+
 	return ceph_compare_options(new_opt, fsc->client);
 }
 
 static int parse_mount_options(struct ceph_mount_options **pfsopt,
 			       struct ceph_options **popt,
 			       int flags, char *options,
-			       const char *dev_name,
-			       const char **path)
+			       const char *dev_name)
 {
 	struct ceph_mount_options *fsopt;
 	const char *dev_name_end;
@@ -386,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	 */
 	dev_name_end = strchr(dev_name, '/');
 	if (dev_name_end) {
-		/* skip over leading '/' for path */
-		*path = dev_name_end + 1;
+		fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+		if (!fsopt->server_path) {
+			err = -ENOMEM;
+			goto out;
+		}
 	} else {
-		/* path is empty */
 		dev_name_end = dev_name + strlen(dev_name);
-		*path = dev_name_end;
 	}
 	err = -EINVAL;
 	dev_name_end--;		/* back up to ':' separator */
@@ -401,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 		goto out;
 	}
 	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
-	dout("server path '%s'\n", *path);
+	if (fsopt->server_path)
+		dout("server path '%s'\n", fsopt->server_path);
 
 	*popt = ceph_parse_options(options, dev_name, dev_name_end,
 				 parse_fsopt_token, (void *)fsopt);
@@ -793,8 +799,7 @@ out:
 /*
  * mount: join the ceph cluster, and open root directory.
  */
-static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
-		      const char *path)
+static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
 {
 	int err;
 	unsigned long started = jiffies;  /* note the start time */
@@ -823,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
 			goto fail;
 	}
 
-	if (path[0] == 0) {
+	if (!fsc->mount_options->server_path) {
 		root = fsc->sb->s_root;
 		dget(root);
 	} else {
-		dout("mount opening base mountpoint\n");
+		const char *path = fsc->mount_options->server_path + 1;
+		dout("mount opening path %s\n", path);
 		root = open_root_dentry(fsc, path, started);
 		if (IS_ERR(root)) {
 			err = PTR_ERR(root);
@@ -943,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 	struct dentry *res;
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
-	const char *path = NULL;
 	struct ceph_mount_options *fsopt = NULL;
 	struct ceph_options *opt = NULL;
 
@@ -952,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	flags |= MS_POSIXACL;
 #endif
-	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
 	if (err < 0) {
 		res = ERR_PTR(err);
 		goto out_final;
@@ -995,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
 		}
 	}
 
-	res = ceph_real_mount(fsc, path);
+	res = ceph_real_mount(fsc);
 	if (IS_ERR(res))
 		goto out_splat;
 	dout("root %p inode %p ino %llx.%llx\n", res,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5fef3a6397db..0ea86406f463 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -70,6 +70,7 @@ struct ceph_mount_options {
 	 */
 
 	char *snapdir_name;   /* default ".snap" */
+	char *server_path;    /* default  "/" */
 };
 
 struct ceph_fs_client {
-- 
cgit v1.2.3-59-g8ed1b


From 04303d8ad06985bed4ea4f46018fc8f55a0962a8 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 21 Apr 2016 12:11:54 +0800
Subject: ceph: use CEPH_MDS_OP_RMXATTR request to remove xattr

Setxattr with NULL value and XATTR_REPLACE flag should be equivalent
to removexattr. But current MDS does not support deleting vxattrs through
MDS_OP_SETXATTR request. The workaround is sending MDS_OP_RMXATTR request
if setxattr actually removs xattr.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/xattr.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5afabc4bf4c7..426c83cc683a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -886,6 +886,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_pagelist *pagelist = NULL;
+	int op = CEPH_MDS_OP_SETXATTR;
 	int err;
 
 	if (size > 0) {
@@ -899,20 +900,21 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 		if (err)
 			goto out;
 	} else if (!value) {
-		flags |= CEPH_XATTR_REMOVE;
+		if (flags & CEPH_XATTR_REPLACE)
+			op = CEPH_MDS_OP_RMXATTR;
+		else
+			flags |= CEPH_XATTR_REMOVE;
 	}
 
 	dout("setxattr value=%.*s\n", (int)size, value);
 
 	/* do request */
-	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
-				       USE_AUTH_MDS);
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out;
 	}
 
-	req->r_args.setxattr.flags = cpu_to_le32(flags);
 	req->r_path2 = kstrdup(name, GFP_NOFS);
 	if (!req->r_path2) {
 		ceph_mdsc_put_request(req);
@@ -920,8 +922,11 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 		goto out;
 	}
 
-	req->r_pagelist = pagelist;
-	pagelist = NULL;
+	if (op == CEPH_MDS_OP_SETXATTR) {
+		req->r_args.setxattr.flags = cpu_to_le32(flags);
+		req->r_pagelist = pagelist;
+		pagelist = NULL;
+	}
 
 	req->r_inode = inode;
 	ihold(inode);
-- 
cgit v1.2.3-59-g8ed1b


From c530cd24c23818af8da35fe556f9a7754211e50a Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 28 Apr 2016 17:43:35 +0800
Subject: ceph: search cache postion for dcache readdir

use binary search to find cache index that corresponds to readdir
postion.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c | 129 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 83 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4fb2bbc2a272..cdea450c5902 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -110,6 +110,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
 	return 0;
 }
 
+
+static struct dentry *
+__dcache_find_get_entry(struct dentry *parent, u64 idx,
+			struct ceph_readdir_cache_control *cache_ctl)
+{
+	struct inode *dir = d_inode(parent);
+	struct dentry *dentry;
+	unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
+	loff_t ptr_pos = idx * sizeof(struct dentry *);
+	pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
+
+	if (ptr_pos >= i_size_read(dir))
+		return NULL;
+
+	if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+		ceph_readdir_cache_release(cache_ctl);
+		cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
+		if (!cache_ctl->page) {
+			dout(" page %lu not found\n", ptr_pgoff);
+			return ERR_PTR(-EAGAIN);
+		}
+		/* reading/filling the cache are serialized by
+		   i_mutex, no need to use page lock */
+		unlock_page(cache_ctl->page);
+		cache_ctl->dentries = kmap(cache_ctl->page);
+	}
+
+	cache_ctl->index = idx & idx_mask;
+
+	rcu_read_lock();
+	spin_lock(&parent->d_lock);
+	/* check i_size again here, because empty directory can be
+	 * marked as complete while not holding the i_mutex. */
+	if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
+		dentry = cache_ctl->dentries[cache_ctl->index];
+	else
+		dentry = NULL;
+	spin_unlock(&parent->d_lock);
+	if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+		dentry = NULL;
+	rcu_read_unlock();
+	return dentry ? : ERR_PTR(-EAGAIN);
+}
+
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
@@ -129,62 +173,57 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 	struct inode *dir = d_inode(parent);
 	struct dentry *dentry, *last = NULL;
 	struct ceph_dentry_info *di;
-	unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
-	int err = 0;
-	loff_t ptr_pos = 0;
 	struct ceph_readdir_cache_control cache_ctl = {};
+	u64 idx = 0;
+	int err = 0;
 
 	dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
 
-	/* we can calculate cache index for the first dirfrag */
-	if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
-		cache_ctl.index = fpos_off(ctx->pos) - 2;
-		BUG_ON(cache_ctl.index < 0);
-		ptr_pos = cache_ctl.index * sizeof(struct dentry *);
+	/* search start position */
+	if (ctx->pos > 2) {
+		u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
+		while (count > 0) {
+			u64 step = count >> 1;
+			dentry = __dcache_find_get_entry(parent, idx + step,
+							 &cache_ctl);
+			if (!dentry) {
+				/* use linar search */
+				idx = 0;
+				break;
+			}
+			if (IS_ERR(dentry)) {
+				err = PTR_ERR(dentry);
+				goto out;
+			}
+			di = ceph_dentry(dentry);
+			spin_lock(&dentry->d_lock);
+			if (fpos_cmp(di->offset, ctx->pos) < 0) {
+				idx += step + 1;
+				count -= step + 1;
+			} else {
+				count = step;
+			}
+			spin_unlock(&dentry->d_lock);
+			dput(dentry);
+		}
+
+		dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
 	}
 
-	while (true) {
-		pgoff_t pgoff;
-		bool emit_dentry;
 
-		if (ptr_pos >= i_size_read(dir)) {
+	for (;;) {
+		bool emit_dentry = false;
+		dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
+		if (!dentry) {
 			fi->flags |= CEPH_F_ATEND;
 			err = 0;
 			break;
 		}
-
-		err = -EAGAIN;
-		pgoff = ptr_pos >> PAGE_SHIFT;
-		if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
-			ceph_readdir_cache_release(&cache_ctl);
-			cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
-			if (!cache_ctl.page) {
-				dout(" page %lu not found\n", pgoff);
-				break;
-			}
-			/* reading/filling the cache are serialized by
-			 * i_mutex, no need to use page lock */
-			unlock_page(cache_ctl.page);
-			cache_ctl.dentries = kmap(cache_ctl.page);
+		if (IS_ERR(dentry)) {
+			err = PTR_ERR(dentry);
+			goto out;
 		}
 
-		rcu_read_lock();
-		spin_lock(&parent->d_lock);
-		/* check i_size again here, because empty directory can be
-		 * marked as complete while not holding the i_mutex. */
-		if (ceph_dir_is_complete_ordered(dir) &&
-		    ptr_pos < i_size_read(dir))
-			dentry = cache_ctl.dentries[cache_ctl.index % nsize];
-		else
-			dentry = NULL;
-		spin_unlock(&parent->d_lock);
-		if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
-			dentry = NULL;
-		rcu_read_unlock();
-		if (!dentry)
-			break;
-
-		emit_dentry = false;
 		di = ceph_dentry(dentry);
 		spin_lock(&dentry->d_lock);
 		if (di->lease_shared_gen == shared_gen &&
@@ -217,10 +256,8 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 		} else {
 			dput(dentry);
 		}
-
-		cache_ctl.index++;
-		ptr_pos += sizeof(struct dentry *);
 	}
+out:
 	ceph_readdir_cache_release(&cache_ctl);
 	if (last) {
 		int ret;
-- 
cgit v1.2.3-59-g8ed1b


From 1cd42a429174689c0df1c37b642654a1ab4d1506 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 29 Apr 2016 15:58:32 +0800
Subject: ceph: remove unnecessary checks in __dcache_readdir

we never add snapdir and the hidden .ceph dir into readdir cache

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index cdea450c5902..734508be1526 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -228,8 +228,6 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 		spin_lock(&dentry->d_lock);
 		if (di->lease_shared_gen == shared_gen &&
 		    d_really_is_positive(dentry) &&
-		    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
-		    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
 		    fpos_cmp(ctx->pos, di->offset) <= 0) {
 			emit_dentry = true;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From a78600e7c4fb47fb5ef34265456b731fde27a9c3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 27 Apr 2016 17:32:34 +0800
Subject: ceph: simplify 'offset in frag'

don't distinguish leftmost frag from other frags. always use 2 as
first entry's offset.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c   | 12 +++---------
 fs/ceph/inode.c |  5 +----
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 734508be1526..31a2cdf39210 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -414,10 +414,7 @@ more:
 		if (req->r_reply_info.dir_end) {
 			kfree(fi->last_name);
 			fi->last_name = NULL;
-			if (ceph_frag_is_rightmost(frag))
-				fi->next_offset = 2;
-			else
-				fi->next_offset = 0;
+			fi->next_offset = 2;
 		} else {
 			err = note_last_dentry(fi,
 				       rinfo->dir_dname[rinfo->dir_nr-1],
@@ -468,7 +465,7 @@ more:
 	/* more frags? */
 	if (!ceph_frag_is_rightmost(frag)) {
 		frag = ceph_frag_next(frag);
-		off = 0;
+		off = 2;
 		ctx->pos = ceph_make_fpos(frag, off);
 		dout("readdir next frag is %x\n", frag);
 		goto more;
@@ -511,10 +508,7 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
 	fi->last_name = NULL;
 	fi->dir_release_count = 0;
 	fi->readdir_cache_idx = -1;
-	if (ceph_frag_is_leftmost(frag))
-		fi->next_offset = 2;  /* compensate for . and .. */
-	else
-		fi->next_offset = 0;
+	fi->next_offset = 2;  /* compensate for . and .. */
 	fi->flags &= ~CEPH_F_ATEND;
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b906e02cddad..51ce5ce59f04 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1405,10 +1405,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 		dout("readdir_prepopulate got new frag %x -> %x\n",
 		     frag, le32_to_cpu(rinfo->dir_dir->frag));
 		frag = le32_to_cpu(rinfo->dir_dir->frag);
-		if (ceph_frag_is_leftmost(frag))
-			req->r_readdir_offset = 2;
-		else
-			req->r_readdir_offset = 0;
+		req->r_readdir_offset = 2;
 	}
 
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
-- 
cgit v1.2.3-59-g8ed1b


From 2a5beea3f1b6544d6c72ea220e860a2eda2f9104 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 28 Apr 2016 09:37:39 +0800
Subject: ceph: define struct for dir entry in readdir reply

This avoids defining multiple arrays for entries in readdir reply

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c        | 27 ++++++++++++---------------
 fs/ceph/inode.c      | 21 +++++++++++----------
 fs/ceph/mds_client.c | 42 +++++++++++++++++++-----------------------
 fs/ceph/mds_client.h | 12 ++++++++----
 4 files changed, 50 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 31a2cdf39210..68530acea2c8 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -416,9 +416,9 @@ more:
 			fi->last_name = NULL;
 			fi->next_offset = 2;
 		} else {
-			err = note_last_dentry(fi,
-				       rinfo->dir_dname[rinfo->dir_nr-1],
-				       rinfo->dir_dname_len[rinfo->dir_nr-1],
+			struct ceph_mds_reply_dir_entry *rde =
+					rinfo->dir_entries + (rinfo->dir_nr-1);
+			err = note_last_dentry(fi, rde->name, rde->name_len,
 				       fi->next_offset + rinfo->dir_nr);
 			if (err)
 				return err;
@@ -431,24 +431,21 @@ more:
 
 	ctx->pos = ceph_make_fpos(frag, off);
 	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-		struct ceph_mds_reply_inode *in =
-			rinfo->dir_in[off - fi->offset].in;
+		struct ceph_mds_reply_dir_entry *rde =
+			rinfo->dir_entries + (off - fi->offset);
 		struct ceph_vino vino;
 		ino_t ino;
 
 		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
 		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
-		     rinfo->dir_dname_len[off - fi->offset],
-		     rinfo->dir_dname[off - fi->offset], in);
-		BUG_ON(!in);
-		ftype = le32_to_cpu(in->mode) >> 12;
-		vino.ino = le64_to_cpu(in->ino);
-		vino.snap = le64_to_cpu(in->snapid);
+		     rde->name_len, rde->name, &rde->inode.in);
+		BUG_ON(!rde->inode.in);
+		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
+		vino.ino = le64_to_cpu(rde->inode.in->ino);
+		vino.snap = le64_to_cpu(rde->inode.in->snapid);
 		ino = ceph_vino_to_ino(vino);
-		if (!dir_emit(ctx,
-			    rinfo->dir_dname[off - fi->offset],
-			    rinfo->dir_dname_len[off - fi->offset],
-			    ceph_translate_ino(inode->i_sb, ino), ftype)) {
+		if (!dir_emit(ctx, rde->name, rde->name_len,
+			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 51ce5ce59f04..40d081d7028e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1308,12 +1308,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 	int i, err = 0;
 
 	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
 		struct ceph_vino vino;
 		struct inode *in;
 		int rc;
 
-		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
-		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+		vino.ino = le64_to_cpu(rde->inode.in->ino);
+		vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
 		in = ceph_get_inode(req->r_dentry->d_sb, vino);
 		if (IS_ERR(in)) {
@@ -1321,7 +1322,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 			dout("new_inode badness got %d\n", err);
 			continue;
 		}
-		rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+		rc = fill_inode(in, NULL, &rde->inode, NULL, session,
 				req->r_request_started, -1,
 				&req->r_caps_reservation);
 		if (rc < 0) {
@@ -1433,14 +1434,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 
 	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
 		struct ceph_vino vino;
 
-		dname.name = rinfo->dir_dname[i];
-		dname.len = rinfo->dir_dname_len[i];
+		dname.name = rde->name;
+		dname.len = rde->name_len;
 		dname.hash = full_name_hash(dname.name, dname.len);
 
-		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
-		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+		vino.ino = le64_to_cpu(rde->inode.in->ino);
+		vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
 retry_lookup:
 		dn = d_lookup(parent, &dname);
@@ -1486,7 +1488,7 @@ retry_lookup:
 			}
 		}
 
-		ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+		ret = fill_inode(in, NULL, &rde->inode, NULL, session,
 				 req->r_request_started, -1,
 				 &req->r_caps_reservation);
 		if (ret < 0) {
@@ -1522,8 +1524,7 @@ retry_lookup:
 		di = dn->d_fsdata;
 		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
 
-		update_dentry_lease(dn, rinfo->dir_dlease[i],
-				    req->r_session,
+		update_dentry_lease(dn, rde->lease, req->r_session,
 				    req->r_request_started);
 
 		if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 047f723bdbe6..6220d3caf7ab 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -186,12 +186,9 @@ static int parse_reply_info_dir(void **p, void *end,
 	if (num == 0)
 		goto done;
 
-	BUG_ON(!info->dir_in);
-	info->dir_dname = (void *)(info->dir_in + num);
-	info->dir_dname_len = (void *)(info->dir_dname + num);
-	info->dir_dlease = (void *)(info->dir_dname_len + num);
-	if ((unsigned long)(info->dir_dlease + num) >
-	    (unsigned long)info->dir_in + info->dir_buf_size) {
+	BUG_ON(!info->dir_entries);
+	if ((unsigned long)(info->dir_entries + num) >
+	    (unsigned long)info->dir_entries + info->dir_buf_size) {
 		pr_err("dir contents are larger than expected\n");
 		WARN_ON(1);
 		goto bad;
@@ -199,19 +196,19 @@ static int parse_reply_info_dir(void **p, void *end,
 
 	info->dir_nr = num;
 	while (num) {
+		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
 		/* dentry */
 		ceph_decode_need(p, end, sizeof(u32)*2, bad);
-		info->dir_dname_len[i] = ceph_decode_32(p);
-		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
-		info->dir_dname[i] = *p;
-		*p += info->dir_dname_len[i];
-		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
-		     info->dir_dname[i]);
-		info->dir_dlease[i] = *p;
+		rde->name_len = ceph_decode_32(p);
+		ceph_decode_need(p, end, rde->name_len, bad);
+		rde->name = *p;
+		*p += rde->name_len;
+		dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
+		rde->lease = *p;
 		*p += sizeof(struct ceph_mds_reply_lease);
 
 		/* inode */
-		err = parse_reply_info_in(p, end, &info->dir_in[i], features);
+		err = parse_reply_info_in(p, end, &rde->inode, features);
 		if (err < 0)
 			goto out_bad;
 		i++;
@@ -345,9 +342,9 @@ out_bad:
 
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
-	if (!info->dir_in)
+	if (!info->dir_entries)
 		return;
-	free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
+	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
 }
 
 
@@ -1656,8 +1653,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 	struct ceph_inode_info *ci = ceph_inode(dir);
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
 	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
-	size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
-		      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
 	int order, num_entries;
 
 	spin_lock(&ci->i_ceph_lock);
@@ -1668,14 +1664,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 
 	order = get_order(size * num_entries);
 	while (order >= 0) {
-		rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
-							__GFP_NOWARN,
-							order);
-		if (rinfo->dir_in)
+		rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
+							     __GFP_NOWARN,
+							     order);
+		if (rinfo->dir_entries)
 			break;
 		order--;
 	}
-	if (!rinfo->dir_in)
+	if (!rinfo->dir_entries)
 		return -ENOMEM;
 
 	num_entries = (PAGE_SIZE << order) / size;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ee69a537dba5..0b84f9c0afa3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -47,6 +47,13 @@ struct ceph_mds_reply_info_in {
 	u32 pool_ns_len;
 };
 
+struct ceph_mds_reply_dir_entry {
+	char                          *name;
+	u32                           name_len;
+	struct ceph_mds_reply_lease   *lease;
+	struct ceph_mds_reply_info_in inode;
+};
+
 /*
  * parsed info about an mds reply, including information about
  * either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +80,8 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			char                          **dir_dname;
-			u32                           *dir_dname_len;
-			struct ceph_mds_reply_lease   **dir_dlease;
-			struct ceph_mds_reply_info_in *dir_in;
 			u8                            dir_complete, dir_end;
+			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
 		/* for create results */
-- 
cgit v1.2.3-59-g8ed1b


From 956d39d631dbcf7b57854873a24e309047f2a7f5 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 27 Apr 2016 17:48:30 +0800
Subject: ceph: define 'end/complete' in readdir reply as bit flags

Set a flag in readdir request, which indicates that client interprets
'end/complete' as bit flags. So that mds can reply additional flags in
readdir reply.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c                |  2 ++
 fs/ceph/mds_client.c         |  7 +++++--
 fs/ceph/mds_client.h         |  2 +-
 include/linux/ceph/ceph_fs.h | 12 ++++++++++++
 4 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 68530acea2c8..ebcbd1c946b4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -365,6 +365,8 @@ more:
 		req->r_readdir_cache_idx = fi->readdir_cache_idx;
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
+		req->r_args.readdir.flags =
+				cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
 
 		req->r_inode = inode;
 		ihold(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6220d3caf7ab..1c2befcd24fb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,8 +181,11 @@ static int parse_reply_info_dir(void **p, void *end,
 
 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
 	num = ceph_decode_32(p);
-	info->dir_end = ceph_decode_8(p);
-	info->dir_complete = ceph_decode_8(p);
+	{
+		u16 flags = ceph_decode_16(p);
+		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
+		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+	}
 	if (num == 0)
 		goto done;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0b84f9c0afa3..2a865812a41b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -80,7 +80,7 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			u8                            dir_complete, dir_end;
+			bool			      dir_complete, dir_end;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index bae833d0d055..a811c5e98bfa 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -347,6 +347,17 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_XATTR_REPLACE (1 << 1)
 #define CEPH_XATTR_REMOVE  (1 << 31)
 
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS	(1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END		(1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+
 union ceph_mds_request_args {
 	struct {
 		__le32 mask;                 /* CEPH_CAP_* */
@@ -364,6 +375,7 @@ union ceph_mds_request_args {
 		__le32 frag;                 /* which dir fragment */
 		__le32 max_entries;          /* how many dentries to grab */
 		__le32 max_bytes;
+		__le16 flags;
 	} __attribute__ ((packed)) readdir;
 	struct {
 		__le32 mode;
-- 
cgit v1.2.3-59-g8ed1b


From 8974eebd38737c9534d81c4131c5fdb1fe24d3e9 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 28 Apr 2016 15:17:40 +0800
Subject: ceph: record 'offset' for each entry of readdir result

This is preparation for using hash value as dentry 'offset'

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c        | 83 ++++++++++++++++++++++++++++++++++------------------
 fs/ceph/inode.c      |  1 +
 fs/ceph/mds_client.c |  2 ++
 fs/ceph/mds_client.h |  1 +
 fs/ceph/super.h      |  1 -
 5 files changed, 59 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ebcbd1c946b4..6ae635605be5 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -277,12 +277,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned frag = fpos_frag(ctx->pos);
-	int off = fpos_off(ctx->pos);
+	int i;
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
 
-	dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
+	dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
 	if (fi->flags & CEPH_F_ATEND)
 		return 0;
 
@@ -294,7 +294,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 			    inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 1;
-		off = 1;
 	}
 	if (ctx->pos == 1) {
 		ino_t ino = parent_ino(file->f_path.dentry);
@@ -304,7 +303,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 			    inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 2;
-		off = 2;
 	}
 
 	/* can we use the dcache? */
@@ -320,7 +318,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		if (err != -EAGAIN)
 			return err;
 		frag = fpos_frag(ctx->pos);
-		off = fpos_off(ctx->pos);
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 	}
@@ -386,12 +383,12 @@ more:
 		rinfo = &req->r_reply_info;
 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 			frag = le32_to_cpu(rinfo->dir_dir->frag);
-			off = req->r_readdir_offset;
-			fi->next_offset = off;
+			fi->next_offset = req->r_readdir_offset;
+			/* adjust ctx->pos to beginning of frag */
+			ctx->pos = ceph_make_fpos(frag, fi->next_offset);
 		}
 
 		fi->frag = frag;
-		fi->offset = fi->next_offset;
 		fi->last_readdir = req;
 
 		if (req->r_did_prepopulate) {
@@ -399,7 +396,8 @@ more:
 			if (fi->readdir_cache_idx < 0) {
 				/* preclude from marking dir ordered */
 				fi->dir_ordered_count = 0;
-			} else if (ceph_frag_is_leftmost(frag) && off == 2) {
+			} else if (ceph_frag_is_leftmost(frag) &&
+				   fi->next_offset == 2) {
 				/* note dir version at start of readdir so
 				 * we can tell if any dentries get dropped */
 				fi->dir_release_count = req->r_dir_release_cnt;
@@ -421,37 +419,54 @@ more:
 			struct ceph_mds_reply_dir_entry *rde =
 					rinfo->dir_entries + (rinfo->dir_nr-1);
 			err = note_last_dentry(fi, rde->name, rde->name_len,
-				       fi->next_offset + rinfo->dir_nr);
+					       fpos_off(rde->offset) + 1);
 			if (err)
 				return err;
 		}
 	}
 
 	rinfo = &fi->last_readdir->r_reply_info;
-	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
-	     rinfo->dir_nr, off, fi->offset);
+	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
+	     frag, rinfo->dir_nr, ctx->pos,
+	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
 
-	ctx->pos = ceph_make_fpos(frag, off);
-	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-		struct ceph_mds_reply_dir_entry *rde =
-			rinfo->dir_entries + (off - fi->offset);
+	i = 0;
+	/* search start position */
+	if (rinfo->dir_nr > 0) {
+		int step, nr = rinfo->dir_nr;
+		while (nr > 0) {
+			step = nr >> 1;
+			if (rinfo->dir_entries[i + step].offset < ctx->pos) {
+				i +=  step + 1;
+				nr -= step + 1;
+			} else {
+				nr = step;
+			}
+		}
+	}
+	for (; i < rinfo->dir_nr; i++) {
+		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
 		struct ceph_vino vino;
 		ino_t ino;
 
-		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
+		BUG_ON(rde->offset < ctx->pos);
+
+		ctx->pos = rde->offset;
+		dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
+		     i, rinfo->dir_nr, ctx->pos,
 		     rde->name_len, rde->name, &rde->inode.in);
+
 		BUG_ON(!rde->inode.in);
 		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
 		vino.ino = le64_to_cpu(rde->inode.in->ino);
 		vino.snap = le64_to_cpu(rde->inode.in->snapid);
 		ino = ceph_vino_to_ino(vino);
+
 		if (!dir_emit(ctx, rde->name, rde->name_len,
 			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
-		off++;
 		ctx->pos++;
 	}
 
@@ -464,8 +479,7 @@ more:
 	/* more frags? */
 	if (!ceph_frag_is_rightmost(frag)) {
 		frag = ceph_frag_next(frag);
-		off = 2;
-		ctx->pos = ceph_make_fpos(frag, off);
+		ctx->pos = ceph_make_fpos(frag, 2);
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
@@ -497,7 +511,7 @@ more:
 	return 0;
 }
 
-static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
+static void reset_readdir(struct ceph_file_info *fi)
 {
 	if (fi->last_readdir) {
 		ceph_mdsc_put_request(fi->last_readdir);
@@ -511,6 +525,23 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
 	fi->flags &= ~CEPH_F_ATEND;
 }
 
+/*
+ * discard buffered readdir content on seekdir(0), or seek to new frag,
+ * or seek prior to current chunk
+ */
+static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
+{
+	struct ceph_mds_reply_info_parsed *rinfo;
+	if (new_pos == 0)
+		return true;
+	if (fpos_frag(new_pos) != fi->frag)
+		return true;
+	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+	if (!rinfo || !rinfo->dir_nr)
+		return true;
+	return new_pos < rinfo->dir_entries[0].offset;;
+}
+
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct ceph_file_info *fi = file->private_data;
@@ -539,13 +570,9 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 		}
 		retval = offset;
 
-		if (offset == 0 ||
-		    fpos_frag(offset) != fi->frag ||
-		    fpos_off(offset) < fi->offset) {
-			/* discard buffered readdir content on seekdir(0), or
-			 * seek to new frag, or seek prior to current chunk */
+		if (need_reset_readdir(fi, offset)) {
 			dout("dir_llseek dropping %p content\n", file);
-			reset_readdir(fi, fpos_frag(offset));
+			reset_readdir(fi);
 		} else if (fpos_cmp(offset, old_offset) > 0) {
 			/* reset dir_release_count if we did a forward seek */
 			fi->dir_release_count = 0;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 40d081d7028e..b53c95903aeb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1523,6 +1523,7 @@ retry_lookup:
 
 		di = dn->d_fsdata;
 		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+		rde->offset = di->offset;
 
 		update_dentry_lease(dn, rde->lease, req->r_session,
 				    req->r_request_started);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1c2befcd24fb..48def22fc7b9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -214,6 +214,8 @@ static int parse_reply_info_dir(void **p, void *end,
 		err = parse_reply_info_in(p, end, &rde->inode, features);
 		if (err < 0)
 			goto out_bad;
+		/* ceph_readdir_prepopulate() will update it */
+		rde->offset = 0;
 		i++;
 		num--;
 	}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2a865812a41b..4ce19d852657 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -52,6 +52,7 @@ struct ceph_mds_reply_dir_entry {
 	u32                           name_len;
 	struct ceph_mds_reply_lease   *lease;
 	struct ceph_mds_reply_info_in inode;
+	loff_t			      offset;
 };
 
 /*
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0ea86406f463..0628099ba1f2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -635,7 +635,6 @@ struct ceph_file_info {
 	struct ceph_mds_request *last_readdir;
 
 	/* readdir: position within a frag */
-	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
 	unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
 	char *last_name;       /* last entry in previous chunk */
 	long long dir_release_count;
-- 
cgit v1.2.3-59-g8ed1b


From 076c40f18d10489e29c515bf5936952830df5e16 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 28 Apr 2016 22:56:44 +0800
Subject: ceph: don't forbid marking directory complete after forward seek

Forward seek within same frag does not update fi->last_name, it will
not affect contents of later readdir reply. So there is no need to
forbid marking directory complete

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6ae635605be5..e954ea2fb710 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -546,7 +546,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct ceph_file_info *fi = file->private_data;
 	struct inode *inode = file->f_mapping->host;
-	loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
 	loff_t retval;
 
 	inode_lock(inode);
@@ -573,10 +572,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 		if (need_reset_readdir(fi, offset)) {
 			dout("dir_llseek dropping %p content\n", file);
 			reset_readdir(fi);
-		} else if (fpos_cmp(offset, old_offset) > 0) {
-			/* reset dir_release_count if we did a forward seek */
-			fi->dir_release_count = 0;
-			fi->readdir_cache_idx = -1;
 		}
 	}
 out:
-- 
cgit v1.2.3-59-g8ed1b


From f3c4ebe65ea149ec892f94474233cfebe9cbe299 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 29 Apr 2016 11:27:30 +0800
Subject: ceph: using hash value to compose dentry offset

If MDS sorts dentries in dirfrag in hash order, we use hash value to
compose dentry offset. dentry offset is:

  (0xff << 52) | ((24 bits hash) << 28) |
  (the nth entry hash hash collision)

This offset is stable across directory fragmentation. This alos means
there is no need to reset readdir offset if directory get fragmented
in the middle of readdir.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/dir.c                | 140 ++++++++++++++++++++++++++++++++-----------
 fs/ceph/inode.c              |  31 ++++++++--
 fs/ceph/mds_client.c         |   1 +
 fs/ceph/mds_client.h         |   4 +-
 fs/ceph/super.h              |   6 +-
 include/linux/ceph/ceph_fs.h |   1 +
 6 files changed, 136 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e954ea2fb710..4850c3624a87 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -69,16 +69,42 @@ out_unlock:
 }
 
 /*
- * for readdir, we encode the directory frag and offset within that
- * frag into f_pos.
+ * for f_pos for readdir:
+ * - hash order:
+ *	(0xff << 52) | ((24 bits hash) << 28) |
+ *	(the nth entry has hash collision);
+ * - frag+name order;
+ *	((frag value) << 28) | (the nth entry in frag);
  */
+#define OFFSET_BITS	28
+#define OFFSET_MASK	((1 << OFFSET_BITS) - 1)
+#define HASH_ORDER	(0xffull << (OFFSET_BITS + 24))
+loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
+{
+	loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
+	if (hash_order)
+		fpos |= HASH_ORDER;
+	return fpos;
+}
+
+static bool is_hash_order(loff_t p)
+{
+	return (p & HASH_ORDER) == HASH_ORDER;
+}
+
 static unsigned fpos_frag(loff_t p)
 {
-	return p >> 32;
+	return p >> OFFSET_BITS;
 }
+
+static unsigned fpos_hash(loff_t p)
+{
+	return ceph_frag_value(fpos_frag(p));
+}
+
 static unsigned fpos_off(loff_t p)
 {
-	return p & 0xffffffff;
+	return p & OFFSET_MASK;
 }
 
 static int fpos_cmp(loff_t l, loff_t r)
@@ -177,7 +203,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 	u64 idx = 0;
 	int err = 0;
 
-	dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
+	dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
 
 	/* search start position */
 	if (ctx->pos > 2) {
@@ -234,7 +260,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 		spin_unlock(&dentry->d_lock);
 
 		if (emit_dentry) {
-			dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+			dout(" %llx dentry %p %pd %p\n", di->offset,
 			     dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
@@ -269,6 +295,16 @@ out:
 	return err;
 }
 
+static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
+{
+	if (!fi->last_readdir)
+		return true;
+	if (is_hash_order(pos))
+		return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+	else
+		return fi->frag != fpos_frag(pos);
+}
+
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct ceph_file_info *fi = file->private_data;
@@ -276,7 +312,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	unsigned frag = fpos_frag(ctx->pos);
 	int i;
 	int err;
 	u32 ftype;
@@ -317,7 +352,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		err = __dcache_readdir(file, ctx, shared_gen);
 		if (err != -EAGAIN)
 			return err;
-		frag = fpos_frag(ctx->pos);
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 	}
@@ -325,8 +359,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	/* proceed with a normal readdir */
 more:
 	/* do we have the correct frag content buffered? */
-	if (fi->frag != frag || fi->last_readdir == NULL) {
+	if (need_send_readdir(fi, ctx->pos)) {
 		struct ceph_mds_request *req;
+		unsigned frag;
 		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
 			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
@@ -336,6 +371,13 @@ more:
 			fi->last_readdir = NULL;
 		}
 
+		if (is_hash_order(ctx->pos)) {
+			frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
+						NULL, NULL);
+		} else {
+			frag = fpos_frag(ctx->pos);
+		}
+
 		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
 		     ceph_vinop(inode), frag, fi->last_name);
 		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -373,19 +415,23 @@ more:
 			ceph_mdsc_put_request(req);
 			return err;
 		}
-		dout("readdir got and parsed readdir result=%d"
-		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		dout("readdir got and parsed readdir result=%d on "
+		     "frag %x, end=%d, complete=%d, hash_order=%d\n",
+		     err, frag,
 		     (int)req->r_reply_info.dir_end,
-		     (int)req->r_reply_info.dir_complete);
-
+		     (int)req->r_reply_info.dir_complete,
+		     (int)req->r_reply_info.hash_order);
 
-		/* note next offset and last dentry name */
 		rinfo = &req->r_reply_info;
 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 			frag = le32_to_cpu(rinfo->dir_dir->frag);
-			fi->next_offset = req->r_readdir_offset;
-			/* adjust ctx->pos to beginning of frag */
-			ctx->pos = ceph_make_fpos(frag, fi->next_offset);
+			if (!rinfo->hash_order) {
+				fi->next_offset = req->r_readdir_offset;
+				/* adjust ctx->pos to beginning of frag */
+				ctx->pos = ceph_make_fpos(frag,
+							  fi->next_offset,
+							  false);
+			}
 		}
 
 		fi->frag = frag;
@@ -411,23 +457,25 @@ more:
 			fi->dir_release_count = 0;
 		}
 
-		if (req->r_reply_info.dir_end) {
-			kfree(fi->last_name);
-			fi->last_name = NULL;
-			fi->next_offset = 2;
-		} else {
+		/* note next offset and last dentry name */
+		if (rinfo->dir_nr > 0) {
 			struct ceph_mds_reply_dir_entry *rde =
 					rinfo->dir_entries + (rinfo->dir_nr-1);
+			unsigned next_offset = req->r_reply_info.dir_end ?
+					2 : (fpos_off(rde->offset) + 1);
 			err = note_last_dentry(fi, rde->name, rde->name_len,
-					       fpos_off(rde->offset) + 1);
+					       next_offset);
 			if (err)
 				return err;
+		} else if (req->r_reply_info.dir_end) {
+			fi->next_offset = 2;
+			/* keep last name */
 		}
 	}
 
 	rinfo = &fi->last_readdir->r_reply_info;
 	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
-	     frag, rinfo->dir_nr, ctx->pos,
+	     fi->frag, rinfo->dir_nr, ctx->pos,
 	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
 
 	i = 0;
@@ -470,16 +518,26 @@ more:
 		ctx->pos++;
 	}
 
-	if (fi->last_name) {
+	if (fi->next_offset > 2) {
 		ceph_mdsc_put_request(fi->last_readdir);
 		fi->last_readdir = NULL;
 		goto more;
 	}
 
 	/* more frags? */
-	if (!ceph_frag_is_rightmost(frag)) {
-		frag = ceph_frag_next(frag);
-		ctx->pos = ceph_make_fpos(frag, 2);
+	if (!ceph_frag_is_rightmost(fi->frag)) {
+		unsigned frag = ceph_frag_next(fi->frag);
+		if (is_hash_order(ctx->pos)) {
+			loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
+							fi->next_offset, true);
+			if (new_pos > ctx->pos)
+				ctx->pos = new_pos;
+			/* keep last_name */
+		} else {
+			ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+		}
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
@@ -532,14 +590,21 @@ static void reset_readdir(struct ceph_file_info *fi)
 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
 {
 	struct ceph_mds_reply_info_parsed *rinfo;
+	loff_t chunk_offset;
 	if (new_pos == 0)
 		return true;
-	if (fpos_frag(new_pos) != fi->frag)
+	if (is_hash_order(new_pos)) {
+		/* no need to reset last_name for a forward seek when
+		 * dentries are sotred in hash order */
+	} else if (fi->frag |= fpos_frag(new_pos)) {
 		return true;
+	}
 	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
 	if (!rinfo || !rinfo->dir_nr)
 		return true;
-	return new_pos < rinfo->dir_entries[0].offset;;
+	chunk_offset = rinfo->dir_entries[0].offset;
+	return new_pos < chunk_offset ||
+	       is_hash_order(new_pos) != is_hash_order(chunk_offset);
 }
 
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
@@ -562,17 +627,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 	}
 
 	if (offset >= 0) {
+		if (need_reset_readdir(fi, offset)) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi);
+		} else if (is_hash_order(offset) && offset > file->f_pos) {
+			/* for hash offset, we don't know if a forward seek
+			 * is within same frag */
+			fi->dir_release_count = 0;
+			fi->readdir_cache_idx = -1;
+		}
+
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
 			fi->flags &= ~CEPH_F_ATEND;
 		}
 		retval = offset;
-
-		if (need_reset_readdir(fi, offset)) {
-			dout("dir_llseek dropping %p content\n", file);
-			reset_readdir(fi);
-		}
 	}
 out:
 	inode_unlock(inode);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b53c95903aeb..f51b6fd5f570 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1387,6 +1387,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			     struct ceph_mds_session *session)
 {
 	struct dentry *parent = req->r_dentry;
+	struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
 	struct qstr dname;
 	struct dentry *dn;
@@ -1394,19 +1395,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	int err = 0, skipped = 0, ret, i;
 	struct inode *snapdir = NULL;
 	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-	struct ceph_dentry_info *di;
 	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+	u32 last_hash = 0;
+	u32 fpos_offset;
 	struct ceph_readdir_cache_control cache_ctl = {};
 
 	if (req->r_aborted)
 		return readdir_prepopulate_inodes_only(req, session);
 
+	if (rinfo->hash_order && req->r_path2) {
+		last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+					  req->r_path2, strlen(req->r_path2));
+		last_hash = ceph_frag_value(last_hash);
+	}
+
 	if (rinfo->dir_dir &&
 	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 		dout("readdir_prepopulate got new frag %x -> %x\n",
 		     frag, le32_to_cpu(rinfo->dir_dir->frag));
 		frag = le32_to_cpu(rinfo->dir_dir->frag);
-		req->r_readdir_offset = 2;
+		if (!rinfo->hash_order)
+			req->r_readdir_offset = 2;
 	}
 
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1424,13 +1433,13 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
-		struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
 		req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
 		req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
 		req->r_readdir_cache_idx = 0;
 	}
 
 	cache_ctl.index = req->r_readdir_cache_idx;
+	fpos_offset = req->r_readdir_offset;
 
 	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
@@ -1444,6 +1453,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 		vino.ino = le64_to_cpu(rde->inode.in->ino);
 		vino.snap = le64_to_cpu(rde->inode.in->snapid);
 
+		if (rinfo->hash_order) {
+			u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+						 rde->name, rde->name_len);
+			hash = ceph_frag_value(hash);
+			if (hash != last_hash)
+				fpos_offset = 2;
+			last_hash = hash;
+			rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
+		} else {
+			rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
+		}
+
 retry_lookup:
 		dn = d_lookup(parent, &dname);
 		dout("d_lookup on parent=%p name=%.*s got %p\n",
@@ -1521,9 +1542,7 @@ retry_lookup:
 			dn = realdn;
 		}
 
-		di = dn->d_fsdata;
-		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-		rde->offset = di->offset;
+		ceph_dentry(dn)->offset = rde->offset;
 
 		update_dentry_lease(dn, rde->lease, req->r_session,
 				    req->r_request_started);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 48def22fc7b9..7ad31283d510 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -185,6 +185,7 @@ static int parse_reply_info_dir(void **p, void *end,
 		u16 flags = ceph_decode_16(p);
 		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
 		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
+		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
 	}
 	if (num == 0)
 		goto done;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4ce19d852657..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -81,7 +81,9 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			bool			      dir_complete, dir_end;
+			bool			      dir_complete;
+			bool			      dir_end;
+			bool			      hash_order;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0628099ba1f2..c9b671dfff81 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -540,11 +540,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
 	return (struct ceph_dentry_info *)dentry->d_fsdata;
 }
 
-static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
-{
-	return ((loff_t)frag << 32) | (loff_t)off;
-}
-
 /*
  * caps helpers
  */
@@ -949,6 +944,7 @@ extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
 	ceph_snapdir_dentry_ops;
 
+extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
 extern int ceph_handle_snapdir(struct ceph_mds_request *req,
 			       struct dentry *dentry, int err);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index a811c5e98bfa..dfce616002ad 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -357,6 +357,7 @@ extern const char *ceph_mds_op_name(int op);
  */
 #define CEPH_READDIR_FRAG_END		(1<<0)
 #define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
+#define CEPH_READDIR_HASH_ORDER		(1<<9)
 
 union ceph_mds_request_args {
 	struct {
-- 
cgit v1.2.3-59-g8ed1b


From 209ae762a64397557be2bc48490fd968912e9a71 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 29 Apr 2016 23:40:23 +0800
Subject: ceph: fix inode reference leak

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f51b6fd5f570..4889aaa72acc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1328,8 +1328,8 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
 		if (rc < 0) {
 			pr_err("fill_inode badness on %p got %d\n", in, rc);
 			err = rc;
-			continue;
 		}
+		iput(in);
 	}
 
 	return err;
-- 
cgit v1.2.3-59-g8ed1b


From a407846ef7c6e7905ab0d1ebc0c86fbebb065b87 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 3 May 2016 20:55:50 +0800
Subject: ceph: don't assume frag tree splits in mds reply are sorted

The algorithm that updates i_fragtree relies on that the frag tree
splits in mds reply are of the same order of i_fragtree. This is not
true because current MDS encodes frag tree splits in ascending order
of (unsigned)frag_t. But nodes in i_fragtree are sorted according to
ceph_frag_compare().

The fix is sort the frag tree splits first, then updates i_fragtree.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/inode.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4889aaa72acc..d91eb6b49259 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -10,6 +10,7 @@
 #include <linux/vmalloc.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/sort.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -299,6 +300,13 @@ out:
 	return err;
 }
 
+static int frag_tree_split_cmp(const void *l, const void *r)
+{
+	struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
+	struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
+	return ceph_frag_compare(ls->frag, rs->frag);
+}
+
 static int ceph_fill_fragtree(struct inode *inode,
 			      struct ceph_frag_tree_head *fragtree,
 			      struct ceph_mds_reply_dirfrag *dirinfo)
@@ -331,6 +339,11 @@ static int ceph_fill_fragtree(struct inode *inode,
 	if (!update)
 		goto out_unlock;
 
+	if (nsplits > 1) {
+		sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
+		     frag_tree_split_cmp, NULL);
+	}
+
 	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
 	rb_node = rb_first(&ci->i_fragtree);
 	for (i = 0; i < nsplits; i++) {
-- 
cgit v1.2.3-59-g8ed1b


From 421721195a68f46a8218c664154076b5b06f5f51 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 3 May 2016 22:33:20 +0800
Subject: ceph: fix dir_auth check in ceph_fill_dirfrag()

-1 is CDIR_AUTH_PARENT, it means dir's auth mds is the same as
inode's auth mds

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d91eb6b49259..562f57d15f46 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -254,6 +254,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
 		diri_auth = ci->i_auth_cap->mds;
 	spin_unlock(&ci->i_ceph_lock);
 
+	if (mds == -1) /* CDIR_AUTH_PARENT */
+		mds = diri_auth;
+
 	mutex_lock(&ci->i_fragtree_mutex);
 	if (ndist == 0 && mds == diri_auth) {
 		/* no delegation info needed. */
-- 
cgit v1.2.3-59-g8ed1b


From a4b7431f39438f415cef1fa28502ec58016686f7 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 4 May 2016 11:05:10 +0800
Subject: ceph: keep leaf frag when updating fragtree

Nodes in i_fragtree are sorted according to ceph_compare_frag().
It means frag node in i_fragtree always follow its direct parent
node. To check if a leaf node is valid, we just need to check if
it's child of previous split node.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/inode.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 562f57d15f46..09713f32ea88 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -310,12 +310,21 @@ static int frag_tree_split_cmp(const void *l, const void *r)
 	return ceph_frag_compare(ls->frag, rs->frag);
 }
 
+static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
+{
+	if (!frag)
+		return f == ceph_frag_make(0, 0);
+	if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
+		return false;
+	return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
+}
+
 static int ceph_fill_fragtree(struct inode *inode,
 			      struct ceph_frag_tree_head *fragtree,
 			      struct ceph_mds_reply_dirfrag *dirinfo)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_inode_frag *frag;
+	struct ceph_inode_frag *frag, *prev_frag = NULL;
 	struct rb_node *rb_node;
 	int i;
 	u32 id, nsplits;
@@ -362,8 +371,12 @@ static int ceph_fill_fragtree(struct inode *inode,
 				break;
 			}
 			rb_node = rb_next(rb_node);
-			rb_erase(&frag->node, &ci->i_fragtree);
-			kfree(frag);
+			/* delete stale split/leaf node */
+			if (frag->split_by > 0 ||
+			    !is_frag_child(frag->frag, prev_frag)) {
+				rb_erase(&frag->node, &ci->i_fragtree);
+				kfree(frag);
+			}
 			frag = NULL;
 		}
 		if (!frag) {
@@ -373,12 +386,17 @@ static int ceph_fill_fragtree(struct inode *inode,
 		}
 		frag->split_by = le32_to_cpu(fragtree->splits[i].by);
 		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+		prev_frag = frag;
 	}
 	while (rb_node) {
 		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
 		rb_node = rb_next(rb_node);
-		rb_erase(&frag->node, &ci->i_fragtree);
-		kfree(frag);
+		/* delete stale split/leaf node */
+		if (frag->split_by > 0 ||
+		    !is_frag_child(frag->frag, prev_frag)) {
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+		}
 	}
 out_unlock:
 	mutex_unlock(&ci->i_fragtree_mutex);
-- 
cgit v1.2.3-59-g8ed1b


From 1b1bc16d66a7c7af3b4f30d1cf5a363168b217f4 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 4 May 2016 11:40:30 +0800
Subject: ceph: improve fragtree change detection

check if number of splits in i_fragtree is equal to number of splits
in mds reply

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/inode.c | 24 ++++++++++++++++++++----
 fs/ceph/super.h |  1 +
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 09713f32ea88..7ba8e1c2557b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -326,13 +326,15 @@ static int ceph_fill_fragtree(struct inode *inode,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_inode_frag *frag, *prev_frag = NULL;
 	struct rb_node *rb_node;
-	int i;
-	u32 id, nsplits;
+	unsigned i, split_by, nsplits;
+	u32 id;
 	bool update = false;
 
 	mutex_lock(&ci->i_fragtree_mutex);
 	nsplits = le32_to_cpu(fragtree->nsplits);
-	if (nsplits) {
+	if (nsplits != ci->i_fragtree_nsplits) {
+		update = true;
+	} else if (nsplits) {
 		i = prandom_u32() % nsplits;
 		id = le32_to_cpu(fragtree->splits[i].frag);
 		if (!__ceph_find_frag(ci, id))
@@ -360,6 +362,13 @@ static int ceph_fill_fragtree(struct inode *inode,
 	rb_node = rb_first(&ci->i_fragtree);
 	for (i = 0; i < nsplits; i++) {
 		id = le32_to_cpu(fragtree->splits[i].frag);
+		split_by = le32_to_cpu(fragtree->splits[i].by);
+		if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
+			pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
+			       "frag %x split by %d\n", ceph_vinop(inode),
+			       i, nsplits, id, split_by);
+			continue;
+		}
 		frag = NULL;
 		while (rb_node) {
 			frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -375,6 +384,8 @@ static int ceph_fill_fragtree(struct inode *inode,
 			if (frag->split_by > 0 ||
 			    !is_frag_child(frag->frag, prev_frag)) {
 				rb_erase(&frag->node, &ci->i_fragtree);
+				if (frag->split_by > 0)
+					ci->i_fragtree_nsplits--;
 				kfree(frag);
 			}
 			frag = NULL;
@@ -384,7 +395,9 @@ static int ceph_fill_fragtree(struct inode *inode,
 			if (IS_ERR(frag))
 				continue;
 		}
-		frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+		if (frag->split_by == 0)
+			ci->i_fragtree_nsplits++;
+		frag->split_by = split_by;
 		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
 		prev_frag = frag;
 	}
@@ -395,6 +408,8 @@ static int ceph_fill_fragtree(struct inode *inode,
 		if (frag->split_by > 0 ||
 		    !is_frag_child(frag->frag, prev_frag)) {
 			rb_erase(&frag->node, &ci->i_fragtree);
+			if (frag->split_by > 0)
+				ci->i_fragtree_nsplits--;
 			kfree(frag);
 		}
 	}
@@ -546,6 +561,7 @@ void ceph_destroy_inode(struct inode *inode)
 		rb_erase(n, &ci->i_fragtree);
 		kfree(frag);
 	}
+	ci->i_fragtree_nsplits = 0;
 
 	__ceph_destroy_xattrs(ci);
 	if (ci->i_xattrs.blob)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c9b671dfff81..a268f18d2119 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -297,6 +297,7 @@ struct ceph_inode_info {
 	u64 i_files, i_subdirs;
 
 	struct rb_root i_fragtree;
+	int i_fragtree_nsplits;
 	struct mutex i_fragtree_mutex;
 
 	struct ceph_inode_xattrs_info i_xattrs;
-- 
cgit v1.2.3-59-g8ed1b


From 224a7542b8fdde3cc7c600f8b0870c5541a9f678 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 5 May 2016 16:40:17 +0800
Subject: ceph: tolerate bad i_size for symlink inode

A mds bug can cause symlink's size to be truncated to zero.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/inode.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7ba8e1c2557b..89d08155986d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -582,6 +582,11 @@ int ceph_drop_inode(struct inode *inode)
 	return 1;
 }
 
+static inline blkcnt_t calc_inode_blocks(u64 size)
+{
+	return (size + (1<<9) - 1) >> 9;
+}
+
 /*
  * Helpers to fill in size, ctime, mtime, and atime.  We have to be
  * careful because either the client or MDS may have more up to date
@@ -604,7 +609,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 			size = 0;
 		}
 		i_size_write(inode, size);
-		inode->i_blocks = (size + (1<<9) - 1) >> 9;
+		inode->i_blocks = calc_inode_blocks(size);
 		ci->i_reported_size = size;
 		if (truncate_seq != ci->i_truncate_seq) {
 			dout("truncate_seq %u -> %u\n",
@@ -863,9 +868,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
 			spin_unlock(&ci->i_ceph_lock);
 
-			err = -EINVAL;
-			if (WARN_ON(symlen != i_size_read(inode)))
-				goto out;
+			if (symlen != i_size_read(inode)) {
+				pr_err("fill_inode %llx.%llx BAD symlink "
+					"size %lld\n", ceph_vinop(inode),
+					i_size_read(inode));
+				i_size_write(inode, symlen);
+				inode->i_blocks = calc_inode_blocks(symlen);
+			}
 
 			err = -ENOMEM;
 			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1629,7 +1638,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 	spin_lock(&ci->i_ceph_lock);
 	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
 	i_size_write(inode, size);
-	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+	inode->i_blocks = calc_inode_blocks(size);
 
 	/* tell the MDS if we are approaching max_size */
 	if ((size << 1) >= ci->i_max_size &&
@@ -2002,8 +2011,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		if ((issued & CEPH_CAP_FILE_EXCL) &&
 		    attr->ia_size > inode->i_size) {
 			i_size_write(inode, attr->ia_size);
-			inode->i_blocks =
-				(attr->ia_size + (1 << 9) - 1) >> 9;
+			inode->i_blocks = calc_inode_blocks(attr->ia_size);
 			inode->i_ctime = attr->ia_ctime;
 			ci->i_reported_size = attr->ia_size;
 			dirtied |= CEPH_CAP_FILE_EXCL;
-- 
cgit v1.2.3-59-g8ed1b


From 3b33f692c84c28cc8178aaeeb9264d82b48787f1 Mon Sep 17 00:00:00 2001
From: Zhang Zhuoyu <zhangzhuoyu@cmss.chinamobile.com>
Date: Fri, 25 Mar 2016 05:18:39 -0400
Subject: ceph: make logical calculation functions return bool

This patch makes serverl logical caculation functions return bool to
improve readability due to these particular functions only using 0/1
as their return value.

No functional change.

Signed-off-by: Zhang Zhuoyu <zhangzhuoyu@cmss.chinamobile.com>
---
 fs/ceph/cache.c                | 2 +-
 fs/ceph/dir.c                  | 2 +-
 include/linux/ceph/ceph_frag.h | 4 ++--
 include/linux/ceph/decode.h    | 2 +-
 include/linux/ceph/osdmap.h    | 6 +++---
 net/ceph/ceph_common.c         | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..c052b5bf219b 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
 	unlock_page(page);
 }
 
-static inline int cache_valid(struct ceph_inode_info *ci)
+static inline bool cache_valid(struct ceph_inode_info *ci)
 {
 	return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
 		(ci->i_fscache_gen == ci->i_rdcache_gen));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4850c3624a87..f6279a1bd6ec 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -710,7 +710,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 	return dentry;
 }
 
-static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
 {
 	return ceph_ino(inode) == CEPH_INO_ROOT &&
 		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
 	return ceph_frag_make(newbits,
 			 ceph_frag_value(f) | (i << (24 - newbits)));
 }
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
 {
 	return ceph_frag_value(f) == 0;
 }
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
 {
 	return ceph_frag_value(f) == ceph_frag_mask(f);
 }
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..19e9932f3e77 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 /*
  * bounds check input.
  */
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
 {
 	return end >= *p && n <= end - *p;
 }
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 821e16fff39a..ddc426b22d81 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -172,19 +172,19 @@ struct ceph_osdmap {
 	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
 };
 
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
 {
 	return osd >= 0 && osd < map->max_osd &&
 	       (map->osd_state[osd] & CEPH_OSD_EXISTS);
 }
 
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
 {
 	return ceph_osd_exists(map, osd) &&
 	       (map->osd_state[osd] & CEPH_OSD_UP);
 }
 
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
 {
 	return !ceph_osd_is_up(map, osd);
 }
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static bool have_mon_and_osd_map(struct ceph_client *client)
 {
 	return client->monc.monmap && client->monc.monmap->epoch &&
 	       client->osdc.osdmap && client->osdc.osdmap->epoch;
-- 
cgit v1.2.3-59-g8ed1b


From 4f7e89f6ace0f6cd2f20110efd2d405e26bcbf31 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 10 May 2016 18:40:28 +0800
Subject: ceph: block non-fatal signals for fault/page_mkwrite

Fault and page_mkwrite are supposed to be uninterruptable. But they
call ceph functions that are interruptible. So they should block
signals before calling functions that are interruptible

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 66 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d52e3bcfda7c..3e204b9ff9f7 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1315,6 +1315,17 @@ const struct address_space_operations ceph_aops = {
 	.direct_IO = ceph_direct_io,
 };
 
+static void ceph_block_sigs(sigset_t *oldset)
+{
+	sigset_t mask;
+	siginitsetinv(&mask, sigmask(SIGKILL));
+	sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void ceph_restore_sigs(sigset_t *oldset)
+{
+	sigprocmask(SIG_SETMASK, oldset, NULL);
+}
 
 /*
  * vm ops
@@ -1327,6 +1338,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct page *pinned_page = NULL;
 	loff_t off = vmf->pgoff << PAGE_SHIFT;
 	int want, got, ret;
+	sigset_t oldset;
+
+	ceph_block_sigs(&oldset);
 
 	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
 	     inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1334,16 +1348,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_CACHE;
-	while (1) {
-		got = 0;
-		ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
-				    -1, &got, &pinned_page);
-		if (ret == 0)
-			break;
-		if (ret != -ERESTARTSYS) {
-			WARN_ON(1);
-			return VM_FAULT_SIGBUS;
-		}
+
+	got = 0;
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
+	if (ret < 0) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_restore;
 	}
 	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
 	     inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
@@ -1361,7 +1371,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	ceph_put_cap_refs(ci, got);
 
 	if (ret != -EAGAIN)
-		return ret;
+		goto out_restore;
 
 	/* read inline data */
 	if (off >= PAGE_SIZE) {
@@ -1375,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 						~__GFP_FS));
 		if (!page) {
 			ret = VM_FAULT_OOM;
-			goto out;
+			goto out_inline;
 		}
 		ret1 = __ceph_do_getattr(inode, page,
 					 CEPH_STAT_CAP_INLINE_DATA, true);
@@ -1383,7 +1393,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 			unlock_page(page);
 			put_page(page);
 			ret = VM_FAULT_SIGBUS;
-			goto out;
+			goto out_inline;
 		}
 		if (ret1 < PAGE_SIZE)
 			zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1392,10 +1402,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		SetPageUptodate(page);
 		vmf->page = page;
 		ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
+out_inline:
+		dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+		     inode, off, (size_t)PAGE_SIZE, ret);
 	}
-out:
-	dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
-	     inode, off, (size_t)PAGE_SIZE, ret);
+out_restore:
+	ceph_restore_sigs(&oldset);
 	return ret;
 }
 
@@ -1413,11 +1425,14 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	loff_t size = i_size_read(inode);
 	size_t len;
 	int want, got, ret;
+	sigset_t oldset;
 
 	prealloc_cf = ceph_alloc_cap_flush();
 	if (!prealloc_cf)
 		return VM_FAULT_SIGBUS;
 
+	ceph_block_sigs(&oldset);
+
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
 		struct page *locked_page = NULL;
 		if (off == 0) {
@@ -1444,17 +1459,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
 	else
 		want = CEPH_CAP_FILE_BUFFER;
-	while (1) {
-		got = 0;
-		ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
-				    &got, NULL);
-		if (ret == 0)
-			break;
-		if (ret != -ERESTARTSYS) {
-			WARN_ON(1);
-			ret = VM_FAULT_SIGBUS;
-			goto out_free;
-		}
+
+	got = 0;
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
+			    &got, NULL);
+	if (ret < 0) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_free;
 	}
 	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
 	     inode, off, len, ceph_cap_string(got));
@@ -1499,6 +1510,7 @@ out:
 	     inode, off, len, ceph_cap_string(got), ret);
 	ceph_put_cap_refs(ci, got);
 out_free:
+	ceph_restore_sigs(&oldset);
 	ceph_free_cap_flush(prealloc_cf);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 6ce026e411c4f36c9e51189d28a5dd9d08095b9d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 10 May 2016 18:59:13 +0800
Subject: ceph: make fault/page_mkwrite return VM_FAULT_OOM for -ENOMEM

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e204b9ff9f7..d128bb65746d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1351,10 +1351,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	got = 0;
 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
-	if (ret < 0) {
-		ret = VM_FAULT_SIGBUS;
+	if (ret < 0)
 		goto out_restore;
-	}
+
 	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
 	     inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
 
@@ -1392,7 +1391,10 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		if (ret1 < 0 || off >= i_size_read(inode)) {
 			unlock_page(page);
 			put_page(page);
-			ret = VM_FAULT_SIGBUS;
+			if (ret1 < 0)
+				ret = ret1;
+			else
+				ret = VM_FAULT_SIGBUS;
 			goto out_inline;
 		}
 		if (ret1 < PAGE_SIZE)
@@ -1408,6 +1410,9 @@ out_inline:
 	}
 out_restore:
 	ceph_restore_sigs(&oldset);
+	if (ret < 0)
+		ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+
 	return ret;
 }
 
@@ -1429,7 +1434,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	prealloc_cf = ceph_alloc_cap_flush();
 	if (!prealloc_cf)
-		return VM_FAULT_SIGBUS;
+		return VM_FAULT_OOM;
 
 	ceph_block_sigs(&oldset);
 
@@ -1442,10 +1447,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		ret = ceph_uninline_data(vma->vm_file, locked_page);
 		if (locked_page)
 			unlock_page(locked_page);
-		if (ret < 0) {
-			ret = VM_FAULT_SIGBUS;
+		if (ret < 0)
 			goto out_free;
-		}
 	}
 
 	if (off + PAGE_SIZE <= size)
@@ -1463,10 +1466,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	got = 0;
 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
 			    &got, NULL);
-	if (ret < 0) {
-		ret = VM_FAULT_SIGBUS;
+	if (ret < 0)
 		goto out_free;
-	}
+
 	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
 	     inode, off, len, ceph_cap_string(got));
 
@@ -1475,10 +1477,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	lock_page(page);
 
-	ret = VM_FAULT_NOPAGE;
-	if ((off > size) ||
-	    (page->mapping != inode->i_mapping)) {
+	if ((off > size) || (page->mapping != inode->i_mapping)) {
 		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
 		goto out;
 	}
 
@@ -1487,11 +1488,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		/* success.  we'll keep the page locked. */
 		set_page_dirty(page);
 		ret = VM_FAULT_LOCKED;
-	} else {
-		if (ret == -ENOMEM)
-			ret = VM_FAULT_OOM;
-		else
-			ret = VM_FAULT_SIGBUS;
 	}
 out:
 	if (ret == VM_FAULT_LOCKED ||
@@ -1512,7 +1508,8 @@ out:
 out_free:
 	ceph_restore_sigs(&oldset);
 	ceph_free_cap_flush(prealloc_cf);
-
+	if (ret < 0)
+		ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f0b33df57a5f03c637f75ead7cb4d978c59cc63d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 10 May 2016 19:09:06 +0800
Subject: ceph: handle -EAGAIN returned by ceph_update_writeable_page()

when ceph_update_writeable_page() return -EAGAIN, caller should
lock the page and call ceph_update_writeable_page() again.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d128bb65746d..97ee5d1fbb61 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1475,21 +1475,23 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	/* Update time before taking page lock */
 	file_update_time(vma->vm_file);
 
-	lock_page(page);
+	do {
+		lock_page(page);
 
-	if ((off > size) || (page->mapping != inode->i_mapping)) {
-		unlock_page(page);
-		ret = VM_FAULT_NOPAGE;
-		goto out;
-	}
+		if ((off > size) || (page->mapping != inode->i_mapping)) {
+			unlock_page(page);
+			ret = VM_FAULT_NOPAGE;
+			break;
+		}
+
+		ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+		if (ret >= 0) {
+			/* success.  we'll keep the page locked. */
+			set_page_dirty(page);
+			ret = VM_FAULT_LOCKED;
+		}
+	} while (ret == -EAGAIN);
 
-	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-	if (ret >= 0) {
-		/* success.  we'll keep the page locked. */
-		set_page_dirty(page);
-		ret = VM_FAULT_LOCKED;
-	}
-out:
 	if (ret == VM_FAULT_LOCKED ||
 	    ci->i_inline_version != CEPH_INLINE_NONE) {
 		int dirty;
-- 
cgit v1.2.3-59-g8ed1b


From a78bbd4b29c29784f0addb5e3b35790c7ed178ae Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 13 May 2016 11:30:24 +0800
Subject: ceph: make ceph_update_writeable_page() uninterruptible

ceph_update_writeable_page() is used by ceph_write_begin(). It beaks
atomicity of write operation if it's interruptible.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 97ee5d1fbb61..4aa8e375e648 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1168,7 +1168,7 @@ retry_locked:
 			snapc = ceph_get_snap_context(snapc);
 			unlock_page(page);
 			ceph_queue_writeback(inode);
-			r = wait_event_interruptible(ci->i_cap_wq,
+			r = wait_event_killable(ci->i_cap_wq,
 			       context_is_writeable_or_written(inode, snapc));
 			ceph_put_snap_context(snapc);
 			if (r == -ERESTARTSYS)
-- 
cgit v1.2.3-59-g8ed1b


From ad15ec06e51b6eb73981428109e32c75cbad7d3d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 13 May 2016 17:29:51 +0800
Subject: ceph: handle interrupted ceph_writepage()

writepage() can be interrupted when it's called by direct memory
reclaimer (the direct memory relaimer is killed). To avoid lossing
data, we redirty the page.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4aa8e375e648..080a9cab3ee1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -544,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 				   truncate_seq, truncate_size,
 				   &inode->i_mtime, &page, 1);
 	if (err < 0) {
-		dout("writepage setting page/mapping error %d %p\n", err, page);
+		struct writeback_control tmp_wbc;
+		if (!wbc)
+			wbc = &tmp_wbc;
+		if (err == -ERESTARTSYS) {
+			/* killed by SIGKILL */
+			dout("writepage interrupted page %p\n", page);
+			redirty_page_for_writepage(wbc, page);
+			end_page_writeback(page);
+			goto out;
+		}
+		dout("writepage setting page/mapping error %d %p\n",
+		     err, page);
 		SetPageError(page);
 		mapping_set_error(&inode->i_data, err);
-		if (wbc)
-			wbc->pages_skipped++;
+		wbc->pages_skipped++;
 	} else {
 		dout("writepage cleaned page %p\n", page);
 		err = 0;  /* vfs expects us to return 0 */
@@ -569,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 	BUG_ON(!inode);
 	ihold(inode);
 	err = writepage_nounlock(page, wbc);
+	if (err == -ERESTARTSYS) {
+		/* direct memory reclaimer was killed by SIGKILL. return 0
+		 * to prevent caller from setting mapping/page error */
+		err = 0;
+	}
 	unlock_page(page);
 	iput(inode);
 	return err;
 }
 
-
 /*
  * lame release_pages helper.  release_pages() isn't exported to
  * modules.
-- 
cgit v1.2.3-59-g8ed1b


From b109eec6f4332bd517e2f41e207037c4b9065094 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Fri, 13 May 2016 17:54:17 +0800
Subject: ceph: SetPageError() for writeback pages if writepages fails

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 080a9cab3ee1..8fa1f91cbf16 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -626,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req)
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	bool remove_page;
 
-
 	dout("writepages_finish %p rc %d\n", inode, rc);
 	if (rc < 0)
 		mapping_set_error(mapping, rc);
@@ -661,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req)
 				clear_bdi_congested(&fsc->backing_dev_info,
 						    BLK_RW_ASYNC);
 
+			if (rc < 0)
+				SetPageError(page);
+
 			ceph_put_snap_context(page_snap_context(page));
 			page->private = 0;
 			ClearPagePrivate(page);
-- 
cgit v1.2.3-59-g8ed1b


From 9abd4db713704aac146395e079224ddd716e9b95 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 18 May 2016 20:58:26 +0800
Subject: ceph: don't use truncate_pagecache() to invalidate read cache

truncate_pagecache() drops dirty pages, it's dangerous to use it
to invalidate read cache. Besides, we shouldn't start invalidating
read cache while there are buffer writers. Because buffer writers
may add dirty pages later.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/caps.c  | 8 ++++----
 fs/ceph/inode.c | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index fab93c66d879..c17b5d76d75e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1656,7 +1656,7 @@ retry_locked:
 	 */
 	if ((!is_delayed || mdsc->stopping) &&
 	    !S_ISDIR(inode->i_mode) &&		/* ignore readdir cache */
-	    ci->i_wrbuffer_ref == 0 &&		/* no dirty pages... */
+	    !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
 	    inode->i_data.nrpages &&		/* have cached pages */
 	    (revoking & (CEPH_CAP_FILE_CACHE|
 			 CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
 
 		revoking = cap->implemented & ~cap->issued;
 		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
-		     cap->mds, cap, ceph_cap_string(cap->issued),
-		     ceph_cap_string(cap_used),
+		     cap->mds, cap, ceph_cap_string(cap_used),
+		     ceph_cap_string(cap->issued),
 		     ceph_cap_string(cap->implemented),
 		     ceph_cap_string(revoking));
 
@@ -2828,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
 	    ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
 	    (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
-	    !ci->i_wrbuffer_ref) {
+	    !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
 		if (try_nonblocking_invalidate(inode)) {
 			/* there were locked pages.. invalidate later
 			   in a separate thread. */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 89d08155986d..07495ba61fe3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1728,7 +1728,9 @@ static void ceph_invalidate_work(struct work_struct *work)
 	orig_gen = ci->i_rdcache_gen;
 	spin_unlock(&ci->i_ceph_lock);
 
-	truncate_pagecache(inode, 0);
+	if (invalidate_inode_pages2(inode->i_mapping) < 0) {
+		pr_err("invalidate_pages %p fails\n", inode);
+	}
 
 	spin_lock(&ci->i_ceph_lock);
 	if (orig_gen == ci->i_rdcache_gen &&
-- 
cgit v1.2.3-59-g8ed1b


From e536030934aebf049fe6aaebc58dd37aeee21840 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 19 May 2016 19:15:19 +0800
Subject: ceph: fix wake_up_session_cb()

We should reset i_requested_max_size before waking the waiters.
(zero i_requested_max_size make waiter re-request the max size)

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/mds_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7ad31283d510..2103b823bec0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1258,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	wake_up_all(&ci->i_cap_wq);
 	if (arg) {
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_wanted_max_size = 0;
 		ci->i_requested_max_size = 0;
 		spin_unlock(&ci->i_ceph_lock);
 	}
+	wake_up_all(&ci->i_cap_wq);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b